From 300e39a1cd3103f720496ede72ad309edd26c07b Mon Sep 17 00:00:00 2001 From: xiexionghang Date: Fri, 26 Jul 2019 09:20:55 +0800 Subject: [PATCH] for paddle clone --- AUTHORS.md | 70 + BCLOUD | 126 +- CMakeLists.txt | 242 + CODE_OF_CONDUCT.md | 46 + CODE_OF_CONDUCT_cn.md | 50 + CONTRIBUTING.md | 162 + Dockerfile | 211 + ISSUE_TEMPLATE.md | 14 + LICENSE | 203 + README.md | 115 +- README_cn.md | 91 + RELEASE.md | 3 + cmake/FindGperftools.cmake | 63 + cmake/FindNumPy.cmake | 38 + cmake/anakin_subgraph.cmake | 45 + cmake/cblas.cmake | 94 + cmake/ccache.cmake | 9 + cmake/configure.cmake | 152 + cmake/coveralls.cmake | 102 + cmake/coverallsGcovJsons.cmake | 401 + cmake/cuda.cmake | 224 + cmake/cudnn.cmake | 102 + cmake/cupti.cmake | 41 + cmake/external/boost.cmake | 60 + cmake/external/brpc.cmake | 70 + cmake/external/cares.cmake | 45 + cmake/external/cub.cmake | 33 + cmake/external/dgc.cmake | 40 + cmake/external/dlpack.cmake | 29 + cmake/external/eigen.cmake | 62 + cmake/external/gflags.cmake | 64 + cmake/external/glog.cmake | 67 + cmake/external/grpc.cmake | 78 + cmake/external/gtest.cmake | 82 + cmake/external/leveldb.cmake | 41 + cmake/external/libmct.cmake | 75 + cmake/external/libxsmm.cmake | 55 + cmake/external/mkldnn.cmake | 120 + cmake/external/mklml.cmake | 78 + cmake/external/ngraph.cmake | 84 + cmake/external/openblas.cmake | 93 + cmake/external/protobuf.cmake | 251 + cmake/external/pslib.cmake | 72 + cmake/external/pslib_brpc.cmake | 72 + cmake/external/pybind11.cmake | 46 + cmake/external/python.cmake | 83 + cmake/external/rocprim.cmake | 44 + cmake/external/snappy.cmake | 65 + cmake/external/snappystream.cmake | 63 + cmake/external/threadpool.cmake | 28 + cmake/external/warpctc.cmake | 81 + cmake/external/xbyak.cmake | 57 + cmake/external/xxhash.cmake | 68 + cmake/external/zlib.cmake | 54 + cmake/flags.cmake | 194 + cmake/generic.cmake | 816 + cmake/hip.cmake | 53 + cmake/inference_lib.cmake | 296 + cmake/make_resource.py | 25 + cmake/operators.cmake | 227 + cmake/package.cmake | 21 + cmake/python_module.cmake | 43 + cmake/simd.cmake | 99 + cmake/system.cmake | 85 + cmake/tensorrt.cmake | 38 + cmake/util.cmake | 55 + cmake/version.cmake | 63 + doc/README.md | 7 + go/glide.lock | 233 + go/glide.yaml | 33 + paddle/.common_test_util.sh | 120 + paddle/.gitignore | 43 + paddle/.set_port.sh | 18 + paddle/.set_python_path.sh | 35 + paddle/CMakeLists.txt | 4 + paddle/contrib/float16/.gitignore | 1 + paddle/contrib/float16/README.md | 171 + paddle/contrib/float16/float16_benchmark.md | 97 + .../contrib/float16/float16_inference_demo.py | 362 + paddle/contrib/float16/float16_transpiler.py | 256 + paddle/contrib/float16/run_float16_demo.sh | 111 + paddle/fluid/.clang-format | 5 + paddle/fluid/API.spec | 1043 ++ paddle/fluid/CMakeLists.txt | 12 + paddle/fluid/framework/.gitignore | 2 + paddle/fluid/framework/CMakeLists.txt | 244 + paddle/fluid/framework/archive.h | 609 + paddle/fluid/framework/array.h | 137 + paddle/fluid/framework/async_executor.cc | 172 + paddle/fluid/framework/async_executor.h | 94 + paddle/fluid/framework/attribute.cc | 81 + paddle/fluid/framework/attribute.h | 345 + paddle/fluid/framework/block_desc.cc | 242 + paddle/fluid/framework/block_desc.h | 123 + paddle/fluid/framework/blocking_queue.h | 110 + paddle/fluid/framework/channel.h | 460 + paddle/fluid/framework/commit.h | 21 + paddle/fluid/framework/commit.h.in | 21 + .../fluid/framework/data_device_transform.cc | 45 + .../fluid/framework/data_device_transform.h | 28 + .../framework/data_device_transform_test.cu | 169 + paddle/fluid/framework/data_feed.cc | 1100 ++ paddle/fluid/framework/data_feed.h | 557 + paddle/fluid/framework/data_feed.proto | 33 + paddle/fluid/framework/data_feed_factory.cc | 71 + paddle/fluid/framework/data_feed_factory.h | 29 + paddle/fluid/framework/data_feed_test.cc | 330 + paddle/fluid/framework/data_layout.h | 73 + .../fluid/framework/data_layout_transform.cc | 185 + .../fluid/framework/data_layout_transform.h | 79 + .../framework/data_layout_transform_test.cc | 45 + paddle/fluid/framework/data_set.cc | 652 + paddle/fluid/framework/data_set.h | 198 + paddle/fluid/framework/data_transform.cc | 116 + paddle/fluid/framework/data_transform.h | 44 + paddle/fluid/framework/data_type.cc | 98 + paddle/fluid/framework/data_type.h | 86 + paddle/fluid/framework/data_type_test.cc | 40 + paddle/fluid/framework/data_type_transform.cc | 106 + paddle/fluid/framework/data_type_transform.cu | 1 + paddle/fluid/framework/data_type_transform.h | 33 + .../framework/data_type_transform_test.cc | 192 + .../framework/data_type_transform_test.cu | 261 + paddle/fluid/framework/dataset_factory.cc | 66 + paddle/fluid/framework/dataset_factory.h | 29 + paddle/fluid/framework/ddim.cc | 141 + paddle/fluid/framework/ddim.h | 209 + paddle/fluid/framework/ddim_test.cc | 85 + paddle/fluid/framework/details/CMakeLists.txt | 99 + .../framework/details/all_reduce_op_handle.cc | 168 + .../framework/details/all_reduce_op_handle.h | 71 + .../details/async_ssa_graph_executor.cc | 210 + .../details/async_ssa_graph_executor.h | 67 + .../framework/details/broadcast_op_handle.cc | 169 + .../framework/details/broadcast_op_handle.h | 82 + .../details/broadcast_op_handle_test.cc | 57 + .../details/broadcast_op_handle_test.h | 276 + .../fluid/framework/details/build_strategy.cc | 407 + .../fluid/framework/details/build_strategy.h | 189 + .../details/computation_op_handle.cc | 53 + .../framework/details/computation_op_handle.h | 65 + .../fluid/framework/details/container_cast.h | 40 + paddle/fluid/framework/details/cow_ptr.h | 56 + .../fluid/framework/details/cow_ptr_test.cc | 43 + .../framework/details/dgc_const_values.h | 32 + .../details/eager_deletion_op_handle.cc | 150 + .../details/eager_deletion_op_handle.h | 78 + .../framework/details/exception_holder.h | 115 + .../framework/details/execution_strategy.h | 47 + .../fast_threaded_ssa_graph_executor.cc | 286 + .../fast_threaded_ssa_graph_executor.h | 95 + .../details/fetch_barrier_op_handle.cc | 64 + .../details/fetch_barrier_op_handle.h | 63 + .../framework/details/fetch_op_handle.cc | 92 + .../fluid/framework/details/fetch_op_handle.h | 62 + .../details/fused_all_reduce_op_handle.cc | 240 + .../details/fused_all_reduce_op_handle.h | 81 + .../details/fused_broadcast_op_handle.cc | 50 + .../details/fused_broadcast_op_handle.h | 57 + .../details/fused_broadcast_op_handle_test.cc | 171 + .../framework/details/gather_op_handle.cc | 115 + .../framework/details/gather_op_handle.h | 52 + .../details/gather_op_handle_test.cc | 212 + .../fluid/framework/details/graph_test_base.h | 80 + .../framework/details/multi_devices_helper.cc | 20 + .../framework/details/multi_devices_helper.h | 77 + .../fluid/framework/details/nccl_op_handle.h | 235 + .../fluid/framework/details/op_handle_base.cc | 254 + .../fluid/framework/details/op_handle_base.h | 142 + paddle/fluid/framework/details/op_registry.h | 245 + .../details/parallel_ssa_graph_executor.cc | 181 + .../details/parallel_ssa_graph_executor.h | 61 + .../framework/details/reduce_and_gather.h | 122 + .../framework/details/reduce_op_handle.cc | 330 + .../framework/details/reduce_op_handle.h | 117 + .../details/reduce_op_handle_test.cc | 294 + .../fluid/framework/details/rpc_op_handle.cc | 50 + .../fluid/framework/details/rpc_op_handle.h | 56 + .../details/scale_loss_grad_op_handle.cc | 91 + .../details/scale_loss_grad_op_handle.h | 51 + .../scope_buffered_ssa_graph_executor.cc | 152 + .../scope_buffered_ssa_graph_executor.h | 78 + .../details/share_tensor_buffer_op_handle.cc | 133 + .../details/share_tensor_buffer_op_handle.h | 74 + .../details/sparse_all_reduce_op_handle.cc | 186 + .../details/sparse_all_reduce_op_handle.h | 52 + .../framework/details/ssa_graph_executor.cc | 42 + .../framework/details/ssa_graph_executor.h | 44 + .../details/threaded_ssa_graph_executor.cc | 349 + .../details/threaded_ssa_graph_executor.h | 119 + paddle/fluid/framework/details/var_handle.cc | 39 + paddle/fluid/framework/details/var_handle.h | 166 + .../framework/details/variable_visitor.cc | 139 + .../framework/details/variable_visitor.h | 36 + paddle/fluid/framework/device_worker.cc | 27 + paddle/fluid/framework/device_worker.h | 305 + .../fluid/framework/device_worker_factory.cc | 68 + .../fluid/framework/device_worker_factory.h | 31 + paddle/fluid/framework/device_worker_test.cc | 24 + paddle/fluid/framework/dim.h | 101 + paddle/fluid/framework/dim_test.cu | 86 + paddle/fluid/framework/dist_multi_trainer.cc | 78 + paddle/fluid/framework/dlpack_tensor.cc | 124 + paddle/fluid/framework/dlpack_tensor.h | 45 + paddle/fluid/framework/dlpack_tensor_test.cc | 101 + paddle/fluid/framework/downpour_worker.cc | 523 + paddle/fluid/framework/eigen.h | 119 + paddle/fluid/framework/eigen_test.cc | 132 + paddle/fluid/framework/executor.cc | 498 + paddle/fluid/framework/executor.h | 129 + paddle/fluid/framework/executor_gc_helper.cc | 189 + paddle/fluid/framework/executor_gc_helper.h | 42 + .../fluid/framework/executor_thread_worker.cc | 698 + .../fluid/framework/executor_thread_worker.h | 245 + paddle/fluid/framework/expect.h | 32 + paddle/fluid/framework/feed_fetch_method.cc | 67 + paddle/fluid/framework/feed_fetch_method.h | 33 + paddle/fluid/framework/feed_fetch_type.h | 28 + paddle/fluid/framework/fleet/CMakeLists.txt | 7 + paddle/fluid/framework/fleet/fleet_wrapper.cc | 625 + paddle/fluid/framework/fleet/fleet_wrapper.h | 192 + paddle/fluid/framework/fleet/nccl_wrapper.cc | 78 + paddle/fluid/framework/fleet/nccl_wrapper.h | 83 + paddle/fluid/framework/framework.proto | 188 + paddle/fluid/framework/garbage_collector.cc | 131 + paddle/fluid/framework/garbage_collector.h | 148 + paddle/fluid/framework/grad_op_desc_maker.h | 194 + paddle/fluid/framework/hogwild_worker.cc | 178 + paddle/fluid/framework/inlined_vector.h | 69 + paddle/fluid/framework/inlined_vector_test.cc | 82 + paddle/fluid/framework/inplace_op_inference.h | 62 + .../framework/inplace_op_inference_test.cc | 328 + paddle/fluid/framework/io/CMakeLists.txt | 2 + paddle/fluid/framework/io/fs.cc | 456 + paddle/fluid/framework/io/fs.h | 101 + paddle/fluid/framework/io/shell.cc | 323 + paddle/fluid/framework/io/shell.h | 66 + paddle/fluid/framework/ir/CMakeLists.txt | 136 + .../framework/ir/attention_lstm_fuse_pass.cc | 284 + .../framework/ir/attention_lstm_fuse_pass.h | 30 + .../framework/ir/coalesce_grad_tensor_pass.cc | 488 + .../framework/ir/coalesce_grad_tensor_pass.h | 29 + .../ir/conv_affine_channel_fuse_pass.cc | 218 + .../ir/conv_affine_channel_fuse_pass.h | 49 + .../fluid/framework/ir/conv_bn_fuse_pass.cc | 296 + paddle/fluid/framework/ir/conv_bn_fuse_pass.h | 49 + .../ir/conv_elementwise_add2_act_fuse.cc | 104 + .../ir/conv_elementwise_add2_act_fuse_pass.cc | 105 + .../ir/conv_elementwise_add2_act_fuse_pass.h | 33 + .../ir/conv_elementwise_add_act_fuse_pass.cc | 102 + .../ir/conv_elementwise_add_act_fuse_pass.h | 33 + .../ir/conv_elementwise_add_fuse_pass.cc | 89 + .../ir/conv_elementwise_add_fuse_pass.h | 33 + .../ir/delete_quant_dequant_op_pass.cc | 82 + .../ir/delete_quant_dequant_op_pass.h | 34 + .../ir/embedding_fc_lstm_fuse_pass.cc | 243 + .../ir/embedding_fc_lstm_fuse_pass.h | 42 + paddle/fluid/framework/ir/fc_fuse_pass.cc | 111 + paddle/fluid/framework/ir/fc_fuse_pass.h | 39 + .../fluid/framework/ir/fc_fuse_pass_tester.cc | 99 + paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 180 + paddle/fluid/framework/ir/fc_gru_fuse_pass.h | 50 + .../fluid/framework/ir/fc_lstm_fuse_pass.cc | 184 + paddle/fluid/framework/ir/fc_lstm_fuse_pass.h | 51 + .../ir/fillconstant_elementwisemul_fuse.cc | 83 + .../ir/fillconstant_elementwisemul_fuse.h | 34 + .../framework/ir/fuse_elewise_add_act_pass.cc | 370 + .../framework/ir/fuse_elewise_add_act_pass.h | 74 + .../ir/fuse_optimizer_ops_pass/CMakeLists.txt | 4 + .../fuse_adam_op_pass.cc | 209 + .../fuse_momentum_op_pass.cc | 92 + .../fuse_optimizer_op_pass.cc | 364 + .../fuse_optimizer_op_pass.h | 98 + .../fuse_sgd_op_pass.cc | 70 + paddle/fluid/framework/ir/fuse_pass_base.cc | 64 + paddle/fluid/framework/ir/fuse_pass_base.h | 57 + .../ir/fuse_relu_depthwise_conv_pass.cc | 157 + .../ir/fuse_relu_depthwise_conv_pass.h | 41 + paddle/fluid/framework/ir/graph.cc | 209 + paddle/fluid/framework/ir/graph.h | 241 + paddle/fluid/framework/ir/graph_helper.cc | 395 + paddle/fluid/framework/ir/graph_helper.h | 90 + .../fluid/framework/ir/graph_helper_test.cc | 227 + .../framework/ir/graph_pattern_detector.cc | 1948 +++ .../framework/ir/graph_pattern_detector.h | 1046 ++ .../ir/graph_pattern_detector_tester.cc | 206 + paddle/fluid/framework/ir/graph_test.cc | 210 + .../framework/ir/graph_to_program_pass.cc | 83 + .../framework/ir/graph_to_program_pass.h | 34 + .../ir/graph_to_program_pass_test.cc | 112 + paddle/fluid/framework/ir/graph_traits.cc | 142 + paddle/fluid/framework/ir/graph_traits.h | 128 + paddle/fluid/framework/ir/graph_viz_pass.cc | 135 + paddle/fluid/framework/ir/graph_viz_pass.h | 54 + .../ir/identity_scale_op_clean_pass.cc | 82 + .../ir/identity_scale_op_clean_pass.h | 33 + .../framework/ir/infer_clean_graph_pass.cc | 67 + paddle/fluid/framework/ir/is_test_pass.cc | 55 + paddle/fluid/framework/ir/is_test_pass.h | 30 + .../fluid/framework/ir/is_test_pass_tester.cc | 120 + .../framework/ir/lock_free_optimize_pass.cc | 355 + .../framework/ir/lock_free_optimize_pass.h | 127 + .../ir/memory_optimize_pass/CMakeLists.txt | 24 + .../buffer_shared_inplace_op_pass.cc | 160 + .../eager_deletion_pass.cc | 292 + .../memory_optimize_pass/inplace_op_pass.cc | 487 + .../memory_optimization_var_info.h | 105 + .../memory_optimize_helper.cc | 569 + .../memory_optimize_helper.h | 189 + .../memory_optimize_helper_test.cc | 525 + .../memory_optimize_pass.cc | 224 + .../memory_optimize_pass.h | 72 + .../memory_optimize_pass/memory_reuse_pass.cc | 286 + .../memory_optimize_pass/memory_reuse_pass.h | 128 + .../ir/memory_optimize_pass/op_graph_view.cc | 71 + .../ir/memory_optimize_pass/op_graph_view.h | 80 + .../record_skip_memory_opt_vars_pass.cc | 170 + .../recurrent_op_eager_deletion_pass.cc | 76 + .../recurrent_op_eager_deletion_pass.h | 43 + .../reference_count_pass.cc | 381 + .../reference_count_pass_helper.cc | 35 + .../reference_count_pass_helper.h | 52 + .../while_op_eager_deletion_pass.cc | 60 + .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc | 139 + .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.h | 49 + .../conv_bias_mkldnn_fuse_pass_tester.cc | 155 + .../ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc | 71 + .../ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h | 39 + .../conv_brelu_mkldnn_fuse_pass_tester.cc | 135 + .../conv_concat_relu_mkldnn_fuse_pass.cc | 119 + .../conv_concat_relu_mkldnn_fuse_pass.h | 53 + ...onv_concat_relu_mkldnn_fuse_pass_tester.cc | 157 + .../conv_elementwise_add_mkldnn_fuse_pass.cc | 345 + .../conv_elementwise_add_mkldnn_fuse_pass.h | 134 + ...elementwise_add_mkldnn_fuse_pass_tester.cc | 274 + .../ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc | 76 + .../ir/mkldnn/conv_relu_mkldnn_fuse_pass.h | 39 + .../conv_relu_mkldnn_fuse_pass_tester.cc | 127 + .../framework/ir/mkldnn/cpu_quantize_pass.cc | 365 + .../framework/ir/mkldnn/cpu_quantize_pass.h | 74 + .../ir/mkldnn/cpu_quantize_pass_tester.cc | 310 + .../ir/mkldnn/cpu_quantize_placement_pass.cc | 56 + .../ir/mkldnn/cpu_quantize_placement_pass.h | 33 + .../cpu_quantize_placement_pass_tester.cc | 129 + .../ir/mkldnn/cpu_quantize_squash_pass.cc | 142 + .../ir/mkldnn/cpu_quantize_squash_pass.h | 57 + .../mkldnn/cpu_quantize_squash_pass_tester.cc | 178 + .../ir/mkldnn/depthwise_conv_mkldnn_pass.cc | 56 + .../ir/mkldnn/depthwise_conv_mkldnn_pass.h | 33 + .../depthwise_conv_mkldnn_pass_tester.cc | 123 + .../framework/ir/mkldnn/fc_mkldnn_pass.cc | 77 + .../framework/ir/mkldnn/fc_mkldnn_pass.h | 38 + .../ir/mkldnn/mkldnn_placement_pass.cc | 51 + .../ir/mkldnn/mkldnn_placement_pass.h | 34 + .../ir/mkldnn/mkldnn_placement_pass_tester.cc | 136 + .../framework/ir/multi_batch_merge_pass.cc | 336 + .../framework/ir/multi_batch_merge_pass.h | 44 + .../multi_devices_graph_pass/CMakeLists.txt | 17 + .../all_reduce_deps_pass.cc | 219 + .../backward_optimizer_op_deps_pass.cc | 223 + .../fuse_all_reduce_op_pass.cc | 206 + .../modify_op_lock_and_record_event_pass.cc | 59 + .../multi_devices_graph_check_pass.cc | 102 + .../multi_devices_graph_pass.cc | 1118 ++ .../multi_devices_graph_pass.h | 215 + .../multi_devices_graph_print_pass.cc | 101 + .../multi_devices_graph_print_pass.h | 44 + .../sequential_execution_pass.cc | 113 + .../framework/ir/ngraph_subgraph_pass.cc | 158 + .../fluid/framework/ir/ngraph_subgraph_pass.h | 42 + paddle/fluid/framework/ir/node.cc | 43 + paddle/fluid/framework/ir/node.h | 174 + paddle/fluid/framework/ir/node_test.cc | 80 + paddle/fluid/framework/ir/pass.cc | 52 + paddle/fluid/framework/ir/pass.h | 232 + paddle/fluid/framework/ir/pass_builder.cc | 43 + paddle/fluid/framework/ir/pass_builder.h | 49 + paddle/fluid/framework/ir/pass_test.cc | 111 + .../ir/quant_conv2d_dequant_fuse_pass.cc | 208 + .../ir/quant_conv2d_dequant_fuse_pass.h | 35 + .../ir/repeated_fc_relu_fuse_pass.cc | 384 + .../framework/ir/repeated_fc_relu_fuse_pass.h | 41 + .../ir/runtime_context_cache_pass.cc | 37 + .../framework/ir/runtime_context_cache_pass.h | 31 + .../framework/ir/seq_concat_fc_fuse_pass.cc | 255 + .../framework/ir/seq_concat_fc_fuse_pass.h | 35 + .../ir/seqconv_eltadd_relu_fuse_pass.cc | 99 + .../ir/seqconv_eltadd_relu_fuse_pass.h | 38 + .../framework/ir/seqpool_concat_fuse_pass.cc | 213 + .../framework/ir/seqpool_concat_fuse_pass.h | 52 + .../ir/seqpool_concat_fuse_pass_tester.cc | 198 + .../ir/shuffle_channel_detect_pass.cc | 93 + .../ir/shuffle_channel_detect_pass.h | 34 + ...lify_anakin_priorbox_detection_out_pass.cc | 233 + ...plify_anakin_priorbox_detection_out_pass.h | 39 + .../framework/ir/squared_mat_sub_fuse_pass.cc | 377 + .../framework/ir/squared_mat_sub_fuse_pass.h | 41 + .../framework/ir/sync_batch_norm_pass.cc | 45 + .../ir/sync_batch_norm_pass_tester.cc | 80 + .../ir/transpose_flatten_concat_fuse_pass.cc | 137 + .../ir/transpose_flatten_concat_fuse_pass.h | 39 + paddle/fluid/framework/library_type.h | 72 + paddle/fluid/framework/lod_rank_table.cc | 58 + paddle/fluid/framework/lod_rank_table.h | 61 + paddle/fluid/framework/lod_tensor.cc | 432 + paddle/fluid/framework/lod_tensor.h | 242 + paddle/fluid/framework/lod_tensor_array.h | 25 + paddle/fluid/framework/lod_tensor_test.cc | 332 + paddle/fluid/framework/lod_tensor_test.cu | 72 + paddle/fluid/framework/mixed_vector.h | 537 + paddle/fluid/framework/mixed_vector_test.cc | 72 + paddle/fluid/framework/mixed_vector_test.cu | 75 + paddle/fluid/framework/multi_trainer.cc | 80 + paddle/fluid/framework/naive_executor.cc | 131 + paddle/fluid/framework/naive_executor.h | 69 + paddle/fluid/framework/naive_executor_test.cc | 70 + .../framework/no_need_buffer_vars_inference.h | 60 + paddle/fluid/framework/op_desc.cc | 822 + paddle/fluid/framework/op_desc.h | 153 + paddle/fluid/framework/op_info.cc | 43 + paddle/fluid/framework/op_info.h | 123 + paddle/fluid/framework/op_kernel_type.cc | 54 + paddle/fluid/framework/op_kernel_type.h | 110 + paddle/fluid/framework/op_kernel_type_test.cc | 57 + paddle/fluid/framework/op_proto_maker.cc | 94 + paddle/fluid/framework/op_proto_maker.h | 108 + paddle/fluid/framework/op_proto_maker_test.cc | 49 + paddle/fluid/framework/op_registry.cc | 68 + paddle/fluid/framework/op_registry.h | 326 + paddle/fluid/framework/op_registry_test.cc | 370 + paddle/fluid/framework/operator.cc | 1214 ++ paddle/fluid/framework/operator.h | 511 + .../fluid/framework/operator_kernel_configs.h | 118 + paddle/fluid/framework/operator_test.cc | 317 + paddle/fluid/framework/parallel_executor.cc | 782 + paddle/fluid/framework/parallel_executor.h | 93 + paddle/fluid/framework/pipeline_trainer.cc | 265 + paddle/fluid/framework/program_desc.cc | 195 + paddle/fluid/framework/program_desc.h | 90 + paddle/fluid/framework/program_desc_test.cc | 178 + paddle/fluid/framework/proto_desc.h | 26 + paddle/fluid/framework/prune.cc | 187 + paddle/fluid/framework/prune.h | 26 + paddle/fluid/framework/prune_test.cc | 151 + paddle/fluid/framework/pull_dense_worker.cc | 142 + paddle/fluid/framework/python_headers.h | 34 + paddle/fluid/framework/reader.cc | 77 + paddle/fluid/framework/reader.h | 151 + paddle/fluid/framework/reader_test.cc | 52 + paddle/fluid/framework/revision.cc | 29 + paddle/fluid/framework/revision.h | 23 + paddle/fluid/framework/rw_lock.h | 99 + paddle/fluid/framework/rw_lock_test.cc | 81 + paddle/fluid/framework/scope.cc | 256 + paddle/fluid/framework/scope.h | 154 + paddle/fluid/framework/scope_pool.cc | 54 + paddle/fluid/framework/scope_pool.h | 46 + paddle/fluid/framework/scope_test.cc | 71 + paddle/fluid/framework/section_worker.cc | 411 + paddle/fluid/framework/selected_rows.cc | 234 + paddle/fluid/framework/selected_rows.h | 167 + paddle/fluid/framework/selected_rows_test.cc | 202 + paddle/fluid/framework/shape_inference.cc | 46 + paddle/fluid/framework/shape_inference.h | 85 + paddle/fluid/framework/tensor.cc | 113 + paddle/fluid/framework/tensor.h | 211 + paddle/fluid/framework/tensor_impl.h | 81 + paddle/fluid/framework/tensor_test.cc | 259 + paddle/fluid/framework/tensor_util.cc | 541 + paddle/fluid/framework/tensor_util.cu | 1 + paddle/fluid/framework/tensor_util.h | 157 + paddle/fluid/framework/tensor_util_test.cc | 396 + paddle/fluid/framework/tensor_util_test.cu | 260 + paddle/fluid/framework/threadpool.cc | 113 + paddle/fluid/framework/threadpool.h | 144 + paddle/fluid/framework/threadpool_test.cc | 61 + paddle/fluid/framework/trainer.cc | 23 + paddle/fluid/framework/trainer.h | 148 + paddle/fluid/framework/trainer_desc.proto | 120 + paddle/fluid/framework/trainer_factory.cc | 70 + paddle/fluid/framework/trainer_factory.h | 30 + paddle/fluid/framework/trainer_test.cc | 27 + .../fluid/framework/transfer_scope_cache.cc | 99 + paddle/fluid/framework/transfer_scope_cache.h | 39 + paddle/fluid/framework/tuple.h | 71 + paddle/fluid/framework/tuple_test.cc | 65 + paddle/fluid/framework/type_defs.h | 70 + paddle/fluid/framework/unroll_array_ops.h | 132 + .../fluid/framework/unroll_array_ops_test.cc | 83 + paddle/fluid/framework/var_desc.cc | 275 + paddle/fluid/framework/var_desc.h | 124 + paddle/fluid/framework/var_type.h | 70 + paddle/fluid/framework/var_type_inference.h | 156 + .../framework/var_type_inference_test.cc | 116 + paddle/fluid/framework/var_type_traits.cc | 123 + paddle/fluid/framework/var_type_traits.h | 191 + .../fluid/framework/var_type_traits_test.cc | 122 + paddle/fluid/framework/variable.h | 104 + paddle/fluid/framework/variable_helper.cc | 83 + paddle/fluid/framework/variable_helper.h | 25 + paddle/fluid/framework/variable_test.cc | 43 + paddle/fluid/framework/version.cc | 36 + paddle/fluid/framework/version.h | 47 + paddle/fluid/framework/version_test.cc | 30 + paddle/fluid/imperative/CMakeLists.txt | 10 + paddle/fluid/imperative/README.md | 212 + paddle/fluid/imperative/backward_strategy.h | 38 + paddle/fluid/imperative/engine.cc | 53 + paddle/fluid/imperative/engine.h | 39 + paddle/fluid/imperative/flags.cc | 30 + paddle/fluid/imperative/flags.h | 26 + paddle/fluid/imperative/layer.cc | 469 + paddle/fluid/imperative/layer.h | 550 + paddle/fluid/imperative/nccl_context.cc | 134 + paddle/fluid/imperative/nccl_context.h | 81 + paddle/fluid/imperative/nccl_context_test.cc | 52 + paddle/fluid/imperative/profiler.cc | 62 + paddle/fluid/imperative/profiler.h | 25 + paddle/fluid/imperative/tracer.cc | 292 + paddle/fluid/imperative/tracer.h | 60 + paddle/fluid/imperative/type_defs.h | 43 + paddle/fluid/inference/CMakeLists.txt | 108 + paddle/fluid/inference/anakin/CMakeLists.txt | 5 + .../inference/anakin/convert/CMakeLists.txt | 23 + .../inference/anakin/convert/activation.cc | 64 + .../inference/anakin/convert/activation.h | 72 + .../anakin/convert/affine_channel.cc | 55 + .../inference/anakin/convert/affine_channel.h | 40 + .../inference/anakin/convert/batch_norm.cc | 85 + .../inference/anakin/convert/batch_norm.h | 37 + .../fluid/inference/anakin/convert/concat.cc | 41 + .../fluid/inference/anakin/convert/concat.h | 39 + .../fluid/inference/anakin/convert/conv2d.cc | 109 + .../fluid/inference/anakin/convert/conv2d.h | 37 + .../inference/anakin/convert/conv2d_fusion.cc | 115 + .../inference/anakin/convert/conv2d_fusion.h | 37 + .../anakin/convert/density_prior_box.cc | 112 + .../anakin/convert/density_prior_box.h | 40 + .../inference/anakin/convert/detection_out.cc | 69 + .../inference/anakin/convert/detection_out.h | 39 + .../fluid/inference/anakin/convert/dropout.cc | 55 + .../fluid/inference/anakin/convert/dropout.h | 39 + .../inference/anakin/convert/elementwise.cc | 75 + .../inference/anakin/convert/elementwise.h | 55 + paddle/fluid/inference/anakin/convert/fc.cc | 122 + paddle/fluid/inference/anakin/convert/fc.h | 51 + .../fluid/inference/anakin/convert/flatten.cc | 48 + .../fluid/inference/anakin/convert/flatten.h | 37 + .../fluid/inference/anakin/convert/helper.cc | 32 + .../fluid/inference/anakin/convert/helper.h | 95 + .../inference/anakin/convert/im2sequence.cc | 58 + .../inference/anakin/convert/im2sequence.h | 39 + .../inference/anakin/convert/op_converter.h | 237 + .../fluid/inference/anakin/convert/pool2d.cc | 74 + .../fluid/inference/anakin/convert/pool2d.h | 39 + paddle/fluid/inference/anakin/convert/relu.cc | 61 + paddle/fluid/inference/anakin/convert/relu.h | 51 + .../fluid/inference/anakin/convert/reshape.cc | 49 + .../fluid/inference/anakin/convert/reshape.h | 37 + .../inference/anakin/convert/roi_align.cc | 54 + .../inference/anakin/convert/roi_align.h | 39 + .../fluid/inference/anakin/convert/scale.cc | 52 + paddle/fluid/inference/anakin/convert/scale.h | 39 + .../anakin/convert/shuffle_channel.cc | 47 + .../anakin/convert/shuffle_channel.h | 38 + .../fluid/inference/anakin/convert/softmax.cc | 47 + .../fluid/inference/anakin/convert/softmax.h | 37 + .../fluid/inference/anakin/convert/split.cc | 59 + paddle/fluid/inference/anakin/convert/split.h | 39 + paddle/fluid/inference/anakin/convert/sum.cc | 50 + paddle/fluid/inference/anakin/convert/sum.h | 39 + .../anakin/convert/test_activation_op.cc | 92 + .../anakin/convert/test_affine_channel_op.cc | 72 + .../anakin/convert/test_batch_norm_op.cc | 87 + .../anakin/convert/test_concat_op.cc | 67 + .../anakin/convert/test_conv2d_op.cc | 75 + .../anakin/convert/test_dropout_op.cc | 69 + .../anakin/convert/test_elementwise_op.cc | 81 + .../inference/anakin/convert/test_fc_op.cc | 64 + .../anakin/convert/test_flatten_op.cc | 64 + .../anakin/convert/test_im2sequence_op.cc | 55 + .../anakin/convert/test_pool2d_op.cc | 119 + .../inference/anakin/convert/test_relu_op.cc | 70 + .../anakin/convert/test_reshape_op.cc | 102 + .../anakin/convert/test_softmax_op.cc | 63 + .../inference/anakin/convert/test_split_op.cc | 119 + .../inference/anakin/convert/test_sum_op.cc | 64 + .../anakin/convert/test_transpose_op.cc | 100 + .../inference/anakin/convert/transpose.cc | 52 + .../inference/anakin/convert/transpose.h | 37 + .../inference/anakin/convert/ut_helper.h | 227 + paddle/fluid/inference/anakin/engine.cc | 207 + paddle/fluid/inference/anakin/engine.h | 168 + paddle/fluid/inference/anakin/op_teller.cc | 74 + paddle/fluid/inference/anakin/op_teller.h | 70 + .../inference/anakin/test_anakin_engine.cc | 91 + .../fluid/inference/analysis/CMakeLists.txt | 68 + paddle/fluid/inference/analysis/README.md | 58 + .../fluid/inference/analysis/analysis_pass.cc | 15 + .../fluid/inference/analysis/analysis_pass.h | 53 + paddle/fluid/inference/analysis/analyzer.cc | 42 + paddle/fluid/inference/analysis/analyzer.h | 62 + .../inference/analysis/analyzer_tester.cc | 98 + paddle/fluid/inference/analysis/argument.cc | 15 + paddle/fluid/inference/analysis/argument.h | 218 + paddle/fluid/inference/analysis/device.h | 24 + paddle/fluid/inference/analysis/dot.h | 161 + paddle/fluid/inference/analysis/dot_tester.cc | 61 + paddle/fluid/inference/analysis/flags.h | 22 + paddle/fluid/inference/analysis/helper.cc | 80 + paddle/fluid/inference/analysis/helper.h | 257 + .../inference/analysis/ir_pass_manager.cc | 184 + .../inference/analysis/ir_pass_manager.h | 63 + .../analysis/ir_passes/CMakeLists.txt | 28 + .../ir_passes/anakin_subgraph_pass.cc | 283 + .../analysis/ir_passes/anakin_subgraph_pass.h | 51 + .../analysis/ir_passes/subgraph_detector.cc | 467 + .../analysis/ir_passes/subgraph_detector.h | 160 + .../analysis/ir_passes/subgraph_util.cc | 169 + .../analysis/ir_passes/subgraph_util.h | 50 + .../ir_passes/tensorrt_subgraph_pass.cc | 298 + .../ir_passes/tensorrt_subgraph_pass.h | 42 + .../inference/analysis/passes/CMakeLists.txt | 22 + .../adjust_cudnn_workspace_size_pass.cc | 43 + .../passes/adjust_cudnn_workspace_size_pass.h | 41 + .../passes/inference_op_replace_pass.cc | 47 + .../passes/inference_op_replace_pass.h | 43 + .../analysis/passes/ir_analysis_pass.cc | 53 + .../analysis/passes/ir_analysis_pass.h | 40 + .../analysis/passes/ir_graph_build_pass.cc | 88 + .../analysis/passes/ir_graph_build_pass.h | 49 + .../passes/ir_graph_to_program_pass.cc | 49 + .../passes/ir_graph_to_program_pass.h | 32 + .../ir_params_sync_among_devices_pass.cc | 86 + .../ir_params_sync_among_devices_pass.h | 40 + .../analysis/passes/memory_optimize_pass.cc | 845 ++ .../analysis/passes/memory_optimize_pass.h | 112 + .../fluid/inference/analysis/passes/passes.cc | 51 + .../fluid/inference/analysis/passes/passes.h | 44 + paddle/fluid/inference/analysis/ut_helper.h | 35 + paddle/fluid/inference/api/CMakeLists.txt | 82 + paddle/fluid/inference/api/README.md | 17 + paddle/fluid/inference/api/analysis_config.cc | 460 + .../fluid/inference/api/analysis_predictor.cc | 1001 ++ .../fluid/inference/api/analysis_predictor.h | 187 + .../api/analysis_predictor_tester.cc | 491 + paddle/fluid/inference/api/api.cc | 113 + .../fluid/inference/api/api_anakin_engine.cc | 454 + .../fluid/inference/api/api_anakin_engine.h | 126 + paddle/fluid/inference/api/api_impl.cc | 341 + paddle/fluid/inference/api/api_impl.h | 81 + paddle/fluid/inference/api/api_impl_tester.cc | 309 + paddle/fluid/inference/api/api_tester.cc | 70 + paddle/fluid/inference/api/demo_ci/.gitignore | 1 + .../inference/api/demo_ci/CMakeLists.txt | 163 + paddle/fluid/inference/api/demo_ci/README.md | 26 + paddle/fluid/inference/api/demo_ci/clean.sh | 4 + paddle/fluid/inference/api/demo_ci/run.sh | 126 + .../api/demo_ci/simple_on_word2vec.cc | 144 + .../api/demo_ci/trt_mobilenet_demo.cc | 80 + paddle/fluid/inference/api/demo_ci/utils.h | 140 + .../fluid/inference/api/demo_ci/vis_demo.cc | 90 + .../api/demo_ci/windows_inference.md | 19 + .../inference/api/details/CMakeLists.txt | 18 + .../api/details/reset_tensor_array.cc | 73 + .../api/details/reset_tensor_array.h | 57 + .../inference/api/details/zero_copy_tensor.cc | 196 + .../api/details/zero_copy_tensor_dummy.cc | 48 + paddle/fluid/inference/api/helper.cc | 44 + paddle/fluid/inference/api/helper.h | 341 + paddle/fluid/inference/api/high_level_api.md | 60 + .../fluid/inference/api/high_level_api_cn.md | 87 + .../fluid/inference/api/mkldnn_quantizer.cc | 477 + paddle/fluid/inference/api/mkldnn_quantizer.h | 104 + .../inference/api/mkldnn_quantizer_config.cc | 48 + .../inference/api/paddle_anakin_config.h | 49 + .../inference/api/paddle_analysis_config.h | 336 + paddle/fluid/inference/api/paddle_api.h | 357 + .../inference/api/paddle_inference_api.h | 33 + .../inference/api/paddle_inference_pass.h | 33 + .../api/paddle_mkldnn_quantizer_config.h | 105 + .../inference/api/paddle_pass_builder.cc | 216 + .../fluid/inference/api/paddle_pass_builder.h | 157 + paddle/fluid/inference/check_symbol.sh | 12 + paddle/fluid/inference/engine.h | 63 + paddle/fluid/inference/io.cc | 194 + paddle/fluid/inference/io.h | 55 + paddle/fluid/inference/paddle_fluid.map | 7 + paddle/fluid/inference/paddle_fluid.sym | 1 + .../fluid/inference/tensorrt/CMakeLists.txt | 6 + .../inference/tensorrt/convert/CMakeLists.txt | 44 + .../tensorrt/convert/activation_op.cc | 89 + .../tensorrt/convert/batch_norm_op.cc | 132 + .../inference/tensorrt/convert/concat_op.cc | 53 + .../inference/tensorrt/convert/conv2d_op.cc | 156 + .../inference/tensorrt/convert/dropout_op.cc | 68 + .../tensorrt/convert/elementwise_op.cc | 264 + .../fluid/inference/tensorrt/convert/fc_op.cc | 151 + .../tensorrt/convert/io_converter.cc | 74 + .../inference/tensorrt/convert/io_converter.h | 82 + .../tensorrt/convert/leaky_relu_op.cc | 89 + .../inference/tensorrt/convert/mul_op.cc | 52 + .../inference/tensorrt/convert/op_converter.h | 231 + .../inference/tensorrt/convert/pad_op.cc | 62 + .../inference/tensorrt/convert/pool2d_op.cc | 167 + .../inference/tensorrt/convert/prelu_op.cc | 70 + .../inference/tensorrt/convert/softmax_op.cc | 53 + .../inference/tensorrt/convert/split_op.cc | 67 + .../tensorrt/convert/test_activation_op.cc | 55 + .../tensorrt/convert/test_batch_norm_op.cc | 71 + .../tensorrt/convert/test_concat_op.cc | 49 + .../tensorrt/convert/test_conv2d_op.cc | 90 + .../tensorrt/convert/test_dropout_op.cc | 58 + .../tensorrt/convert/test_elementwise_op.cc | 107 + .../inference/tensorrt/convert/test_fc_op.cc | 46 + .../tensorrt/convert/test_io_converter.cc | 76 + .../tensorrt/convert/test_leaky_relu_op.cc | 48 + .../inference/tensorrt/convert/test_mul_op.cc | 49 + .../tensorrt/convert/test_op_converter.cc | 71 + .../inference/tensorrt/convert/test_pad_op.cc | 52 + .../tensorrt/convert/test_pool2d_op.cc | 74 + .../tensorrt/convert/test_prelu_op.cc | 93 + .../tensorrt/convert/test_softmax_op.cc | 49 + .../tensorrt/convert/test_split_op.cc | 115 + .../inference/tensorrt/convert/ut_helper.h | 241 + paddle/fluid/inference/tensorrt/engine.cc | 225 + paddle/fluid/inference/tensorrt/engine.h | 257 + paddle/fluid/inference/tensorrt/helper.h | 108 + paddle/fluid/inference/tensorrt/op_teller.cc | 49 + paddle/fluid/inference/tensorrt/op_teller.h | 70 + .../inference/tensorrt/plugin/CMakeLists.txt | 5 + .../tensorrt/plugin/avg_pool_op_plugin.cu | 71 + .../tensorrt/plugin/avg_pool_op_plugin.h | 115 + .../tensorrt/plugin/elementwise_op_plugin.cu | 145 + .../tensorrt/plugin/elementwise_op_plugin.h | 93 + .../tensorrt/plugin/prelu_op_plugin.cu | 82 + .../tensorrt/plugin/prelu_op_plugin.h | 87 + .../tensorrt/plugin/split_op_plugin.cu | 184 + .../tensorrt/plugin/split_op_plugin.h | 79 + .../inference/tensorrt/plugin/trt_plugin.cc | 61 + .../inference/tensorrt/plugin/trt_plugin.h | 118 + .../tensorrt/plugin/trt_plugin_factory.cc | 48 + .../tensorrt/plugin/trt_plugin_factory.h | 78 + .../tensorrt/plugin/trt_plugin_utils.h | 134 + .../fluid/inference/tensorrt/test_engine.cc | 232 + .../fluid/inference/tensorrt/test_tensorrt.cc | 155 + .../inference/tensorrt/trt_int8_calibrator.cc | 147 + .../inference/tensorrt/trt_int8_calibrator.h | 128 + .../fluid/inference/tests/api/CMakeLists.txt | 271 + .../inference/tests/api/anakin_mlu_tester.cc | 98 + .../inference/tests/api/anakin_rnn2_tester.cc | 261 + .../tests/api/analyzer_bert_tester.cc | 296 + .../tests/api/analyzer_dam_tester.cc | 360 + .../analyzer_image_classification_tester.cc | 146 + ...alyzer_int8_image_classification_tester.cc | 171 + .../analyzer_int8_object_detection_tester.cc | 281 + .../tests/api/analyzer_lac_tester.cc | 193 + .../tests/api/analyzer_mm_dnn_tester.cc | 268 + .../tests/api/analyzer_ner_tester.cc | 177 + .../tests/api/analyzer_pyramid_dnn_tester.cc | 204 + .../tests/api/analyzer_rnn1_tester.cc | 309 + .../tests/api/analyzer_rnn2_tester.cc | 173 + .../tests/api/analyzer_save_model_tester.cc | 57 + .../tests/api/analyzer_seq_conv1_tester.cc | 175 + .../tests/api/analyzer_seq_pool1_tester.cc | 232 + .../analyzer_text_classification_tester.cc | 142 + .../tests/api/analyzer_transformer_tester.cc | 238 + .../tests/api/analyzer_vis_tester.cc | 163 + .../inference/tests/api/config_printer.h | 89 + .../api/full_ILSVRC2012_val_preprocess.py | 213 + .../api/full_pascalvoc_test_preprocess.py | 187 + .../tests/api/int8_mkldnn_quantization.md | 105 + .../fluid/inference/tests/api/tester_helper.h | 730 + .../inference/tests/api/trt_mobilenet_test.cc | 49 + .../inference/tests/api/trt_resnet50_test.cc | 30 + .../inference/tests/api/trt_resnext_test.cc | 30 + .../inference/tests/api/trt_test_helper.h | 137 + .../fluid/inference/tests/book/CMakeLists.txt | 50 + .../tests/book/test_inference_fit_a_line.cc | 89 + .../test_inference_image_classification.cc | 98 + .../test_inference_label_semantic_roles.cc | 87 + .../tests/book/test_inference_nlp.cc | 243 + .../book/test_inference_recognize_digits.cc | 68 + .../book/test_inference_recommender_system.cc | 87 + .../test_inference_rnn_encoder_decoder.cc | 65 + .../test_inference_understand_sentiment.cc | 63 + .../tests/book/test_inference_word2vec.cc | 68 + paddle/fluid/inference/tests/test.cmake | 82 + paddle/fluid/inference/tests/test_helper.h | 252 + .../tests/test_multi_thread_helper.h | 90 + paddle/fluid/inference/utils/CMakeLists.txt | 2 + paddle/fluid/inference/utils/benchmark.cc | 49 + paddle/fluid/inference/utils/benchmark.h | 54 + .../fluid/inference/utils/benchmark_tester.cc | 39 + paddle/fluid/inference/utils/singleton.h | 77 + paddle/fluid/memory/CMakeLists.txt | 12 + paddle/fluid/memory/allocation/CMakeLists.txt | 57 + .../memory/allocation/aligned_allocator.cc | 58 + .../memory/allocation/aligned_allocator.h | 43 + paddle/fluid/memory/allocation/allocator.cc | 30 + paddle/fluid/memory/allocation/allocator.h | 203 + .../memory/allocation/allocator_facade.cc | 176 + .../memory/allocation/allocator_facade.h | 55 + .../allocator_facade_abs_flags_test.cc | 103 + .../allocator_facade_frac_flags_test.cc | 96 + .../memory/allocation/allocator_strategy.cc | 50 + .../memory/allocation/allocator_strategy.h | 30 + .../auto_growth_best_fit_allocator.cc | 124 + .../auto_growth_best_fit_allocator.h | 86 + ...o_growth_best_fit_allocator_facade_test.cc | 148 + .../memory/allocation/best_fit_allocator.cc | 170 + .../memory/allocation/best_fit_allocator.h | 132 + .../allocation/best_fit_allocator_test.cc | 139 + .../allocation/best_fit_allocator_test.cu | 88 + .../memory/allocation/buffered_allocator.cc | 77 + .../memory/allocation/buffered_allocator.h | 58 + .../allocation/buffered_allocator_test.cc | 145 + .../fluid/memory/allocation/cpu_allocator.cc | 47 + .../fluid/memory/allocation/cpu_allocator.h | 45 + .../fluid/memory/allocation/cuda_allocator.cc | 48 + .../fluid/memory/allocation/cuda_allocator.h | 40 + .../memory/allocation/locked_allocator.cc | 47 + .../memory/allocation/locked_allocator.h | 41 + .../allocation/naive_best_fit_allocator.cc | 312 + .../allocation/naive_best_fit_allocator.h | 41 + .../memory/allocation/pinned_allocator.cc | 34 + .../memory/allocation/pinned_allocator.h | 34 + .../memory/allocation/retry_allocator.cc | 64 + .../fluid/memory/allocation/retry_allocator.h | 58 + .../memory/allocation/retry_allocator_test.cc | 98 + .../allocation/test_aligned_allocator.cc | 81 + paddle/fluid/memory/detail/CMakeLists.txt | 13 + paddle/fluid/memory/detail/buddy_allocator.cc | 352 + paddle/fluid/memory/detail/buddy_allocator.h | 117 + .../memory/detail/buddy_allocator_test.cc | 133 + paddle/fluid/memory/detail/memory_block.cc | 156 + paddle/fluid/memory/detail/memory_block.h | 130 + .../fluid/memory/detail/memory_block_desc.cc | 74 + paddle/fluid/memory/detail/meta_cache.cc | 58 + .../fluid/memory/detail/system_allocator.cc | 219 + paddle/fluid/memory/detail/system_allocator.h | 72 + .../memory/detail/system_allocator_test.cc | 65 + paddle/fluid/memory/malloc.cc | 33 + paddle/fluid/memory/malloc.h | 32 + paddle/fluid/memory/memcpy.cc | 169 + paddle/fluid/memory/memcpy.h | 58 + paddle/fluid/memory/memory.h | 18 + paddle/fluid/memory/pinned_memory_test.cu | 146 + .../fluid/op_use_default_grad_op_maker.spec | 43 + paddle/fluid/operators/CMakeLists.txt | 118 + paddle/fluid/operators/activation_cudnn.cu.cc | 40 + .../fluid/operators/activation_cudnn_op.cu.cc | 189 + paddle/fluid/operators/activation_op.cc | 890 ++ paddle/fluid/operators/activation_op.cu | 88 + paddle/fluid/operators/activation_op.h | 1551 ++ .../operators/add_position_encoding_op.cc | 124 + .../operators/add_position_encoding_op.h | 106 + paddle/fluid/operators/affine_channel_op.cc | 334 + paddle/fluid/operators/affine_channel_op.cu | 194 + .../operators/affine_grid_cudnn_op.cu.cc | 112 + paddle/fluid/operators/affine_grid_op.cc | 234 + paddle/fluid/operators/affine_grid_op.h | 178 + paddle/fluid/operators/anakin/CMakeLists.txt | 2 + .../operators/anakin/anakin_engine_op.cc | 53 + .../fluid/operators/anakin/anakin_engine_op.h | 138 + paddle/fluid/operators/arg_max_op.cc | 32 + paddle/fluid/operators/arg_max_op.cu | 29 + paddle/fluid/operators/arg_min_max_op_base.h | 162 + paddle/fluid/operators/arg_min_op.cc | 32 + paddle/fluid/operators/arg_min_op.cu | 29 + paddle/fluid/operators/argsort_op.cc | 87 + paddle/fluid/operators/argsort_op.cu | 151 + paddle/fluid/operators/argsort_op.h | 81 + paddle/fluid/operators/array_operator.h | 58 + .../fluid/operators/array_to_lod_tensor_op.cc | 222 + paddle/fluid/operators/assign_op.cc | 145 + paddle/fluid/operators/assign_value_op.cc | 76 + paddle/fluid/operators/assign_value_op.cu.cc | 19 + paddle/fluid/operators/assign_value_op.h | 51 + paddle/fluid/operators/attention_lstm_op.cc | 426 + paddle/fluid/operators/attention_lstm_op.h | 41 + .../fluid/operators/average_accumulates_op.cc | 214 + .../fluid/operators/average_accumulates_op.cu | 67 + .../fluid/operators/average_accumulates_op.h | 114 + paddle/fluid/operators/batch_norm_op.cc | 638 + paddle/fluid/operators/batch_norm_op.cu | 418 + paddle/fluid/operators/batch_norm_op.h | 121 + paddle/fluid/operators/batch_size_like.h | 81 + .../fluid/operators/beam_search_decode_op.cc | 223 + .../fluid/operators/beam_search_decode_op.h | 215 + .../operators/beam_search_decode_op_test.cc | 129 + paddle/fluid/operators/beam_search_op.cc | 145 + paddle/fluid/operators/beam_search_op.cu.cc | 24 + paddle/fluid/operators/beam_search_op.h | 56 + .../fluid/operators/benchmark/CMakeLists.txt | 3 + paddle/fluid/operators/benchmark/op_tester.cc | 522 + paddle/fluid/operators/benchmark/op_tester.h | 75 + .../operators/benchmark/op_tester_config.cc | 229 + .../operators/benchmark/op_tester_config.h | 83 + .../operators/bilinear_tensor_product_op.cc | 195 + .../operators/bilinear_tensor_product_op.cu | 29 + .../operators/bilinear_tensor_product_op.h | 183 + paddle/fluid/operators/bpr_loss_op.cc | 165 + paddle/fluid/operators/bpr_loss_op.h | 118 + paddle/fluid/operators/cast_op.cc | 95 + paddle/fluid/operators/cast_op.cu | 25 + paddle/fluid/operators/cast_op.h | 66 + paddle/fluid/operators/chunk_eval_op.cc | 178 + paddle/fluid/operators/chunk_eval_op.h | 266 + paddle/fluid/operators/clip_by_norm_op.cc | 23 + paddle/fluid/operators/clip_by_norm_op.cu | 20 + paddle/fluid/operators/clip_by_norm_op.h | 133 + paddle/fluid/operators/clip_op.cc | 106 + paddle/fluid/operators/clip_op.cu | 21 + paddle/fluid/operators/clip_op.h | 118 + paddle/fluid/operators/coalesce_tensor_op.cc | 234 + .../fluid/operators/collective/CMakeLists.txt | 39 + .../operators/collective/c_allgather_op.cc | 86 + .../operators/collective/c_allgather_op.cu.cc | 77 + .../operators/collective/c_allgather_op.h | 38 + .../collective/c_allreduce_max_op.cc | 39 + .../collective/c_allreduce_max_op.cu.cc | 25 + .../collective/c_allreduce_min_op.cc | 39 + .../collective/c_allreduce_min_op.cu.cc | 25 + .../operators/collective/c_allreduce_op.h | 139 + .../collective/c_allreduce_prod_op.cc | 39 + .../collective/c_allreduce_prod_op.cu.cc | 25 + .../collective/c_allreduce_sum_op.cc | 54 + .../collective/c_allreduce_sum_op.cu.cc | 25 + .../operators/collective/c_broadcast_op.cc | 70 + .../operators/collective/c_broadcast_op.cu.cc | 87 + .../operators/collective/c_broadcast_op.h | 37 + .../operators/collective/c_comm_init_op.cc | 87 + .../operators/collective/c_gen_nccl_id_op.cc | 150 + .../collective/c_reducescatter_op.cc | 91 + .../collective/c_reducescatter_op.cu.cc | 74 + .../operators/collective/c_reducescatter_op.h | 38 + .../collective/c_sync_calc_stream_op.cc | 75 + .../collective/c_sync_comm_stream_op.cc | 77 + paddle/fluid/operators/concat_op.cc | 215 + paddle/fluid/operators/concat_op.cu.cc | 32 + paddle/fluid/operators/concat_op.h | 126 + .../operators/controlflow/CMakeLists.txt | 7 + .../fluid/operators/controlflow/compare_op.cc | 139 + .../fluid/operators/controlflow/compare_op.cu | 24 + .../fluid/operators/controlflow/compare_op.h | 100 + .../controlflow/conditional_block_infer_op.cc | 74 + .../controlflow/conditional_block_op.cc | 189 + .../controlflow/conditional_block_op.h | 111 + paddle/fluid/operators/controlflow/feed_op.cc | 86 + .../fluid/operators/controlflow/fetch_op.cc | 89 + .../operators/controlflow/get_places_op.cc | 116 + .../fluid/operators/controlflow/logical_op.cc | 157 + .../fluid/operators/controlflow/logical_op.cu | 24 + .../fluid/operators/controlflow/logical_op.h | 94 + .../fluid/operators/controlflow/op_variant.cc | 72 + .../fluid/operators/controlflow/op_variant.h | 69 + .../controlflow/recurrent_op_helper.cc | 265 + .../controlflow/recurrent_op_helper.h | 52 + .../controlflow/tensor_array_read_write_op.cc | 232 + .../fluid/operators/controlflow/while_op.cc | 459 + .../operators/controlflow/while_op_helper.cc | 200 + .../operators/controlflow/while_op_helper.h | 43 + paddle/fluid/operators/conv_cudnn_helper.h | 413 + paddle/fluid/operators/conv_cudnn_op.cu.cc | 520 + paddle/fluid/operators/conv_cudnn_op_cache.h | 48 + paddle/fluid/operators/conv_fusion_op.cc | 108 + paddle/fluid/operators/conv_fusion_op.cu.cc | 235 + paddle/fluid/operators/conv_op.cc | 652 + paddle/fluid/operators/conv_op.cu.cc | 43 + paddle/fluid/operators/conv_op.h | 485 + paddle/fluid/operators/conv_shift_op.cc | 205 + paddle/fluid/operators/conv_shift_op.cu | 197 + paddle/fluid/operators/conv_shift_op.h | 33 + .../operators/conv_transpose_cudnn_op.cu.cc | 266 + paddle/fluid/operators/conv_transpose_op.cc | 422 + .../fluid/operators/conv_transpose_op.cu.cc | 42 + paddle/fluid/operators/conv_transpose_op.h | 391 + paddle/fluid/operators/cos_sim_op.cc | 175 + paddle/fluid/operators/cos_sim_op.cu | 21 + paddle/fluid/operators/cos_sim_op.h | 139 + paddle/fluid/operators/crf_decoding_op.cc | 137 + paddle/fluid/operators/crf_decoding_op.h | 107 + paddle/fluid/operators/crop_op.cc | 212 + paddle/fluid/operators/crop_op.cu | 20 + paddle/fluid/operators/crop_op.h | 175 + paddle/fluid/operators/cross_entropy_op.cc | 389 + paddle/fluid/operators/cross_entropy_op.cu | 39 + paddle/fluid/operators/cross_entropy_op.h | 264 + paddle/fluid/operators/ctc_align_op.cc | 91 + paddle/fluid/operators/ctc_align_op.cu | 100 + paddle/fluid/operators/ctc_align_op.h | 83 + paddle/fluid/operators/cudnn_lstm_op.cc | 247 + paddle/fluid/operators/cudnn_lstm_op.cu.cc | 261 + paddle/fluid/operators/cudnn_rnn_cache.h | 255 + paddle/fluid/operators/cum_op.h | 113 + paddle/fluid/operators/cumsum_op.cc | 81 + paddle/fluid/operators/cumsum_op.cu | 22 + paddle/fluid/operators/cvm_op.cc | 154 + paddle/fluid/operators/cvm_op.h | 126 + paddle/fluid/operators/data_norm_op.cc | 409 + paddle/fluid/operators/data_norm_op.h | 35 + paddle/fluid/operators/deformable_conv_op.cc | 278 + paddle/fluid/operators/deformable_conv_op.cu | 752 + .../operators/deformable_psroi_pooling_op.cc | 270 + .../operators/deformable_psroi_pooling_op.cu | 532 + .../operators/deformable_psroi_pooling_op.h | 488 + paddle/fluid/operators/delete_var_op.cc | 57 + paddle/fluid/operators/dequantize_op.cc | 45 + paddle/fluid/operators/dequantize_op.h | 54 + paddle/fluid/operators/detail/safe_ref.h | 45 + .../fluid/operators/detail/strided_memcpy.h | 114 + .../fluid/operators/detection/CMakeLists.txt | 61 + .../detection/anchor_generator_op.cc | 153 + .../detection/anchor_generator_op.cu | 132 + .../operators/detection/anchor_generator_op.h | 109 + paddle/fluid/operators/detection/bbox_util.h | 153 + .../operators/detection/bipartite_match_op.cc | 290 + .../fluid/operators/detection/box_clip_op.cc | 86 + .../fluid/operators/detection/box_clip_op.cu | 74 + .../fluid/operators/detection/box_clip_op.h | 50 + .../fluid/operators/detection/box_coder_op.cc | 192 + .../fluid/operators/detection/box_coder_op.cu | 209 + .../fluid/operators/detection/box_coder_op.h | 247 + .../detection/box_decoder_and_assign_op.cc | 169 + .../detection/box_decoder_and_assign_op.cu | 147 + .../detection/box_decoder_and_assign_op.h | 103 + .../detection/collect_fpn_proposals_op.cc | 108 + .../detection/collect_fpn_proposals_op.cu | 209 + .../detection/collect_fpn_proposals_op.h | 149 + .../detection/density_prior_box_op.cc | 180 + .../detection/density_prior_box_op.cu | 172 + .../detection/density_prior_box_op.h | 154 + .../detection/distribute_fpn_proposals_op.cc | 93 + .../detection/distribute_fpn_proposals_op.cu | 201 + .../detection/distribute_fpn_proposals_op.h | 147 + .../detection/generate_mask_labels_op.cc | 441 + .../detection/generate_proposal_labels_op.cc | 591 + .../detection/generate_proposals_op.cc | 500 + .../detection/generate_proposals_op.cu | 466 + paddle/fluid/operators/detection/gpc.cc | 2206 +++ paddle/fluid/operators/detection/gpc.h | 246 + .../operators/detection/iou_similarity_op.cc | 97 + .../operators/detection/iou_similarity_op.cu | 21 + .../operators/detection/iou_similarity_op.h | 92 + paddle/fluid/operators/detection/mask_util.cc | 229 + paddle/fluid/operators/detection/mask_util.h | 30 + .../operators/detection/mask_util_test.cc | 115 + .../detection/mine_hard_examples_op.cc | 341 + .../operators/detection/multiclass_nms_op.cc | 537 + paddle/fluid/operators/detection/poly_util.cc | 132 + paddle/fluid/operators/detection/poly_util.h | 73 + .../detection/polygon_box_transform_op.cc | 107 + .../detection/polygon_box_transform_op.cu | 76 + .../fluid/operators/detection/prior_box_op.cc | 245 + .../fluid/operators/detection/prior_box_op.cu | 183 + .../fluid/operators/detection/prior_box_op.h | 205 + .../retinanet_detection_output_op.cc | 566 + .../detection/roi_perspective_transform_op.cc | 653 + .../detection/roi_perspective_transform_op.cu | 511 + .../detection/rpn_target_assign_op.cc | 1035 ++ .../detection/sigmoid_focal_loss_op.cc | 208 + .../detection/sigmoid_focal_loss_op.cu | 181 + .../detection/sigmoid_focal_loss_op.h | 128 + .../operators/detection/target_assign_op.cc | 159 + .../operators/detection/target_assign_op.cu | 63 + .../operators/detection/target_assign_op.h | 141 + .../fluid/operators/detection/yolo_box_op.cc | 167 + .../fluid/operators/detection/yolo_box_op.cu | 120 + .../fluid/operators/detection/yolo_box_op.h | 149 + .../operators/detection/yolov3_loss_op.cc | 299 + .../operators/detection/yolov3_loss_op.h | 502 + paddle/fluid/operators/detection_map_op.cc | 198 + paddle/fluid/operators/detection_map_op.h | 480 + paddle/fluid/operators/dgc_clip_by_norm_op.cc | 67 + paddle/fluid/operators/dgc_clip_by_norm_op.cu | 20 + paddle/fluid/operators/dgc_clip_by_norm_op.h | 49 + paddle/fluid/operators/dgc_op.cc | 138 + paddle/fluid/operators/dgc_op.cu | 20 + paddle/fluid/operators/dgc_op.h | 132 + paddle/fluid/operators/diag_op.cc | 60 + paddle/fluid/operators/diag_op.cu | 23 + paddle/fluid/operators/diag_op.h | 59 + .../operators/distributed/CMakeLists.txt | 66 + .../async_sparse_param_update_recorder.cc | 27 + .../async_sparse_param_update_recorder.h | 183 + ...async_sparse_param_update_recorder_test.cc | 99 + .../operators/distributed/brpc/brpc_client.cc | 456 + .../operators/distributed/brpc/brpc_client.h | 173 + .../distributed/brpc/brpc_rdma_pool.cc | 84 + .../distributed/brpc/brpc_rdma_pool.h | 56 + .../distributed/brpc/brpc_sendrecvop_utils.cc | 207 + .../distributed/brpc/brpc_sendrecvop_utils.h | 49 + .../distributed/brpc/brpc_serde_test.cc | 175 + .../operators/distributed/brpc/brpc_server.cc | 403 + .../operators/distributed/brpc/brpc_server.h | 53 + .../brpc/brpc_variable_response.cc | 73 + .../distributed/brpc/brpc_variable_response.h | 67 + .../distributed/collective_client.cc | 59 + .../operators/distributed/collective_client.h | 93 + .../distributed/collective_server.cc | 74 + .../operators/distributed/collective_server.h | 110 + .../distributed/collective_server_test.cc | 130 + .../operators/distributed/communicator.cc | 325 + .../operators/distributed/communicator.h | 218 + .../distributed/communicator_test.cc | 110 + .../fluid/operators/distributed/distributed.h | 35 + .../operators/distributed/distributed_pb.h | 30 + .../grpc/grpc_bytebuffer_stream.cc | 88 + .../distributed/grpc/grpc_bytebuffer_stream.h | 170 + .../operators/distributed/grpc/grpc_client.cc | 481 + .../operators/distributed/grpc/grpc_client.h | 272 + .../operators/distributed/grpc/grpc_serde.cc | 162 + .../operators/distributed/grpc/grpc_serde.h | 52 + .../distributed/grpc/grpc_serde_test.cc | 224 + .../operators/distributed/grpc/grpc_server.cc | 594 + .../operators/distributed/grpc/grpc_server.h | 88 + .../operators/distributed/grpc/grpc_service.h | 136 + .../grpc/grpc_variable_response.cc | 330 + .../distributed/grpc/grpc_variable_response.h | 56 + .../distributed/parameter_prefetch.cc | 241 + .../distributed/parameter_prefetch.h | 83 + .../operators/distributed/parameter_recv.cc | 138 + .../operators/distributed/parameter_recv.h | 34 + .../operators/distributed/parameter_send.cc | 175 + .../operators/distributed/parameter_send.h | 35 + .../distributed/proto_encoder_helper.h | 153 + .../operators/distributed/request_handler.h | 244 + .../distributed/request_handler_impl.cc | 233 + .../distributed/request_handler_impl.h | 131 + .../fluid/operators/distributed/rpc_client.cc | 31 + .../fluid/operators/distributed/rpc_client.h | 121 + .../fluid/operators/distributed/rpc_common.h | 79 + .../fluid/operators/distributed/rpc_server.cc | 234 + .../fluid/operators/distributed/rpc_server.h | 137 + .../operators/distributed/rpc_server_test.cc | 183 + .../operators/distributed/send_recv.proto.in | 87 + .../operators/distributed/sendrecvop_utils.cc | 109 + .../operators/distributed/sendrecvop_utils.h | 93 + .../operators/distributed/varhandle_test.cc | 55 + .../distributed/variable_response.cc | 227 + .../operators/distributed/variable_response.h | 132 + .../operators/distributed_ops/CMakeLists.txt | 39 + .../operators/distributed_ops/allreduce_op.cc | 80 + .../distributed_ops/allreduce_op.cu.cc | 25 + .../operators/distributed_ops/allreduce_op.h | 86 + .../operators/distributed_ops/broadcast_op.cc | 76 + .../distributed_ops/broadcast_op.cu.cc | 81 + .../distributed_ops/checkpoint_notify_op.cc | 90 + .../operators/distributed_ops/fake_init_op.cc | 85 + .../distributed_ops/fetch_barrier_op.cc | 85 + .../distributed_ops/gen_nccl_id_op.cc | 286 + .../distributed_ops/listen_and_serv_op.cc | 523 + .../distributed_ops/listen_and_serv_op.h | 115 + .../operators/distributed_ops/merge_ids_op.cc | 132 + .../operators/distributed_ops/merge_ids_op.h | 105 + .../operators/distributed_ops/prefetch_op.cc | 100 + .../operators/distributed_ops/recv_op.cc | 150 + .../distributed_ops/ref_by_trainer_id_op.cc | 77 + .../ref_by_trainer_id_op.cu.cc | 26 + .../distributed_ops/ref_by_trainer_id_op.h | 48 + .../distributed_ops/send_barrier_op.cc | 91 + .../operators/distributed_ops/send_op.cc | 140 + .../distributed_ops/send_recv_op_test.cc | 255 + .../distributed_ops/send_recv_util.h | 73 + .../distributed_ops/split_byref_op.cc | 102 + .../distributed_ops/split_byref_op.cu.cc | 19 + .../distributed_ops/split_byref_op.h | 43 + .../operators/distributed_ops/split_ids_op.cc | 108 + .../operators/distributed_ops/split_ids_op.h | 125 + .../distributed_ops/test_send_nccl_id.cc | 106 + paddle/fluid/operators/dropout_op.cc | 159 + paddle/fluid/operators/dropout_op.cu | 129 + paddle/fluid/operators/dropout_op.h | 129 + paddle/fluid/operators/dropout_op_test.cc | 106 + paddle/fluid/operators/edit_distance_op.cc | 128 + paddle/fluid/operators/edit_distance_op.cu | 180 + paddle/fluid/operators/edit_distance_op.h | 121 + .../operators/elementwise/CMakeLists.txt | 4 + .../elementwise/elementwise_add_op.cc | 80 + .../elementwise/elementwise_add_op.cu | 39 + .../elementwise/elementwise_add_op.h | 191 + .../elementwise/elementwise_div_op.cc | 107 + .../elementwise/elementwise_div_op.cu | 45 + .../elementwise/elementwise_div_op.h | 192 + .../elementwise/elementwise_floordiv_op.cc | 38 + .../elementwise/elementwise_floordiv_op.cu | 23 + .../elementwise/elementwise_floordiv_op.h | 55 + .../elementwise/elementwise_max_op.cc | 69 + .../elementwise/elementwise_max_op.cu | 30 + .../elementwise/elementwise_max_op.h | 76 + .../elementwise/elementwise_min_op.cc | 69 + .../elementwise/elementwise_min_op.cu | 30 + .../elementwise/elementwise_min_op.h | 75 + .../elementwise/elementwise_mod_op.cc | 36 + .../elementwise/elementwise_mod_op.cu | 22 + .../elementwise/elementwise_mod_op.h | 55 + .../elementwise/elementwise_mul_op.cc | 103 + .../elementwise/elementwise_mul_op.cu | 97 + .../elementwise/elementwise_mul_op.h | 182 + .../operators/elementwise/elementwise_op.h | 402 + .../elementwise/elementwise_op_function.h | 1656 ++ .../elementwise/elementwise_pow_op.cc | 35 + .../elementwise/elementwise_pow_op.cu | 18 + .../elementwise/elementwise_pow_op.h | 45 + .../elementwise/elementwise_sub_op.cc | 80 + .../elementwise/elementwise_sub_op.cu | 45 + .../elementwise/elementwise_sub_op.h | 100 + .../mkldnn/elementwise_add_mkldnn_op.cc | 228 + .../mkldnn/elementwise_mul_mkldnn_op.cc | 189 + .../test_elementwise_add_op_inplace.cc | 136 + paddle/fluid/operators/expand_op.cc | 213 + paddle/fluid/operators/expand_op.cu | 25 + paddle/fluid/operators/expand_op.h | 208 + paddle/fluid/operators/fake_dequantize_op.cc | 199 + paddle/fluid/operators/fake_dequantize_op.cu | 119 + paddle/fluid/operators/fake_dequantize_op.h | 96 + paddle/fluid/operators/fake_quantize_op.cc | 532 + paddle/fluid/operators/fake_quantize_op.cu | 345 + paddle/fluid/operators/fake_quantize_op.h | 277 + paddle/fluid/operators/fc_op.cc | 173 + paddle/fluid/operators/fc_op.h | 68 + paddle/fluid/operators/fill_any_like_op.cc | 64 + paddle/fluid/operators/fill_any_like_op.cu | 27 + paddle/fluid/operators/fill_any_like_op.h | 60 + .../fill_constant_batch_size_like_op.cc | 68 + .../fill_constant_batch_size_like_op.cu.cc | 28 + .../fill_constant_batch_size_like_op.h | 45 + paddle/fluid/operators/fill_constant_op.cc | 90 + paddle/fluid/operators/fill_constant_op.cu.cc | 22 + paddle/fluid/operators/fill_constant_op.h | 64 + paddle/fluid/operators/fill_op.cc | 116 + paddle/fluid/operators/fill_zeros_like_op.cc | 105 + .../fluid/operators/fill_zeros_like_op.cu.cc | 38 + paddle/fluid/operators/fill_zeros_like_op.h | 36 + paddle/fluid/operators/flatten_op.cc | 303 + paddle/fluid/operators/fsp_op.cc | 128 + paddle/fluid/operators/fsp_op.cu | 24 + paddle/fluid/operators/fsp_op.h | 136 + paddle/fluid/operators/fused/CMakeLists.txt | 10 + .../fused/fused_elemwise_activation_op.cc | 357 + .../fused/fused_elemwise_activation_op.cu | 30 + .../fused/fused_elemwise_activation_op.h | 477 + .../fused/fused_embedding_fc_lstm_op.cc | 597 + .../fused/fused_embedding_fc_lstm_op.h | 41 + .../fused/fused_embedding_seq_pool_op.cc | 163 + .../fused/fused_embedding_seq_pool_op.h | 157 + .../fused/fusion_conv_inception_op.cc | 111 + .../fused/fusion_conv_inception_op.cu | 272 + paddle/fluid/operators/fused/fusion_gru_op.cc | 402 + paddle/fluid/operators/fused/fusion_gru_op.h | 41 + .../fluid/operators/fused/fusion_lstm_op.cc | 481 + paddle/fluid/operators/fused/fusion_lstm_op.h | 41 + .../fused/fusion_repeated_fc_relu_op.cc | 152 + .../fused/fusion_repeated_fc_relu_op.h | 41 + .../fused/fusion_seqconv_eltadd_relu_op.cc | 228 + .../fused/fusion_seqconv_eltadd_relu_op.h | 42 + .../fused/fusion_seqexpand_concat_fc_op.cc | 205 + .../fused/fusion_seqexpand_concat_fc_op.h | 42 + .../fused/fusion_seqpool_concat_op.cc | 134 + .../fused/fusion_seqpool_concat_op.h | 41 + .../fused/fusion_squared_mat_sub_op.cc | 144 + .../fused/fusion_squared_mat_sub_op.h | 42 + .../fusion_transpose_flatten_concat_op.cc | 114 + .../fusion_transpose_flatten_concat_op.cu.cc | 115 + .../fusion_transpose_flatten_concat_op.h | 50 + paddle/fluid/operators/gather.cu.h | 82 + paddle/fluid/operators/gather.h | 64 + paddle/fluid/operators/gather_op.cc | 144 + paddle/fluid/operators/gather_op.cu | 102 + paddle/fluid/operators/gather_op.h | 102 + paddle/fluid/operators/gather_test.cc | 54 + .../gaussian_random_batch_size_like_op.cc | 77 + paddle/fluid/operators/gaussian_random_op.cc | 135 + paddle/fluid/operators/gaussian_random_op.cu | 68 + .../get_tensor_from_selected_rows_op.cc | 114 + .../operators/grid_sampler_cudnn_op.cu.cc | 132 + paddle/fluid/operators/grid_sampler_op.cc | 208 + paddle/fluid/operators/grid_sampler_op.h | 330 + paddle/fluid/operators/group_norm_op.cc | 213 + paddle/fluid/operators/group_norm_op.cu | 306 + paddle/fluid/operators/group_norm_op.h | 192 + paddle/fluid/operators/gru_op.cc | 390 + paddle/fluid/operators/gru_op.cu.cc | 114 + paddle/fluid/operators/gru_op.h | 175 + paddle/fluid/operators/gru_unit_op.cc | 248 + paddle/fluid/operators/gru_unit_op.cu | 23 + paddle/fluid/operators/gru_unit_op.h | 253 + paddle/fluid/operators/hash_op.cc | 68 + paddle/fluid/operators/hash_op.h | 72 + .../operators/hierarchical_sigmoid_op.cc | 283 + .../fluid/operators/hierarchical_sigmoid_op.h | 261 + paddle/fluid/operators/hinge_loss_op.cc | 132 + paddle/fluid/operators/hinge_loss_op.cu | 22 + paddle/fluid/operators/hinge_loss_op.h | 71 + paddle/fluid/operators/huber_loss_op.cc | 147 + paddle/fluid/operators/huber_loss_op.cu | 22 + paddle/fluid/operators/huber_loss_op.h | 123 + paddle/fluid/operators/im2sequence_op.cc | 178 + paddle/fluid/operators/im2sequence_op.cu | 23 + paddle/fluid/operators/im2sequence_op.h | 194 + paddle/fluid/operators/increment_op.cc | 92 + paddle/fluid/operators/increment_op.cu | 22 + paddle/fluid/operators/increment_op.h | 39 + paddle/fluid/operators/interpolate_op.cc | 263 + paddle/fluid/operators/interpolate_op.cu | 365 + paddle/fluid/operators/interpolate_op.h | 336 + paddle/fluid/operators/is_empty_op.cc | 66 + paddle/fluid/operators/is_empty_op.cu.cc | 23 + paddle/fluid/operators/is_empty_op.h | 40 + paddle/fluid/operators/isfinite_op.cc | 114 + paddle/fluid/operators/isfinite_op.cu | 31 + paddle/fluid/operators/isfinite_op.h | 71 + paddle/fluid/operators/jit/CMakeLists.txt | 25 + paddle/fluid/operators/jit/README.en.md | 103 + paddle/fluid/operators/jit/README.md | 94 + paddle/fluid/operators/jit/benchmark.cc | 568 + paddle/fluid/operators/jit/gen/CMakeLists.txt | 36 + paddle/fluid/operators/jit/gen/act.cc | 164 + paddle/fluid/operators/jit/gen/act.h | 331 + paddle/fluid/operators/jit/gen/blas.cc | 191 + paddle/fluid/operators/jit/gen/blas.h | 121 + paddle/fluid/operators/jit/gen/embseqpool.cc | 150 + paddle/fluid/operators/jit/gen/embseqpool.h | 81 + paddle/fluid/operators/jit/gen/gru.cc | 117 + paddle/fluid/operators/jit/gen/gru.h | 113 + paddle/fluid/operators/jit/gen/hopv.cc | 104 + paddle/fluid/operators/jit/gen/hopv.h | 90 + paddle/fluid/operators/jit/gen/jitcode.h | 129 + paddle/fluid/operators/jit/gen/lstm.cc | 143 + paddle/fluid/operators/jit/gen/lstm.h | 118 + paddle/fluid/operators/jit/gen/matmul.cc | 128 + paddle/fluid/operators/jit/gen/matmul.h | 62 + paddle/fluid/operators/jit/gen/seqpool.cc | 86 + paddle/fluid/operators/jit/gen/seqpool.h | 214 + paddle/fluid/operators/jit/gen/sgd.cc | 131 + paddle/fluid/operators/jit/gen/sgd.h | 60 + paddle/fluid/operators/jit/gen/vbroadcast.cc | 91 + paddle/fluid/operators/jit/gen/vbroadcast.h | 53 + paddle/fluid/operators/jit/gen_base.cc | 91 + paddle/fluid/operators/jit/gen_base.h | 85 + paddle/fluid/operators/jit/helper.cc | 139 + paddle/fluid/operators/jit/helper.h | 270 + paddle/fluid/operators/jit/kernel_base.h | 357 + paddle/fluid/operators/jit/kernel_key.cc | 69 + paddle/fluid/operators/jit/kernel_key.h | 53 + paddle/fluid/operators/jit/kernel_pool.cc | 41 + paddle/fluid/operators/jit/kernel_pool.h | 120 + paddle/fluid/operators/jit/kernels.h | 160 + paddle/fluid/operators/jit/macro.h | 32 + .../fluid/operators/jit/more/CMakeLists.txt | 17 + .../jit/more/intrinsic/CMakeLists.txt | 9 + .../jit/more/intrinsic/crf_decoding.cc | 180 + .../jit/more/intrinsic/crf_decoding.h | 41 + .../jit/more/intrinsic/layer_norm.cc | 168 + .../operators/jit/more/intrinsic/layer_norm.h | 42 + .../operators/jit/more/mix/CMakeLists.txt | 15 + paddle/fluid/operators/jit/more/mix/mix.cc | 256 + paddle/fluid/operators/jit/more/mix/mix.h | 65 + .../operators/jit/more/mkl/CMakeLists.txt | 20 + paddle/fluid/operators/jit/more/mkl/mkl.cc | 310 + paddle/fluid/operators/jit/more/mkl/mkl.h | 233 + .../fluid/operators/jit/refer/CMakeLists.txt | 40 + paddle/fluid/operators/jit/refer/refer.cc | 61 + paddle/fluid/operators/jit/refer/refer.h | 580 + paddle/fluid/operators/jit/registry.h | 168 + paddle/fluid/operators/jit/test.cc | 1382 ++ paddle/fluid/operators/kldiv_loss_op.cc | 173 + paddle/fluid/operators/kldiv_loss_op.cu | 22 + paddle/fluid/operators/kldiv_loss_op.h | 119 + paddle/fluid/operators/l1_norm_op.cc | 93 + paddle/fluid/operators/l1_norm_op.cu | 21 + paddle/fluid/operators/l1_norm_op.h | 65 + paddle/fluid/operators/label_smooth_op.cc | 143 + paddle/fluid/operators/label_smooth_op.cu | 26 + paddle/fluid/operators/label_smooth_op.h | 67 + paddle/fluid/operators/layer_norm_op.cc | 208 + paddle/fluid/operators/layer_norm_op.cu | 529 + paddle/fluid/operators/layer_norm_op.h | 349 + paddle/fluid/operators/linear_chain_crf_op.cc | 317 + paddle/fluid/operators/linear_chain_crf_op.cu | 27 + paddle/fluid/operators/linear_chain_crf_op.h | 353 + paddle/fluid/operators/linspace_op.cc | 84 + paddle/fluid/operators/linspace_op.cu | 75 + paddle/fluid/operators/linspace_op.h | 51 + paddle/fluid/operators/load_combine_op.cc | 92 + paddle/fluid/operators/load_combine_op.cu | 25 + paddle/fluid/operators/load_combine_op.h | 105 + paddle/fluid/operators/load_op.cc | 68 + paddle/fluid/operators/load_op.cu | 24 + paddle/fluid/operators/load_op.h | 102 + paddle/fluid/operators/lod_array_length_op.cc | 75 + paddle/fluid/operators/lod_rank_table_op.cc | 81 + paddle/fluid/operators/lod_reset_op.cc | 210 + paddle/fluid/operators/lod_reset_op.cu | 29 + paddle/fluid/operators/lod_reset_op.h | 91 + .../fluid/operators/lod_tensor_to_array_op.cc | 233 + paddle/fluid/operators/log_loss_op.cc | 136 + paddle/fluid/operators/log_loss_op.cu | 21 + paddle/fluid/operators/log_loss_op.h | 75 + .../fluid/operators/lookup_sparse_table_op.cc | 120 + paddle/fluid/operators/lookup_table_op.cc | 200 + paddle/fluid/operators/lookup_table_op.cu | 205 + paddle/fluid/operators/lookup_table_op.h | 209 + paddle/fluid/operators/lrn_op.cc | 297 + paddle/fluid/operators/lrn_op.cu | 178 + paddle/fluid/operators/lrn_op.h | 133 + paddle/fluid/operators/lstm_op.cc | 319 + paddle/fluid/operators/lstm_op.cu.cc | 23 + paddle/fluid/operators/lstm_op.h | 379 + paddle/fluid/operators/lstm_unit_op.cc | 109 + paddle/fluid/operators/lstm_unit_op.cu | 180 + paddle/fluid/operators/lstm_unit_op.h | 151 + paddle/fluid/operators/lstmp_op.cc | 366 + paddle/fluid/operators/lstmp_op.cu | 24 + paddle/fluid/operators/lstmp_op.h | 522 + paddle/fluid/operators/margin_rank_loss_op.cc | 139 + paddle/fluid/operators/margin_rank_loss_op.cu | 24 + paddle/fluid/operators/margin_rank_loss_op.h | 98 + paddle/fluid/operators/math.h | 42 + paddle/fluid/operators/math/CMakeLists.txt | 79 + paddle/fluid/operators/math/algorithm.h | 90 + paddle/fluid/operators/math/beam_search.cc | 289 + paddle/fluid/operators/math/beam_search.cu | 426 + paddle/fluid/operators/math/beam_search.h | 117 + .../fluid/operators/math/beam_search_test.cc | 142 + paddle/fluid/operators/math/blas.cc | 52 + paddle/fluid/operators/math/blas.h | 326 + paddle/fluid/operators/math/blas_impl.cu.h | 380 + paddle/fluid/operators/math/blas_impl.h | 657 + .../fluid/operators/math/compound_functors.h | 254 + paddle/fluid/operators/math/concat.hip.cu | 15 + .../fluid/operators/math/concat_and_split.cc | 120 + .../fluid/operators/math/concat_and_split.cu | 411 + .../fluid/operators/math/concat_and_split.h | 77 + paddle/fluid/operators/math/concat_test.cc | 395 + .../fluid/operators/math/context_project.cc | 26 + .../fluid/operators/math/context_project.cu | 25 + paddle/fluid/operators/math/context_project.h | 324 + .../fluid/operators/math/cos_sim_functor.cc | 48 + .../fluid/operators/math/cos_sim_functor.cu | 64 + paddle/fluid/operators/math/cos_sim_functor.h | 166 + paddle/fluid/operators/math/cpu_vec.h | 630 + paddle/fluid/operators/math/cpu_vec_test.cc | 324 + paddle/fluid/operators/math/cross_entropy.cc | 73 + paddle/fluid/operators/math/cross_entropy.cu | 93 + paddle/fluid/operators/math/cross_entropy.h | 67 + paddle/fluid/operators/math/depthwise_conv.cu | 665 + paddle/fluid/operators/math/depthwise_conv.h | 67 + .../operators/math/detail/CMakeLists.txt | 1 + .../math/detail/activation_functions.h | 189 + .../operators/math/detail/avx_functions.cc | 88 + .../fluid/operators/math/detail/avx_mathfun.h | 731 + .../operators/math/detail/gru_cpu_kernel.h | 499 + .../operators/math/detail/gru_gpu_kernel.h | 319 + .../fluid/operators/math/detail/gru_kernel.h | 201 + .../operators/math/detail/lstm_cpu_kernel.h | 327 + .../operators/math/detail/lstm_gpu_kernel.h | 257 + .../fluid/operators/math/detail/lstm_kernel.h | 188 + paddle/fluid/operators/math/fc_compute.h | 55 + paddle/fluid/operators/math/functors.h | 125 + paddle/fluid/operators/math/gru_compute.cc | 104 + paddle/fluid/operators/math/gru_compute.cu | 200 + paddle/fluid/operators/math/gru_compute.h | 62 + paddle/fluid/operators/math/im2col.cc | 271 + paddle/fluid/operators/math/im2col.cu | 396 + paddle/fluid/operators/math/im2col.h | 103 + paddle/fluid/operators/math/im2col_cfo_cpu.h | 252 + paddle/fluid/operators/math/im2col_test.cc | 272 + paddle/fluid/operators/math/lstm_compute.cc | 83 + paddle/fluid/operators/math/lstm_compute.cu | 57 + paddle/fluid/operators/math/lstm_compute.h | 71 + paddle/fluid/operators/math/math_function.cc | 151 + paddle/fluid/operators/math/math_function.cu | 153 + paddle/fluid/operators/math/math_function.h | 74 + .../fluid/operators/math/math_function_impl.h | 172 + .../operators/math/math_function_test.cc | 284 + .../operators/math/math_function_test.cu | 467 + .../fluid/operators/math/matrix_bit_code.cc | 363 + paddle/fluid/operators/math/matrix_bit_code.h | 267 + paddle/fluid/operators/math/maxouting.cc | 101 + paddle/fluid/operators/math/maxouting.cu | 147 + paddle/fluid/operators/math/maxouting.h | 42 + paddle/fluid/operators/math/padding.h | 124 + paddle/fluid/operators/math/pooling.cc | 860 ++ paddle/fluid/operators/math/pooling.cu | 1219 ++ paddle/fluid/operators/math/pooling.h | 229 + paddle/fluid/operators/math/prelu.cu | 148 + paddle/fluid/operators/math/prelu.h | 49 + paddle/fluid/operators/math/sample_prob.cc | 26 + paddle/fluid/operators/math/sample_prob.cu | 161 + paddle/fluid/operators/math/sample_prob.h | 118 + paddle/fluid/operators/math/sampler.cc | 98 + paddle/fluid/operators/math/sampler.h | 127 + .../operators/math/selected_rows_functor.cc | 448 + .../operators/math/selected_rows_functor.cu | 472 + .../operators/math/selected_rows_functor.h | 109 + .../math/selected_rows_functor_test.cc | 518 + .../math/selected_rows_functor_test.cu.cc | 308 + paddle/fluid/operators/math/sequence2batch.cc | 64 + paddle/fluid/operators/math/sequence2batch.cu | 78 + paddle/fluid/operators/math/sequence2batch.h | 177 + .../fluid/operators/math/sequence_padding.cc | 155 + .../fluid/operators/math/sequence_padding.cu | 170 + .../fluid/operators/math/sequence_padding.h | 104 + .../operators/math/sequence_padding_test.cc | 120 + .../fluid/operators/math/sequence_pooling.cc | 392 + .../fluid/operators/math/sequence_pooling.cu | 381 + .../fluid/operators/math/sequence_pooling.h | 47 + .../operators/math/sequence_pooling_test.cc | 126 + paddle/fluid/operators/math/sequence_scale.cc | 46 + paddle/fluid/operators/math/sequence_scale.cu | 58 + paddle/fluid/operators/math/sequence_scale.h | 55 + paddle/fluid/operators/math/softmax.cc | 31 + paddle/fluid/operators/math/softmax.cu | 113 + paddle/fluid/operators/math/softmax.h | 58 + paddle/fluid/operators/math/softmax_impl.h | 216 + paddle/fluid/operators/math/tree2col.cc | 197 + paddle/fluid/operators/math/tree2col.cu | 208 + paddle/fluid/operators/math/tree2col.h | 90 + paddle/fluid/operators/math/unpooling.cc | 91 + paddle/fluid/operators/math/unpooling.cu | 128 + paddle/fluid/operators/math/unpooling.h | 38 + paddle/fluid/operators/math/vol2col.cc | 201 + paddle/fluid/operators/math/vol2col.cu | 264 + paddle/fluid/operators/math/vol2col.h | 89 + paddle/fluid/operators/math/vol2col_test.cc | 128 + paddle/fluid/operators/matmul_op.cc | 451 + paddle/fluid/operators/max_sequence_len_op.cc | 70 + paddle/fluid/operators/maxout_op.cc | 108 + paddle/fluid/operators/maxout_op.cu.cc | 24 + paddle/fluid/operators/maxout_op.h | 62 + paddle/fluid/operators/mean_iou_op.cc | 109 + paddle/fluid/operators/mean_iou_op.cu | 164 + paddle/fluid/operators/mean_iou_op.h | 117 + paddle/fluid/operators/mean_op.cc | 103 + paddle/fluid/operators/mean_op.cu | 26 + paddle/fluid/operators/mean_op.h | 66 + paddle/fluid/operators/merge_lod_tensor_op.cc | 198 + .../fluid/operators/merge_selected_rows_op.cc | 100 + .../operators/merge_selected_rows_op.cu.cc | 23 + .../fluid/operators/merge_selected_rows_op.h | 36 + paddle/fluid/operators/metrics/CMakeLists.txt | 2 + paddle/fluid/operators/metrics/accuracy_op.cc | 102 + paddle/fluid/operators/metrics/accuracy_op.cu | 101 + paddle/fluid/operators/metrics/accuracy_op.h | 70 + paddle/fluid/operators/metrics/auc_op.cc | 113 + paddle/fluid/operators/metrics/auc_op.h | 169 + .../operators/metrics/precision_recall_op.cc | 190 + .../operators/metrics/precision_recall_op.h | 161 + paddle/fluid/operators/minus_op.cc | 110 + paddle/fluid/operators/minus_op.cu | 19 + paddle/fluid/operators/minus_op.h | 40 + .../operators/mkldnn/activation_mkldnn_op.cc | 282 + .../operators/mkldnn/batch_norm_mkldnn_op.cc | 529 + .../operators/mkldnn/concat_mkldnn_op.cc | 235 + .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 939 ++ .../mkldnn/conv_transpose_mkldnn_op.cc | 230 + .../operators/mkldnn/dequantize_mkldnn_op.cc | 125 + paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 280 + .../mkldnn/gaussian_random_mkldnn_op.cc | 55 + .../fluid/operators/mkldnn/lrn_mkldnn_op.cc | 174 + .../operators/mkldnn/mkldnn_activation_op.h | 66 + .../fluid/operators/mkldnn/mul_mkldnn_op.cc | 433 + .../fluid/operators/mkldnn/pool_mkldnn_op.cc | 224 + .../operators/mkldnn/quantize_mkldnn_op.cc | 130 + .../operators/mkldnn/requantize_mkldnn_op.cc | 94 + .../operators/mkldnn/softmax_mkldnn_op.cc | 280 + .../fluid/operators/mkldnn/sum_mkldnn_op.cc | 153 + .../operators/mkldnn/transpose_mkldnn_op.cc | 161 + .../fluid/operators/modified_huber_loss_op.cc | 127 + .../fluid/operators/modified_huber_loss_op.cu | 78 + .../fluid/operators/modified_huber_loss_op.h | 106 + paddle/fluid/operators/mul_op.cc | 303 + paddle/fluid/operators/mul_op.cu.cc | 30 + paddle/fluid/operators/mul_op.h | 206 + paddle/fluid/operators/multiplex_op.cc | 169 + paddle/fluid/operators/multiplex_op.cu | 107 + paddle/fluid/operators/multiplex_op.h | 86 + paddle/fluid/operators/nccl/CMakeLists.txt | 13 + .../fluid/operators/nccl/nccl_gpu_common.cc | 65 + paddle/fluid/operators/nccl/nccl_gpu_common.h | 45 + paddle/fluid/operators/nccl/nccl_op.cc | 243 + paddle/fluid/operators/nccl/nccl_op.cu.cc | 161 + .../fluid/operators/nccl/nccl_op_test.cu.cc | 287 + paddle/fluid/operators/nce_op.cc | 265 + paddle/fluid/operators/nce_op.h | 406 + paddle/fluid/operators/ngraph/CMakeLists.txt | 6 + .../fluid/operators/ngraph/ngraph_bridge.cc | 72 + paddle/fluid/operators/ngraph/ngraph_bridge.h | 51 + .../fluid/operators/ngraph/ngraph_engine.cc | 660 + paddle/fluid/operators/ngraph/ngraph_engine.h | 120 + .../operators/ngraph/ngraph_engine_op.cc | 52 + .../fluid/operators/ngraph/ngraph_engine_op.h | 56 + .../fluid/operators/ngraph/ops/CMakeLists.txt | 8 + .../fluid/operators/ngraph/ops/accuracy_op.h | 70 + .../operators/ngraph/ops/activation_op.h | 117 + paddle/fluid/operators/ngraph/ops/adam_op.h | 79 + paddle/fluid/operators/ngraph/ops/assign_op.h | 43 + .../operators/ngraph/ops/batch_norm_op.h | 163 + .../operators/ngraph/ops/binary_unary_op.h | 61 + paddle/fluid/operators/ngraph/ops/cast_op.h | 62 + paddle/fluid/operators/ngraph/ops/concat_op.h | 50 + paddle/fluid/operators/ngraph/ops/conv2d_op.h | 242 + .../operators/ngraph/ops/cross_entropy_op.h | 243 + .../fluid/operators/ngraph/ops/dropout_op.h | 110 + .../operators/ngraph/ops/elementwise_add_op.h | 93 + .../ops/elementwise_binary_prepare_node.h | 78 + .../operators/ngraph/ops/elementwise_div_op.h | 103 + .../operators/ngraph/ops/elementwise_mul_op.h | 111 + .../operators/ngraph/ops/elementwise_node.h | 77 + .../ngraph/ops/elementwise_scalar_op.h | 59 + .../operators/ngraph/ops/fill_constant_op.h | 51 + .../operators/ngraph/ops/fill_zeros_like_op.h | 45 + paddle/fluid/operators/ngraph/ops/gather_op.h | 77 + .../fluid/operators/ngraph/ops/increment_op.h | 49 + .../operators/ngraph/ops/layer_norm_op.h | 195 + .../operators/ngraph/ops/lookup_table_op.h | 103 + paddle/fluid/operators/ngraph/ops/lrn_op.h | 54 + paddle/fluid/operators/ngraph/ops/matmul_op.h | 248 + paddle/fluid/operators/ngraph/ops/mean_op.h | 72 + .../fluid/operators/ngraph/ops/momentum_op.h | 106 + paddle/fluid/operators/ngraph/ops/mul_op.h | 143 + paddle/fluid/operators/ngraph/ops/op_bridge.h | 84 + paddle/fluid/operators/ngraph/ops/pool2d_op.h | 191 + .../operators/ngraph/ops/reduce_sum_op.h | 161 + .../fluid/operators/ngraph/ops/reshape_op.h | 117 + paddle/fluid/operators/ngraph/ops/scale_op.h | 44 + paddle/fluid/operators/ngraph/ops/slice_op.h | 111 + .../fluid/operators/ngraph/ops/softmax_op.h | 93 + .../ops/softmax_with_cross_entropy_op.h | 90 + paddle/fluid/operators/ngraph/ops/stack_op.h | 56 + paddle/fluid/operators/ngraph/ops/sum_op.h | 58 + paddle/fluid/operators/ngraph/ops/top_k_op.h | 49 + .../fluid/operators/ngraph/ops/transpose_op.h | 101 + paddle/fluid/operators/norm_op.cc | 111 + paddle/fluid/operators/norm_op.cu | 161 + paddle/fluid/operators/norm_op.h | 130 + paddle/fluid/operators/one_hot_op.cc | 125 + paddle/fluid/operators/one_hot_op.cu | 98 + paddle/fluid/operators/one_hot_op.h | 95 + .../fluid/operators/optimizers/CMakeLists.txt | 2 + .../fluid/operators/optimizers/adadelta_op.cc | 129 + .../fluid/operators/optimizers/adadelta_op.cu | 19 + .../fluid/operators/optimizers/adadelta_op.h | 82 + .../fluid/operators/optimizers/adagrad_op.cc | 151 + .../fluid/operators/optimizers/adagrad_op.cu | 117 + .../fluid/operators/optimizers/adagrad_op.h | 109 + paddle/fluid/operators/optimizers/adam_op.cc | 150 + paddle/fluid/operators/optimizers/adam_op.cu | 19 + paddle/fluid/operators/optimizers/adam_op.h | 566 + .../fluid/operators/optimizers/adamax_op.cc | 147 + .../fluid/operators/optimizers/adamax_op.cu | 19 + paddle/fluid/operators/optimizers/adamax_op.h | 80 + .../optimizers/decayed_adagrad_op.cc | 116 + .../optimizers/decayed_adagrad_op.cu | 19 + .../operators/optimizers/decayed_adagrad_op.h | 69 + paddle/fluid/operators/optimizers/ftrl_op.cc | 154 + paddle/fluid/operators/optimizers/ftrl_op.cu | 17 + paddle/fluid/operators/optimizers/ftrl_op.h | 109 + paddle/fluid/operators/optimizers/lamb_op.cc | 85 + paddle/fluid/operators/optimizers/lamb_op.cu | 20 + paddle/fluid/operators/optimizers/lamb_op.h | 305 + .../operators/optimizers/lars_momentum_op.cc | 85 + .../operators/optimizers/lars_momentum_op.cu | 94 + .../operators/optimizers/lars_momentum_op.h | 72 + .../fluid/operators/optimizers/momentum_op.cc | 95 + .../fluid/operators/optimizers/momentum_op.cu | 24 + .../fluid/operators/optimizers/momentum_op.h | 397 + .../optimizers/proximal_adagrad_op.cc | 121 + .../optimizers/proximal_adagrad_op.cu | 18 + .../optimizers/proximal_adagrad_op.h | 68 + .../operators/optimizers/proximal_gd_op.cc | 102 + .../operators/optimizers/proximal_gd_op.cu | 18 + .../operators/optimizers/proximal_gd_op.h | 64 + .../fluid/operators/optimizers/rmsprop_op.cc | 146 + .../fluid/operators/optimizers/rmsprop_op.cu | 18 + .../fluid/operators/optimizers/rmsprop_op.h | 258 + paddle/fluid/operators/optimizers/sgd_op.cc | 108 + paddle/fluid/operators/optimizers/sgd_op.cu | 128 + paddle/fluid/operators/optimizers/sgd_op.h | 133 + paddle/fluid/operators/pad2d_op.cc | 663 + paddle/fluid/operators/pad2d_op.cu | 463 + .../fluid/operators/pad_constant_like_op.cc | 226 + .../fluid/operators/pad_constant_like_op.cu | 25 + paddle/fluid/operators/pad_constant_like_op.h | 93 + paddle/fluid/operators/pad_op.cc | 149 + paddle/fluid/operators/pad_op.cu | 20 + paddle/fluid/operators/pad_op.h | 63 + paddle/fluid/operators/pixel_shuffle_op.cc | 135 + paddle/fluid/operators/pixel_shuffle_op.cu | 26 + paddle/fluid/operators/pixel_shuffle_op.h | 82 + paddle/fluid/operators/pool_cudnn_op.cu.cc | 190 + paddle/fluid/operators/pool_op.cc | 504 + paddle/fluid/operators/pool_op.cu.cc | 33 + paddle/fluid/operators/pool_op.h | 191 + paddle/fluid/operators/pool_with_index_op.cc | 321 + .../fluid/operators/pool_with_index_op.cu.cc | 43 + paddle/fluid/operators/pool_with_index_op.h | 115 + .../operators/positive_negative_pair_op.cc | 185 + .../operators/positive_negative_pair_op.h | 113 + paddle/fluid/operators/prelu_op.cc | 132 + paddle/fluid/operators/prelu_op.cu | 211 + paddle/fluid/operators/prelu_op.h | 126 + paddle/fluid/operators/print_op.cc | 286 + paddle/fluid/operators/psroi_pool_op.cc | 189 + paddle/fluid/operators/psroi_pool_op.cu | 294 + paddle/fluid/operators/psroi_pool_op.h | 252 + paddle/fluid/operators/py_func_op.cc | 312 + paddle/fluid/operators/py_func_op.h | 24 + paddle/fluid/operators/quantize_op.cc | 47 + paddle/fluid/operators/quantize_op.h | 46 + paddle/fluid/operators/random_crop_op.cc | 83 + paddle/fluid/operators/random_crop_op.cu | 21 + paddle/fluid/operators/random_crop_op.h | 187 + paddle/fluid/operators/range_op.cc | 69 + paddle/fluid/operators/range_op.cu | 67 + paddle/fluid/operators/range_op.h | 56 + paddle/fluid/operators/rank_loss_op.cc | 150 + paddle/fluid/operators/rank_loss_op.cu | 22 + paddle/fluid/operators/rank_loss_op.h | 80 + paddle/fluid/operators/reader/CMakeLists.txt | 41 + .../fluid/operators/reader/blocking_queue.h | 133 + .../fluid/operators/reader/buffered_reader.cc | 185 + .../fluid/operators/reader/buffered_reader.h | 75 + .../reader/create_batch_reader_op.cc | 151 + .../operators/reader/create_ctr_reader_op.cc | 97 + .../reader/create_custom_reader_op.cc | 189 + .../reader/create_double_buffer_reader_op.cc | 85 + .../reader/create_multi_pass_reader_op.cc | 93 + .../operators/reader/create_py_reader_op.cc | 65 + .../reader/create_random_data_generator_op.cc | 107 + .../reader/create_recordio_file_reader_op.cc | 93 + .../reader/create_shuffle_reader_op.cc | 124 + .../reader/lod_tensor_blocking_queue.h | 91 + .../fluid/operators/reader/open_files_op.cc | 277 + paddle/fluid/operators/reader/py_reader.cc | 42 + paddle/fluid/operators/reader/py_reader.h | 45 + paddle/fluid/operators/reader/read_op.cc | 129 + .../reader/reader_blocking_queue_test.cc | 243 + .../operators/reader/reader_op_registry.cc | 144 + .../operators/reader/reader_op_registry.h | 113 + paddle/fluid/operators/recurrent_op.cc | 640 + paddle/fluid/operators/recurrent_op.h | 226 + .../fluid/operators/reduce_ops/CMakeLists.txt | 24 + .../fluid/operators/reduce_ops/cub_reduce.h | 328 + .../operators/reduce_ops/reduce_all_op.cc | 20 + .../operators/reduce_ops/reduce_all_op.cu | 19 + .../operators/reduce_ops/reduce_all_op.h | 29 + .../operators/reduce_ops/reduce_any_op.cc | 20 + .../operators/reduce_ops/reduce_any_op.cu | 19 + .../operators/reduce_ops/reduce_any_op.h | 29 + .../operators/reduce_ops/reduce_max_op.cc | 34 + .../operators/reduce_ops/reduce_max_op.cu | 25 + .../reduce_ops/reduce_max_op.part.cu | 25 + .../operators/reduce_ops/reduce_mean_op.cc | 94 + .../operators/reduce_ops/reduce_mean_op.cu | 71 + .../operators/reduce_ops/reduce_mean_op.h | 39 + .../reduce_ops/reduce_mean_op.part.cu | 26 + .../operators/reduce_ops/reduce_min_max_op.h | 50 + .../operators/reduce_ops/reduce_min_op.cc | 34 + .../operators/reduce_ops/reduce_min_op.cu | 25 + .../reduce_ops/reduce_min_op.part.cu | 25 + paddle/fluid/operators/reduce_ops/reduce_op.h | 285 + .../operators/reduce_ops/reduce_op_function.h | 109 + .../operators/reduce_ops/reduce_prod_op.cc | 35 + .../operators/reduce_ops/reduce_prod_op.cu | 25 + .../operators/reduce_ops/reduce_prod_op.h | 39 + .../reduce_ops/reduce_prod_op.part.cu | 25 + .../operators/reduce_ops/reduce_sum_op.cc | 35 + .../operators/reduce_ops/reduce_sum_op.cu | 66 + .../operators/reduce_ops/reduce_sum_op.h | 97 + .../reduce_ops/reduce_sum_op.part.cu | 26 + .../reorder_lod_tensor_by_rank_op.cc | 273 + paddle/fluid/operators/requantize_op.cc | 46 + paddle/fluid/operators/requantize_op.h | 47 + paddle/fluid/operators/reshape_op.cc | 461 + paddle/fluid/operators/reverse_op.cc | 107 + paddle/fluid/operators/reverse_op.cu | 24 + paddle/fluid/operators/reverse_op.h | 87 + .../fluid/operators/rnn_memory_helper_op.cc | 166 + paddle/fluid/operators/roi_align_op.cc | 184 + paddle/fluid/operators/roi_align_op.cu | 362 + paddle/fluid/operators/roi_align_op.h | 338 + paddle/fluid/operators/roi_pool_op.cc | 194 + paddle/fluid/operators/roi_pool_op.cu | 263 + paddle/fluid/operators/roi_pool_op.h | 213 + paddle/fluid/operators/row_conv_op.cc | 298 + paddle/fluid/operators/row_conv_op.cu | 418 + paddle/fluid/operators/row_conv_op.h | 33 + paddle/fluid/operators/sample_logits_op.cc | 243 + paddle/fluid/operators/sample_logits_op.cu | 257 + paddle/fluid/operators/sample_logits_op.h | 245 + paddle/fluid/operators/sampling_id_op.cc | 77 + paddle/fluid/operators/sampling_id_op.cu | 19 + paddle/fluid/operators/sampling_id_op.h | 80 + paddle/fluid/operators/save_combine_op.cc | 89 + paddle/fluid/operators/save_combine_op.cu | 23 + paddle/fluid/operators/save_combine_op.h | 95 + .../operators/save_load_combine_op_test.cc | 346 + paddle/fluid/operators/save_load_op_test.cc | 160 + paddle/fluid/operators/save_op.cc | 98 + paddle/fluid/operators/save_op.cu | 27 + paddle/fluid/operators/save_op.h | 146 + paddle/fluid/operators/scale_op.cc | 112 + paddle/fluid/operators/scale_op.cu | 27 + paddle/fluid/operators/scale_op.h | 60 + paddle/fluid/operators/scatter.cu.h | 113 + paddle/fluid/operators/scatter.h | 148 + paddle/fluid/operators/scatter_op.cc | 139 + paddle/fluid/operators/scatter_op.cu | 67 + paddle/fluid/operators/scatter_op.h | 90 + paddle/fluid/operators/scatter_test.cc | 58 + paddle/fluid/operators/selu_op.cc | 135 + paddle/fluid/operators/selu_op.cu | 22 + paddle/fluid/operators/selu_op.h | 123 + .../operators/sequence_ops/CMakeLists.txt | 2 + .../sequence_ops/sequence_concat_op.cc | 133 + .../sequence_ops/sequence_concat_op.cu.cc | 26 + .../sequence_ops/sequence_concat_op.h | 149 + .../sequence_ops/sequence_conv_op.cc | 236 + .../sequence_ops/sequence_conv_op.cu.cc | 25 + .../operators/sequence_ops/sequence_conv_op.h | 157 + .../sequence_ops/sequence_enumerate_op.cc | 93 + .../sequence_ops/sequence_enumerate_op.cu | 86 + .../sequence_ops/sequence_enumerate_op.h | 77 + .../sequence_ops/sequence_erase_op.cc | 90 + .../sequence_ops/sequence_erase_op.cu | 119 + .../sequence_ops/sequence_erase_op.h | 73 + .../sequence_ops/sequence_expand_as_op.cc | 207 + .../sequence_ops/sequence_expand_as_op.cu | 134 + .../sequence_ops/sequence_expand_as_op.h | 148 + .../sequence_ops/sequence_expand_op.cc | 260 + .../sequence_ops/sequence_expand_op.cu | 220 + .../sequence_ops/sequence_expand_op.h | 218 + .../sequence_ops/sequence_mask_op.cc | 104 + .../sequence_ops/sequence_mask_op.cu | 26 + .../operators/sequence_ops/sequence_mask_op.h | 128 + .../operators/sequence_ops/sequence_pad_op.cc | 243 + .../operators/sequence_ops/sequence_pad_op.cu | 29 + .../operators/sequence_ops/sequence_pad_op.h | 76 + .../sequence_ops/sequence_pool_op.cc | 164 + .../sequence_ops/sequence_pool_op.cu | 22 + .../operators/sequence_ops/sequence_pool_op.h | 86 + .../sequence_ops/sequence_reshape_op.cc | 134 + .../sequence_ops/sequence_reshape_op.cu | 30 + .../sequence_ops/sequence_reshape_op.h | 86 + .../sequence_ops/sequence_reverse_op.cc | 29 + .../sequence_ops/sequence_reverse_op.cu | 25 + .../sequence_ops/sequence_reverse_op.h | 171 + .../sequence_ops/sequence_scatter_op.cc | 179 + .../sequence_ops/sequence_scatter_op.h | 122 + .../sequence_ops/sequence_slice_op.cc | 152 + .../sequence_ops/sequence_slice_op.cu | 23 + .../sequence_ops/sequence_slice_op.h | 170 + .../sequence_softmax_cudnn_op.cu.cc | 107 + .../sequence_ops/sequence_softmax_op.cc | 169 + .../sequence_ops/sequence_softmax_op.cu | 169 + .../sequence_ops/sequence_softmax_op.h | 139 + .../sequence_ops/sequence_unpad_op.cc | 175 + .../sequence_ops/sequence_unpad_op.cu | 30 + .../sequence_ops/sequence_unpad_op.h | 102 + paddle/fluid/operators/shape_op.cc | 59 + paddle/fluid/operators/shape_op.cu | 21 + paddle/fluid/operators/shape_op.h | 38 + paddle/fluid/operators/shard_index_op.cc | 115 + paddle/fluid/operators/shard_index_op.cu | 77 + paddle/fluid/operators/shard_index_op.h | 58 + .../fluid/operators/shrink_rnn_memory_op.cc | 187 + paddle/fluid/operators/shuffle_channel_op.cc | 125 + paddle/fluid/operators/shuffle_channel_op.cu | 126 + paddle/fluid/operators/shuffle_channel_op.h | 95 + .../sigmoid_cross_entropy_with_logits_op.cc | 191 + .../sigmoid_cross_entropy_with_logits_op.cu | 190 + .../sigmoid_cross_entropy_with_logits_op.h | 113 + paddle/fluid/operators/sign_op.cc | 71 + paddle/fluid/operators/sign_op.cu | 23 + paddle/fluid/operators/sign_op.h | 39 + paddle/fluid/operators/similarity_focus_op.cc | 86 + paddle/fluid/operators/similarity_focus_op.h | 168 + paddle/fluid/operators/size_op.cc | 57 + paddle/fluid/operators/size_op.cu | 21 + paddle/fluid/operators/size_op.h | 34 + paddle/fluid/operators/slice_op.cc | 215 + paddle/fluid/operators/slice_op.cu | 149 + paddle/fluid/operators/slice_op.h | 217 + paddle/fluid/operators/smooth_l1_loss_op.cc | 188 + paddle/fluid/operators/smooth_l1_loss_op.cu | 22 + paddle/fluid/operators/smooth_l1_loss_op.h | 184 + paddle/fluid/operators/softmax_cudnn_op.cu.cc | 84 + paddle/fluid/operators/softmax_op.cc | 237 + paddle/fluid/operators/softmax_op.cu.cc | 27 + paddle/fluid/operators/softmax_op.h | 107 + .../softmax_with_cross_entropy_op.cc | 292 + .../softmax_with_cross_entropy_op.cu | 520 + .../operators/softmax_with_cross_entropy_op.h | 126 + paddle/fluid/operators/space_to_depth_op.cc | 193 + paddle/fluid/operators/space_to_depth_op.cu | 30 + paddle/fluid/operators/space_to_depth_op.h | 127 + paddle/fluid/operators/spectral_norm_op.cc | 228 + paddle/fluid/operators/spectral_norm_op.cu | 22 + paddle/fluid/operators/spectral_norm_op.h | 273 + paddle/fluid/operators/split_lod_tensor_op.cc | 194 + paddle/fluid/operators/split_op.cc | 128 + paddle/fluid/operators/split_op.cu.cc | 21 + paddle/fluid/operators/split_op.h | 68 + .../fluid/operators/split_selected_rows_op.cc | 97 + .../fluid/operators/split_selected_rows_op.cu | 19 + .../fluid/operators/split_selected_rows_op.h | 95 + paddle/fluid/operators/spp_op.cc | 102 + paddle/fluid/operators/spp_op.cu.cc | 23 + paddle/fluid/operators/spp_op.h | 165 + .../fluid/operators/squared_l2_distance_op.cc | 174 + .../fluid/operators/squared_l2_distance_op.cu | 22 + .../fluid/operators/squared_l2_distance_op.h | 125 + paddle/fluid/operators/squared_l2_norm_op.cc | 99 + paddle/fluid/operators/squared_l2_norm_op.cu | 22 + paddle/fluid/operators/squared_l2_norm_op.h | 66 + paddle/fluid/operators/squeeze_op.cc | 311 + paddle/fluid/operators/stack_op.cc | 32 + paddle/fluid/operators/stack_op.cu | 32 + paddle/fluid/operators/stack_op.h | 262 + paddle/fluid/operators/strided_memcpy.h | 124 + paddle/fluid/operators/strided_memcpy_test.cc | 159 + paddle/fluid/operators/sum_op.cc | 261 + paddle/fluid/operators/sum_op.cu | 269 + paddle/fluid/operators/sum_op.h | 191 + paddle/fluid/operators/sync_batch_norm_op.cc | 20 + paddle/fluid/operators/sync_batch_norm_op.cu | 452 + .../teacher_student_sigmoid_loss_op.cc | 192 + .../teacher_student_sigmoid_loss_op.h | 118 + paddle/fluid/operators/temporal_shift_op.cc | 170 + paddle/fluid/operators/temporal_shift_op.cu | 168 + paddle/fluid/operators/temporal_shift_op.h | 129 + .../operators/tensor_array_to_tensor_op.cc | 245 + .../fluid/operators/tensorrt/CMakeLists.txt | 5 + .../operators/tensorrt/tensorrt_engine_op.cc | 60 + .../operators/tensorrt/tensorrt_engine_op.h | 272 + .../tensorrt/tensorrt_engine_op_test.cc | 228 + paddle/fluid/operators/top_k_op.cc | 94 + paddle/fluid/operators/top_k_op.cu | 380 + paddle/fluid/operators/top_k_op.h | 98 + paddle/fluid/operators/transpose_op.cc | 301 + paddle/fluid/operators/transpose_op.cu.cc | 45 + paddle/fluid/operators/transpose_op.h | 99 + paddle/fluid/operators/tree_conv_op.cc | 176 + paddle/fluid/operators/tree_conv_op.cu | 24 + paddle/fluid/operators/tree_conv_op.h | 146 + .../operators/truncated_gaussian_random_op.cc | 255 + .../operators/truncated_gaussian_random_op.cu | 77 + paddle/fluid/operators/unfold_op.cc | 184 + paddle/fluid/operators/unfold_op.cu | 26 + paddle/fluid/operators/unfold_op.h | 127 + .../uniform_random_batch_size_like_op.cc | 72 + paddle/fluid/operators/uniform_random_op.cc | 141 + paddle/fluid/operators/uniform_random_op.cu | 83 + paddle/fluid/operators/unique_op.cc | 61 + paddle/fluid/operators/unique_op.h | 83 + paddle/fluid/operators/unpool_op.cc | 146 + paddle/fluid/operators/unpool_op.cu.cc | 24 + paddle/fluid/operators/unpool_op.h | 73 + paddle/fluid/operators/unsqueeze_op.cc | 293 + paddle/fluid/operators/unstack_op.cc | 26 + paddle/fluid/operators/unstack_op.h | 135 + paddle/fluid/operators/warpctc_cudnn_op.cu.cc | 197 + paddle/fluid/operators/warpctc_op.cc | 186 + paddle/fluid/operators/warpctc_op.cu.cc | 22 + paddle/fluid/operators/warpctc_op.h | 242 + paddle/fluid/operators/where_op.cc | 58 + paddle/fluid/operators/where_op.cu | 81 + paddle/fluid/operators/where_op.h | 95 + paddle/fluid/platform/CMakeLists.txt | 122 + paddle/fluid/platform/assert.h | 48 + paddle/fluid/platform/collective_helper.cc | 102 + paddle/fluid/platform/collective_helper.h | 97 + paddle/fluid/platform/cpu_helper.cc | 51 + paddle/fluid/platform/cpu_helper.h | 26 + paddle/fluid/platform/cpu_helper_test.cc | 22 + paddle/fluid/platform/cpu_info.cc | 155 + paddle/fluid/platform/cpu_info.h | 78 + paddle/fluid/platform/cpu_info_test.cc | 34 + paddle/fluid/platform/cuda_device_function.h | 164 + paddle/fluid/platform/cuda_device_guard.cc | 22 + paddle/fluid/platform/cuda_device_guard.h | 45 + paddle/fluid/platform/cuda_helper.h | 58 + paddle/fluid/platform/cuda_helper_test.cu | 237 + paddle/fluid/platform/cuda_primitives.h | 132 + paddle/fluid/platform/cuda_profiler.h | 38 + paddle/fluid/platform/cudnn_desc.h | 205 + paddle/fluid/platform/cudnn_desc_test.cc | 41 + paddle/fluid/platform/cudnn_helper.h | 490 + paddle/fluid/platform/cudnn_helper_test.cc | 157 + .../fluid/platform/cudnn_workspace_helper.h | 23 + .../details/cuda_transform_iterator_cast.h | 71 + paddle/fluid/platform/device_context.cc | 534 + paddle/fluid/platform/device_context.h | 465 + paddle/fluid/platform/device_context_test.cu | 68 + .../fluid/platform/device_memory_aligment.cc | 34 + .../fluid/platform/device_memory_aligment.h | 27 + paddle/fluid/platform/device_tracer.cc | 723 + paddle/fluid/platform/device_tracer.h | 155 + paddle/fluid/platform/dynload/CMakeLists.txt | 24 + paddle/fluid/platform/dynload/cublas.cc | 40 + paddle/fluid/platform/dynload/cublas.h | 125 + paddle/fluid/platform/dynload/cudnn.cc | 67 + paddle/fluid/platform/dynload/cudnn.h | 193 + paddle/fluid/platform/dynload/cupti.cc | 35 + paddle/fluid/platform/dynload/cupti.h | 87 + .../fluid/platform/dynload/cupti_lib_path.h | 17 + .../platform/dynload/cupti_lib_path.h.in | 17 + paddle/fluid/platform/dynload/curand.cc | 30 + paddle/fluid/platform/dynload/curand.h | 66 + .../fluid/platform/dynload/dynamic_loader.cc | 258 + .../fluid/platform/dynload/dynamic_loader.h | 38 + paddle/fluid/platform/dynload/mklml.cc | 30 + paddle/fluid/platform/dynload/mklml.h | 99 + paddle/fluid/platform/dynload/nccl.cc | 30 + paddle/fluid/platform/dynload/nccl.h | 76 + paddle/fluid/platform/dynload/tensorrt.cc | 30 + paddle/fluid/platform/dynload/tensorrt.h | 69 + paddle/fluid/platform/dynload/warpctc.cc | 30 + paddle/fluid/platform/dynload/warpctc.h | 63 + .../fluid/platform/dynload/warpctc_lib_path.h | 17 + .../platform/dynload/warpctc_lib_path.h.in | 17 + paddle/fluid/platform/enforce.cc | 19 + paddle/fluid/platform/enforce.h | 401 + paddle/fluid/platform/enforce_test.cc | 255 + paddle/fluid/platform/event.h | 101 + paddle/fluid/platform/float16.h | 1086 ++ paddle/fluid/platform/float16_test.cc | 177 + paddle/fluid/platform/float16_test.cu | 317 + paddle/fluid/platform/for_range.h | 85 + paddle/fluid/platform/gpu_info.cc | 346 + paddle/fluid/platform/gpu_info.h | 97 + paddle/fluid/platform/hostdevice.h | 24 + paddle/fluid/platform/init.cc | 240 + paddle/fluid/platform/init.h | 36 + paddle/fluid/platform/init_test.cc | 40 + paddle/fluid/platform/lock_guard_ptr.h | 55 + paddle/fluid/platform/lodtensor_printer.cc | 45 + paddle/fluid/platform/lodtensor_printer.h | 24 + .../fluid/platform/lodtensor_printer_test.cc | 22 + paddle/fluid/platform/macros.h | 30 + paddle/fluid/platform/mkldnn_helper.h | 172 + paddle/fluid/platform/mkldnn_reuse.h | 1424 ++ paddle/fluid/platform/nccl_helper.h | 323 + paddle/fluid/platform/ngraph_helper.h | 199 + paddle/fluid/platform/place.cc | 78 + paddle/fluid/platform/place.h | 128 + paddle/fluid/platform/place_test.cc | 44 + paddle/fluid/platform/port.h | 175 + paddle/fluid/platform/profiler.cc | 655 + paddle/fluid/platform/profiler.cu | 50 + paddle/fluid/platform/profiler.h | 189 + paddle/fluid/platform/profiler.proto | 58 + paddle/fluid/platform/profiler_test.cc | 145 + .../fluid/platform/stream_callback_manager.cc | 63 + .../fluid/platform/stream_callback_manager.h | 50 + paddle/fluid/platform/temporary_allocator.cc | 121 + paddle/fluid/platform/temporary_allocator.h | 68 + .../platform/temporary_allocator_test.cc | 222 + paddle/fluid/platform/timer.cc | 63 + paddle/fluid/platform/timer.h | 61 + paddle/fluid/platform/timer_test.cc | 45 + paddle/fluid/platform/transform.h | 110 + paddle/fluid/platform/transform_test.cu | 97 + paddle/fluid/platform/variant.h | 53 + paddle/fluid/pybind/.gitignore | 1 + paddle/fluid/pybind/CMakeLists.txt | 50 + paddle/fluid/pybind/communicator_py.cc | 48 + paddle/fluid/pybind/communicator_py.h | 27 + paddle/fluid/pybind/const_value.cc | 76 + paddle/fluid/pybind/const_value.h | 27 + paddle/fluid/pybind/data_set_py.cc | 110 + paddle/fluid/pybind/data_set_py.h | 28 + paddle/fluid/pybind/exception.cc | 38 + paddle/fluid/pybind/exception.h | 27 + paddle/fluid/pybind/fleet_wrapper_py.cc | 66 + paddle/fluid/pybind/fleet_wrapper_py.h | 28 + paddle/fluid/pybind/imperative.cc | 354 + paddle/fluid/pybind/imperative.h | 30 + paddle/fluid/pybind/inference_api.cc | 316 + paddle/fluid/pybind/inference_api.h | 23 + paddle/fluid/pybind/ir.cc | 187 + paddle/fluid/pybind/ir.h | 25 + paddle/fluid/pybind/nccl_wrapper_py.cc | 53 + paddle/fluid/pybind/nccl_wrapper_py.h | 28 + paddle/fluid/pybind/protobuf.cc | 247 + paddle/fluid/pybind/protobuf.h | 35 + paddle/fluid/pybind/pybind.cc | 1616 ++ paddle/fluid/pybind/pybind_boost_headers.h | 124 + paddle/fluid/pybind/reader_py.cc | 179 + paddle/fluid/pybind/reader_py.h | 25 + paddle/fluid/pybind/recordio.cc | 88 + paddle/fluid/pybind/recordio.h | 27 + paddle/fluid/pybind/tensor_py.h | 519 + paddle/fluid/recordio/CMakeLists.txt | 9 + paddle/fluid/recordio/README.md | 13 + paddle/fluid/recordio/chunk.cc | 174 + paddle/fluid/recordio/chunk.h | 73 + paddle/fluid/recordio/chunk_test.cc | 47 + paddle/fluid/recordio/header.cc | 70 + paddle/fluid/recordio/header.h | 66 + paddle/fluid/recordio/header_test.cc | 29 + paddle/fluid/recordio/scanner.cc | 57 + paddle/fluid/recordio/scanner.h | 43 + paddle/fluid/recordio/writer.cc | 40 + paddle/fluid/recordio/writer.h | 44 + paddle/fluid/recordio/writer_scanner_test.cc | 70 + paddle/fluid/string/CMakeLists.txt | 7 + paddle/fluid/string/piece.cc | 136 + paddle/fluid/string/piece.h | 105 + paddle/fluid/string/piece_test.cc | 293 + paddle/fluid/string/pretty_log.cc | 22 + paddle/fluid/string/pretty_log.h | 87 + paddle/fluid/string/printf.h | 124 + paddle/fluid/string/printf_test.cc | 31 + paddle/fluid/string/split.h | 37 + paddle/fluid/string/split_test.cc | 28 + paddle/fluid/string/string_helper.cc | 103 + paddle/fluid/string/string_helper.h | 159 + paddle/fluid/string/tinyformat/tinyformat.h | 892 ++ paddle/fluid/string/to_string.h | 51 + paddle/fluid/string/to_string_test.cc | 38 + paddle/fluid/train/CMakeLists.txt | 31 + paddle/fluid/train/demo/CMakeLists.txt | 78 + paddle/fluid/train/demo/README.md | 65 + paddle/fluid/train/demo/demo_network.py | 47 + paddle/fluid/train/demo/demo_trainer.cc | 115 + .../train/test_train_recognize_digits.cc | 89 + paddle/scripts/CMakeLists.txt | 7 + paddle/scripts/Dockerfile.tmp | 177 + paddle/scripts/README.md | 191 + paddle/scripts/build_docker_images.sh | 14 + ...paddle-development-environment-gpu.graffle | Bin 0 -> 2912 bytes .../paddle-development-environment-gpu.png | Bin 0 -> 91803 bytes .../paddle-development-environment.graffle | Bin 0 -> 2691 bytes .../doc/paddle-development-environment.png | Bin 0 -> 67311 bytes paddle/scripts/docker/root/.bashrc | 46 + paddle/scripts/docker/root/.gitconfig | 43 + .../docker/root/.scripts/git-completion.sh | 2663 ++++ .../docker/root/.scripts/git-prompt.sh | 445 + paddle/scripts/fast_install.sh | 1126 ++ paddle/scripts/installation_validate.py | 18 + paddle/scripts/paddle_build.sh | 1096 ++ paddle/scripts/paddle_docker_build.sh | 71 + paddle/scripts/submit_local.sh.in | 169 + paddle/testing/CMakeLists.txt | 5 + paddle/testing/paddle_gtest_main.cc | 89 + patches/grpc/completion_queue.h | 386 + patches/grpc/grpc_library.h | 64 + python/.gitignore | 9 + python/CMakeLists.txt | 117 + python/__init__.py | 13 + python/paddle/.gitignore | 1 + python/paddle/__init__.py | 29 + python/paddle/batch.py | 49 + python/paddle/compat.py | 237 + python/paddle/dataset/__init__.py | 48 + python/paddle/dataset/cifar.py | 156 + python/paddle/dataset/common.py | 244 + python/paddle/dataset/conll05.py | 258 + python/paddle/dataset/flowers.py | 230 + python/paddle/dataset/image.py | 416 + python/paddle/dataset/imdb.py | 151 + python/paddle/dataset/imikolov.py | 166 + python/paddle/dataset/mnist.py | 136 + python/paddle/dataset/movielens.py | 271 + python/paddle/dataset/mq2007.py | 335 + python/paddle/dataset/sentiment.py | 144 + python/paddle/dataset/tests/CMakeLists.txt | 1 + python/paddle/dataset/tests/cat.jpg | Bin 0 -> 57218 bytes python/paddle/dataset/tests/cifar_test.py | 58 + python/paddle/dataset/tests/common_test.py | 97 + python/paddle/dataset/tests/flowers_test.py | 53 + python/paddle/dataset/tests/imdb_test.py | 57 + python/paddle/dataset/tests/imikolov_test.py | 69 + python/paddle/dataset/tests/mnist_test.py | 46 + python/paddle/dataset/tests/mq2007_test.py | 35 + python/paddle/dataset/tests/test_image.py | 45 + python/paddle/dataset/tests/test_sentiment.py | 56 + python/paddle/dataset/tests/voc2012_test.py | 44 + python/paddle/dataset/tests/wmt16_test.py | 68 + python/paddle/dataset/uci_housing.py | 157 + python/paddle/dataset/voc2012.py | 87 + python/paddle/dataset/wmt14.py | 178 + python/paddle/dataset/wmt16.py | 357 + python/paddle/distributed/__init__.py | 13 + python/paddle/distributed/launch.py | 222 + python/paddle/distributed/launch_ps.py | 151 + python/paddle/fluid/.gitignore | 2 + python/paddle/fluid/__init__.py | 221 + python/paddle/fluid/annotations.py | 39 + python/paddle/fluid/average.py | 90 + python/paddle/fluid/backward.py | 847 ++ python/paddle/fluid/clip.py | 402 + python/paddle/fluid/communicator.py | 106 + python/paddle/fluid/compiler.py | 349 + python/paddle/fluid/contrib/__init__.py | 50 + .../paddle/fluid/contrib/decoder/__init__.py | 20 + .../contrib/decoder/beam_search_decoder.py | 842 ++ .../contrib/extend_optimizer/__init__.py | 20 + .../extend_optimizer_with_weight_decay.py | 152 + python/paddle/fluid/contrib/inferencer.py | 112 + .../paddle/fluid/contrib/layers/__init__.py | 23 + python/paddle/fluid/contrib/layers/nn.py | 90 + .../paddle/fluid/contrib/layers/rnn_impl.py | 743 + .../paddle/fluid/contrib/memory_usage_calc.py | 120 + .../fluid/contrib/mixed_precision/__init__.py | 21 + .../contrib/mixed_precision/decorator.py | 241 + .../contrib/mixed_precision/fp16_lists.py | 275 + .../contrib/mixed_precision/fp16_utils.py | 372 + python/paddle/fluid/contrib/model_stat.py | 194 + python/paddle/fluid/contrib/op_frequence.py | 104 + .../paddle/fluid/contrib/quantize/__init__.py | 20 + .../contrib/quantize/quantize_transpiler.py | 621 + python/paddle/fluid/contrib/reader/README.md | 25 + .../paddle/fluid/contrib/reader/__init__.py | 20 + .../contrib/reader/distributed_reader.py | 65 + python/paddle/fluid/contrib/slim/__init__.py | 16 + .../fluid/contrib/slim/core/__init__.py | 22 + .../fluid/contrib/slim/core/compressor.py | 546 + .../paddle/fluid/contrib/slim/core/config.py | 130 + .../fluid/contrib/slim/core/strategy.py | 51 + .../contrib/slim/distillation/__init__.py | 21 + .../distillation/distillation_strategy.py | 103 + .../contrib/slim/distillation/distiller.py | 276 + .../fluid/contrib/slim/graph/__init__.py | 20 + .../fluid/contrib/slim/graph/executor.py | 58 + .../fluid/contrib/slim/graph/graph_wrapper.py | 526 + .../paddle/fluid/contrib/slim/nas/__init__.py | 29 + .../contrib/slim/nas/controller_server.py | 107 + .../contrib/slim/nas/light_nas_strategy.py | 187 + python/paddle/fluid/contrib/slim/nas/lock.py | 36 + .../fluid/contrib/slim/nas/search_agent.py | 67 + .../fluid/contrib/slim/nas/search_space.py | 43 + .../fluid/contrib/slim/prune/__init__.py | 24 + .../contrib/slim/prune/auto_prune_strategy.py | 272 + .../contrib/slim/prune/prune_strategy.py | 958 ++ .../paddle/fluid/contrib/slim/prune/pruner.py | 107 + .../contrib/slim/quantization/__init__.py | 28 + .../mkldnn_post_training_strategy.py | 120 + .../quantization/quantization_mkldnn_pass.py | 256 + .../slim/quantization/quantization_pass.py | 1258 ++ .../quantization/quantization_strategy.py | 249 + .../fluid/contrib/slim/searcher/__init__.py | 18 + .../fluid/contrib/slim/searcher/controller.py | 147 + .../fluid/contrib/slim/tests/CMakeLists.txt | 158 + .../slim/tests/QAT_mkldnn_int8_readme.md | 76 + .../fluid/contrib/slim/tests/__init__.py | 13 + .../slim/tests/auto_pruning/compress.yaml | 30 + .../slim/tests/configs/filter_pruning.yaml | 34 + .../slim/tests/distillation/compress.yaml | 53 + .../slim/tests/filter_pruning/compress.yaml | 34 + .../slim/tests/light_nas/compress.yaml | 22 + .../slim/tests/light_nas/light_nas_space.py | 277 + .../slim/tests/light_nas/light_nasnet.py | 339 + .../fluid/contrib/slim/tests/mobilenet.py | 215 + .../contrib/slim/tests/qat_int8_comparison.py | 306 + .../slim/tests/quantization/compress.yaml | 50 + .../quantization/config_mkldnn_int8.yaml | 28 + ..._int8_mkldnn_post_training_quantization.md | 131 + .../contrib/slim/tests/test_auto_pruning.py | 86 + .../slim/tests/test_distillation_strategy.py | 96 + .../fluid/contrib/slim/tests/test_factory.py | 37 + .../contrib/slim/tests/test_filter_pruning.py | 89 + .../fluid/contrib/slim/tests/test_graph.py | 146 + .../contrib/slim/tests/test_graph_wrapper.py | 146 + .../contrib/slim/tests/test_light_nas.py | 66 + .../test_mkldnn_int8_quantization_strategy.py | 269 + .../tests/test_quantization_mkldnn_pass.py | 195 + .../slim/tests/test_quantization_pass.py | 486 + .../tests/test_quantization_scale_pass.py | 198 + .../slim/tests/test_quantization_strategy.py | 82 + .../paddle/fluid/contrib/tests/CMakeLists.txt | 6 + .../contrib/tests/test_distributed_reader.py | 44 + .../tests/test_image_classification_fp16.py | 303 + .../contrib/tests/test_quantize_transpiler.py | 281 + .../contrib/tests/test_weight_decay_extend.py | 151 + python/paddle/fluid/contrib/trainer.py | 1258 ++ python/paddle/fluid/contrib/utils/__init__.py | 23 + .../paddle/fluid/contrib/utils/hdfs_utils.py | 603 + .../fluid/contrib/utils/lookup_table_utils.py | 496 + python/paddle/fluid/core.py | 211 + python/paddle/fluid/data_feed_desc.py | 250 + python/paddle/fluid/data_feeder.py | 494 + python/paddle/fluid/dataset.py | 621 + python/paddle/fluid/debugger.py | 367 + python/paddle/fluid/default_scope_funcs.py | 103 + python/paddle/fluid/device_worker.py | 239 + .../paddle/fluid/distribute_lookup_table.py | 79 + python/paddle/fluid/distributed/__init__.py | 12 + python/paddle/fluid/distributed/downpour.py | 168 + python/paddle/fluid/distributed/fleet.py | 76 + python/paddle/fluid/distributed/helper.py | 85 + python/paddle/fluid/distributed/node.py | 179 + .../paddle/fluid/distributed/ps_instance.py | 160 + python/paddle/fluid/distributed/ps_pb2.py | 2426 +++ python/paddle/fluid/dygraph/__init__.py | 49 + .../paddle/fluid/dygraph/backward_strategy.py | 19 + python/paddle/fluid/dygraph/base.py | 201 + python/paddle/fluid/dygraph/checkpoint.py | 241 + .../fluid/dygraph/layer_object_helper.py | 223 + python/paddle/fluid/dygraph/layers.py | 274 + .../fluid/dygraph/learning_rate_scheduler.py | 486 + python/paddle/fluid/dygraph/nn.py | 2640 ++++ python/paddle/fluid/dygraph/parallel.py | 189 + .../paddle/fluid/dygraph/parallel_helper.py | 43 + python/paddle/fluid/dygraph/profiler.py | 30 + python/paddle/fluid/dygraph/tracer.py | 72 + python/paddle/fluid/dygraph_grad_clip.py | 287 + python/paddle/fluid/evaluator.py | 431 + python/paddle/fluid/executor.py | 1008 ++ python/paddle/fluid/framework.py | 3883 +++++ python/paddle/fluid/graphviz.py | 272 + python/paddle/fluid/incubate/__init__.py | 17 + .../fluid/incubate/data_generator/__init__.py | 375 + .../data_generator/test_data_generator.py | 36 + .../paddle/fluid/incubate/fleet/__init__.py | 14 + .../fluid/incubate/fleet/base/__init__.py | 12 + .../fluid/incubate/fleet/base/fleet_base.py | 341 + .../fluid/incubate/fleet/base/role_maker.py | 496 + .../incubate/fleet/collective/__init__.py | 283 + .../fleet/parameter_server/__init__.py | 13 + .../distribute_transpiler/__init__.py | 347 + .../fleet/parameter_server/pslib/__init__.py | 486 + .../fleet/parameter_server/pslib/node.py | 238 + .../pslib/optimizer_factory.py | 169 + .../fleet/parameter_server/pslib/ps_pb2.py | 2426 +++ .../incubate/fleet/tests/cluster_train.sh | 33 + .../fleet/tests/ctr_dataset_reader.py | 100 + .../incubate/fleet/tests/fleet_deep_ctr.py | 204 + .../fluid/incubate/fleet/utils/__init__.py | 13 + .../paddle/fluid/incubate/fleet/utils/hdfs.py | 510 + python/paddle/fluid/inferencer.py | 16 + python/paddle/fluid/initializer.py | 937 ++ python/paddle/fluid/install_check.py | 144 + python/paddle/fluid/io.py | 1380 ++ python/paddle/fluid/layer_helper.py | 175 + python/paddle/fluid/layer_helper_base.py | 383 + python/paddle/fluid/layers/__init__.py | 49 + python/paddle/fluid/layers/collective.py | 180 + python/paddle/fluid/layers/control_flow.py | 2196 +++ python/paddle/fluid/layers/detection.py | 2991 ++++ python/paddle/fluid/layers/device.py | 43 + python/paddle/fluid/layers/distributions.py | 398 + python/paddle/fluid/layers/io.py | 1292 ++ .../fluid/layers/layer_function_generator.py | 349 + .../fluid/layers/learning_rate_scheduler.py | 500 + python/paddle/fluid/layers/math_op_patch.py | 194 + python/paddle/fluid/layers/metric_op.py | 195 + python/paddle/fluid/layers/nn.py | 12607 ++++++++++++++++ python/paddle/fluid/layers/ops.py | 165 + python/paddle/fluid/layers/tensor.py | 1026 ++ python/paddle/fluid/layers/utils.py | 61 + python/paddle/fluid/lod_tensor.py | 166 + python/paddle/fluid/log_helper.py | 52 + python/paddle/fluid/metrics.py | 889 ++ python/paddle/fluid/net_drawer.py | 129 + python/paddle/fluid/nets.py | 533 + python/paddle/fluid/op.py | 292 + python/paddle/fluid/optimizer.py | 2954 ++++ python/paddle/fluid/parallel_executor.py | 350 + python/paddle/fluid/param_attr.py | 238 + python/paddle/fluid/profiler.py | 283 + python/paddle/fluid/reader.py | 675 + python/paddle/fluid/recordio_writer.py | 132 + python/paddle/fluid/regularizer.py | 270 + python/paddle/fluid/sampcd_processor.py | 561 + python/paddle/fluid/tests/.gitignore | 4 + python/paddle/fluid/tests/CMakeLists.txt | 14 + python/paddle/fluid/tests/__init__.py | 13 + python/paddle/fluid/tests/book/.gitignore | 1 + python/paddle/fluid/tests/book/CMakeLists.txt | 11 + python/paddle/fluid/tests/book/__init__.py | 13 + .../tests/book/high-level-api/CMakeLists.txt | 29 + .../high-level-api/cifar10_small_test_set.py | 106 + .../high-level-api/test_fit_a_line_new_api.py | 187 + ...est_image_classification_resnet_new_api.py | 194 + .../test_image_classification_vgg_new_api.py | 168 + .../test_label_semantic_roles_new_api.py | 276 + .../test_machine_translation_new_api.py | 335 + .../test_recognize_digits_conv_new_api.py | 166 + .../test_recognize_digits_mlp_new_api.py | 148 + .../test_recommender_system_new_api.py | 276 + .../test_understand_sentiment_conv_new_api.py | 166 + ...nderstand_sentiment_dynamic_rnn_new_api.py | 181 + ...derstand_sentiment_stacked_lstm_new_api.py | 176 + .../high-level-api/test_word2vec_new_api.py | 187 + .../tests/book/notest_understand_sentiment.py | 369 + .../fluid/tests/book/test_fit_a_line.py | 165 + .../tests/book/test_image_classification.py | 290 + .../tests/book/test_label_semantic_roles.py | 378 + .../tests/book/test_machine_translation.py | 347 + .../fluid/tests/book/test_recognize_digits.py | 275 + .../tests/book/test_recommender_system.py | 328 + .../tests/book/test_rnn_encoder_decoder.py | 283 + .../paddle/fluid/tests/book/test_word2vec.py | 280 + .../book_memory_optimization/CMakeLists.txt | 11 + .../test_memopt_image_classification_train.py | 168 + .../test_memopt_machine_translation.py | 139 + .../tests/demo/executor_train_dataset.py | 96 + python/paddle/fluid/tests/demo/fc_gan.py | 173 + .../fluid/tests/demo/file_reader/.gitignore | 1 + .../file_reader/convert_data_to_recordio.py | 63 + .../fluid/tests/demo/file_reader/train.py | 140 + .../paddle/fluid/tests/demo/pipeline_train.py | 508 + python/paddle/fluid/tests/demo/pyreader.py | 102 + .../fluid/tests/test_beam_search_decoder.py | 267 + .../paddle/fluid/tests/test_communicator.py | 32 + python/paddle/fluid/tests/test_cpp_reader.py | 94 + python/paddle/fluid/tests/test_data_feeder.py | 81 + python/paddle/fluid/tests/test_detection.py | 569 + python/paddle/fluid/tests/test_error_clip.py | 82 + python/paddle/fluid/tests/test_if_else_op.py | 225 + python/paddle/fluid/tests/test_lod_tensor.py | 130 + .../tests/test_python_operator_overriding.py | 78 + .../paddle/fluid/tests/unittests/.gitignore | 8 + .../fluid/tests/unittests/CMakeLists.txt | 272 + .../paddle/fluid/tests/unittests/__init__.py | 13 + .../paddle/fluid/tests/unittests/benchmark.py | 116 + .../fluid/tests/unittests/benchmark_sum_op.py | 84 + .../unittests/collective_allgather_op.py | 69 + .../unittests/collective_allreduce_op.py | 68 + .../unittests/collective_broadcast_op.py | 70 + .../unittests/collective_reducescatter_op.py | 70 + .../tests/unittests/ctr_dataset_reader.py | 100 + .../fluid/tests/unittests/decorator_helper.py | 45 + .../tests/unittests/dist_allreduce_op.py | 120 + .../paddle/fluid/tests/unittests/dist_ctr.py | 119 + .../fluid/tests/unittests/dist_ctr_reader.py | 172 + .../fluid/tests/unittests/dist_fleet_ctr.py | 163 + .../fluid/tests/unittests/dist_mnist.py | 112 + .../tests/unittests/dist_mnist_batch_merge.py | 80 + .../fluid/tests/unittests/dist_mnist_lars.py | 73 + .../fluid/tests/unittests/dist_save_load.py | 207 + .../fluid/tests/unittests/dist_se_resnext.py | 265 + .../fluid/tests/unittests/dist_simnet_bow.py | 255 + .../unittests/dist_text_classification.py | 231 + .../fluid/tests/unittests/dist_transformer.py | 1729 +++ .../fluid/tests/unittests/dist_word2vec.py | 128 + .../fluid/tests/unittests/fake_reader.py | 34 + .../fluid/tests/unittests/gradient_checker.py | 392 + .../unittests/ir_memory_optimize_net_base.py | 146 + .../tests/unittests/mkldnn/CMakeLists.txt | 6 + .../fluid/tests/unittests/mkldnn/__init__.py | 13 + .../tests/unittests/mkldnn/mkldnn_op_test.py | 86 + .../mkldnn/test_activation_mkldnn_op.py | 122 + .../mkldnn/test_batch_norm_mkldnn_op.py | 70 + .../mkldnn/test_concat_int8_mkldnn_op.py | 124 + .../unittests/mkldnn/test_concat_mkldnn_op.py | 61 + .../mkldnn/test_conv2d_int8_mkldnn_op.py | 352 + .../unittests/mkldnn/test_conv2d_mkldnn_op.py | 179 + .../mkldnn/test_conv2d_transpose_mkldnn_op.py | 103 + .../unittests/mkldnn/test_conv3d_mkldnn_op.py | 59 + .../mkldnn/test_dequantize_mkldnn_op.py | 73 + .../mkldnn/test_elementwise_add_mkldnn_op.py | 132 + .../mkldnn/test_elementwise_mul_mkldnn_op.py | 266 + .../unittests/mkldnn/test_fc_mkldnn_op.py | 71 + .../mkldnn/test_gaussian_random_mkldnn_op.py | 28 + .../unittests/mkldnn/test_lrn_mkldnn_op.py | 51 + .../mkldnn/test_mul_int8_mkldnn_op.py | 166 + .../mkldnn/test_pool2d_int8_mkldnn_op.py | 110 + .../unittests/mkldnn/test_pool2d_mkldnn_op.py | 57 + .../mkldnn/test_quantize_mkldnn_op.py | 76 + .../mkldnn/test_requantize_mkldnn_op.py | 93 + .../mkldnn/test_softmax_mkldnn_op.py | 57 + .../unittests/mkldnn/test_sum_mkldnn_op.py | 28 + .../mkldnn/test_transpose_int8_mkldnn_op.py | 78 + .../mkldnn/test_transpose_mkldnn_op.py | 66 + .../fluid/tests/unittests/multi_process.py | 35 + .../tests/unittests/ngraph/CMakeLists.txt | 6 + .../fluid/tests/unittests/ngraph/__init__.py | 13 + .../ngraph/test_accuracy_ngraph_op.py | 21 + .../ngraph/test_activation_ngraph_op.py | 48 + .../unittests/ngraph/test_adam_ngraph_op.py | 21 + .../unittests/ngraph/test_assign_ngraph_op.py | 22 + .../ngraph/test_batch_norm_ngraph_op.py | 21 + .../unittests/ngraph/test_cast_ngraph_op.py | 21 + .../ngraph/test_compare_ngraph_op.py | 23 + .../unittests/ngraph/test_concat_ngraph_op.py | 21 + .../unittests/ngraph/test_conv2d_ngraph_op.py | 54 + .../ngraph/test_cross_entropy_ngraph_op.py | 21 + .../ngraph/test_dropout_ngraph_op.py | 21 + .../ngraph/test_elementwise_add_ngraph_op.py | 21 + .../ngraph/test_elementwise_div_ngraph_op.py | 22 + .../ngraph/test_elementwise_max_ngraph_op.py | 22 + .../ngraph/test_elementwise_min_ngraph_op.py | 22 + .../ngraph/test_elementwise_mul_ngraph_op.py | 22 + .../ngraph/test_elementwise_pow_ngraph_op.py | 22 + .../ngraph/test_elementwise_sub_ngraph_op.py | 22 + .../ngraph/test_fill_constant_ngraph_op.py | 47 + .../ngraph/test_fill_zeros_like_ngraph_op.py | 20 + .../unittests/ngraph/test_gather_ngraph_op.py | 21 + .../ngraph/test_increment_ngraph_op.py | 45 + .../ngraph/test_layer_norm_ngraph_op.py | 30 + .../ngraph/test_logical_ngraph_op.py | 28 + .../ngraph/test_lookup_table_ngraph_op.py | 21 + .../unittests/ngraph/test_lrn_ngraph_op.py | 29 + .../unittests/ngraph/test_matmul_ngraph_op.py | 21 + .../unittests/ngraph/test_mean_ngraph_op.py | 20 + .../ngraph/test_momentum_ngraph_op.py | 21 + .../unittests/ngraph/test_mul_ngraph_op.py | 21 + .../unittests/ngraph/test_pool2d_ngraph_op.py | 39 + .../unittests/ngraph/test_reduce_ngraph_op.py | 37 + .../ngraph/test_reshape_ngraph_op.py | 23 + .../unittests/ngraph/test_scale_ngraph_op.py | 19 + .../unittests/ngraph/test_slice_ngraph_op.py | 22 + .../ngraph/test_softmax_ngraph_op.py | 20 + ...st_softmax_with_cross_entropy_ngraph_op.py | 20 + .../unittests/ngraph/test_stack_ngraph_op.py | 22 + .../unittests/ngraph/test_sum_ngraph_op.py | 19 + .../unittests/ngraph/test_top_k_ngraph_op.py | 20 + .../ngraph/test_transpose_ngraph_op.py | 22 + .../paddle/fluid/tests/unittests/op_test.py | 668 + .../tests/unittests/parallel_dygraph_mnist.py | 136 + .../unittests/parallel_dygraph_se_resnext.py | 313 + .../unittests/parallel_executor_test_base.py | 139 + .../fluid/tests/unittests/simple_nets.py | 94 + .../fluid/tests/unittests/test_accuracy_op.py | 60 + .../unittests/test_activation_nn_grad.py | 127 + .../tests/unittests/test_activation_op.py | 759 + .../fluid/tests/unittests/test_adadelta_op.py | 112 + .../fluid/tests/unittests/test_adagrad_op.py | 192 + .../fluid/tests/unittests/test_adam_op.py | 334 + .../fluid/tests/unittests/test_adamax_op.py | 188 + .../test_add_position_encoding_op.py | 134 + .../tests/unittests/test_affine_channel_op.py | 118 + .../tests/unittests/test_affine_grid_op.py | 79 + .../fluid/tests/unittests/test_allgather.py | 31 + .../fluid/tests/unittests/test_allreduce.py | 31 + .../unittests/test_anchor_generator_op.py | 112 + .../tests/unittests/test_arg_min_max_op.py | 92 + .../fluid/tests/unittests/test_argsort_op.py | 58 + .../unittests/test_array_read_write_op.py | 105 + .../fluid/tests/unittests/test_assign_op.py | 37 + .../tests/unittests/test_assign_value_op.py | 56 + .../test_async_ssa_graph_executor_mnist.py | 192 + .../tests/unittests/test_attention_lstm_op.py | 208 + .../fluid/tests/unittests/test_auc_op.py | 61 + .../unittests/test_auc_single_pred_op.py | 64 + .../test_backward_find_no_grad_vars.py | 57 + .../fluid/tests/unittests/test_base_layer.py | 79 + .../tests/unittests/test_basic_gru_api.py | 334 + .../tests/unittests/test_basic_gru_unit_op.py | 144 + .../tests/unittests/test_basic_lstm_api.py | 305 + .../unittests/test_basic_lstm_unit_op.py | 133 + .../tests/unittests/test_batch_norm_op.py | 523 + .../unittests/test_beam_search_decode_op.py | 114 + .../tests/unittests/test_beam_search_op.py | 100 + .../unittests/test_bilinear_interp_op.py | 352 + .../test_bilinear_tensor_product_op.py | 53 + .../unittests/test_bipartite_match_op.py | 159 + .../fluid/tests/unittests/test_box_clip_op.py | 70 + .../tests/unittests/test_box_coder_op.py | 239 + .../test_box_decoder_and_assign_op.py | 96 + .../fluid/tests/unittests/test_bpr_loss_op.py | 52 + .../fluid/tests/unittests/test_broadcast.py | 31 + .../test_buffer_shared_inplace_pass.py | 177 + .../tests/unittests/test_calc_gradient.py | 39 + .../fluid/tests/unittests/test_cast_op.py | 73 + .../tests/unittests/test_chunk_eval_op.py | 233 + .../tests/unittests/test_clip_by_norm_op.py | 123 + .../fluid/tests/unittests/test_clip_op.py | 74 + .../unittests/test_coalesce_tensor_op.py | 96 + .../test_collect_fpn_proposals_op.py | 100 + .../tests/unittests/test_collective_base.py | 273 + .../fluid/tests/unittests/test_compare_op.py | 49 + .../fluid/tests/unittests/test_compat.py | 505 + .../fluid/tests/unittests/test_concat_op.py | 93 + .../tests/unittests/test_conditional_block.py | 56 + .../fluid/tests/unittests/test_const_value.py | 30 + .../tests/unittests/test_conv2d_fusion_op.py | 190 + .../fluid/tests/unittests/test_conv2d_op.py | 464 + .../unittests/test_conv2d_transpose_op.py | 316 + .../fluid/tests/unittests/test_conv3d_op.py | 330 + .../unittests/test_conv3d_transpose_op.py | 276 + .../tests/unittests/test_conv_shift_op.py | 63 + .../fluid/tests/unittests/test_cos_sim_op.py | 109 + .../unittests/test_create_op_doc_string.py | 27 + .../tests/unittests/test_crf_decoding_op.py | 180 + .../fluid/tests/unittests/test_crop_op.py | 128 + .../tests/unittests/test_cross_entropy2_op.py | 82 + .../tests/unittests/test_cross_entropy_op.py | 306 + .../fluid/tests/unittests/test_ctc_align.py | 91 + .../fluid/tests/unittests/test_cumsum_op.py | 129 + .../fluid/tests/unittests/test_cvm_op.py | 132 + .../fluid/tests/unittests/test_dataset.py | 172 + .../fluid/tests/unittests/test_debugger.py | 62 + .../unittests/test_decayed_adagrad_op.py | 87 + .../unittests/test_decoupled_py_reader.py | 176 + .../test_decoupled_py_reader_data_check.py | 95 + .../unittests/test_default_scope_funcs.py | 49 + .../unittests/test_deformable_conv_op.py | 294 + .../test_deformable_psroi_pooling.py | 369 + .../unittests/test_density_prior_box_op.py | 150 + .../fluid/tests/unittests/test_desc_clone.py | 203 + .../tests/unittests/test_detection_map_op.py | 285 + .../fluid/tests/unittests/test_dgc_op.py | 138 + .../paddle/fluid/tests/unittests/test_diag.py | 43 + .../tests/unittests/test_dist_allreduce_op.py | 35 + .../fluid/tests/unittests/test_dist_base.py | 733 + .../fluid/tests/unittests/test_dist_ctr.py | 45 + .../tests/unittests/test_dist_fleet_base.py | 269 + .../tests/unittests/test_dist_fleet_ctr.py | 53 + .../fluid/tests/unittests/test_dist_mnist.py | 76 + .../test_dist_mnist_backward_deps.py | 35 + .../unittests/test_dist_mnist_batch_merge.py | 67 + .../unittests/test_dist_mnist_dgc_nccl.py | 35 + .../unittests/test_dist_mnist_hallreduce.py | 35 + .../tests/unittests/test_dist_mnist_lars.py | 30 + .../unittests/test_dist_mnist_multi_comm.py | 35 + .../tests/unittests/test_dist_mnist_pg.py | 40 + .../test_dist_mnist_ring_allreduce.py | 34 + .../tests/unittests/test_dist_save_load.py | 157 + .../tests/unittests/test_dist_se_resnext.py | 79 + .../unittests/test_dist_se_resnext_nccl.py | 64 + .../tests/unittests/test_dist_simnet_bow.py | 151 + .../test_dist_text_classification.py | 40 + .../fluid/tests/unittests/test_dist_train.py | 145 + .../tests/unittests/test_dist_transformer.py | 79 + .../tests/unittests/test_dist_transpiler.py | 993 ++ .../tests/unittests/test_dist_word2vec.py | 46 + .../test_distribute_fpn_proposals_op.py | 121 + .../tests/unittests/test_distributions.py | 377 + .../fluid/tests/unittests/test_dropout_op.py | 184 + .../unittests/test_dygraph_multi_forward.py | 200 + .../fluid/tests/unittests/test_dyn_rnn.py | 287 + .../unittests/test_dynrnn_gradient_check.py | 379 + .../unittests/test_dynrnn_static_input.py | 211 + .../test_eager_deletion_conditional_block.py | 23 + .../test_eager_deletion_delete_vars.py | 183 + .../test_eager_deletion_dynamic_rnn_base.py | 97 + .../unittests/test_eager_deletion_gru_net.py | 51 + .../unittests/test_eager_deletion_lstm_net.py | 52 + .../unittests/test_eager_deletion_mnist.py | 34 + .../test_eager_deletion_padding_rnn.py | 660 + .../test_eager_deletion_recurrent_op.py | 692 + .../test_eager_deletion_transformer.py | 26 + .../unittests/test_eager_deletion_while_op.py | 153 + .../tests/unittests/test_edit_distance_op.py | 187 + .../unittests/test_elementwise_add_op.py | 282 + .../unittests/test_elementwise_div_op.py | 151 + .../unittests/test_elementwise_floordiv_op.py | 69 + .../unittests/test_elementwise_gradient_op.py | 105 + .../unittests/test_elementwise_max_op.py | 132 + .../unittests/test_elementwise_min_op.py | 132 + .../unittests/test_elementwise_mod_op.py | 69 + .../unittests/test_elementwise_mul_op.py | 144 + .../unittests/test_elementwise_nn_grad.py | 247 + .../unittests/test_elementwise_pow_op.py | 45 + .../unittests/test_elementwise_sub_op.py | 121 + .../fluid/tests/unittests/test_exception.py | 35 + .../tests/unittests/test_executor_and_mul.py | 45 + .../test_executor_and_use_program_cache.py | 100 + .../fluid/tests/unittests/test_expand_op.py | 196 + .../unittests/test_fake_dequantize_op.py | 163 + .../tests/unittests/test_fake_init_op.py | 52 + .../tests/unittests/test_fake_quantize_op.py | 214 + .../fluid/tests/unittests/test_fc_op.py | 110 + .../tests/unittests/test_feed_fetch_method.py | 46 + .../fluid/tests/unittests/test_fetch_var.py | 39 + .../tests/unittests/test_fill_any_like_op.py | 81 + .../test_fill_constant_batch_size_like_op.py | 78 + .../tests/unittests/test_fill_constant_op.py | 108 + .../fluid/tests/unittests/test_fill_op.py | 40 + .../unittests/test_fill_zeros_like2_op.py | 50 + .../unittests/test_fill_zeros_like_op.py | 43 + .../fluid/tests/unittests/test_flatten_op.py | 73 + .../unittests/test_framework_debug_str.py | 29 + .../fluid/tests/unittests/test_fsp_op.py | 62 + .../fluid/tests/unittests/test_ftrl_op.py | 78 + .../unittests/test_fuse_all_reduce_pass.py | 133 + .../test_fuse_elewise_add_act_pass.py | 80 + .../unittests/test_fuse_optimizer_pass.py | 146 + .../test_fuse_relu_depthwise_conv_pass.py | 149 + .../test_fused_elemwise_activation_op.py | 330 + .../unittests/test_fused_emb_seq_pool_op.py | 51 + .../test_fused_embedding_fc_lstm_op.py | 218 + .../tests/unittests/test_fusion_gru_op.py | 141 + .../tests/unittests/test_fusion_lstm_op.py | 194 + .../test_fusion_repeated_fc_relu_op.py | 85 + .../test_fusion_seqconv_eltadd_relu_op.py | 94 + .../test_fusion_seqexpand_concat_fc_op.py | 139 + .../test_fusion_seqpool_concat_op.py | 118 + .../test_fusion_squared_mat_sub_op.py | 53 + ...test_fusion_transpose_flatten_concat_op.py | 105 + .../fluid/tests/unittests/test_gather_op.py | 110 + ...test_gaussian_random_batch_size_like_op.py | 48 + .../unittests/test_gaussian_random_op.py | 76 + .../unittests/test_generate_mask_labels_op.py | 421 + .../test_generate_proposal_labels_op.py | 357 + .../unittests/test_generate_proposals_op.py | 330 + .../tests/unittests/test_get_places_op.py | 34 + .../test_get_tensor_from_selected_rows_op.py | 65 + .../unittests/test_grad_clip_minimize.py | 254 + .../tests/unittests/test_gradient_clip.py | 162 + .../tests/unittests/test_grid_sampler_op.py | 123 + .../tests/unittests/test_group_norm_op.py | 143 + .../fluid/tests/unittests/test_gru_op.py | 233 + .../fluid/tests/unittests/test_gru_unit_op.py | 153 + .../fluid/tests/unittests/test_hash_op.py | 106 + .../tests/unittests/test_hinge_loss_op.py | 44 + .../fluid/tests/unittests/test_hsigmoid_op.py | 349 + .../test_hsigmoid_remote_table_op.py | 269 + .../tests/unittests/test_huber_loss_op.py | 64 + .../tests/unittests/test_im2sequence_op.py | 283 + .../test_image_classification_layer.py | 85 + .../tests/unittests/test_imperative_base.py | 31 + .../tests/unittests/test_imperative_basic.py | 410 + .../unittests/test_imperative_checkpoint.py | 168 + .../unittests/test_imperative_decorator.py | 67 + .../tests/unittests/test_imperative_deepcf.py | 293 + .../tests/unittests/test_imperative_gan.py | 228 + .../tests/unittests/test_imperative_gnn.py | 169 + .../tests/unittests/test_imperative_mnist.py | 236 + .../test_imperative_mnist_sorted_gradient.py | 149 + .../test_imperative_ocr_attention_model.py | 584 + .../unittests/test_imperative_optimizer.py | 250 + .../unittests/test_imperative_ptb_rnn.py | 345 + ...test_imperative_ptb_rnn_sorted_gradient.py | 165 + .../test_imperative_recurrent_usage.py | 92 + .../test_imperative_reinforcement.py | 175 + .../tests/unittests/test_imperative_resnet.py | 383 + .../test_imperative_resnet_sorted_gradient.py | 230 + .../test_imperative_save_load_optimizer.py | 196 + .../unittests/test_imperative_se_resnext.py | 476 + .../unittests/test_imperative_transformer.py | 1082 ++ ..._imperative_transformer_sorted_gradient.py | 361 + .../fluid/tests/unittests/test_infer_shape.py | 117 + .../unittests/test_inference_model_io.py | 150 + .../fluid/tests/unittests/test_initializer.py | 520 + ...test_inplace_softmax_with_cross_entropy.py | 137 + .../tests/unittests/test_install_check.py | 26 + .../tests/unittests/test_iou_similarity_op.py | 71 + .../fluid/tests/unittests/test_ir_graph.py | 146 + .../tests/unittests/test_ir_inplace_pass.py | 80 + .../test_ir_memory_optimize_ifelse_op.py | 123 + .../unittests/test_ir_memory_optimize_nlp.py | 55 + .../unittests/test_ir_memory_optimize_pass.py | 123 + .../test_ir_memory_optimize_transformer.py | 73 + .../fluid/tests/unittests/test_is_empty_op.py | 40 + .../fluid/tests/unittests/test_isfinite_op.py | 97 + .../tests/unittests/test_kldiv_loss_op.py | 82 + .../fluid/tests/unittests/test_l1_norm_op.py | 44 + .../tests/unittests/test_label_smooth_op.py | 57 + .../fluid/tests/unittests/test_lamb_op.py | 296 + .../fluid/tests/unittests/test_launch.sh | 30 + .../tests/unittests/test_layer_norm_op.py | 179 + .../fluid/tests/unittests/test_layers.py | 2133 +++ .../unittests/test_learning_rate_scheduler.py | 270 + .../unittests/test_linear_chain_crf_op.py | 161 + .../fluid/tests/unittests/test_linspace.py | 71 + .../unittests/test_listen_and_serv_op.py | 166 + .../unittests/test_lod_array_length_op.py | 37 + .../tests/unittests/test_lod_rank_table.py | 45 + .../tests/unittests/test_lod_reset_op.py | 105 + .../tests/unittests/test_lod_tensor_array.py | 54 + .../unittests/test_lod_tensor_array_ops.py | 223 + .../fluid/tests/unittests/test_log_loss_op.py | 49 + .../fluid/tests/unittests/test_logical_op.py | 51 + .../unittests/test_lookup_remote_table_op.py | 203 + .../unittests/test_lookup_sparse_table_op.py | 118 + .../tests/unittests/test_lookup_table_op.py | 154 + .../fluid/tests/unittests/test_lrn_op.py | 93 + .../tests/unittests/test_lstm_cudnn_op.py | 192 + .../fluid/tests/unittests/test_lstm_op.py | 318 + .../tests/unittests/test_lstm_unit_op.py | 54 + .../fluid/tests/unittests/test_lstmp_op.py | 319 + .../unittests/test_margin_rank_loss_op.py | 55 + .../tests/unittests/test_math_op_patch.py | 191 + .../fluid/tests/unittests/test_matmul_op.py | 174 + .../fluid/tests/unittests/test_maxout_op.py | 53 + .../fluid/tests/unittests/test_mean_iou.py | 116 + .../fluid/tests/unittests/test_mean_op.py | 60 + .../test_memory_optimization_transpiler.py | 118 + .../tests/unittests/test_memory_usage.py | 69 + .../tests/unittests/test_merge_ids_op.py | 53 + .../unittests/test_merge_selectedrows_op.py | 73 + .../fluid/tests/unittests/test_metrics.py | 49 + .../unittests/test_mine_hard_examples_op.py | 102 + .../fluid/tests/unittests/test_minus_op.py | 39 + .../test_mix_precision_all_reduce_fuse.py | 91 + .../unittests/test_modified_huber_loss_op.py | 64 + .../fluid/tests/unittests/test_momentum_op.py | 238 + .../fluid/tests/unittests/test_mul_op.py | 161 + .../tests/unittests/test_multi_file_reader.py | 81 + .../tests/unittests/test_multi_pass_reader.py | 69 + .../tests/unittests/test_multiclass_nms_op.py | 363 + .../unittests/test_multihead_attention.py | 100 + .../tests/unittests/test_multiplex_op.py | 61 + .../fluid/tests/unittests/test_name_scope.py | 45 + .../paddle/fluid/tests/unittests/test_nce.py | 223 + .../unittests/test_nce_remote_table_op.py | 236 + .../tests/unittests/test_nearest_interp_op.py | 293 + .../unittests/test_network_with_dtype.py | 76 + .../fluid/tests/unittests/test_nn_grad.py | 122 + .../fluid/tests/unittests/test_norm_op.py | 89 + .../unittests/test_normalization_wrapper.py | 99 + .../tests/unittests/test_npair_loss_op.py | 101 + .../fluid/tests/unittests/test_nvprof.py | 48 + .../fluid/tests/unittests/test_one_hot_op.py | 179 + .../tests/unittests/test_op_support_gpu.py | 28 + .../fluid/tests/unittests/test_operator.py | 220 + .../tests/unittests/test_operator_desc.py | 109 + .../fluid/tests/unittests/test_optimizer.py | 548 + .../fluid/tests/unittests/test_pad2d_op.py | 127 + .../tests/unittests/test_pad_constant_like.py | 69 + .../fluid/tests/unittests/test_pad_op.py | 71 + .../unittests/test_parallel_dygraph_mnist.py | 37 + .../test_parallel_dygraph_se_resnext.py | 35 + .../unittests/test_parallel_executor_crf.py | 243 + .../test_parallel_executor_crf_auto_growth.py | 19 + .../test_parallel_executor_drop_scope.py | 77 + .../test_parallel_executor_dry_run.py | 81 + .../test_parallel_executor_fetch_feed.py | 177 + .../unittests/test_parallel_executor_mnist.py | 191 + .../unittests/test_parallel_executor_pg.py | 82 + .../test_parallel_executor_seresnext.py | 396 + ...test_parallel_executor_test_while_train.py | 98 + .../test_parallel_executor_transformer.py | 184 + ...rallel_executor_transformer_auto_growth.py | 22 + .../fluid/tests/unittests/test_parameter.py | 51 + ...test_partial_eager_deletion_transformer.py | 26 + .../tests/unittests/test_pass_builder.py | 113 + .../fluid/tests/unittests/test_pipeline.py | 152 + .../tests/unittests/test_pixel_shuffle.py | 50 + .../unittests/test_polygon_box_transform.py | 70 + .../fluid/tests/unittests/test_pool2d_op.py | 361 + .../fluid/tests/unittests/test_pool3d_op.py | 408 + .../fluid/tests/unittests/test_pool_max_op.py | 253 + .../test_positive_negative_pair_op.py | 123 + .../unittests/test_precision_recall_op.py | 189 + .../fluid/tests/unittests/test_prelu_op.py | 82 + .../tests/unittests/test_preprocessor.py | 96 + .../fluid/tests/unittests/test_print_op.py | 125 + .../tests/unittests/test_prior_box_op.py | 191 + .../fluid/tests/unittests/test_profiler.py | 130 + .../fluid/tests/unittests/test_program.py | 137 + .../tests/unittests/test_program_code.py | 82 + .../fluid/tests/unittests/test_protobuf.py | 43 + .../tests/unittests/test_protobuf_descs.py | 217 + .../unittests/test_proximal_adagrad_op.py | 52 + .../tests/unittests/test_proximal_gd_op.py | 49 + .../tests/unittests/test_psroi_pool_op.py | 134 + .../fluid/tests/unittests/test_py_func_op.py | 190 + .../test_py_reader_lod_level_share.py | 43 + .../unittests/test_py_reader_pin_memory.py | 130 + .../unittests/test_py_reader_push_pop.py | 102 + .../test_py_reader_sample_generator.py | 137 + .../test_py_reader_using_executor.py | 266 + .../fluid/tests/unittests/test_pyreader.py | 84 + .../tests/unittests/test_random_crop_op.py | 49 + .../fluid/tests/unittests/test_range.py | 70 + .../tests/unittests/test_rank_loss_op.py | 48 + .../tests/unittests/test_reader_reset.py | 103 + .../tests/unittests/test_recordio_reader.py | 92 + .../tests/unittests/test_recurrent_op.py | 468 + .../fluid/tests/unittests/test_reduce_op.py | 401 + .../tests/unittests/test_reducescatter.py | 32 + .../unittests/test_ref_by_trainer_id_op.py | 36 + .../fluid/tests/unittests/test_registry.py | 33 + .../fluid/tests/unittests/test_regularizer.py | 235 + .../unittests/test_reorder_lod_tensor.py | 213 + .../fluid/tests/unittests/test_reshape_op.py | 133 + .../test_retinanet_detection_output.py | 412 + .../fluid/tests/unittests/test_reverse_op.py | 69 + .../fluid/tests/unittests/test_rmsprop_op.py | 226 + .../unittests/test_rnn_memory_helper_op.py | 137 + .../tests/unittests/test_roi_align_op.py | 174 + .../test_roi_perspective_transform_op.py | 318 + .../fluid/tests/unittests/test_roi_pool_op.py | 142 + .../fluid/tests/unittests/test_row_conv_op.py | 112 + .../unittests/test_rpn_target_assign_op.py | 397 + .../tests/unittests/test_sampling_id_op.py | 83 + .../fluid/tests/unittests/test_scale_op.py | 140 + .../fluid/tests/unittests/test_scatter_op.py | 135 + .../fluid/tests/unittests/test_scope.py | 53 + .../tests/unittests/test_selected_rows.py | 54 + .../fluid/tests/unittests/test_selu_op.py | 71 + .../fluid/tests/unittests/test_seq_conv.py | 245 + .../fluid/tests/unittests/test_seq_pool.py | 343 + .../tests/unittests/test_sequence_concat.py | 78 + .../unittests/test_sequence_enumerate_op.py | 116 + .../tests/unittests/test_sequence_erase_op.py | 113 + .../tests/unittests/test_sequence_expand.py | 138 + .../unittests/test_sequence_expand_as.py | 86 + .../tests/unittests/test_sequence_mask.py | 156 + .../tests/unittests/test_sequence_pad_op.py | 145 + .../tests/unittests/test_sequence_reshape.py | 85 + .../tests/unittests/test_sequence_reverse.py | 81 + .../unittests/test_sequence_scatter_op.py | 72 + .../tests/unittests/test_sequence_slice_op.py | 87 + .../unittests/test_sequence_softmax_op.py | 94 + .../tests/unittests/test_sequence_unpad_op.py | 89 + .../fluid/tests/unittests/test_sgd_op.py | 190 + .../fluid/tests/unittests/test_shape_op.py | 49 + .../tests/unittests/test_shard_index_op.py | 85 + .../tests/unittests/test_shrink_rnn_memory.py | 108 + .../unittests/test_shuffle_channel_op.py | 52 + ...st_sigmoid_cross_entropy_with_logits_op.py | 246 + .../unittests/test_sigmoid_focal_loss_op.py | 132 + .../fluid/tests/unittests/test_sign_op.py | 38 + .../unittests/test_similarity_focus_op.py | 217 + .../fluid/tests/unittests/test_size_op.py | 57 + .../fluid/tests/unittests/test_slice_op.py | 257 + .../fluid/tests/unittests/test_slice_var.py | 65 + .../tests/unittests/test_smooth_l1_loss_op.py | 109 + .../fluid/tests/unittests/test_softmax_op.py | 193 + .../test_softmax_with_cross_entropy_op.py | 412 + .../tests/unittests/test_space_to_depth_op.py | 135 + .../tests/unittests/test_spectral_norm_op.py | 122 + .../test_split_and_merge_lod_tensor_op.py | 186 + .../tests/unittests/test_split_ids_op.py | 93 + .../fluid/tests/unittests/test_split_op.py | 49 + .../unittests/test_split_selected_rows_op.py | 137 + .../fluid/tests/unittests/test_spp_op.py | 84 + .../unittests/test_squared_l2_distance_op.py | 87 + .../unittests/test_squared_l2_norm_op.py | 45 + .../fluid/tests/unittests/test_squeeze_op.py | 75 + .../fluid/tests/unittests/test_stack_op.py | 92 + .../fluid/tests/unittests/test_sum_op.py | 229 + .../fluid/tests/unittests/test_switch.py | 66 + .../unittests/test_sync_batch_norm_op.py | 160 + .../tests/unittests/test_target_assign_op.py | 166 + .../test_teacher_student_sigmoid_loss_op.py | 59 + .../tests/unittests/test_temporal_shift_op.py | 81 + .../fluid/tests/unittests/test_tensor.py | 261 + .../unittests/test_tensor_array_to_tensor.py | 142 + .../tests/unittests/test_tensor_to_numpy.py | 53 + .../fluid/tests/unittests/test_top_k_op.py | 155 + .../tests/unittests/test_transpose_op.py | 82 + .../tests/unittests/test_tree_conv_op.py | 120 + .../test_truncated_gaussian_random_op.py | 69 + .../fluid/tests/unittests/test_unfold_op.py | 102 + .../test_uniform_random_batch_size_like_op.py | 44 + .../tests/unittests/test_uniform_random_op.py | 85 + .../fluid/tests/unittests/test_unique.py | 72 + .../fluid/tests/unittests/test_unique_name.py | 45 + .../fluid/tests/unittests/test_unpool_op.py | 99 + .../tests/unittests/test_unsqueeze_op.py | 83 + .../fluid/tests/unittests/test_unstack_op.py | 81 + .../fluid/tests/unittests/test_variable.py | 187 + .../fluid/tests/unittests/test_version.py | 50 + .../fluid/tests/unittests/test_warpctc_op.py | 260 + .../tests/unittests/test_weight_decay.py | 181 + .../unittests/test_weight_normalization.py | 123 + .../fluid/tests/unittests/test_where.py | 92 + .../fluid/tests/unittests/test_while_op.py | 101 + .../fluid/tests/unittests/test_yolo_box_op.py | 117 + .../tests/unittests/test_yolov3_loss_op.py | 260 + .../paddle/fluid/tests/unittests/testsuite.py | 194 + .../tests/unittests/transformer_model.py | 493 + python/paddle/fluid/trainer_desc.py | 134 + python/paddle/fluid/trainer_factory.py | 43 + python/paddle/fluid/transpiler/__init__.py | 29 + python/paddle/fluid/transpiler/collective.py | 370 + .../fluid/transpiler/details/__init__.py | 20 + .../fluid/transpiler/details/checkport.py | 56 + .../fluid/transpiler/details/program_utils.py | 203 + .../paddle/fluid/transpiler/details/ufind.py | 66 + .../transpiler/details/vars_distributed.py | 269 + .../fluid/transpiler/distribute_transpiler.py | 2310 +++ .../fluid/transpiler/inference_transpiler.py | 661 + .../memory_optimization_transpiler.py | 624 + .../paddle/fluid/transpiler/ps_dispatcher.py | 110 + python/paddle/fluid/unique_name.py | 170 + python/paddle/fluid/wrapped_decorator.py | 30 + python/paddle/libs/__init__.py | 15 + python/paddle/reader/__init__.py | 71 + python/paddle/reader/creator.py | 96 + python/paddle/reader/decorator.py | 564 + python/paddle/reader/tests/CMakeLists.txt | 2 + python/paddle/reader/tests/__init__.py | 13 + python/paddle/reader/tests/creator_test.py | 75 + python/paddle/reader/tests/decorator_test.py | 223 + .../paddle/reader/tests/test_data_creator.txt | 3 + .../reader/tests/test_reader_recordio.dat | Bin 0 -> 76 bytes .../reader/tests/test_recordio_creator.dat | Bin 0 -> 88 bytes python/paddle/utils/__init__.py | 16 + python/paddle/utils/image_util.py | 224 + python/paddle/utils/plot.py | 116 + python/paddle/utils/plotcurve.py | 155 + python/paddle/utils/preprocess_img.py | 156 + python/paddle/utils/preprocess_util.py | 362 + python/paddle/utils/show_pb.py | 59 + python/paddle/utils/torch2paddle.py | 92 + python/requirements.txt | 18 + python/setup.py.in | 250 + tools/__init__.py | 13 + tools/aws_benchmarking/README.md | 184 + tools/aws_benchmarking/client/Dockerfile | 7 + .../client/cluster_launcher.py | 415 + .../aws_benchmarking/client/requirements.txt | 6 + tools/aws_benchmarking/diagram.png | Bin 0 -> 40790 bytes tools/aws_benchmarking/server/Dockerfile | 7 + .../aws_benchmarking/server/cluster_master.py | 735 + tools/aws_benchmarking/server/logs/master.log | 0 .../server/pserver.sh.template | 2 + .../aws_benchmarking/server/requirements.txt | 4 + .../server/trainer.sh.template | 2 + tools/check_ctest_hung.py | 55 + tools/check_pr_approval.py | 49 + tools/codestyle/.gitignore | 1 + tools/codestyle/clang_format.hook | 15 + tools/codestyle/copyright.hook | 121 + tools/codestyle/cpplint_pre_commit.hook | 27 + tools/codestyle/docstring_checker.py | 349 + tools/codestyle/pylint_pre_commit.hook | 19 + tools/codestyle/test_docstring_checker.py | 232 + tools/continuous_integration/bisect.py | 141 + tools/diff_api.py | 35 + tools/diff_use_default_grad_op_maker.py | 66 + tools/document_preview.sh | 17 + ...generate_op_use_grad_op_desc_maker_spec.py | 29 + tools/manylinux1/Dockerfile.x64 | 64 + tools/manylinux1/README.md | 65 + tools/manylinux1/build_all.sh | 31 + tools/manylinux1/build_scripts/build.sh | 161 + tools/manylinux1/build_scripts/build_utils.sh | 195 + .../manylinux1/build_scripts/install_nccl2.sh | 27 + .../build_scripts/manylinux1-check.py | 70 + .../build_scripts/python-tag-abi-tag.py | 21 + tools/manylinux1/build_scripts/ssl-check.py | 46 + tools/print_signatures.py | 99 + tools/test_runner.py | 53 + tools/timeline.py | 306 + 3027 files changed, 486398 insertions(+), 74 deletions(-) create mode 100644 AUTHORS.md create mode 100644 CMakeLists.txt create mode 100644 CODE_OF_CONDUCT.md create mode 100644 CODE_OF_CONDUCT_cn.md create mode 100644 CONTRIBUTING.md create mode 100644 Dockerfile create mode 100644 ISSUE_TEMPLATE.md create mode 100644 LICENSE create mode 100644 README_cn.md create mode 100644 RELEASE.md create mode 100644 cmake/FindGperftools.cmake create mode 100644 cmake/FindNumPy.cmake create mode 100644 cmake/anakin_subgraph.cmake create mode 100644 cmake/cblas.cmake create mode 100644 cmake/ccache.cmake create mode 100644 cmake/configure.cmake create mode 100644 cmake/coveralls.cmake create mode 100644 cmake/coverallsGcovJsons.cmake create mode 100644 cmake/cuda.cmake create mode 100644 cmake/cudnn.cmake create mode 100644 cmake/cupti.cmake create mode 100644 cmake/external/boost.cmake create mode 100644 cmake/external/brpc.cmake create mode 100644 cmake/external/cares.cmake create mode 100644 cmake/external/cub.cmake create mode 100644 cmake/external/dgc.cmake create mode 100644 cmake/external/dlpack.cmake create mode 100644 cmake/external/eigen.cmake create mode 100644 cmake/external/gflags.cmake create mode 100644 cmake/external/glog.cmake create mode 100644 cmake/external/grpc.cmake create mode 100644 cmake/external/gtest.cmake create mode 100644 cmake/external/leveldb.cmake create mode 100644 cmake/external/libmct.cmake create mode 100644 cmake/external/libxsmm.cmake create mode 100644 cmake/external/mkldnn.cmake create mode 100644 cmake/external/mklml.cmake create mode 100644 cmake/external/ngraph.cmake create mode 100644 cmake/external/openblas.cmake create mode 100644 cmake/external/protobuf.cmake create mode 100644 cmake/external/pslib.cmake create mode 100644 cmake/external/pslib_brpc.cmake create mode 100644 cmake/external/pybind11.cmake create mode 100644 cmake/external/python.cmake create mode 100644 cmake/external/rocprim.cmake create mode 100644 cmake/external/snappy.cmake create mode 100644 cmake/external/snappystream.cmake create mode 100644 cmake/external/threadpool.cmake create mode 100644 cmake/external/warpctc.cmake create mode 100644 cmake/external/xbyak.cmake create mode 100644 cmake/external/xxhash.cmake create mode 100644 cmake/external/zlib.cmake create mode 100644 cmake/flags.cmake create mode 100644 cmake/generic.cmake create mode 100644 cmake/hip.cmake create mode 100644 cmake/inference_lib.cmake create mode 100644 cmake/make_resource.py create mode 100644 cmake/operators.cmake create mode 100644 cmake/package.cmake create mode 100644 cmake/python_module.cmake create mode 100644 cmake/simd.cmake create mode 100644 cmake/system.cmake create mode 100644 cmake/tensorrt.cmake create mode 100644 cmake/util.cmake create mode 100644 cmake/version.cmake create mode 100644 doc/README.md create mode 100644 go/glide.lock create mode 100644 go/glide.yaml create mode 100644 paddle/.common_test_util.sh create mode 100644 paddle/.gitignore create mode 100755 paddle/.set_port.sh create mode 100755 paddle/.set_python_path.sh create mode 100644 paddle/CMakeLists.txt create mode 100644 paddle/contrib/float16/.gitignore create mode 100644 paddle/contrib/float16/README.md create mode 100644 paddle/contrib/float16/float16_benchmark.md create mode 100644 paddle/contrib/float16/float16_inference_demo.py create mode 100644 paddle/contrib/float16/float16_transpiler.py create mode 100755 paddle/contrib/float16/run_float16_demo.sh create mode 100644 paddle/fluid/.clang-format create mode 100644 paddle/fluid/API.spec create mode 100644 paddle/fluid/CMakeLists.txt create mode 100644 paddle/fluid/framework/.gitignore create mode 100644 paddle/fluid/framework/CMakeLists.txt create mode 100644 paddle/fluid/framework/archive.h create mode 100644 paddle/fluid/framework/array.h create mode 100644 paddle/fluid/framework/async_executor.cc create mode 100644 paddle/fluid/framework/async_executor.h create mode 100644 paddle/fluid/framework/attribute.cc create mode 100644 paddle/fluid/framework/attribute.h create mode 100644 paddle/fluid/framework/block_desc.cc create mode 100644 paddle/fluid/framework/block_desc.h create mode 100644 paddle/fluid/framework/blocking_queue.h create mode 100644 paddle/fluid/framework/channel.h create mode 100644 paddle/fluid/framework/commit.h create mode 100644 paddle/fluid/framework/commit.h.in create mode 100644 paddle/fluid/framework/data_device_transform.cc create mode 100644 paddle/fluid/framework/data_device_transform.h create mode 100644 paddle/fluid/framework/data_device_transform_test.cu create mode 100644 paddle/fluid/framework/data_feed.cc create mode 100644 paddle/fluid/framework/data_feed.h create mode 100644 paddle/fluid/framework/data_feed.proto create mode 100644 paddle/fluid/framework/data_feed_factory.cc create mode 100644 paddle/fluid/framework/data_feed_factory.h create mode 100644 paddle/fluid/framework/data_feed_test.cc create mode 100644 paddle/fluid/framework/data_layout.h create mode 100644 paddle/fluid/framework/data_layout_transform.cc create mode 100644 paddle/fluid/framework/data_layout_transform.h create mode 100644 paddle/fluid/framework/data_layout_transform_test.cc create mode 100644 paddle/fluid/framework/data_set.cc create mode 100644 paddle/fluid/framework/data_set.h create mode 100644 paddle/fluid/framework/data_transform.cc create mode 100644 paddle/fluid/framework/data_transform.h create mode 100644 paddle/fluid/framework/data_type.cc create mode 100644 paddle/fluid/framework/data_type.h create mode 100644 paddle/fluid/framework/data_type_test.cc create mode 100644 paddle/fluid/framework/data_type_transform.cc create mode 120000 paddle/fluid/framework/data_type_transform.cu create mode 100644 paddle/fluid/framework/data_type_transform.h create mode 100644 paddle/fluid/framework/data_type_transform_test.cc create mode 100644 paddle/fluid/framework/data_type_transform_test.cu create mode 100644 paddle/fluid/framework/dataset_factory.cc create mode 100644 paddle/fluid/framework/dataset_factory.h create mode 100644 paddle/fluid/framework/ddim.cc create mode 100644 paddle/fluid/framework/ddim.h create mode 100644 paddle/fluid/framework/ddim_test.cc create mode 100644 paddle/fluid/framework/details/CMakeLists.txt create mode 100644 paddle/fluid/framework/details/all_reduce_op_handle.cc create mode 100644 paddle/fluid/framework/details/all_reduce_op_handle.h create mode 100644 paddle/fluid/framework/details/async_ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/async_ssa_graph_executor.h create mode 100644 paddle/fluid/framework/details/broadcast_op_handle.cc create mode 100644 paddle/fluid/framework/details/broadcast_op_handle.h create mode 100644 paddle/fluid/framework/details/broadcast_op_handle_test.cc create mode 100644 paddle/fluid/framework/details/broadcast_op_handle_test.h create mode 100644 paddle/fluid/framework/details/build_strategy.cc create mode 100644 paddle/fluid/framework/details/build_strategy.h create mode 100644 paddle/fluid/framework/details/computation_op_handle.cc create mode 100644 paddle/fluid/framework/details/computation_op_handle.h create mode 100644 paddle/fluid/framework/details/container_cast.h create mode 100644 paddle/fluid/framework/details/cow_ptr.h create mode 100644 paddle/fluid/framework/details/cow_ptr_test.cc create mode 100644 paddle/fluid/framework/details/dgc_const_values.h create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.cc create mode 100644 paddle/fluid/framework/details/eager_deletion_op_handle.h create mode 100644 paddle/fluid/framework/details/exception_holder.h create mode 100644 paddle/fluid/framework/details/execution_strategy.h create mode 100644 paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h create mode 100644 paddle/fluid/framework/details/fetch_barrier_op_handle.cc create mode 100644 paddle/fluid/framework/details/fetch_barrier_op_handle.h create mode 100644 paddle/fluid/framework/details/fetch_op_handle.cc create mode 100644 paddle/fluid/framework/details/fetch_op_handle.h create mode 100644 paddle/fluid/framework/details/fused_all_reduce_op_handle.cc create mode 100644 paddle/fluid/framework/details/fused_all_reduce_op_handle.h create mode 100644 paddle/fluid/framework/details/fused_broadcast_op_handle.cc create mode 100644 paddle/fluid/framework/details/fused_broadcast_op_handle.h create mode 100644 paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc create mode 100644 paddle/fluid/framework/details/gather_op_handle.cc create mode 100644 paddle/fluid/framework/details/gather_op_handle.h create mode 100644 paddle/fluid/framework/details/gather_op_handle_test.cc create mode 100644 paddle/fluid/framework/details/graph_test_base.h create mode 100644 paddle/fluid/framework/details/multi_devices_helper.cc create mode 100644 paddle/fluid/framework/details/multi_devices_helper.h create mode 100644 paddle/fluid/framework/details/nccl_op_handle.h create mode 100644 paddle/fluid/framework/details/op_handle_base.cc create mode 100644 paddle/fluid/framework/details/op_handle_base.h create mode 100644 paddle/fluid/framework/details/op_registry.h create mode 100644 paddle/fluid/framework/details/parallel_ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/parallel_ssa_graph_executor.h create mode 100644 paddle/fluid/framework/details/reduce_and_gather.h create mode 100644 paddle/fluid/framework/details/reduce_op_handle.cc create mode 100644 paddle/fluid/framework/details/reduce_op_handle.h create mode 100644 paddle/fluid/framework/details/reduce_op_handle_test.cc create mode 100644 paddle/fluid/framework/details/rpc_op_handle.cc create mode 100644 paddle/fluid/framework/details/rpc_op_handle.h create mode 100644 paddle/fluid/framework/details/scale_loss_grad_op_handle.cc create mode 100644 paddle/fluid/framework/details/scale_loss_grad_op_handle.h create mode 100644 paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h create mode 100644 paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc create mode 100644 paddle/fluid/framework/details/share_tensor_buffer_op_handle.h create mode 100644 paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc create mode 100644 paddle/fluid/framework/details/sparse_all_reduce_op_handle.h create mode 100644 paddle/fluid/framework/details/ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/ssa_graph_executor.h create mode 100644 paddle/fluid/framework/details/threaded_ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/threaded_ssa_graph_executor.h create mode 100644 paddle/fluid/framework/details/var_handle.cc create mode 100644 paddle/fluid/framework/details/var_handle.h create mode 100644 paddle/fluid/framework/details/variable_visitor.cc create mode 100644 paddle/fluid/framework/details/variable_visitor.h create mode 100644 paddle/fluid/framework/device_worker.cc create mode 100644 paddle/fluid/framework/device_worker.h create mode 100644 paddle/fluid/framework/device_worker_factory.cc create mode 100644 paddle/fluid/framework/device_worker_factory.h create mode 100644 paddle/fluid/framework/device_worker_test.cc create mode 100644 paddle/fluid/framework/dim.h create mode 100644 paddle/fluid/framework/dim_test.cu create mode 100644 paddle/fluid/framework/dist_multi_trainer.cc create mode 100644 paddle/fluid/framework/dlpack_tensor.cc create mode 100644 paddle/fluid/framework/dlpack_tensor.h create mode 100644 paddle/fluid/framework/dlpack_tensor_test.cc create mode 100644 paddle/fluid/framework/downpour_worker.cc create mode 100644 paddle/fluid/framework/eigen.h create mode 100644 paddle/fluid/framework/eigen_test.cc create mode 100644 paddle/fluid/framework/executor.cc create mode 100644 paddle/fluid/framework/executor.h create mode 100644 paddle/fluid/framework/executor_gc_helper.cc create mode 100644 paddle/fluid/framework/executor_gc_helper.h create mode 100644 paddle/fluid/framework/executor_thread_worker.cc create mode 100644 paddle/fluid/framework/executor_thread_worker.h create mode 100644 paddle/fluid/framework/expect.h create mode 100644 paddle/fluid/framework/feed_fetch_method.cc create mode 100644 paddle/fluid/framework/feed_fetch_method.h create mode 100644 paddle/fluid/framework/feed_fetch_type.h create mode 100644 paddle/fluid/framework/fleet/CMakeLists.txt create mode 100644 paddle/fluid/framework/fleet/fleet_wrapper.cc create mode 100644 paddle/fluid/framework/fleet/fleet_wrapper.h create mode 100644 paddle/fluid/framework/fleet/nccl_wrapper.cc create mode 100644 paddle/fluid/framework/fleet/nccl_wrapper.h create mode 100644 paddle/fluid/framework/framework.proto create mode 100644 paddle/fluid/framework/garbage_collector.cc create mode 100644 paddle/fluid/framework/garbage_collector.h create mode 100644 paddle/fluid/framework/grad_op_desc_maker.h create mode 100644 paddle/fluid/framework/hogwild_worker.cc create mode 100644 paddle/fluid/framework/inlined_vector.h create mode 100644 paddle/fluid/framework/inlined_vector_test.cc create mode 100644 paddle/fluid/framework/inplace_op_inference.h create mode 100644 paddle/fluid/framework/inplace_op_inference_test.cc create mode 100644 paddle/fluid/framework/io/CMakeLists.txt create mode 100644 paddle/fluid/framework/io/fs.cc create mode 100644 paddle/fluid/framework/io/fs.h create mode 100644 paddle/fluid/framework/io/shell.cc create mode 100644 paddle/fluid/framework/io/shell.h create mode 100644 paddle/fluid/framework/ir/CMakeLists.txt create mode 100644 paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/attention_lstm_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc create mode 100644 paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h create mode 100644 paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/conv_bn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_bn_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc create mode 100644 paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h create mode 100644 paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/fc_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/fc_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/fc_fuse_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/fc_gru_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/fc_gru_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/fc_lstm_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc create mode 100644 paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h create mode 100644 paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc create mode 100644 paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h create mode 100644 paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt create mode 100644 paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc create mode 100644 paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc create mode 100644 paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc create mode 100644 paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h create mode 100644 paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc create mode 100644 paddle/fluid/framework/ir/fuse_pass_base.cc create mode 100644 paddle/fluid/framework/ir/fuse_pass_base.h create mode 100644 paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc create mode 100644 paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h create mode 100644 paddle/fluid/framework/ir/graph.cc create mode 100644 paddle/fluid/framework/ir/graph.h create mode 100644 paddle/fluid/framework/ir/graph_helper.cc create mode 100644 paddle/fluid/framework/ir/graph_helper.h create mode 100644 paddle/fluid/framework/ir/graph_helper_test.cc create mode 100644 paddle/fluid/framework/ir/graph_pattern_detector.cc create mode 100644 paddle/fluid/framework/ir/graph_pattern_detector.h create mode 100644 paddle/fluid/framework/ir/graph_pattern_detector_tester.cc create mode 100644 paddle/fluid/framework/ir/graph_test.cc create mode 100644 paddle/fluid/framework/ir/graph_to_program_pass.cc create mode 100644 paddle/fluid/framework/ir/graph_to_program_pass.h create mode 100644 paddle/fluid/framework/ir/graph_to_program_pass_test.cc create mode 100644 paddle/fluid/framework/ir/graph_traits.cc create mode 100644 paddle/fluid/framework/ir/graph_traits.h create mode 100644 paddle/fluid/framework/ir/graph_viz_pass.cc create mode 100644 paddle/fluid/framework/ir/graph_viz_pass.h create mode 100644 paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc create mode 100644 paddle/fluid/framework/ir/identity_scale_op_clean_pass.h create mode 100644 paddle/fluid/framework/ir/infer_clean_graph_pass.cc create mode 100644 paddle/fluid/framework/ir/is_test_pass.cc create mode 100644 paddle/fluid/framework/ir/is_test_pass.h create mode 100644 paddle/fluid/framework/ir/is_test_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/lock_free_optimize_pass.cc create mode 100644 paddle/fluid/framework/ir/lock_free_optimize_pass.h create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h create mode 100644 paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/multi_batch_merge_pass.cc create mode 100644 paddle/fluid/framework/ir/multi_batch_merge_pass.h create mode 100644 paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt create mode 100644 paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc create mode 100644 paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc create mode 100644 paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc create mode 100644 paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc create mode 100644 paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc create mode 100644 paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc create mode 100644 paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h create mode 100644 paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc create mode 100644 paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h create mode 100644 paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc create mode 100644 paddle/fluid/framework/ir/ngraph_subgraph_pass.cc create mode 100644 paddle/fluid/framework/ir/ngraph_subgraph_pass.h create mode 100644 paddle/fluid/framework/ir/node.cc create mode 100644 paddle/fluid/framework/ir/node.h create mode 100644 paddle/fluid/framework/ir/node_test.cc create mode 100644 paddle/fluid/framework/ir/pass.cc create mode 100644 paddle/fluid/framework/ir/pass.h create mode 100644 paddle/fluid/framework/ir/pass_builder.cc create mode 100644 paddle/fluid/framework/ir/pass_builder.h create mode 100644 paddle/fluid/framework/ir/pass_test.cc create mode 100644 paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/runtime_context_cache_pass.cc create mode 100644 paddle/fluid/framework/ir/runtime_context_cache_pass.h create mode 100644 paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc create mode 100644 paddle/fluid/framework/ir/shuffle_channel_detect_pass.h create mode 100644 paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.cc create mode 100644 paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h create mode 100644 paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/sync_batch_norm_pass.cc create mode 100644 paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h create mode 100644 paddle/fluid/framework/library_type.h create mode 100644 paddle/fluid/framework/lod_rank_table.cc create mode 100644 paddle/fluid/framework/lod_rank_table.h create mode 100644 paddle/fluid/framework/lod_tensor.cc create mode 100644 paddle/fluid/framework/lod_tensor.h create mode 100644 paddle/fluid/framework/lod_tensor_array.h create mode 100644 paddle/fluid/framework/lod_tensor_test.cc create mode 100644 paddle/fluid/framework/lod_tensor_test.cu create mode 100644 paddle/fluid/framework/mixed_vector.h create mode 100644 paddle/fluid/framework/mixed_vector_test.cc create mode 100644 paddle/fluid/framework/mixed_vector_test.cu create mode 100644 paddle/fluid/framework/multi_trainer.cc create mode 100644 paddle/fluid/framework/naive_executor.cc create mode 100644 paddle/fluid/framework/naive_executor.h create mode 100644 paddle/fluid/framework/naive_executor_test.cc create mode 100644 paddle/fluid/framework/no_need_buffer_vars_inference.h create mode 100644 paddle/fluid/framework/op_desc.cc create mode 100644 paddle/fluid/framework/op_desc.h create mode 100644 paddle/fluid/framework/op_info.cc create mode 100644 paddle/fluid/framework/op_info.h create mode 100644 paddle/fluid/framework/op_kernel_type.cc create mode 100644 paddle/fluid/framework/op_kernel_type.h create mode 100644 paddle/fluid/framework/op_kernel_type_test.cc create mode 100644 paddle/fluid/framework/op_proto_maker.cc create mode 100644 paddle/fluid/framework/op_proto_maker.h create mode 100644 paddle/fluid/framework/op_proto_maker_test.cc create mode 100644 paddle/fluid/framework/op_registry.cc create mode 100644 paddle/fluid/framework/op_registry.h create mode 100644 paddle/fluid/framework/op_registry_test.cc create mode 100644 paddle/fluid/framework/operator.cc create mode 100644 paddle/fluid/framework/operator.h create mode 100644 paddle/fluid/framework/operator_kernel_configs.h create mode 100644 paddle/fluid/framework/operator_test.cc create mode 100644 paddle/fluid/framework/parallel_executor.cc create mode 100644 paddle/fluid/framework/parallel_executor.h create mode 100644 paddle/fluid/framework/pipeline_trainer.cc create mode 100644 paddle/fluid/framework/program_desc.cc create mode 100644 paddle/fluid/framework/program_desc.h create mode 100644 paddle/fluid/framework/program_desc_test.cc create mode 100644 paddle/fluid/framework/proto_desc.h create mode 100644 paddle/fluid/framework/prune.cc create mode 100644 paddle/fluid/framework/prune.h create mode 100644 paddle/fluid/framework/prune_test.cc create mode 100644 paddle/fluid/framework/pull_dense_worker.cc create mode 100644 paddle/fluid/framework/python_headers.h create mode 100644 paddle/fluid/framework/reader.cc create mode 100644 paddle/fluid/framework/reader.h create mode 100644 paddle/fluid/framework/reader_test.cc create mode 100644 paddle/fluid/framework/revision.cc create mode 100644 paddle/fluid/framework/revision.h create mode 100644 paddle/fluid/framework/rw_lock.h create mode 100644 paddle/fluid/framework/rw_lock_test.cc create mode 100644 paddle/fluid/framework/scope.cc create mode 100644 paddle/fluid/framework/scope.h create mode 100644 paddle/fluid/framework/scope_pool.cc create mode 100644 paddle/fluid/framework/scope_pool.h create mode 100644 paddle/fluid/framework/scope_test.cc create mode 100644 paddle/fluid/framework/section_worker.cc create mode 100644 paddle/fluid/framework/selected_rows.cc create mode 100644 paddle/fluid/framework/selected_rows.h create mode 100644 paddle/fluid/framework/selected_rows_test.cc create mode 100644 paddle/fluid/framework/shape_inference.cc create mode 100644 paddle/fluid/framework/shape_inference.h create mode 100644 paddle/fluid/framework/tensor.cc create mode 100644 paddle/fluid/framework/tensor.h create mode 100644 paddle/fluid/framework/tensor_impl.h create mode 100644 paddle/fluid/framework/tensor_test.cc create mode 100644 paddle/fluid/framework/tensor_util.cc create mode 120000 paddle/fluid/framework/tensor_util.cu create mode 100644 paddle/fluid/framework/tensor_util.h create mode 100644 paddle/fluid/framework/tensor_util_test.cc create mode 100644 paddle/fluid/framework/tensor_util_test.cu create mode 100644 paddle/fluid/framework/threadpool.cc create mode 100644 paddle/fluid/framework/threadpool.h create mode 100644 paddle/fluid/framework/threadpool_test.cc create mode 100644 paddle/fluid/framework/trainer.cc create mode 100644 paddle/fluid/framework/trainer.h create mode 100644 paddle/fluid/framework/trainer_desc.proto create mode 100644 paddle/fluid/framework/trainer_factory.cc create mode 100644 paddle/fluid/framework/trainer_factory.h create mode 100644 paddle/fluid/framework/trainer_test.cc create mode 100644 paddle/fluid/framework/transfer_scope_cache.cc create mode 100644 paddle/fluid/framework/transfer_scope_cache.h create mode 100644 paddle/fluid/framework/tuple.h create mode 100644 paddle/fluid/framework/tuple_test.cc create mode 100644 paddle/fluid/framework/type_defs.h create mode 100644 paddle/fluid/framework/unroll_array_ops.h create mode 100644 paddle/fluid/framework/unroll_array_ops_test.cc create mode 100644 paddle/fluid/framework/var_desc.cc create mode 100644 paddle/fluid/framework/var_desc.h create mode 100644 paddle/fluid/framework/var_type.h create mode 100644 paddle/fluid/framework/var_type_inference.h create mode 100644 paddle/fluid/framework/var_type_inference_test.cc create mode 100644 paddle/fluid/framework/var_type_traits.cc create mode 100644 paddle/fluid/framework/var_type_traits.h create mode 100644 paddle/fluid/framework/var_type_traits_test.cc create mode 100644 paddle/fluid/framework/variable.h create mode 100644 paddle/fluid/framework/variable_helper.cc create mode 100644 paddle/fluid/framework/variable_helper.h create mode 100644 paddle/fluid/framework/variable_test.cc create mode 100644 paddle/fluid/framework/version.cc create mode 100644 paddle/fluid/framework/version.h create mode 100644 paddle/fluid/framework/version_test.cc create mode 100644 paddle/fluid/imperative/CMakeLists.txt create mode 100644 paddle/fluid/imperative/README.md create mode 100644 paddle/fluid/imperative/backward_strategy.h create mode 100644 paddle/fluid/imperative/engine.cc create mode 100644 paddle/fluid/imperative/engine.h create mode 100644 paddle/fluid/imperative/flags.cc create mode 100644 paddle/fluid/imperative/flags.h create mode 100644 paddle/fluid/imperative/layer.cc create mode 100644 paddle/fluid/imperative/layer.h create mode 100644 paddle/fluid/imperative/nccl_context.cc create mode 100644 paddle/fluid/imperative/nccl_context.h create mode 100644 paddle/fluid/imperative/nccl_context_test.cc create mode 100644 paddle/fluid/imperative/profiler.cc create mode 100644 paddle/fluid/imperative/profiler.h create mode 100644 paddle/fluid/imperative/tracer.cc create mode 100644 paddle/fluid/imperative/tracer.h create mode 100644 paddle/fluid/imperative/type_defs.h create mode 100644 paddle/fluid/inference/CMakeLists.txt create mode 100644 paddle/fluid/inference/anakin/CMakeLists.txt create mode 100644 paddle/fluid/inference/anakin/convert/CMakeLists.txt create mode 100644 paddle/fluid/inference/anakin/convert/activation.cc create mode 100644 paddle/fluid/inference/anakin/convert/activation.h create mode 100644 paddle/fluid/inference/anakin/convert/affine_channel.cc create mode 100644 paddle/fluid/inference/anakin/convert/affine_channel.h create mode 100644 paddle/fluid/inference/anakin/convert/batch_norm.cc create mode 100644 paddle/fluid/inference/anakin/convert/batch_norm.h create mode 100644 paddle/fluid/inference/anakin/convert/concat.cc create mode 100644 paddle/fluid/inference/anakin/convert/concat.h create mode 100644 paddle/fluid/inference/anakin/convert/conv2d.cc create mode 100644 paddle/fluid/inference/anakin/convert/conv2d.h create mode 100644 paddle/fluid/inference/anakin/convert/conv2d_fusion.cc create mode 100644 paddle/fluid/inference/anakin/convert/conv2d_fusion.h create mode 100644 paddle/fluid/inference/anakin/convert/density_prior_box.cc create mode 100644 paddle/fluid/inference/anakin/convert/density_prior_box.h create mode 100644 paddle/fluid/inference/anakin/convert/detection_out.cc create mode 100644 paddle/fluid/inference/anakin/convert/detection_out.h create mode 100644 paddle/fluid/inference/anakin/convert/dropout.cc create mode 100644 paddle/fluid/inference/anakin/convert/dropout.h create mode 100644 paddle/fluid/inference/anakin/convert/elementwise.cc create mode 100644 paddle/fluid/inference/anakin/convert/elementwise.h create mode 100644 paddle/fluid/inference/anakin/convert/fc.cc create mode 100644 paddle/fluid/inference/anakin/convert/fc.h create mode 100644 paddle/fluid/inference/anakin/convert/flatten.cc create mode 100644 paddle/fluid/inference/anakin/convert/flatten.h create mode 100644 paddle/fluid/inference/anakin/convert/helper.cc create mode 100644 paddle/fluid/inference/anakin/convert/helper.h create mode 100644 paddle/fluid/inference/anakin/convert/im2sequence.cc create mode 100644 paddle/fluid/inference/anakin/convert/im2sequence.h create mode 100644 paddle/fluid/inference/anakin/convert/op_converter.h create mode 100644 paddle/fluid/inference/anakin/convert/pool2d.cc create mode 100644 paddle/fluid/inference/anakin/convert/pool2d.h create mode 100644 paddle/fluid/inference/anakin/convert/relu.cc create mode 100644 paddle/fluid/inference/anakin/convert/relu.h create mode 100644 paddle/fluid/inference/anakin/convert/reshape.cc create mode 100644 paddle/fluid/inference/anakin/convert/reshape.h create mode 100644 paddle/fluid/inference/anakin/convert/roi_align.cc create mode 100644 paddle/fluid/inference/anakin/convert/roi_align.h create mode 100644 paddle/fluid/inference/anakin/convert/scale.cc create mode 100644 paddle/fluid/inference/anakin/convert/scale.h create mode 100644 paddle/fluid/inference/anakin/convert/shuffle_channel.cc create mode 100644 paddle/fluid/inference/anakin/convert/shuffle_channel.h create mode 100644 paddle/fluid/inference/anakin/convert/softmax.cc create mode 100644 paddle/fluid/inference/anakin/convert/softmax.h create mode 100644 paddle/fluid/inference/anakin/convert/split.cc create mode 100644 paddle/fluid/inference/anakin/convert/split.h create mode 100644 paddle/fluid/inference/anakin/convert/sum.cc create mode 100644 paddle/fluid/inference/anakin/convert/sum.h create mode 100644 paddle/fluid/inference/anakin/convert/test_activation_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_concat_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_conv2d_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_dropout_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_elementwise_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_fc_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_flatten_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_pool2d_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_relu_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_reshape_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_softmax_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_split_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_sum_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/test_transpose_op.cc create mode 100644 paddle/fluid/inference/anakin/convert/transpose.cc create mode 100644 paddle/fluid/inference/anakin/convert/transpose.h create mode 100644 paddle/fluid/inference/anakin/convert/ut_helper.h create mode 100644 paddle/fluid/inference/anakin/engine.cc create mode 100644 paddle/fluid/inference/anakin/engine.h create mode 100644 paddle/fluid/inference/anakin/op_teller.cc create mode 100644 paddle/fluid/inference/anakin/op_teller.h create mode 100644 paddle/fluid/inference/anakin/test_anakin_engine.cc create mode 100644 paddle/fluid/inference/analysis/CMakeLists.txt create mode 100644 paddle/fluid/inference/analysis/README.md create mode 100644 paddle/fluid/inference/analysis/analysis_pass.cc create mode 100644 paddle/fluid/inference/analysis/analysis_pass.h create mode 100644 paddle/fluid/inference/analysis/analyzer.cc create mode 100644 paddle/fluid/inference/analysis/analyzer.h create mode 100644 paddle/fluid/inference/analysis/analyzer_tester.cc create mode 100644 paddle/fluid/inference/analysis/argument.cc create mode 100644 paddle/fluid/inference/analysis/argument.h create mode 100644 paddle/fluid/inference/analysis/device.h create mode 100644 paddle/fluid/inference/analysis/dot.h create mode 100644 paddle/fluid/inference/analysis/dot_tester.cc create mode 100644 paddle/fluid/inference/analysis/flags.h create mode 100644 paddle/fluid/inference/analysis/helper.cc create mode 100644 paddle/fluid/inference/analysis/helper.h create mode 100644 paddle/fluid/inference/analysis/ir_pass_manager.cc create mode 100644 paddle/fluid/inference/analysis/ir_pass_manager.h create mode 100644 paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt create mode 100644 paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc create mode 100644 paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h create mode 100644 paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc create mode 100644 paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h create mode 100644 paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc create mode 100644 paddle/fluid/inference/analysis/ir_passes/subgraph_util.h create mode 100644 paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc create mode 100644 paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h create mode 100644 paddle/fluid/inference/analysis/passes/CMakeLists.txt create mode 100644 paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h create mode 100644 paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h create mode 100644 paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/ir_analysis_pass.h create mode 100644 paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h create mode 100644 paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h create mode 100644 paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h create mode 100644 paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc create mode 100644 paddle/fluid/inference/analysis/passes/memory_optimize_pass.h create mode 100644 paddle/fluid/inference/analysis/passes/passes.cc create mode 100644 paddle/fluid/inference/analysis/passes/passes.h create mode 100644 paddle/fluid/inference/analysis/ut_helper.h create mode 100755 paddle/fluid/inference/api/CMakeLists.txt create mode 100644 paddle/fluid/inference/api/README.md create mode 100644 paddle/fluid/inference/api/analysis_config.cc create mode 100644 paddle/fluid/inference/api/analysis_predictor.cc create mode 100644 paddle/fluid/inference/api/analysis_predictor.h create mode 100644 paddle/fluid/inference/api/analysis_predictor_tester.cc create mode 100644 paddle/fluid/inference/api/api.cc create mode 100644 paddle/fluid/inference/api/api_anakin_engine.cc create mode 100644 paddle/fluid/inference/api/api_anakin_engine.h create mode 100644 paddle/fluid/inference/api/api_impl.cc create mode 100644 paddle/fluid/inference/api/api_impl.h create mode 100644 paddle/fluid/inference/api/api_impl_tester.cc create mode 100644 paddle/fluid/inference/api/api_tester.cc create mode 100644 paddle/fluid/inference/api/demo_ci/.gitignore create mode 100644 paddle/fluid/inference/api/demo_ci/CMakeLists.txt create mode 100644 paddle/fluid/inference/api/demo_ci/README.md create mode 100755 paddle/fluid/inference/api/demo_ci/clean.sh create mode 100755 paddle/fluid/inference/api/demo_ci/run.sh create mode 100644 paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc create mode 100644 paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc create mode 100644 paddle/fluid/inference/api/demo_ci/utils.h create mode 100644 paddle/fluid/inference/api/demo_ci/vis_demo.cc create mode 100644 paddle/fluid/inference/api/demo_ci/windows_inference.md create mode 100644 paddle/fluid/inference/api/details/CMakeLists.txt create mode 100644 paddle/fluid/inference/api/details/reset_tensor_array.cc create mode 100644 paddle/fluid/inference/api/details/reset_tensor_array.h create mode 100644 paddle/fluid/inference/api/details/zero_copy_tensor.cc create mode 100644 paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc create mode 100644 paddle/fluid/inference/api/helper.cc create mode 100644 paddle/fluid/inference/api/helper.h create mode 100644 paddle/fluid/inference/api/high_level_api.md create mode 100644 paddle/fluid/inference/api/high_level_api_cn.md create mode 100644 paddle/fluid/inference/api/mkldnn_quantizer.cc create mode 100644 paddle/fluid/inference/api/mkldnn_quantizer.h create mode 100644 paddle/fluid/inference/api/mkldnn_quantizer_config.cc create mode 100644 paddle/fluid/inference/api/paddle_anakin_config.h create mode 100644 paddle/fluid/inference/api/paddle_analysis_config.h create mode 100644 paddle/fluid/inference/api/paddle_api.h create mode 100644 paddle/fluid/inference/api/paddle_inference_api.h create mode 100644 paddle/fluid/inference/api/paddle_inference_pass.h create mode 100644 paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h create mode 100644 paddle/fluid/inference/api/paddle_pass_builder.cc create mode 100644 paddle/fluid/inference/api/paddle_pass_builder.h create mode 100755 paddle/fluid/inference/check_symbol.sh create mode 100644 paddle/fluid/inference/engine.h create mode 100644 paddle/fluid/inference/io.cc create mode 100644 paddle/fluid/inference/io.h create mode 100644 paddle/fluid/inference/paddle_fluid.map create mode 100644 paddle/fluid/inference/paddle_fluid.sym create mode 100644 paddle/fluid/inference/tensorrt/CMakeLists.txt create mode 100644 paddle/fluid/inference/tensorrt/convert/CMakeLists.txt create mode 100644 paddle/fluid/inference/tensorrt/convert/activation_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/concat_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/conv2d_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/dropout_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/elementwise_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/fc_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/io_converter.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/io_converter.h create mode 100644 paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/mul_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/op_converter.h create mode 100644 paddle/fluid/inference/tensorrt/convert/pad_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/pool2d_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/prelu_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/softmax_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/split_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_activation_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_concat_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_fc_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_io_converter.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_mul_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_op_converter.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_pad_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_split_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/ut_helper.h create mode 100644 paddle/fluid/inference/tensorrt/engine.cc create mode 100644 paddle/fluid/inference/tensorrt/engine.h create mode 100644 paddle/fluid/inference/tensorrt/helper.h create mode 100644 paddle/fluid/inference/tensorrt/op_teller.cc create mode 100644 paddle/fluid/inference/tensorrt/op_teller.h create mode 100644 paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt create mode 100644 paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h create mode 100644 paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h create mode 100644 paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h create mode 100644 paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin.h create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h create mode 100644 paddle/fluid/inference/tensorrt/test_engine.cc create mode 100644 paddle/fluid/inference/tensorrt/test_tensorrt.cc create mode 100644 paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc create mode 100644 paddle/fluid/inference/tensorrt/trt_int8_calibrator.h create mode 100644 paddle/fluid/inference/tests/api/CMakeLists.txt create mode 100644 paddle/fluid/inference/tests/api/anakin_mlu_tester.cc create mode 100644 paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_bert_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_dam_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_lac_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_ner_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc create mode 100644 paddle/fluid/inference/tests/api/analyzer_vis_tester.cc create mode 100644 paddle/fluid/inference/tests/api/config_printer.h create mode 100644 paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py create mode 100644 paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py create mode 100644 paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md create mode 100644 paddle/fluid/inference/tests/api/tester_helper.h create mode 100644 paddle/fluid/inference/tests/api/trt_mobilenet_test.cc create mode 100644 paddle/fluid/inference/tests/api/trt_resnet50_test.cc create mode 100644 paddle/fluid/inference/tests/api/trt_resnext_test.cc create mode 100644 paddle/fluid/inference/tests/api/trt_test_helper.h create mode 100644 paddle/fluid/inference/tests/book/CMakeLists.txt create mode 100644 paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc create mode 100644 paddle/fluid/inference/tests/book/test_inference_image_classification.cc create mode 100644 paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc create mode 100644 paddle/fluid/inference/tests/book/test_inference_nlp.cc create mode 100644 paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc create mode 100644 paddle/fluid/inference/tests/book/test_inference_recommender_system.cc create mode 100644 paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc create mode 100644 paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc create mode 100644 paddle/fluid/inference/tests/book/test_inference_word2vec.cc create mode 100644 paddle/fluid/inference/tests/test.cmake create mode 100644 paddle/fluid/inference/tests/test_helper.h create mode 100644 paddle/fluid/inference/tests/test_multi_thread_helper.h create mode 100644 paddle/fluid/inference/utils/CMakeLists.txt create mode 100644 paddle/fluid/inference/utils/benchmark.cc create mode 100644 paddle/fluid/inference/utils/benchmark.h create mode 100644 paddle/fluid/inference/utils/benchmark_tester.cc create mode 100644 paddle/fluid/inference/utils/singleton.h create mode 100644 paddle/fluid/memory/CMakeLists.txt create mode 100644 paddle/fluid/memory/allocation/CMakeLists.txt create mode 100644 paddle/fluid/memory/allocation/aligned_allocator.cc create mode 100644 paddle/fluid/memory/allocation/aligned_allocator.h create mode 100644 paddle/fluid/memory/allocation/allocator.cc create mode 100644 paddle/fluid/memory/allocation/allocator.h create mode 100644 paddle/fluid/memory/allocation/allocator_facade.cc create mode 100644 paddle/fluid/memory/allocation/allocator_facade.h create mode 100644 paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc create mode 100644 paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc create mode 100644 paddle/fluid/memory/allocation/allocator_strategy.cc create mode 100644 paddle/fluid/memory/allocation/allocator_strategy.h create mode 100644 paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc create mode 100644 paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h create mode 100644 paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator.cc create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator.h create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator_test.cc create mode 100644 paddle/fluid/memory/allocation/best_fit_allocator_test.cu create mode 100644 paddle/fluid/memory/allocation/buffered_allocator.cc create mode 100644 paddle/fluid/memory/allocation/buffered_allocator.h create mode 100644 paddle/fluid/memory/allocation/buffered_allocator_test.cc create mode 100644 paddle/fluid/memory/allocation/cpu_allocator.cc create mode 100644 paddle/fluid/memory/allocation/cpu_allocator.h create mode 100644 paddle/fluid/memory/allocation/cuda_allocator.cc create mode 100644 paddle/fluid/memory/allocation/cuda_allocator.h create mode 100644 paddle/fluid/memory/allocation/locked_allocator.cc create mode 100644 paddle/fluid/memory/allocation/locked_allocator.h create mode 100644 paddle/fluid/memory/allocation/naive_best_fit_allocator.cc create mode 100644 paddle/fluid/memory/allocation/naive_best_fit_allocator.h create mode 100644 paddle/fluid/memory/allocation/pinned_allocator.cc create mode 100644 paddle/fluid/memory/allocation/pinned_allocator.h create mode 100644 paddle/fluid/memory/allocation/retry_allocator.cc create mode 100644 paddle/fluid/memory/allocation/retry_allocator.h create mode 100644 paddle/fluid/memory/allocation/retry_allocator_test.cc create mode 100644 paddle/fluid/memory/allocation/test_aligned_allocator.cc create mode 100644 paddle/fluid/memory/detail/CMakeLists.txt create mode 100644 paddle/fluid/memory/detail/buddy_allocator.cc create mode 100644 paddle/fluid/memory/detail/buddy_allocator.h create mode 100644 paddle/fluid/memory/detail/buddy_allocator_test.cc create mode 100644 paddle/fluid/memory/detail/memory_block.cc create mode 100644 paddle/fluid/memory/detail/memory_block.h create mode 100644 paddle/fluid/memory/detail/memory_block_desc.cc create mode 100644 paddle/fluid/memory/detail/meta_cache.cc create mode 100644 paddle/fluid/memory/detail/system_allocator.cc create mode 100644 paddle/fluid/memory/detail/system_allocator.h create mode 100644 paddle/fluid/memory/detail/system_allocator_test.cc create mode 100644 paddle/fluid/memory/malloc.cc create mode 100644 paddle/fluid/memory/malloc.h create mode 100644 paddle/fluid/memory/memcpy.cc create mode 100644 paddle/fluid/memory/memcpy.h create mode 100644 paddle/fluid/memory/memory.h create mode 100644 paddle/fluid/memory/pinned_memory_test.cu create mode 100644 paddle/fluid/op_use_default_grad_op_maker.spec create mode 100644 paddle/fluid/operators/CMakeLists.txt create mode 100644 paddle/fluid/operators/activation_cudnn.cu.cc create mode 100644 paddle/fluid/operators/activation_cudnn_op.cu.cc create mode 100644 paddle/fluid/operators/activation_op.cc create mode 100644 paddle/fluid/operators/activation_op.cu create mode 100644 paddle/fluid/operators/activation_op.h create mode 100644 paddle/fluid/operators/add_position_encoding_op.cc create mode 100644 paddle/fluid/operators/add_position_encoding_op.h create mode 100644 paddle/fluid/operators/affine_channel_op.cc create mode 100644 paddle/fluid/operators/affine_channel_op.cu create mode 100644 paddle/fluid/operators/affine_grid_cudnn_op.cu.cc create mode 100644 paddle/fluid/operators/affine_grid_op.cc create mode 100644 paddle/fluid/operators/affine_grid_op.h create mode 100644 paddle/fluid/operators/anakin/CMakeLists.txt create mode 100644 paddle/fluid/operators/anakin/anakin_engine_op.cc create mode 100644 paddle/fluid/operators/anakin/anakin_engine_op.h create mode 100644 paddle/fluid/operators/arg_max_op.cc create mode 100644 paddle/fluid/operators/arg_max_op.cu create mode 100644 paddle/fluid/operators/arg_min_max_op_base.h create mode 100644 paddle/fluid/operators/arg_min_op.cc create mode 100644 paddle/fluid/operators/arg_min_op.cu create mode 100644 paddle/fluid/operators/argsort_op.cc create mode 100644 paddle/fluid/operators/argsort_op.cu create mode 100644 paddle/fluid/operators/argsort_op.h create mode 100644 paddle/fluid/operators/array_operator.h create mode 100644 paddle/fluid/operators/array_to_lod_tensor_op.cc create mode 100644 paddle/fluid/operators/assign_op.cc create mode 100644 paddle/fluid/operators/assign_value_op.cc create mode 100644 paddle/fluid/operators/assign_value_op.cu.cc create mode 100644 paddle/fluid/operators/assign_value_op.h create mode 100644 paddle/fluid/operators/attention_lstm_op.cc create mode 100644 paddle/fluid/operators/attention_lstm_op.h create mode 100644 paddle/fluid/operators/average_accumulates_op.cc create mode 100644 paddle/fluid/operators/average_accumulates_op.cu create mode 100644 paddle/fluid/operators/average_accumulates_op.h create mode 100644 paddle/fluid/operators/batch_norm_op.cc create mode 100644 paddle/fluid/operators/batch_norm_op.cu create mode 100644 paddle/fluid/operators/batch_norm_op.h create mode 100644 paddle/fluid/operators/batch_size_like.h create mode 100644 paddle/fluid/operators/beam_search_decode_op.cc create mode 100644 paddle/fluid/operators/beam_search_decode_op.h create mode 100644 paddle/fluid/operators/beam_search_decode_op_test.cc create mode 100644 paddle/fluid/operators/beam_search_op.cc create mode 100644 paddle/fluid/operators/beam_search_op.cu.cc create mode 100644 paddle/fluid/operators/beam_search_op.h create mode 100644 paddle/fluid/operators/benchmark/CMakeLists.txt create mode 100644 paddle/fluid/operators/benchmark/op_tester.cc create mode 100644 paddle/fluid/operators/benchmark/op_tester.h create mode 100644 paddle/fluid/operators/benchmark/op_tester_config.cc create mode 100644 paddle/fluid/operators/benchmark/op_tester_config.h create mode 100644 paddle/fluid/operators/bilinear_tensor_product_op.cc create mode 100644 paddle/fluid/operators/bilinear_tensor_product_op.cu create mode 100644 paddle/fluid/operators/bilinear_tensor_product_op.h create mode 100644 paddle/fluid/operators/bpr_loss_op.cc create mode 100644 paddle/fluid/operators/bpr_loss_op.h create mode 100644 paddle/fluid/operators/cast_op.cc create mode 100644 paddle/fluid/operators/cast_op.cu create mode 100644 paddle/fluid/operators/cast_op.h create mode 100644 paddle/fluid/operators/chunk_eval_op.cc create mode 100644 paddle/fluid/operators/chunk_eval_op.h create mode 100644 paddle/fluid/operators/clip_by_norm_op.cc create mode 100644 paddle/fluid/operators/clip_by_norm_op.cu create mode 100644 paddle/fluid/operators/clip_by_norm_op.h create mode 100644 paddle/fluid/operators/clip_op.cc create mode 100644 paddle/fluid/operators/clip_op.cu create mode 100644 paddle/fluid/operators/clip_op.h create mode 100644 paddle/fluid/operators/coalesce_tensor_op.cc create mode 100644 paddle/fluid/operators/collective/CMakeLists.txt create mode 100644 paddle/fluid/operators/collective/c_allgather_op.cc create mode 100644 paddle/fluid/operators/collective/c_allgather_op.cu.cc create mode 100644 paddle/fluid/operators/collective/c_allgather_op.h create mode 100644 paddle/fluid/operators/collective/c_allreduce_max_op.cc create mode 100644 paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc create mode 100644 paddle/fluid/operators/collective/c_allreduce_min_op.cc create mode 100644 paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc create mode 100644 paddle/fluid/operators/collective/c_allreduce_op.h create mode 100644 paddle/fluid/operators/collective/c_allreduce_prod_op.cc create mode 100644 paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc create mode 100644 paddle/fluid/operators/collective/c_allreduce_sum_op.cc create mode 100644 paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc create mode 100644 paddle/fluid/operators/collective/c_broadcast_op.cc create mode 100644 paddle/fluid/operators/collective/c_broadcast_op.cu.cc create mode 100644 paddle/fluid/operators/collective/c_broadcast_op.h create mode 100644 paddle/fluid/operators/collective/c_comm_init_op.cc create mode 100644 paddle/fluid/operators/collective/c_gen_nccl_id_op.cc create mode 100644 paddle/fluid/operators/collective/c_reducescatter_op.cc create mode 100644 paddle/fluid/operators/collective/c_reducescatter_op.cu.cc create mode 100644 paddle/fluid/operators/collective/c_reducescatter_op.h create mode 100644 paddle/fluid/operators/collective/c_sync_calc_stream_op.cc create mode 100644 paddle/fluid/operators/collective/c_sync_comm_stream_op.cc create mode 100644 paddle/fluid/operators/concat_op.cc create mode 100644 paddle/fluid/operators/concat_op.cu.cc create mode 100644 paddle/fluid/operators/concat_op.h create mode 100644 paddle/fluid/operators/controlflow/CMakeLists.txt create mode 100644 paddle/fluid/operators/controlflow/compare_op.cc create mode 100644 paddle/fluid/operators/controlflow/compare_op.cu create mode 100644 paddle/fluid/operators/controlflow/compare_op.h create mode 100644 paddle/fluid/operators/controlflow/conditional_block_infer_op.cc create mode 100644 paddle/fluid/operators/controlflow/conditional_block_op.cc create mode 100644 paddle/fluid/operators/controlflow/conditional_block_op.h create mode 100644 paddle/fluid/operators/controlflow/feed_op.cc create mode 100644 paddle/fluid/operators/controlflow/fetch_op.cc create mode 100644 paddle/fluid/operators/controlflow/get_places_op.cc create mode 100644 paddle/fluid/operators/controlflow/logical_op.cc create mode 100644 paddle/fluid/operators/controlflow/logical_op.cu create mode 100644 paddle/fluid/operators/controlflow/logical_op.h create mode 100644 paddle/fluid/operators/controlflow/op_variant.cc create mode 100644 paddle/fluid/operators/controlflow/op_variant.h create mode 100644 paddle/fluid/operators/controlflow/recurrent_op_helper.cc create mode 100644 paddle/fluid/operators/controlflow/recurrent_op_helper.h create mode 100644 paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc create mode 100644 paddle/fluid/operators/controlflow/while_op.cc create mode 100644 paddle/fluid/operators/controlflow/while_op_helper.cc create mode 100644 paddle/fluid/operators/controlflow/while_op_helper.h create mode 100644 paddle/fluid/operators/conv_cudnn_helper.h create mode 100644 paddle/fluid/operators/conv_cudnn_op.cu.cc create mode 100644 paddle/fluid/operators/conv_cudnn_op_cache.h create mode 100644 paddle/fluid/operators/conv_fusion_op.cc create mode 100644 paddle/fluid/operators/conv_fusion_op.cu.cc create mode 100644 paddle/fluid/operators/conv_op.cc create mode 100644 paddle/fluid/operators/conv_op.cu.cc create mode 100644 paddle/fluid/operators/conv_op.h create mode 100644 paddle/fluid/operators/conv_shift_op.cc create mode 100644 paddle/fluid/operators/conv_shift_op.cu create mode 100644 paddle/fluid/operators/conv_shift_op.h create mode 100644 paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc create mode 100644 paddle/fluid/operators/conv_transpose_op.cc create mode 100644 paddle/fluid/operators/conv_transpose_op.cu.cc create mode 100644 paddle/fluid/operators/conv_transpose_op.h create mode 100644 paddle/fluid/operators/cos_sim_op.cc create mode 100644 paddle/fluid/operators/cos_sim_op.cu create mode 100644 paddle/fluid/operators/cos_sim_op.h create mode 100644 paddle/fluid/operators/crf_decoding_op.cc create mode 100644 paddle/fluid/operators/crf_decoding_op.h create mode 100644 paddle/fluid/operators/crop_op.cc create mode 100644 paddle/fluid/operators/crop_op.cu create mode 100644 paddle/fluid/operators/crop_op.h create mode 100644 paddle/fluid/operators/cross_entropy_op.cc create mode 100644 paddle/fluid/operators/cross_entropy_op.cu create mode 100644 paddle/fluid/operators/cross_entropy_op.h create mode 100644 paddle/fluid/operators/ctc_align_op.cc create mode 100644 paddle/fluid/operators/ctc_align_op.cu create mode 100644 paddle/fluid/operators/ctc_align_op.h create mode 100644 paddle/fluid/operators/cudnn_lstm_op.cc create mode 100644 paddle/fluid/operators/cudnn_lstm_op.cu.cc create mode 100644 paddle/fluid/operators/cudnn_rnn_cache.h create mode 100644 paddle/fluid/operators/cum_op.h create mode 100644 paddle/fluid/operators/cumsum_op.cc create mode 100644 paddle/fluid/operators/cumsum_op.cu create mode 100644 paddle/fluid/operators/cvm_op.cc create mode 100644 paddle/fluid/operators/cvm_op.h create mode 100644 paddle/fluid/operators/data_norm_op.cc create mode 100644 paddle/fluid/operators/data_norm_op.h create mode 100644 paddle/fluid/operators/deformable_conv_op.cc create mode 100644 paddle/fluid/operators/deformable_conv_op.cu create mode 100644 paddle/fluid/operators/deformable_psroi_pooling_op.cc create mode 100644 paddle/fluid/operators/deformable_psroi_pooling_op.cu create mode 100644 paddle/fluid/operators/deformable_psroi_pooling_op.h create mode 100644 paddle/fluid/operators/delete_var_op.cc create mode 100644 paddle/fluid/operators/dequantize_op.cc create mode 100644 paddle/fluid/operators/dequantize_op.h create mode 100644 paddle/fluid/operators/detail/safe_ref.h create mode 100644 paddle/fluid/operators/detail/strided_memcpy.h create mode 100644 paddle/fluid/operators/detection/CMakeLists.txt create mode 100644 paddle/fluid/operators/detection/anchor_generator_op.cc create mode 100644 paddle/fluid/operators/detection/anchor_generator_op.cu create mode 100644 paddle/fluid/operators/detection/anchor_generator_op.h create mode 100644 paddle/fluid/operators/detection/bbox_util.h create mode 100644 paddle/fluid/operators/detection/bipartite_match_op.cc create mode 100644 paddle/fluid/operators/detection/box_clip_op.cc create mode 100644 paddle/fluid/operators/detection/box_clip_op.cu create mode 100644 paddle/fluid/operators/detection/box_clip_op.h create mode 100644 paddle/fluid/operators/detection/box_coder_op.cc create mode 100644 paddle/fluid/operators/detection/box_coder_op.cu create mode 100644 paddle/fluid/operators/detection/box_coder_op.h create mode 100644 paddle/fluid/operators/detection/box_decoder_and_assign_op.cc create mode 100644 paddle/fluid/operators/detection/box_decoder_and_assign_op.cu create mode 100644 paddle/fluid/operators/detection/box_decoder_and_assign_op.h create mode 100644 paddle/fluid/operators/detection/collect_fpn_proposals_op.cc create mode 100644 paddle/fluid/operators/detection/collect_fpn_proposals_op.cu create mode 100644 paddle/fluid/operators/detection/collect_fpn_proposals_op.h create mode 100644 paddle/fluid/operators/detection/density_prior_box_op.cc create mode 100644 paddle/fluid/operators/detection/density_prior_box_op.cu create mode 100644 paddle/fluid/operators/detection/density_prior_box_op.h create mode 100644 paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc create mode 100644 paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu create mode 100644 paddle/fluid/operators/detection/distribute_fpn_proposals_op.h create mode 100644 paddle/fluid/operators/detection/generate_mask_labels_op.cc create mode 100644 paddle/fluid/operators/detection/generate_proposal_labels_op.cc create mode 100644 paddle/fluid/operators/detection/generate_proposals_op.cc create mode 100644 paddle/fluid/operators/detection/generate_proposals_op.cu create mode 100644 paddle/fluid/operators/detection/gpc.cc create mode 100644 paddle/fluid/operators/detection/gpc.h create mode 100644 paddle/fluid/operators/detection/iou_similarity_op.cc create mode 100644 paddle/fluid/operators/detection/iou_similarity_op.cu create mode 100644 paddle/fluid/operators/detection/iou_similarity_op.h create mode 100644 paddle/fluid/operators/detection/mask_util.cc create mode 100644 paddle/fluid/operators/detection/mask_util.h create mode 100644 paddle/fluid/operators/detection/mask_util_test.cc create mode 100644 paddle/fluid/operators/detection/mine_hard_examples_op.cc create mode 100644 paddle/fluid/operators/detection/multiclass_nms_op.cc create mode 100644 paddle/fluid/operators/detection/poly_util.cc create mode 100644 paddle/fluid/operators/detection/poly_util.h create mode 100644 paddle/fluid/operators/detection/polygon_box_transform_op.cc create mode 100644 paddle/fluid/operators/detection/polygon_box_transform_op.cu create mode 100644 paddle/fluid/operators/detection/prior_box_op.cc create mode 100644 paddle/fluid/operators/detection/prior_box_op.cu create mode 100644 paddle/fluid/operators/detection/prior_box_op.h create mode 100644 paddle/fluid/operators/detection/retinanet_detection_output_op.cc create mode 100644 paddle/fluid/operators/detection/roi_perspective_transform_op.cc create mode 100644 paddle/fluid/operators/detection/roi_perspective_transform_op.cu create mode 100644 paddle/fluid/operators/detection/rpn_target_assign_op.cc create mode 100644 paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc create mode 100644 paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu create mode 100644 paddle/fluid/operators/detection/sigmoid_focal_loss_op.h create mode 100644 paddle/fluid/operators/detection/target_assign_op.cc create mode 100644 paddle/fluid/operators/detection/target_assign_op.cu create mode 100644 paddle/fluid/operators/detection/target_assign_op.h create mode 100644 paddle/fluid/operators/detection/yolo_box_op.cc create mode 100644 paddle/fluid/operators/detection/yolo_box_op.cu create mode 100644 paddle/fluid/operators/detection/yolo_box_op.h create mode 100644 paddle/fluid/operators/detection/yolov3_loss_op.cc create mode 100644 paddle/fluid/operators/detection/yolov3_loss_op.h create mode 100644 paddle/fluid/operators/detection_map_op.cc create mode 100644 paddle/fluid/operators/detection_map_op.h create mode 100644 paddle/fluid/operators/dgc_clip_by_norm_op.cc create mode 100644 paddle/fluid/operators/dgc_clip_by_norm_op.cu create mode 100644 paddle/fluid/operators/dgc_clip_by_norm_op.h create mode 100644 paddle/fluid/operators/dgc_op.cc create mode 100644 paddle/fluid/operators/dgc_op.cu create mode 100644 paddle/fluid/operators/dgc_op.h create mode 100644 paddle/fluid/operators/diag_op.cc create mode 100644 paddle/fluid/operators/diag_op.cu create mode 100644 paddle/fluid/operators/diag_op.h create mode 100644 paddle/fluid/operators/distributed/CMakeLists.txt create mode 100644 paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc create mode 100644 paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h create mode 100644 paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc create mode 100644 paddle/fluid/operators/distributed/brpc/brpc_client.cc create mode 100644 paddle/fluid/operators/distributed/brpc/brpc_client.h create mode 100644 paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc create mode 100644 paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h create mode 100644 paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc create mode 100644 paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h create mode 100644 paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc create mode 100644 paddle/fluid/operators/distributed/brpc/brpc_server.cc create mode 100644 paddle/fluid/operators/distributed/brpc/brpc_server.h create mode 100644 paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc create mode 100644 paddle/fluid/operators/distributed/brpc/brpc_variable_response.h create mode 100644 paddle/fluid/operators/distributed/collective_client.cc create mode 100644 paddle/fluid/operators/distributed/collective_client.h create mode 100644 paddle/fluid/operators/distributed/collective_server.cc create mode 100644 paddle/fluid/operators/distributed/collective_server.h create mode 100644 paddle/fluid/operators/distributed/collective_server_test.cc create mode 100644 paddle/fluid/operators/distributed/communicator.cc create mode 100644 paddle/fluid/operators/distributed/communicator.h create mode 100644 paddle/fluid/operators/distributed/communicator_test.cc create mode 100644 paddle/fluid/operators/distributed/distributed.h create mode 100644 paddle/fluid/operators/distributed/distributed_pb.h create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_client.cc create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_client.h create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_serde.cc create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_serde.h create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_server.cc create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_server.h create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_service.h create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc create mode 100644 paddle/fluid/operators/distributed/grpc/grpc_variable_response.h create mode 100644 paddle/fluid/operators/distributed/parameter_prefetch.cc create mode 100644 paddle/fluid/operators/distributed/parameter_prefetch.h create mode 100644 paddle/fluid/operators/distributed/parameter_recv.cc create mode 100644 paddle/fluid/operators/distributed/parameter_recv.h create mode 100644 paddle/fluid/operators/distributed/parameter_send.cc create mode 100644 paddle/fluid/operators/distributed/parameter_send.h create mode 100644 paddle/fluid/operators/distributed/proto_encoder_helper.h create mode 100644 paddle/fluid/operators/distributed/request_handler.h create mode 100644 paddle/fluid/operators/distributed/request_handler_impl.cc create mode 100644 paddle/fluid/operators/distributed/request_handler_impl.h create mode 100644 paddle/fluid/operators/distributed/rpc_client.cc create mode 100644 paddle/fluid/operators/distributed/rpc_client.h create mode 100644 paddle/fluid/operators/distributed/rpc_common.h create mode 100644 paddle/fluid/operators/distributed/rpc_server.cc create mode 100644 paddle/fluid/operators/distributed/rpc_server.h create mode 100644 paddle/fluid/operators/distributed/rpc_server_test.cc create mode 100644 paddle/fluid/operators/distributed/send_recv.proto.in create mode 100644 paddle/fluid/operators/distributed/sendrecvop_utils.cc create mode 100644 paddle/fluid/operators/distributed/sendrecvop_utils.h create mode 100644 paddle/fluid/operators/distributed/varhandle_test.cc create mode 100644 paddle/fluid/operators/distributed/variable_response.cc create mode 100644 paddle/fluid/operators/distributed/variable_response.h create mode 100644 paddle/fluid/operators/distributed_ops/CMakeLists.txt create mode 100644 paddle/fluid/operators/distributed_ops/allreduce_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc create mode 100644 paddle/fluid/operators/distributed_ops/allreduce_op.h create mode 100644 paddle/fluid/operators/distributed_ops/broadcast_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc create mode 100644 paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/fake_init_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/listen_and_serv_op.h create mode 100644 paddle/fluid/operators/distributed_ops/merge_ids_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/merge_ids_op.h create mode 100644 paddle/fluid/operators/distributed_ops/prefetch_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/recv_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc create mode 100644 paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h create mode 100644 paddle/fluid/operators/distributed_ops/send_barrier_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/send_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/send_recv_op_test.cc create mode 100644 paddle/fluid/operators/distributed_ops/send_recv_util.h create mode 100644 paddle/fluid/operators/distributed_ops/split_byref_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc create mode 100644 paddle/fluid/operators/distributed_ops/split_byref_op.h create mode 100644 paddle/fluid/operators/distributed_ops/split_ids_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/split_ids_op.h create mode 100644 paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc create mode 100644 paddle/fluid/operators/dropout_op.cc create mode 100644 paddle/fluid/operators/dropout_op.cu create mode 100644 paddle/fluid/operators/dropout_op.h create mode 100644 paddle/fluid/operators/dropout_op_test.cc create mode 100644 paddle/fluid/operators/edit_distance_op.cc create mode 100644 paddle/fluid/operators/edit_distance_op.cu create mode 100644 paddle/fluid/operators/edit_distance_op.h create mode 100644 paddle/fluid/operators/elementwise/CMakeLists.txt create mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op.cc create mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op.cu create mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_div_op.cc create mode 100644 paddle/fluid/operators/elementwise/elementwise_div_op.cu create mode 100644 paddle/fluid/operators/elementwise/elementwise_div_op.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc create mode 100644 paddle/fluid/operators/elementwise/elementwise_floordiv_op.cu create mode 100644 paddle/fluid/operators/elementwise/elementwise_floordiv_op.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_max_op.cc create mode 100644 paddle/fluid/operators/elementwise/elementwise_max_op.cu create mode 100644 paddle/fluid/operators/elementwise/elementwise_max_op.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_min_op.cc create mode 100644 paddle/fluid/operators/elementwise/elementwise_min_op.cu create mode 100644 paddle/fluid/operators/elementwise/elementwise_min_op.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_mod_op.cc create mode 100644 paddle/fluid/operators/elementwise/elementwise_mod_op.cu create mode 100644 paddle/fluid/operators/elementwise/elementwise_mod_op.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_mul_op.cc create mode 100644 paddle/fluid/operators/elementwise/elementwise_mul_op.cu create mode 100644 paddle/fluid/operators/elementwise/elementwise_mul_op.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_op.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_op_function.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_pow_op.cc create mode 100644 paddle/fluid/operators/elementwise/elementwise_pow_op.cu create mode 100644 paddle/fluid/operators/elementwise/elementwise_pow_op.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_sub_op.cc create mode 100644 paddle/fluid/operators/elementwise/elementwise_sub_op.cu create mode 100644 paddle/fluid/operators/elementwise/elementwise_sub_op.h create mode 100644 paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc create mode 100644 paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc create mode 100644 paddle/fluid/operators/elementwise/test_elementwise_add_op_inplace.cc create mode 100644 paddle/fluid/operators/expand_op.cc create mode 100644 paddle/fluid/operators/expand_op.cu create mode 100644 paddle/fluid/operators/expand_op.h create mode 100644 paddle/fluid/operators/fake_dequantize_op.cc create mode 100644 paddle/fluid/operators/fake_dequantize_op.cu create mode 100644 paddle/fluid/operators/fake_dequantize_op.h create mode 100644 paddle/fluid/operators/fake_quantize_op.cc create mode 100644 paddle/fluid/operators/fake_quantize_op.cu create mode 100644 paddle/fluid/operators/fake_quantize_op.h create mode 100644 paddle/fluid/operators/fc_op.cc create mode 100644 paddle/fluid/operators/fc_op.h create mode 100644 paddle/fluid/operators/fill_any_like_op.cc create mode 100644 paddle/fluid/operators/fill_any_like_op.cu create mode 100644 paddle/fluid/operators/fill_any_like_op.h create mode 100644 paddle/fluid/operators/fill_constant_batch_size_like_op.cc create mode 100644 paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc create mode 100644 paddle/fluid/operators/fill_constant_batch_size_like_op.h create mode 100644 paddle/fluid/operators/fill_constant_op.cc create mode 100644 paddle/fluid/operators/fill_constant_op.cu.cc create mode 100644 paddle/fluid/operators/fill_constant_op.h create mode 100644 paddle/fluid/operators/fill_op.cc create mode 100644 paddle/fluid/operators/fill_zeros_like_op.cc create mode 100644 paddle/fluid/operators/fill_zeros_like_op.cu.cc create mode 100644 paddle/fluid/operators/fill_zeros_like_op.h create mode 100644 paddle/fluid/operators/flatten_op.cc create mode 100644 paddle/fluid/operators/fsp_op.cc create mode 100644 paddle/fluid/operators/fsp_op.cu create mode 100644 paddle/fluid/operators/fsp_op.h create mode 100644 paddle/fluid/operators/fused/CMakeLists.txt create mode 100644 paddle/fluid/operators/fused/fused_elemwise_activation_op.cc create mode 100644 paddle/fluid/operators/fused/fused_elemwise_activation_op.cu create mode 100644 paddle/fluid/operators/fused/fused_elemwise_activation_op.h create mode 100644 paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc create mode 100644 paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h create mode 100644 paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc create mode 100644 paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h create mode 100644 paddle/fluid/operators/fused/fusion_conv_inception_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_conv_inception_op.cu create mode 100644 paddle/fluid/operators/fused/fusion_gru_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_gru_op.h create mode 100644 paddle/fluid/operators/fused/fusion_lstm_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_lstm_op.h create mode 100644 paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.h create mode 100644 paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h create mode 100644 paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h create mode 100644 paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_seqpool_concat_op.h create mode 100644 paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_squared_mat_sub_op.h create mode 100644 paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.cu.cc create mode 100644 paddle/fluid/operators/fused/fusion_transpose_flatten_concat_op.h create mode 100644 paddle/fluid/operators/gather.cu.h create mode 100644 paddle/fluid/operators/gather.h create mode 100644 paddle/fluid/operators/gather_op.cc create mode 100644 paddle/fluid/operators/gather_op.cu create mode 100644 paddle/fluid/operators/gather_op.h create mode 100644 paddle/fluid/operators/gather_test.cc create mode 100644 paddle/fluid/operators/gaussian_random_batch_size_like_op.cc create mode 100644 paddle/fluid/operators/gaussian_random_op.cc create mode 100644 paddle/fluid/operators/gaussian_random_op.cu create mode 100644 paddle/fluid/operators/get_tensor_from_selected_rows_op.cc create mode 100644 paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc create mode 100644 paddle/fluid/operators/grid_sampler_op.cc create mode 100644 paddle/fluid/operators/grid_sampler_op.h create mode 100644 paddle/fluid/operators/group_norm_op.cc create mode 100644 paddle/fluid/operators/group_norm_op.cu create mode 100644 paddle/fluid/operators/group_norm_op.h create mode 100644 paddle/fluid/operators/gru_op.cc create mode 100644 paddle/fluid/operators/gru_op.cu.cc create mode 100644 paddle/fluid/operators/gru_op.h create mode 100644 paddle/fluid/operators/gru_unit_op.cc create mode 100644 paddle/fluid/operators/gru_unit_op.cu create mode 100644 paddle/fluid/operators/gru_unit_op.h create mode 100644 paddle/fluid/operators/hash_op.cc create mode 100644 paddle/fluid/operators/hash_op.h create mode 100644 paddle/fluid/operators/hierarchical_sigmoid_op.cc create mode 100644 paddle/fluid/operators/hierarchical_sigmoid_op.h create mode 100644 paddle/fluid/operators/hinge_loss_op.cc create mode 100644 paddle/fluid/operators/hinge_loss_op.cu create mode 100644 paddle/fluid/operators/hinge_loss_op.h create mode 100644 paddle/fluid/operators/huber_loss_op.cc create mode 100644 paddle/fluid/operators/huber_loss_op.cu create mode 100644 paddle/fluid/operators/huber_loss_op.h create mode 100644 paddle/fluid/operators/im2sequence_op.cc create mode 100644 paddle/fluid/operators/im2sequence_op.cu create mode 100644 paddle/fluid/operators/im2sequence_op.h create mode 100644 paddle/fluid/operators/increment_op.cc create mode 100644 paddle/fluid/operators/increment_op.cu create mode 100644 paddle/fluid/operators/increment_op.h create mode 100644 paddle/fluid/operators/interpolate_op.cc create mode 100644 paddle/fluid/operators/interpolate_op.cu create mode 100644 paddle/fluid/operators/interpolate_op.h create mode 100644 paddle/fluid/operators/is_empty_op.cc create mode 100644 paddle/fluid/operators/is_empty_op.cu.cc create mode 100644 paddle/fluid/operators/is_empty_op.h create mode 100644 paddle/fluid/operators/isfinite_op.cc create mode 100644 paddle/fluid/operators/isfinite_op.cu create mode 100644 paddle/fluid/operators/isfinite_op.h create mode 100644 paddle/fluid/operators/jit/CMakeLists.txt create mode 100644 paddle/fluid/operators/jit/README.en.md create mode 100644 paddle/fluid/operators/jit/README.md create mode 100644 paddle/fluid/operators/jit/benchmark.cc create mode 100644 paddle/fluid/operators/jit/gen/CMakeLists.txt create mode 100644 paddle/fluid/operators/jit/gen/act.cc create mode 100644 paddle/fluid/operators/jit/gen/act.h create mode 100644 paddle/fluid/operators/jit/gen/blas.cc create mode 100644 paddle/fluid/operators/jit/gen/blas.h create mode 100644 paddle/fluid/operators/jit/gen/embseqpool.cc create mode 100644 paddle/fluid/operators/jit/gen/embseqpool.h create mode 100644 paddle/fluid/operators/jit/gen/gru.cc create mode 100644 paddle/fluid/operators/jit/gen/gru.h create mode 100644 paddle/fluid/operators/jit/gen/hopv.cc create mode 100644 paddle/fluid/operators/jit/gen/hopv.h create mode 100644 paddle/fluid/operators/jit/gen/jitcode.h create mode 100644 paddle/fluid/operators/jit/gen/lstm.cc create mode 100644 paddle/fluid/operators/jit/gen/lstm.h create mode 100644 paddle/fluid/operators/jit/gen/matmul.cc create mode 100644 paddle/fluid/operators/jit/gen/matmul.h create mode 100644 paddle/fluid/operators/jit/gen/seqpool.cc create mode 100644 paddle/fluid/operators/jit/gen/seqpool.h create mode 100644 paddle/fluid/operators/jit/gen/sgd.cc create mode 100644 paddle/fluid/operators/jit/gen/sgd.h create mode 100644 paddle/fluid/operators/jit/gen/vbroadcast.cc create mode 100644 paddle/fluid/operators/jit/gen/vbroadcast.h create mode 100644 paddle/fluid/operators/jit/gen_base.cc create mode 100644 paddle/fluid/operators/jit/gen_base.h create mode 100644 paddle/fluid/operators/jit/helper.cc create mode 100644 paddle/fluid/operators/jit/helper.h create mode 100644 paddle/fluid/operators/jit/kernel_base.h create mode 100644 paddle/fluid/operators/jit/kernel_key.cc create mode 100644 paddle/fluid/operators/jit/kernel_key.h create mode 100644 paddle/fluid/operators/jit/kernel_pool.cc create mode 100644 paddle/fluid/operators/jit/kernel_pool.h create mode 100644 paddle/fluid/operators/jit/kernels.h create mode 100644 paddle/fluid/operators/jit/macro.h create mode 100644 paddle/fluid/operators/jit/more/CMakeLists.txt create mode 100644 paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt create mode 100644 paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc create mode 100644 paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h create mode 100644 paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc create mode 100644 paddle/fluid/operators/jit/more/intrinsic/layer_norm.h create mode 100644 paddle/fluid/operators/jit/more/mix/CMakeLists.txt create mode 100644 paddle/fluid/operators/jit/more/mix/mix.cc create mode 100644 paddle/fluid/operators/jit/more/mix/mix.h create mode 100644 paddle/fluid/operators/jit/more/mkl/CMakeLists.txt create mode 100644 paddle/fluid/operators/jit/more/mkl/mkl.cc create mode 100644 paddle/fluid/operators/jit/more/mkl/mkl.h create mode 100644 paddle/fluid/operators/jit/refer/CMakeLists.txt create mode 100644 paddle/fluid/operators/jit/refer/refer.cc create mode 100644 paddle/fluid/operators/jit/refer/refer.h create mode 100644 paddle/fluid/operators/jit/registry.h create mode 100644 paddle/fluid/operators/jit/test.cc create mode 100644 paddle/fluid/operators/kldiv_loss_op.cc create mode 100644 paddle/fluid/operators/kldiv_loss_op.cu create mode 100644 paddle/fluid/operators/kldiv_loss_op.h create mode 100644 paddle/fluid/operators/l1_norm_op.cc create mode 100644 paddle/fluid/operators/l1_norm_op.cu create mode 100644 paddle/fluid/operators/l1_norm_op.h create mode 100644 paddle/fluid/operators/label_smooth_op.cc create mode 100644 paddle/fluid/operators/label_smooth_op.cu create mode 100644 paddle/fluid/operators/label_smooth_op.h create mode 100644 paddle/fluid/operators/layer_norm_op.cc create mode 100644 paddle/fluid/operators/layer_norm_op.cu create mode 100644 paddle/fluid/operators/layer_norm_op.h create mode 100644 paddle/fluid/operators/linear_chain_crf_op.cc create mode 100644 paddle/fluid/operators/linear_chain_crf_op.cu create mode 100644 paddle/fluid/operators/linear_chain_crf_op.h create mode 100644 paddle/fluid/operators/linspace_op.cc create mode 100644 paddle/fluid/operators/linspace_op.cu create mode 100644 paddle/fluid/operators/linspace_op.h create mode 100644 paddle/fluid/operators/load_combine_op.cc create mode 100644 paddle/fluid/operators/load_combine_op.cu create mode 100644 paddle/fluid/operators/load_combine_op.h create mode 100644 paddle/fluid/operators/load_op.cc create mode 100644 paddle/fluid/operators/load_op.cu create mode 100644 paddle/fluid/operators/load_op.h create mode 100644 paddle/fluid/operators/lod_array_length_op.cc create mode 100644 paddle/fluid/operators/lod_rank_table_op.cc create mode 100644 paddle/fluid/operators/lod_reset_op.cc create mode 100644 paddle/fluid/operators/lod_reset_op.cu create mode 100644 paddle/fluid/operators/lod_reset_op.h create mode 100644 paddle/fluid/operators/lod_tensor_to_array_op.cc create mode 100644 paddle/fluid/operators/log_loss_op.cc create mode 100644 paddle/fluid/operators/log_loss_op.cu create mode 100644 paddle/fluid/operators/log_loss_op.h create mode 100644 paddle/fluid/operators/lookup_sparse_table_op.cc create mode 100644 paddle/fluid/operators/lookup_table_op.cc create mode 100644 paddle/fluid/operators/lookup_table_op.cu create mode 100644 paddle/fluid/operators/lookup_table_op.h create mode 100644 paddle/fluid/operators/lrn_op.cc create mode 100644 paddle/fluid/operators/lrn_op.cu create mode 100644 paddle/fluid/operators/lrn_op.h create mode 100644 paddle/fluid/operators/lstm_op.cc create mode 100644 paddle/fluid/operators/lstm_op.cu.cc create mode 100644 paddle/fluid/operators/lstm_op.h create mode 100644 paddle/fluid/operators/lstm_unit_op.cc create mode 100644 paddle/fluid/operators/lstm_unit_op.cu create mode 100644 paddle/fluid/operators/lstm_unit_op.h create mode 100644 paddle/fluid/operators/lstmp_op.cc create mode 100644 paddle/fluid/operators/lstmp_op.cu create mode 100644 paddle/fluid/operators/lstmp_op.h create mode 100644 paddle/fluid/operators/margin_rank_loss_op.cc create mode 100644 paddle/fluid/operators/margin_rank_loss_op.cu create mode 100644 paddle/fluid/operators/margin_rank_loss_op.h create mode 100644 paddle/fluid/operators/math.h create mode 100644 paddle/fluid/operators/math/CMakeLists.txt create mode 100644 paddle/fluid/operators/math/algorithm.h create mode 100644 paddle/fluid/operators/math/beam_search.cc create mode 100644 paddle/fluid/operators/math/beam_search.cu create mode 100644 paddle/fluid/operators/math/beam_search.h create mode 100644 paddle/fluid/operators/math/beam_search_test.cc create mode 100644 paddle/fluid/operators/math/blas.cc create mode 100644 paddle/fluid/operators/math/blas.h create mode 100644 paddle/fluid/operators/math/blas_impl.cu.h create mode 100644 paddle/fluid/operators/math/blas_impl.h create mode 100644 paddle/fluid/operators/math/compound_functors.h create mode 100644 paddle/fluid/operators/math/concat.hip.cu create mode 100644 paddle/fluid/operators/math/concat_and_split.cc create mode 100644 paddle/fluid/operators/math/concat_and_split.cu create mode 100644 paddle/fluid/operators/math/concat_and_split.h create mode 100644 paddle/fluid/operators/math/concat_test.cc create mode 100644 paddle/fluid/operators/math/context_project.cc create mode 100644 paddle/fluid/operators/math/context_project.cu create mode 100644 paddle/fluid/operators/math/context_project.h create mode 100644 paddle/fluid/operators/math/cos_sim_functor.cc create mode 100644 paddle/fluid/operators/math/cos_sim_functor.cu create mode 100644 paddle/fluid/operators/math/cos_sim_functor.h create mode 100644 paddle/fluid/operators/math/cpu_vec.h create mode 100644 paddle/fluid/operators/math/cpu_vec_test.cc create mode 100644 paddle/fluid/operators/math/cross_entropy.cc create mode 100644 paddle/fluid/operators/math/cross_entropy.cu create mode 100644 paddle/fluid/operators/math/cross_entropy.h create mode 100644 paddle/fluid/operators/math/depthwise_conv.cu create mode 100644 paddle/fluid/operators/math/depthwise_conv.h create mode 100644 paddle/fluid/operators/math/detail/CMakeLists.txt create mode 100644 paddle/fluid/operators/math/detail/activation_functions.h create mode 100644 paddle/fluid/operators/math/detail/avx_functions.cc create mode 100644 paddle/fluid/operators/math/detail/avx_mathfun.h create mode 100644 paddle/fluid/operators/math/detail/gru_cpu_kernel.h create mode 100644 paddle/fluid/operators/math/detail/gru_gpu_kernel.h create mode 100644 paddle/fluid/operators/math/detail/gru_kernel.h create mode 100644 paddle/fluid/operators/math/detail/lstm_cpu_kernel.h create mode 100644 paddle/fluid/operators/math/detail/lstm_gpu_kernel.h create mode 100644 paddle/fluid/operators/math/detail/lstm_kernel.h create mode 100644 paddle/fluid/operators/math/fc_compute.h create mode 100644 paddle/fluid/operators/math/functors.h create mode 100644 paddle/fluid/operators/math/gru_compute.cc create mode 100644 paddle/fluid/operators/math/gru_compute.cu create mode 100644 paddle/fluid/operators/math/gru_compute.h create mode 100644 paddle/fluid/operators/math/im2col.cc create mode 100644 paddle/fluid/operators/math/im2col.cu create mode 100644 paddle/fluid/operators/math/im2col.h create mode 100644 paddle/fluid/operators/math/im2col_cfo_cpu.h create mode 100644 paddle/fluid/operators/math/im2col_test.cc create mode 100644 paddle/fluid/operators/math/lstm_compute.cc create mode 100644 paddle/fluid/operators/math/lstm_compute.cu create mode 100644 paddle/fluid/operators/math/lstm_compute.h create mode 100644 paddle/fluid/operators/math/math_function.cc create mode 100644 paddle/fluid/operators/math/math_function.cu create mode 100644 paddle/fluid/operators/math/math_function.h create mode 100644 paddle/fluid/operators/math/math_function_impl.h create mode 100644 paddle/fluid/operators/math/math_function_test.cc create mode 100644 paddle/fluid/operators/math/math_function_test.cu create mode 100644 paddle/fluid/operators/math/matrix_bit_code.cc create mode 100644 paddle/fluid/operators/math/matrix_bit_code.h create mode 100644 paddle/fluid/operators/math/maxouting.cc create mode 100644 paddle/fluid/operators/math/maxouting.cu create mode 100644 paddle/fluid/operators/math/maxouting.h create mode 100644 paddle/fluid/operators/math/padding.h create mode 100644 paddle/fluid/operators/math/pooling.cc create mode 100644 paddle/fluid/operators/math/pooling.cu create mode 100644 paddle/fluid/operators/math/pooling.h create mode 100644 paddle/fluid/operators/math/prelu.cu create mode 100644 paddle/fluid/operators/math/prelu.h create mode 100644 paddle/fluid/operators/math/sample_prob.cc create mode 100644 paddle/fluid/operators/math/sample_prob.cu create mode 100644 paddle/fluid/operators/math/sample_prob.h create mode 100644 paddle/fluid/operators/math/sampler.cc create mode 100644 paddle/fluid/operators/math/sampler.h create mode 100644 paddle/fluid/operators/math/selected_rows_functor.cc create mode 100644 paddle/fluid/operators/math/selected_rows_functor.cu create mode 100644 paddle/fluid/operators/math/selected_rows_functor.h create mode 100644 paddle/fluid/operators/math/selected_rows_functor_test.cc create mode 100644 paddle/fluid/operators/math/selected_rows_functor_test.cu.cc create mode 100644 paddle/fluid/operators/math/sequence2batch.cc create mode 100644 paddle/fluid/operators/math/sequence2batch.cu create mode 100644 paddle/fluid/operators/math/sequence2batch.h create mode 100644 paddle/fluid/operators/math/sequence_padding.cc create mode 100644 paddle/fluid/operators/math/sequence_padding.cu create mode 100644 paddle/fluid/operators/math/sequence_padding.h create mode 100644 paddle/fluid/operators/math/sequence_padding_test.cc create mode 100644 paddle/fluid/operators/math/sequence_pooling.cc create mode 100644 paddle/fluid/operators/math/sequence_pooling.cu create mode 100644 paddle/fluid/operators/math/sequence_pooling.h create mode 100644 paddle/fluid/operators/math/sequence_pooling_test.cc create mode 100644 paddle/fluid/operators/math/sequence_scale.cc create mode 100644 paddle/fluid/operators/math/sequence_scale.cu create mode 100644 paddle/fluid/operators/math/sequence_scale.h create mode 100644 paddle/fluid/operators/math/softmax.cc create mode 100644 paddle/fluid/operators/math/softmax.cu create mode 100644 paddle/fluid/operators/math/softmax.h create mode 100644 paddle/fluid/operators/math/softmax_impl.h create mode 100644 paddle/fluid/operators/math/tree2col.cc create mode 100644 paddle/fluid/operators/math/tree2col.cu create mode 100644 paddle/fluid/operators/math/tree2col.h create mode 100644 paddle/fluid/operators/math/unpooling.cc create mode 100644 paddle/fluid/operators/math/unpooling.cu create mode 100644 paddle/fluid/operators/math/unpooling.h create mode 100644 paddle/fluid/operators/math/vol2col.cc create mode 100644 paddle/fluid/operators/math/vol2col.cu create mode 100644 paddle/fluid/operators/math/vol2col.h create mode 100644 paddle/fluid/operators/math/vol2col_test.cc create mode 100644 paddle/fluid/operators/matmul_op.cc create mode 100644 paddle/fluid/operators/max_sequence_len_op.cc create mode 100644 paddle/fluid/operators/maxout_op.cc create mode 100644 paddle/fluid/operators/maxout_op.cu.cc create mode 100644 paddle/fluid/operators/maxout_op.h create mode 100644 paddle/fluid/operators/mean_iou_op.cc create mode 100644 paddle/fluid/operators/mean_iou_op.cu create mode 100644 paddle/fluid/operators/mean_iou_op.h create mode 100644 paddle/fluid/operators/mean_op.cc create mode 100644 paddle/fluid/operators/mean_op.cu create mode 100644 paddle/fluid/operators/mean_op.h create mode 100644 paddle/fluid/operators/merge_lod_tensor_op.cc create mode 100644 paddle/fluid/operators/merge_selected_rows_op.cc create mode 100644 paddle/fluid/operators/merge_selected_rows_op.cu.cc create mode 100644 paddle/fluid/operators/merge_selected_rows_op.h create mode 100644 paddle/fluid/operators/metrics/CMakeLists.txt create mode 100644 paddle/fluid/operators/metrics/accuracy_op.cc create mode 100644 paddle/fluid/operators/metrics/accuracy_op.cu create mode 100644 paddle/fluid/operators/metrics/accuracy_op.h create mode 100644 paddle/fluid/operators/metrics/auc_op.cc create mode 100644 paddle/fluid/operators/metrics/auc_op.h create mode 100644 paddle/fluid/operators/metrics/precision_recall_op.cc create mode 100644 paddle/fluid/operators/metrics/precision_recall_op.h create mode 100644 paddle/fluid/operators/minus_op.cc create mode 100644 paddle/fluid/operators/minus_op.cu create mode 100644 paddle/fluid/operators/minus_op.h create mode 100644 paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/mkldnn_activation_op.h create mode 100644 paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc create mode 100644 paddle/fluid/operators/modified_huber_loss_op.cc create mode 100644 paddle/fluid/operators/modified_huber_loss_op.cu create mode 100644 paddle/fluid/operators/modified_huber_loss_op.h create mode 100644 paddle/fluid/operators/mul_op.cc create mode 100644 paddle/fluid/operators/mul_op.cu.cc create mode 100644 paddle/fluid/operators/mul_op.h create mode 100644 paddle/fluid/operators/multiplex_op.cc create mode 100644 paddle/fluid/operators/multiplex_op.cu create mode 100644 paddle/fluid/operators/multiplex_op.h create mode 100644 paddle/fluid/operators/nccl/CMakeLists.txt create mode 100644 paddle/fluid/operators/nccl/nccl_gpu_common.cc create mode 100644 paddle/fluid/operators/nccl/nccl_gpu_common.h create mode 100644 paddle/fluid/operators/nccl/nccl_op.cc create mode 100644 paddle/fluid/operators/nccl/nccl_op.cu.cc create mode 100644 paddle/fluid/operators/nccl/nccl_op_test.cu.cc create mode 100644 paddle/fluid/operators/nce_op.cc create mode 100644 paddle/fluid/operators/nce_op.h create mode 100644 paddle/fluid/operators/ngraph/CMakeLists.txt create mode 100644 paddle/fluid/operators/ngraph/ngraph_bridge.cc create mode 100644 paddle/fluid/operators/ngraph/ngraph_bridge.h create mode 100644 paddle/fluid/operators/ngraph/ngraph_engine.cc create mode 100644 paddle/fluid/operators/ngraph/ngraph_engine.h create mode 100644 paddle/fluid/operators/ngraph/ngraph_engine_op.cc create mode 100644 paddle/fluid/operators/ngraph/ngraph_engine_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/CMakeLists.txt create mode 100644 paddle/fluid/operators/ngraph/ops/accuracy_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/activation_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/adam_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/assign_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/batch_norm_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/binary_unary_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/cast_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/concat_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/conv2d_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/cross_entropy_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/dropout_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/elementwise_add_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/elementwise_binary_prepare_node.h create mode 100644 paddle/fluid/operators/ngraph/ops/elementwise_div_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/elementwise_mul_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/elementwise_node.h create mode 100644 paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/fill_constant_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/fill_zeros_like_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/gather_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/increment_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/layer_norm_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/lookup_table_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/lrn_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/matmul_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/mean_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/momentum_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/mul_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/op_bridge.h create mode 100644 paddle/fluid/operators/ngraph/ops/pool2d_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/reduce_sum_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/reshape_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/scale_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/slice_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/softmax_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/softmax_with_cross_entropy_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/stack_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/sum_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/top_k_op.h create mode 100644 paddle/fluid/operators/ngraph/ops/transpose_op.h create mode 100644 paddle/fluid/operators/norm_op.cc create mode 100644 paddle/fluid/operators/norm_op.cu create mode 100644 paddle/fluid/operators/norm_op.h create mode 100644 paddle/fluid/operators/one_hot_op.cc create mode 100644 paddle/fluid/operators/one_hot_op.cu create mode 100644 paddle/fluid/operators/one_hot_op.h create mode 100644 paddle/fluid/operators/optimizers/CMakeLists.txt create mode 100644 paddle/fluid/operators/optimizers/adadelta_op.cc create mode 100644 paddle/fluid/operators/optimizers/adadelta_op.cu create mode 100644 paddle/fluid/operators/optimizers/adadelta_op.h create mode 100644 paddle/fluid/operators/optimizers/adagrad_op.cc create mode 100644 paddle/fluid/operators/optimizers/adagrad_op.cu create mode 100644 paddle/fluid/operators/optimizers/adagrad_op.h create mode 100644 paddle/fluid/operators/optimizers/adam_op.cc create mode 100644 paddle/fluid/operators/optimizers/adam_op.cu create mode 100644 paddle/fluid/operators/optimizers/adam_op.h create mode 100644 paddle/fluid/operators/optimizers/adamax_op.cc create mode 100644 paddle/fluid/operators/optimizers/adamax_op.cu create mode 100644 paddle/fluid/operators/optimizers/adamax_op.h create mode 100644 paddle/fluid/operators/optimizers/decayed_adagrad_op.cc create mode 100644 paddle/fluid/operators/optimizers/decayed_adagrad_op.cu create mode 100644 paddle/fluid/operators/optimizers/decayed_adagrad_op.h create mode 100644 paddle/fluid/operators/optimizers/ftrl_op.cc create mode 100644 paddle/fluid/operators/optimizers/ftrl_op.cu create mode 100644 paddle/fluid/operators/optimizers/ftrl_op.h create mode 100644 paddle/fluid/operators/optimizers/lamb_op.cc create mode 100644 paddle/fluid/operators/optimizers/lamb_op.cu create mode 100644 paddle/fluid/operators/optimizers/lamb_op.h create mode 100644 paddle/fluid/operators/optimizers/lars_momentum_op.cc create mode 100644 paddle/fluid/operators/optimizers/lars_momentum_op.cu create mode 100644 paddle/fluid/operators/optimizers/lars_momentum_op.h create mode 100644 paddle/fluid/operators/optimizers/momentum_op.cc create mode 100644 paddle/fluid/operators/optimizers/momentum_op.cu create mode 100644 paddle/fluid/operators/optimizers/momentum_op.h create mode 100644 paddle/fluid/operators/optimizers/proximal_adagrad_op.cc create mode 100644 paddle/fluid/operators/optimizers/proximal_adagrad_op.cu create mode 100644 paddle/fluid/operators/optimizers/proximal_adagrad_op.h create mode 100644 paddle/fluid/operators/optimizers/proximal_gd_op.cc create mode 100644 paddle/fluid/operators/optimizers/proximal_gd_op.cu create mode 100644 paddle/fluid/operators/optimizers/proximal_gd_op.h create mode 100644 paddle/fluid/operators/optimizers/rmsprop_op.cc create mode 100644 paddle/fluid/operators/optimizers/rmsprop_op.cu create mode 100644 paddle/fluid/operators/optimizers/rmsprop_op.h create mode 100644 paddle/fluid/operators/optimizers/sgd_op.cc create mode 100644 paddle/fluid/operators/optimizers/sgd_op.cu create mode 100644 paddle/fluid/operators/optimizers/sgd_op.h create mode 100644 paddle/fluid/operators/pad2d_op.cc create mode 100644 paddle/fluid/operators/pad2d_op.cu create mode 100644 paddle/fluid/operators/pad_constant_like_op.cc create mode 100644 paddle/fluid/operators/pad_constant_like_op.cu create mode 100644 paddle/fluid/operators/pad_constant_like_op.h create mode 100644 paddle/fluid/operators/pad_op.cc create mode 100644 paddle/fluid/operators/pad_op.cu create mode 100644 paddle/fluid/operators/pad_op.h create mode 100644 paddle/fluid/operators/pixel_shuffle_op.cc create mode 100644 paddle/fluid/operators/pixel_shuffle_op.cu create mode 100644 paddle/fluid/operators/pixel_shuffle_op.h create mode 100644 paddle/fluid/operators/pool_cudnn_op.cu.cc create mode 100644 paddle/fluid/operators/pool_op.cc create mode 100644 paddle/fluid/operators/pool_op.cu.cc create mode 100644 paddle/fluid/operators/pool_op.h create mode 100644 paddle/fluid/operators/pool_with_index_op.cc create mode 100644 paddle/fluid/operators/pool_with_index_op.cu.cc create mode 100644 paddle/fluid/operators/pool_with_index_op.h create mode 100644 paddle/fluid/operators/positive_negative_pair_op.cc create mode 100644 paddle/fluid/operators/positive_negative_pair_op.h create mode 100644 paddle/fluid/operators/prelu_op.cc create mode 100644 paddle/fluid/operators/prelu_op.cu create mode 100644 paddle/fluid/operators/prelu_op.h create mode 100644 paddle/fluid/operators/print_op.cc create mode 100644 paddle/fluid/operators/psroi_pool_op.cc create mode 100644 paddle/fluid/operators/psroi_pool_op.cu create mode 100644 paddle/fluid/operators/psroi_pool_op.h create mode 100644 paddle/fluid/operators/py_func_op.cc create mode 100644 paddle/fluid/operators/py_func_op.h create mode 100644 paddle/fluid/operators/quantize_op.cc create mode 100644 paddle/fluid/operators/quantize_op.h create mode 100644 paddle/fluid/operators/random_crop_op.cc create mode 100644 paddle/fluid/operators/random_crop_op.cu create mode 100644 paddle/fluid/operators/random_crop_op.h create mode 100644 paddle/fluid/operators/range_op.cc create mode 100644 paddle/fluid/operators/range_op.cu create mode 100644 paddle/fluid/operators/range_op.h create mode 100644 paddle/fluid/operators/rank_loss_op.cc create mode 100644 paddle/fluid/operators/rank_loss_op.cu create mode 100644 paddle/fluid/operators/rank_loss_op.h create mode 100644 paddle/fluid/operators/reader/CMakeLists.txt create mode 100644 paddle/fluid/operators/reader/blocking_queue.h create mode 100644 paddle/fluid/operators/reader/buffered_reader.cc create mode 100644 paddle/fluid/operators/reader/buffered_reader.h create mode 100644 paddle/fluid/operators/reader/create_batch_reader_op.cc create mode 100644 paddle/fluid/operators/reader/create_ctr_reader_op.cc create mode 100644 paddle/fluid/operators/reader/create_custom_reader_op.cc create mode 100644 paddle/fluid/operators/reader/create_double_buffer_reader_op.cc create mode 100644 paddle/fluid/operators/reader/create_multi_pass_reader_op.cc create mode 100644 paddle/fluid/operators/reader/create_py_reader_op.cc create mode 100644 paddle/fluid/operators/reader/create_random_data_generator_op.cc create mode 100644 paddle/fluid/operators/reader/create_recordio_file_reader_op.cc create mode 100644 paddle/fluid/operators/reader/create_shuffle_reader_op.cc create mode 100644 paddle/fluid/operators/reader/lod_tensor_blocking_queue.h create mode 100644 paddle/fluid/operators/reader/open_files_op.cc create mode 100644 paddle/fluid/operators/reader/py_reader.cc create mode 100644 paddle/fluid/operators/reader/py_reader.h create mode 100644 paddle/fluid/operators/reader/read_op.cc create mode 100644 paddle/fluid/operators/reader/reader_blocking_queue_test.cc create mode 100644 paddle/fluid/operators/reader/reader_op_registry.cc create mode 100644 paddle/fluid/operators/reader/reader_op_registry.h create mode 100644 paddle/fluid/operators/recurrent_op.cc create mode 100644 paddle/fluid/operators/recurrent_op.h create mode 100644 paddle/fluid/operators/reduce_ops/CMakeLists.txt create mode 100644 paddle/fluid/operators/reduce_ops/cub_reduce.h create mode 100644 paddle/fluid/operators/reduce_ops/reduce_all_op.cc create mode 100644 paddle/fluid/operators/reduce_ops/reduce_all_op.cu create mode 100644 paddle/fluid/operators/reduce_ops/reduce_all_op.h create mode 100644 paddle/fluid/operators/reduce_ops/reduce_any_op.cc create mode 100644 paddle/fluid/operators/reduce_ops/reduce_any_op.cu create mode 100644 paddle/fluid/operators/reduce_ops/reduce_any_op.h create mode 100644 paddle/fluid/operators/reduce_ops/reduce_max_op.cc create mode 100644 paddle/fluid/operators/reduce_ops/reduce_max_op.cu create mode 100644 paddle/fluid/operators/reduce_ops/reduce_max_op.part.cu create mode 100644 paddle/fluid/operators/reduce_ops/reduce_mean_op.cc create mode 100644 paddle/fluid/operators/reduce_ops/reduce_mean_op.cu create mode 100644 paddle/fluid/operators/reduce_ops/reduce_mean_op.h create mode 100644 paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu create mode 100644 paddle/fluid/operators/reduce_ops/reduce_min_max_op.h create mode 100644 paddle/fluid/operators/reduce_ops/reduce_min_op.cc create mode 100644 paddle/fluid/operators/reduce_ops/reduce_min_op.cu create mode 100644 paddle/fluid/operators/reduce_ops/reduce_min_op.part.cu create mode 100644 paddle/fluid/operators/reduce_ops/reduce_op.h create mode 100644 paddle/fluid/operators/reduce_ops/reduce_op_function.h create mode 100644 paddle/fluid/operators/reduce_ops/reduce_prod_op.cc create mode 100644 paddle/fluid/operators/reduce_ops/reduce_prod_op.cu create mode 100644 paddle/fluid/operators/reduce_ops/reduce_prod_op.h create mode 100644 paddle/fluid/operators/reduce_ops/reduce_prod_op.part.cu create mode 100644 paddle/fluid/operators/reduce_ops/reduce_sum_op.cc create mode 100644 paddle/fluid/operators/reduce_ops/reduce_sum_op.cu create mode 100644 paddle/fluid/operators/reduce_ops/reduce_sum_op.h create mode 100644 paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu create mode 100644 paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc create mode 100644 paddle/fluid/operators/requantize_op.cc create mode 100644 paddle/fluid/operators/requantize_op.h create mode 100644 paddle/fluid/operators/reshape_op.cc create mode 100644 paddle/fluid/operators/reverse_op.cc create mode 100644 paddle/fluid/operators/reverse_op.cu create mode 100644 paddle/fluid/operators/reverse_op.h create mode 100644 paddle/fluid/operators/rnn_memory_helper_op.cc create mode 100644 paddle/fluid/operators/roi_align_op.cc create mode 100644 paddle/fluid/operators/roi_align_op.cu create mode 100644 paddle/fluid/operators/roi_align_op.h create mode 100644 paddle/fluid/operators/roi_pool_op.cc create mode 100644 paddle/fluid/operators/roi_pool_op.cu create mode 100644 paddle/fluid/operators/roi_pool_op.h create mode 100644 paddle/fluid/operators/row_conv_op.cc create mode 100644 paddle/fluid/operators/row_conv_op.cu create mode 100644 paddle/fluid/operators/row_conv_op.h create mode 100644 paddle/fluid/operators/sample_logits_op.cc create mode 100644 paddle/fluid/operators/sample_logits_op.cu create mode 100644 paddle/fluid/operators/sample_logits_op.h create mode 100644 paddle/fluid/operators/sampling_id_op.cc create mode 100644 paddle/fluid/operators/sampling_id_op.cu create mode 100644 paddle/fluid/operators/sampling_id_op.h create mode 100644 paddle/fluid/operators/save_combine_op.cc create mode 100644 paddle/fluid/operators/save_combine_op.cu create mode 100644 paddle/fluid/operators/save_combine_op.h create mode 100644 paddle/fluid/operators/save_load_combine_op_test.cc create mode 100644 paddle/fluid/operators/save_load_op_test.cc create mode 100644 paddle/fluid/operators/save_op.cc create mode 100644 paddle/fluid/operators/save_op.cu create mode 100644 paddle/fluid/operators/save_op.h create mode 100644 paddle/fluid/operators/scale_op.cc create mode 100644 paddle/fluid/operators/scale_op.cu create mode 100644 paddle/fluid/operators/scale_op.h create mode 100644 paddle/fluid/operators/scatter.cu.h create mode 100644 paddle/fluid/operators/scatter.h create mode 100644 paddle/fluid/operators/scatter_op.cc create mode 100644 paddle/fluid/operators/scatter_op.cu create mode 100644 paddle/fluid/operators/scatter_op.h create mode 100644 paddle/fluid/operators/scatter_test.cc create mode 100644 paddle/fluid/operators/selu_op.cc create mode 100644 paddle/fluid/operators/selu_op.cu create mode 100644 paddle/fluid/operators/selu_op.h create mode 100644 paddle/fluid/operators/sequence_ops/CMakeLists.txt create mode 100644 paddle/fluid/operators/sequence_ops/sequence_concat_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_concat_op.cu.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_concat_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_conv_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_conv_op.cu.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_conv_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_erase_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_erase_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_erase_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_expand_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_expand_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_expand_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_mask_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_mask_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_mask_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_pad_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_pad_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_pad_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_pool_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_pool_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_pool_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_reshape_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_reshape_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_reverse_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_reverse_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_scatter_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_slice_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_slice_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_slice_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_softmax_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_softmax_op.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_unpad_op.cu create mode 100644 paddle/fluid/operators/sequence_ops/sequence_unpad_op.h create mode 100644 paddle/fluid/operators/shape_op.cc create mode 100644 paddle/fluid/operators/shape_op.cu create mode 100644 paddle/fluid/operators/shape_op.h create mode 100644 paddle/fluid/operators/shard_index_op.cc create mode 100644 paddle/fluid/operators/shard_index_op.cu create mode 100644 paddle/fluid/operators/shard_index_op.h create mode 100644 paddle/fluid/operators/shrink_rnn_memory_op.cc create mode 100644 paddle/fluid/operators/shuffle_channel_op.cc create mode 100644 paddle/fluid/operators/shuffle_channel_op.cu create mode 100644 paddle/fluid/operators/shuffle_channel_op.h create mode 100644 paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc create mode 100644 paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu create mode 100644 paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h create mode 100644 paddle/fluid/operators/sign_op.cc create mode 100644 paddle/fluid/operators/sign_op.cu create mode 100644 paddle/fluid/operators/sign_op.h create mode 100644 paddle/fluid/operators/similarity_focus_op.cc create mode 100644 paddle/fluid/operators/similarity_focus_op.h create mode 100644 paddle/fluid/operators/size_op.cc create mode 100644 paddle/fluid/operators/size_op.cu create mode 100644 paddle/fluid/operators/size_op.h create mode 100644 paddle/fluid/operators/slice_op.cc create mode 100644 paddle/fluid/operators/slice_op.cu create mode 100644 paddle/fluid/operators/slice_op.h create mode 100644 paddle/fluid/operators/smooth_l1_loss_op.cc create mode 100644 paddle/fluid/operators/smooth_l1_loss_op.cu create mode 100644 paddle/fluid/operators/smooth_l1_loss_op.h create mode 100644 paddle/fluid/operators/softmax_cudnn_op.cu.cc create mode 100644 paddle/fluid/operators/softmax_op.cc create mode 100644 paddle/fluid/operators/softmax_op.cu.cc create mode 100644 paddle/fluid/operators/softmax_op.h create mode 100644 paddle/fluid/operators/softmax_with_cross_entropy_op.cc create mode 100644 paddle/fluid/operators/softmax_with_cross_entropy_op.cu create mode 100644 paddle/fluid/operators/softmax_with_cross_entropy_op.h create mode 100644 paddle/fluid/operators/space_to_depth_op.cc create mode 100644 paddle/fluid/operators/space_to_depth_op.cu create mode 100644 paddle/fluid/operators/space_to_depth_op.h create mode 100644 paddle/fluid/operators/spectral_norm_op.cc create mode 100644 paddle/fluid/operators/spectral_norm_op.cu create mode 100644 paddle/fluid/operators/spectral_norm_op.h create mode 100644 paddle/fluid/operators/split_lod_tensor_op.cc create mode 100644 paddle/fluid/operators/split_op.cc create mode 100644 paddle/fluid/operators/split_op.cu.cc create mode 100644 paddle/fluid/operators/split_op.h create mode 100644 paddle/fluid/operators/split_selected_rows_op.cc create mode 100644 paddle/fluid/operators/split_selected_rows_op.cu create mode 100644 paddle/fluid/operators/split_selected_rows_op.h create mode 100644 paddle/fluid/operators/spp_op.cc create mode 100644 paddle/fluid/operators/spp_op.cu.cc create mode 100644 paddle/fluid/operators/spp_op.h create mode 100644 paddle/fluid/operators/squared_l2_distance_op.cc create mode 100644 paddle/fluid/operators/squared_l2_distance_op.cu create mode 100644 paddle/fluid/operators/squared_l2_distance_op.h create mode 100644 paddle/fluid/operators/squared_l2_norm_op.cc create mode 100644 paddle/fluid/operators/squared_l2_norm_op.cu create mode 100644 paddle/fluid/operators/squared_l2_norm_op.h create mode 100644 paddle/fluid/operators/squeeze_op.cc create mode 100644 paddle/fluid/operators/stack_op.cc create mode 100644 paddle/fluid/operators/stack_op.cu create mode 100644 paddle/fluid/operators/stack_op.h create mode 100644 paddle/fluid/operators/strided_memcpy.h create mode 100644 paddle/fluid/operators/strided_memcpy_test.cc create mode 100644 paddle/fluid/operators/sum_op.cc create mode 100644 paddle/fluid/operators/sum_op.cu create mode 100644 paddle/fluid/operators/sum_op.h create mode 100644 paddle/fluid/operators/sync_batch_norm_op.cc create mode 100644 paddle/fluid/operators/sync_batch_norm_op.cu create mode 100644 paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc create mode 100644 paddle/fluid/operators/teacher_student_sigmoid_loss_op.h create mode 100644 paddle/fluid/operators/temporal_shift_op.cc create mode 100644 paddle/fluid/operators/temporal_shift_op.cu create mode 100644 paddle/fluid/operators/temporal_shift_op.h create mode 100644 paddle/fluid/operators/tensor_array_to_tensor_op.cc create mode 100644 paddle/fluid/operators/tensorrt/CMakeLists.txt create mode 100644 paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc create mode 100644 paddle/fluid/operators/tensorrt/tensorrt_engine_op.h create mode 100644 paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc create mode 100644 paddle/fluid/operators/top_k_op.cc create mode 100644 paddle/fluid/operators/top_k_op.cu create mode 100644 paddle/fluid/operators/top_k_op.h create mode 100644 paddle/fluid/operators/transpose_op.cc create mode 100644 paddle/fluid/operators/transpose_op.cu.cc create mode 100644 paddle/fluid/operators/transpose_op.h create mode 100644 paddle/fluid/operators/tree_conv_op.cc create mode 100644 paddle/fluid/operators/tree_conv_op.cu create mode 100644 paddle/fluid/operators/tree_conv_op.h create mode 100644 paddle/fluid/operators/truncated_gaussian_random_op.cc create mode 100644 paddle/fluid/operators/truncated_gaussian_random_op.cu create mode 100644 paddle/fluid/operators/unfold_op.cc create mode 100644 paddle/fluid/operators/unfold_op.cu create mode 100644 paddle/fluid/operators/unfold_op.h create mode 100644 paddle/fluid/operators/uniform_random_batch_size_like_op.cc create mode 100644 paddle/fluid/operators/uniform_random_op.cc create mode 100644 paddle/fluid/operators/uniform_random_op.cu create mode 100644 paddle/fluid/operators/unique_op.cc create mode 100644 paddle/fluid/operators/unique_op.h create mode 100644 paddle/fluid/operators/unpool_op.cc create mode 100644 paddle/fluid/operators/unpool_op.cu.cc create mode 100644 paddle/fluid/operators/unpool_op.h create mode 100644 paddle/fluid/operators/unsqueeze_op.cc create mode 100644 paddle/fluid/operators/unstack_op.cc create mode 100644 paddle/fluid/operators/unstack_op.h create mode 100644 paddle/fluid/operators/warpctc_cudnn_op.cu.cc create mode 100644 paddle/fluid/operators/warpctc_op.cc create mode 100644 paddle/fluid/operators/warpctc_op.cu.cc create mode 100644 paddle/fluid/operators/warpctc_op.h create mode 100644 paddle/fluid/operators/where_op.cc create mode 100644 paddle/fluid/operators/where_op.cu create mode 100644 paddle/fluid/operators/where_op.h create mode 100644 paddle/fluid/platform/CMakeLists.txt create mode 100644 paddle/fluid/platform/assert.h create mode 100644 paddle/fluid/platform/collective_helper.cc create mode 100644 paddle/fluid/platform/collective_helper.h create mode 100644 paddle/fluid/platform/cpu_helper.cc create mode 100644 paddle/fluid/platform/cpu_helper.h create mode 100644 paddle/fluid/platform/cpu_helper_test.cc create mode 100644 paddle/fluid/platform/cpu_info.cc create mode 100644 paddle/fluid/platform/cpu_info.h create mode 100644 paddle/fluid/platform/cpu_info_test.cc create mode 100644 paddle/fluid/platform/cuda_device_function.h create mode 100644 paddle/fluid/platform/cuda_device_guard.cc create mode 100644 paddle/fluid/platform/cuda_device_guard.h create mode 100644 paddle/fluid/platform/cuda_helper.h create mode 100644 paddle/fluid/platform/cuda_helper_test.cu create mode 100644 paddle/fluid/platform/cuda_primitives.h create mode 100644 paddle/fluid/platform/cuda_profiler.h create mode 100644 paddle/fluid/platform/cudnn_desc.h create mode 100644 paddle/fluid/platform/cudnn_desc_test.cc create mode 100644 paddle/fluid/platform/cudnn_helper.h create mode 100644 paddle/fluid/platform/cudnn_helper_test.cc create mode 100644 paddle/fluid/platform/cudnn_workspace_helper.h create mode 100644 paddle/fluid/platform/details/cuda_transform_iterator_cast.h create mode 100644 paddle/fluid/platform/device_context.cc create mode 100644 paddle/fluid/platform/device_context.h create mode 100644 paddle/fluid/platform/device_context_test.cu create mode 100644 paddle/fluid/platform/device_memory_aligment.cc create mode 100644 paddle/fluid/platform/device_memory_aligment.h create mode 100644 paddle/fluid/platform/device_tracer.cc create mode 100644 paddle/fluid/platform/device_tracer.h create mode 100644 paddle/fluid/platform/dynload/CMakeLists.txt create mode 100644 paddle/fluid/platform/dynload/cublas.cc create mode 100644 paddle/fluid/platform/dynload/cublas.h create mode 100644 paddle/fluid/platform/dynload/cudnn.cc create mode 100644 paddle/fluid/platform/dynload/cudnn.h create mode 100644 paddle/fluid/platform/dynload/cupti.cc create mode 100644 paddle/fluid/platform/dynload/cupti.h create mode 100644 paddle/fluid/platform/dynload/cupti_lib_path.h create mode 100644 paddle/fluid/platform/dynload/cupti_lib_path.h.in create mode 100644 paddle/fluid/platform/dynload/curand.cc create mode 100644 paddle/fluid/platform/dynload/curand.h create mode 100644 paddle/fluid/platform/dynload/dynamic_loader.cc create mode 100644 paddle/fluid/platform/dynload/dynamic_loader.h create mode 100644 paddle/fluid/platform/dynload/mklml.cc create mode 100644 paddle/fluid/platform/dynload/mklml.h create mode 100644 paddle/fluid/platform/dynload/nccl.cc create mode 100644 paddle/fluid/platform/dynload/nccl.h create mode 100644 paddle/fluid/platform/dynload/tensorrt.cc create mode 100644 paddle/fluid/platform/dynload/tensorrt.h create mode 100644 paddle/fluid/platform/dynload/warpctc.cc create mode 100644 paddle/fluid/platform/dynload/warpctc.h create mode 100644 paddle/fluid/platform/dynload/warpctc_lib_path.h create mode 100644 paddle/fluid/platform/dynload/warpctc_lib_path.h.in create mode 100644 paddle/fluid/platform/enforce.cc create mode 100644 paddle/fluid/platform/enforce.h create mode 100644 paddle/fluid/platform/enforce_test.cc create mode 100644 paddle/fluid/platform/event.h create mode 100644 paddle/fluid/platform/float16.h create mode 100644 paddle/fluid/platform/float16_test.cc create mode 100644 paddle/fluid/platform/float16_test.cu create mode 100644 paddle/fluid/platform/for_range.h create mode 100644 paddle/fluid/platform/gpu_info.cc create mode 100644 paddle/fluid/platform/gpu_info.h create mode 100644 paddle/fluid/platform/hostdevice.h create mode 100644 paddle/fluid/platform/init.cc create mode 100644 paddle/fluid/platform/init.h create mode 100644 paddle/fluid/platform/init_test.cc create mode 100644 paddle/fluid/platform/lock_guard_ptr.h create mode 100644 paddle/fluid/platform/lodtensor_printer.cc create mode 100644 paddle/fluid/platform/lodtensor_printer.h create mode 100644 paddle/fluid/platform/lodtensor_printer_test.cc create mode 100644 paddle/fluid/platform/macros.h create mode 100644 paddle/fluid/platform/mkldnn_helper.h create mode 100644 paddle/fluid/platform/mkldnn_reuse.h create mode 100644 paddle/fluid/platform/nccl_helper.h create mode 100644 paddle/fluid/platform/ngraph_helper.h create mode 100644 paddle/fluid/platform/place.cc create mode 100644 paddle/fluid/platform/place.h create mode 100644 paddle/fluid/platform/place_test.cc create mode 100644 paddle/fluid/platform/port.h create mode 100644 paddle/fluid/platform/profiler.cc create mode 100644 paddle/fluid/platform/profiler.cu create mode 100644 paddle/fluid/platform/profiler.h create mode 100644 paddle/fluid/platform/profiler.proto create mode 100644 paddle/fluid/platform/profiler_test.cc create mode 100644 paddle/fluid/platform/stream_callback_manager.cc create mode 100644 paddle/fluid/platform/stream_callback_manager.h create mode 100644 paddle/fluid/platform/temporary_allocator.cc create mode 100644 paddle/fluid/platform/temporary_allocator.h create mode 100644 paddle/fluid/platform/temporary_allocator_test.cc create mode 100644 paddle/fluid/platform/timer.cc create mode 100644 paddle/fluid/platform/timer.h create mode 100644 paddle/fluid/platform/timer_test.cc create mode 100644 paddle/fluid/platform/transform.h create mode 100644 paddle/fluid/platform/transform_test.cu create mode 100644 paddle/fluid/platform/variant.h create mode 100644 paddle/fluid/pybind/.gitignore create mode 100644 paddle/fluid/pybind/CMakeLists.txt create mode 100644 paddle/fluid/pybind/communicator_py.cc create mode 100644 paddle/fluid/pybind/communicator_py.h create mode 100644 paddle/fluid/pybind/const_value.cc create mode 100644 paddle/fluid/pybind/const_value.h create mode 100644 paddle/fluid/pybind/data_set_py.cc create mode 100644 paddle/fluid/pybind/data_set_py.h create mode 100644 paddle/fluid/pybind/exception.cc create mode 100644 paddle/fluid/pybind/exception.h create mode 100644 paddle/fluid/pybind/fleet_wrapper_py.cc create mode 100644 paddle/fluid/pybind/fleet_wrapper_py.h create mode 100644 paddle/fluid/pybind/imperative.cc create mode 100644 paddle/fluid/pybind/imperative.h create mode 100644 paddle/fluid/pybind/inference_api.cc create mode 100644 paddle/fluid/pybind/inference_api.h create mode 100644 paddle/fluid/pybind/ir.cc create mode 100644 paddle/fluid/pybind/ir.h create mode 100644 paddle/fluid/pybind/nccl_wrapper_py.cc create mode 100644 paddle/fluid/pybind/nccl_wrapper_py.h create mode 100644 paddle/fluid/pybind/protobuf.cc create mode 100644 paddle/fluid/pybind/protobuf.h create mode 100644 paddle/fluid/pybind/pybind.cc create mode 100644 paddle/fluid/pybind/pybind_boost_headers.h create mode 100644 paddle/fluid/pybind/reader_py.cc create mode 100644 paddle/fluid/pybind/reader_py.h create mode 100644 paddle/fluid/pybind/recordio.cc create mode 100644 paddle/fluid/pybind/recordio.h create mode 100644 paddle/fluid/pybind/tensor_py.h create mode 100644 paddle/fluid/recordio/CMakeLists.txt create mode 100644 paddle/fluid/recordio/README.md create mode 100644 paddle/fluid/recordio/chunk.cc create mode 100644 paddle/fluid/recordio/chunk.h create mode 100644 paddle/fluid/recordio/chunk_test.cc create mode 100644 paddle/fluid/recordio/header.cc create mode 100644 paddle/fluid/recordio/header.h create mode 100644 paddle/fluid/recordio/header_test.cc create mode 100644 paddle/fluid/recordio/scanner.cc create mode 100644 paddle/fluid/recordio/scanner.h create mode 100644 paddle/fluid/recordio/writer.cc create mode 100644 paddle/fluid/recordio/writer.h create mode 100644 paddle/fluid/recordio/writer_scanner_test.cc create mode 100644 paddle/fluid/string/CMakeLists.txt create mode 100644 paddle/fluid/string/piece.cc create mode 100644 paddle/fluid/string/piece.h create mode 100644 paddle/fluid/string/piece_test.cc create mode 100644 paddle/fluid/string/pretty_log.cc create mode 100644 paddle/fluid/string/pretty_log.h create mode 100644 paddle/fluid/string/printf.h create mode 100644 paddle/fluid/string/printf_test.cc create mode 100644 paddle/fluid/string/split.h create mode 100644 paddle/fluid/string/split_test.cc create mode 100644 paddle/fluid/string/string_helper.cc create mode 100644 paddle/fluid/string/string_helper.h create mode 100644 paddle/fluid/string/tinyformat/tinyformat.h create mode 100644 paddle/fluid/string/to_string.h create mode 100644 paddle/fluid/string/to_string_test.cc create mode 100644 paddle/fluid/train/CMakeLists.txt create mode 100644 paddle/fluid/train/demo/CMakeLists.txt create mode 100644 paddle/fluid/train/demo/README.md create mode 100644 paddle/fluid/train/demo/demo_network.py create mode 100644 paddle/fluid/train/demo/demo_trainer.cc create mode 100644 paddle/fluid/train/test_train_recognize_digits.cc create mode 100644 paddle/scripts/CMakeLists.txt create mode 100644 paddle/scripts/Dockerfile.tmp create mode 100644 paddle/scripts/README.md create mode 100644 paddle/scripts/build_docker_images.sh create mode 100644 paddle/scripts/doc/paddle-development-environment-gpu.graffle create mode 100644 paddle/scripts/doc/paddle-development-environment-gpu.png create mode 100644 paddle/scripts/doc/paddle-development-environment.graffle create mode 100644 paddle/scripts/doc/paddle-development-environment.png create mode 100755 paddle/scripts/docker/root/.bashrc create mode 100755 paddle/scripts/docker/root/.gitconfig create mode 100755 paddle/scripts/docker/root/.scripts/git-completion.sh create mode 100755 paddle/scripts/docker/root/.scripts/git-prompt.sh create mode 100644 paddle/scripts/fast_install.sh create mode 100644 paddle/scripts/installation_validate.py create mode 100755 paddle/scripts/paddle_build.sh create mode 100755 paddle/scripts/paddle_docker_build.sh create mode 100755 paddle/scripts/submit_local.sh.in create mode 100644 paddle/testing/CMakeLists.txt create mode 100644 paddle/testing/paddle_gtest_main.cc create mode 100644 patches/grpc/completion_queue.h create mode 100644 patches/grpc/grpc_library.h create mode 100644 python/.gitignore create mode 100644 python/CMakeLists.txt create mode 100644 python/__init__.py create mode 100644 python/paddle/.gitignore create mode 100644 python/paddle/__init__.py create mode 100644 python/paddle/batch.py create mode 100644 python/paddle/compat.py create mode 100644 python/paddle/dataset/__init__.py create mode 100644 python/paddle/dataset/cifar.py create mode 100644 python/paddle/dataset/common.py create mode 100644 python/paddle/dataset/conll05.py create mode 100644 python/paddle/dataset/flowers.py create mode 100644 python/paddle/dataset/image.py create mode 100644 python/paddle/dataset/imdb.py create mode 100644 python/paddle/dataset/imikolov.py create mode 100644 python/paddle/dataset/mnist.py create mode 100644 python/paddle/dataset/movielens.py create mode 100644 python/paddle/dataset/mq2007.py create mode 100644 python/paddle/dataset/sentiment.py create mode 100644 python/paddle/dataset/tests/CMakeLists.txt create mode 100644 python/paddle/dataset/tests/cat.jpg create mode 100644 python/paddle/dataset/tests/cifar_test.py create mode 100644 python/paddle/dataset/tests/common_test.py create mode 100644 python/paddle/dataset/tests/flowers_test.py create mode 100644 python/paddle/dataset/tests/imdb_test.py create mode 100644 python/paddle/dataset/tests/imikolov_test.py create mode 100644 python/paddle/dataset/tests/mnist_test.py create mode 100644 python/paddle/dataset/tests/mq2007_test.py create mode 100644 python/paddle/dataset/tests/test_image.py create mode 100644 python/paddle/dataset/tests/test_sentiment.py create mode 100644 python/paddle/dataset/tests/voc2012_test.py create mode 100644 python/paddle/dataset/tests/wmt16_test.py create mode 100644 python/paddle/dataset/uci_housing.py create mode 100644 python/paddle/dataset/voc2012.py create mode 100644 python/paddle/dataset/wmt14.py create mode 100644 python/paddle/dataset/wmt16.py create mode 100644 python/paddle/distributed/__init__.py create mode 100644 python/paddle/distributed/launch.py create mode 100644 python/paddle/distributed/launch_ps.py create mode 100644 python/paddle/fluid/.gitignore create mode 100644 python/paddle/fluid/__init__.py create mode 100644 python/paddle/fluid/annotations.py create mode 100644 python/paddle/fluid/average.py create mode 100644 python/paddle/fluid/backward.py create mode 100644 python/paddle/fluid/clip.py create mode 100644 python/paddle/fluid/communicator.py create mode 100644 python/paddle/fluid/compiler.py create mode 100644 python/paddle/fluid/contrib/__init__.py create mode 100644 python/paddle/fluid/contrib/decoder/__init__.py create mode 100644 python/paddle/fluid/contrib/decoder/beam_search_decoder.py create mode 100644 python/paddle/fluid/contrib/extend_optimizer/__init__.py create mode 100644 python/paddle/fluid/contrib/extend_optimizer/extend_optimizer_with_weight_decay.py create mode 100644 python/paddle/fluid/contrib/inferencer.py create mode 100644 python/paddle/fluid/contrib/layers/__init__.py create mode 100644 python/paddle/fluid/contrib/layers/nn.py create mode 100644 python/paddle/fluid/contrib/layers/rnn_impl.py create mode 100644 python/paddle/fluid/contrib/memory_usage_calc.py create mode 100644 python/paddle/fluid/contrib/mixed_precision/__init__.py create mode 100644 python/paddle/fluid/contrib/mixed_precision/decorator.py create mode 100644 python/paddle/fluid/contrib/mixed_precision/fp16_lists.py create mode 100644 python/paddle/fluid/contrib/mixed_precision/fp16_utils.py create mode 100644 python/paddle/fluid/contrib/model_stat.py create mode 100644 python/paddle/fluid/contrib/op_frequence.py create mode 100644 python/paddle/fluid/contrib/quantize/__init__.py create mode 100644 python/paddle/fluid/contrib/quantize/quantize_transpiler.py create mode 100644 python/paddle/fluid/contrib/reader/README.md create mode 100644 python/paddle/fluid/contrib/reader/__init__.py create mode 100644 python/paddle/fluid/contrib/reader/distributed_reader.py create mode 100644 python/paddle/fluid/contrib/slim/__init__.py create mode 100644 python/paddle/fluid/contrib/slim/core/__init__.py create mode 100644 python/paddle/fluid/contrib/slim/core/compressor.py create mode 100644 python/paddle/fluid/contrib/slim/core/config.py create mode 100644 python/paddle/fluid/contrib/slim/core/strategy.py create mode 100644 python/paddle/fluid/contrib/slim/distillation/__init__.py create mode 100644 python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py create mode 100644 python/paddle/fluid/contrib/slim/distillation/distiller.py create mode 100644 python/paddle/fluid/contrib/slim/graph/__init__.py create mode 100644 python/paddle/fluid/contrib/slim/graph/executor.py create mode 100644 python/paddle/fluid/contrib/slim/graph/graph_wrapper.py create mode 100644 python/paddle/fluid/contrib/slim/nas/__init__.py create mode 100644 python/paddle/fluid/contrib/slim/nas/controller_server.py create mode 100644 python/paddle/fluid/contrib/slim/nas/light_nas_strategy.py create mode 100644 python/paddle/fluid/contrib/slim/nas/lock.py create mode 100644 python/paddle/fluid/contrib/slim/nas/search_agent.py create mode 100644 python/paddle/fluid/contrib/slim/nas/search_space.py create mode 100644 python/paddle/fluid/contrib/slim/prune/__init__.py create mode 100644 python/paddle/fluid/contrib/slim/prune/auto_prune_strategy.py create mode 100644 python/paddle/fluid/contrib/slim/prune/prune_strategy.py create mode 100644 python/paddle/fluid/contrib/slim/prune/pruner.py create mode 100644 python/paddle/fluid/contrib/slim/quantization/__init__.py create mode 100644 python/paddle/fluid/contrib/slim/quantization/mkldnn_post_training_strategy.py create mode 100644 python/paddle/fluid/contrib/slim/quantization/quantization_mkldnn_pass.py create mode 100644 python/paddle/fluid/contrib/slim/quantization/quantization_pass.py create mode 100644 python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py create mode 100644 python/paddle/fluid/contrib/slim/searcher/__init__.py create mode 100644 python/paddle/fluid/contrib/slim/searcher/controller.py create mode 100644 python/paddle/fluid/contrib/slim/tests/CMakeLists.txt create mode 100644 python/paddle/fluid/contrib/slim/tests/QAT_mkldnn_int8_readme.md create mode 100644 python/paddle/fluid/contrib/slim/tests/__init__.py create mode 100644 python/paddle/fluid/contrib/slim/tests/auto_pruning/compress.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/configs/filter_pruning.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/distillation/compress.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/filter_pruning/compress.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/light_nas/compress.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/light_nas/light_nas_space.py create mode 100644 python/paddle/fluid/contrib/slim/tests/light_nas/light_nasnet.py create mode 100644 python/paddle/fluid/contrib/slim/tests/mobilenet.py create mode 100644 python/paddle/fluid/contrib/slim/tests/qat_int8_comparison.py create mode 100644 python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/quantization/config_mkldnn_int8.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/slim_int8_mkldnn_post_training_quantization.md create mode 100644 python/paddle/fluid/contrib/slim/tests/test_auto_pruning.py create mode 100644 python/paddle/fluid/contrib/slim/tests/test_distillation_strategy.py create mode 100644 python/paddle/fluid/contrib/slim/tests/test_factory.py create mode 100644 python/paddle/fluid/contrib/slim/tests/test_filter_pruning.py create mode 100644 python/paddle/fluid/contrib/slim/tests/test_graph.py create mode 100644 python/paddle/fluid/contrib/slim/tests/test_graph_wrapper.py create mode 100644 python/paddle/fluid/contrib/slim/tests/test_light_nas.py create mode 100644 python/paddle/fluid/contrib/slim/tests/test_mkldnn_int8_quantization_strategy.py create mode 100644 python/paddle/fluid/contrib/slim/tests/test_quantization_mkldnn_pass.py create mode 100644 python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py create mode 100644 python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py create mode 100644 python/paddle/fluid/contrib/slim/tests/test_quantization_strategy.py create mode 100644 python/paddle/fluid/contrib/tests/CMakeLists.txt create mode 100644 python/paddle/fluid/contrib/tests/test_distributed_reader.py create mode 100644 python/paddle/fluid/contrib/tests/test_image_classification_fp16.py create mode 100644 python/paddle/fluid/contrib/tests/test_quantize_transpiler.py create mode 100644 python/paddle/fluid/contrib/tests/test_weight_decay_extend.py create mode 100644 python/paddle/fluid/contrib/trainer.py create mode 100644 python/paddle/fluid/contrib/utils/__init__.py create mode 100644 python/paddle/fluid/contrib/utils/hdfs_utils.py create mode 100644 python/paddle/fluid/contrib/utils/lookup_table_utils.py create mode 100644 python/paddle/fluid/core.py create mode 100644 python/paddle/fluid/data_feed_desc.py create mode 100644 python/paddle/fluid/data_feeder.py create mode 100644 python/paddle/fluid/dataset.py create mode 100644 python/paddle/fluid/debugger.py create mode 100644 python/paddle/fluid/default_scope_funcs.py create mode 100644 python/paddle/fluid/device_worker.py create mode 100644 python/paddle/fluid/distribute_lookup_table.py create mode 100644 python/paddle/fluid/distributed/__init__.py create mode 100644 python/paddle/fluid/distributed/downpour.py create mode 100644 python/paddle/fluid/distributed/fleet.py create mode 100644 python/paddle/fluid/distributed/helper.py create mode 100644 python/paddle/fluid/distributed/node.py create mode 100644 python/paddle/fluid/distributed/ps_instance.py create mode 100644 python/paddle/fluid/distributed/ps_pb2.py create mode 100644 python/paddle/fluid/dygraph/__init__.py create mode 100644 python/paddle/fluid/dygraph/backward_strategy.py create mode 100644 python/paddle/fluid/dygraph/base.py create mode 100644 python/paddle/fluid/dygraph/checkpoint.py create mode 100644 python/paddle/fluid/dygraph/layer_object_helper.py create mode 100644 python/paddle/fluid/dygraph/layers.py create mode 100644 python/paddle/fluid/dygraph/learning_rate_scheduler.py create mode 100644 python/paddle/fluid/dygraph/nn.py create mode 100644 python/paddle/fluid/dygraph/parallel.py create mode 100644 python/paddle/fluid/dygraph/parallel_helper.py create mode 100644 python/paddle/fluid/dygraph/profiler.py create mode 100644 python/paddle/fluid/dygraph/tracer.py create mode 100644 python/paddle/fluid/dygraph_grad_clip.py create mode 100644 python/paddle/fluid/evaluator.py create mode 100644 python/paddle/fluid/executor.py create mode 100644 python/paddle/fluid/framework.py create mode 100644 python/paddle/fluid/graphviz.py create mode 100644 python/paddle/fluid/incubate/__init__.py create mode 100644 python/paddle/fluid/incubate/data_generator/__init__.py create mode 100644 python/paddle/fluid/incubate/data_generator/test_data_generator.py create mode 100644 python/paddle/fluid/incubate/fleet/__init__.py create mode 100644 python/paddle/fluid/incubate/fleet/base/__init__.py create mode 100644 python/paddle/fluid/incubate/fleet/base/fleet_base.py create mode 100644 python/paddle/fluid/incubate/fleet/base/role_maker.py create mode 100644 python/paddle/fluid/incubate/fleet/collective/__init__.py create mode 100644 python/paddle/fluid/incubate/fleet/parameter_server/__init__.py create mode 100644 python/paddle/fluid/incubate/fleet/parameter_server/distribute_transpiler/__init__.py create mode 100644 python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py create mode 100644 python/paddle/fluid/incubate/fleet/parameter_server/pslib/node.py create mode 100644 python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py create mode 100644 python/paddle/fluid/incubate/fleet/parameter_server/pslib/ps_pb2.py create mode 100644 python/paddle/fluid/incubate/fleet/tests/cluster_train.sh create mode 100644 python/paddle/fluid/incubate/fleet/tests/ctr_dataset_reader.py create mode 100644 python/paddle/fluid/incubate/fleet/tests/fleet_deep_ctr.py create mode 100644 python/paddle/fluid/incubate/fleet/utils/__init__.py create mode 100644 python/paddle/fluid/incubate/fleet/utils/hdfs.py create mode 100644 python/paddle/fluid/inferencer.py create mode 100644 python/paddle/fluid/initializer.py create mode 100644 python/paddle/fluid/install_check.py create mode 100644 python/paddle/fluid/io.py create mode 100644 python/paddle/fluid/layer_helper.py create mode 100644 python/paddle/fluid/layer_helper_base.py create mode 100644 python/paddle/fluid/layers/__init__.py create mode 100644 python/paddle/fluid/layers/collective.py create mode 100644 python/paddle/fluid/layers/control_flow.py create mode 100644 python/paddle/fluid/layers/detection.py create mode 100644 python/paddle/fluid/layers/device.py create mode 100644 python/paddle/fluid/layers/distributions.py create mode 100644 python/paddle/fluid/layers/io.py create mode 100644 python/paddle/fluid/layers/layer_function_generator.py create mode 100644 python/paddle/fluid/layers/learning_rate_scheduler.py create mode 100644 python/paddle/fluid/layers/math_op_patch.py create mode 100644 python/paddle/fluid/layers/metric_op.py create mode 100644 python/paddle/fluid/layers/nn.py create mode 100644 python/paddle/fluid/layers/ops.py create mode 100644 python/paddle/fluid/layers/tensor.py create mode 100644 python/paddle/fluid/layers/utils.py create mode 100644 python/paddle/fluid/lod_tensor.py create mode 100644 python/paddle/fluid/log_helper.py create mode 100644 python/paddle/fluid/metrics.py create mode 100644 python/paddle/fluid/net_drawer.py create mode 100644 python/paddle/fluid/nets.py create mode 100644 python/paddle/fluid/op.py create mode 100644 python/paddle/fluid/optimizer.py create mode 100644 python/paddle/fluid/parallel_executor.py create mode 100644 python/paddle/fluid/param_attr.py create mode 100644 python/paddle/fluid/profiler.py create mode 100644 python/paddle/fluid/reader.py create mode 100644 python/paddle/fluid/recordio_writer.py create mode 100644 python/paddle/fluid/regularizer.py create mode 100644 python/paddle/fluid/sampcd_processor.py create mode 100644 python/paddle/fluid/tests/.gitignore create mode 100644 python/paddle/fluid/tests/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/__init__.py create mode 100644 python/paddle/fluid/tests/book/.gitignore create mode 100644 python/paddle/fluid/tests/book/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/book/__init__.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/test_fit_a_line_new_api.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/test_image_classification_resnet_new_api.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py create mode 100755 python/paddle/fluid/tests/book/high-level-api/test_label_semantic_roles_new_api.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/test_machine_translation_new_api.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_conv_new_api.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_mlp_new_api.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/test_recommender_system_new_api.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_conv_new_api.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_dynamic_rnn_new_api.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_stacked_lstm_new_api.py create mode 100644 python/paddle/fluid/tests/book/high-level-api/test_word2vec_new_api.py create mode 100644 python/paddle/fluid/tests/book/notest_understand_sentiment.py create mode 100644 python/paddle/fluid/tests/book/test_fit_a_line.py create mode 100644 python/paddle/fluid/tests/book/test_image_classification.py create mode 100644 python/paddle/fluid/tests/book/test_label_semantic_roles.py create mode 100644 python/paddle/fluid/tests/book/test_machine_translation.py create mode 100644 python/paddle/fluid/tests/book/test_recognize_digits.py create mode 100644 python/paddle/fluid/tests/book/test_recommender_system.py create mode 100644 python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py create mode 100644 python/paddle/fluid/tests/book/test_word2vec.py create mode 100644 python/paddle/fluid/tests/book_memory_optimization/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py create mode 100644 python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py create mode 100644 python/paddle/fluid/tests/demo/executor_train_dataset.py create mode 100644 python/paddle/fluid/tests/demo/fc_gan.py create mode 100644 python/paddle/fluid/tests/demo/file_reader/.gitignore create mode 100644 python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py create mode 100644 python/paddle/fluid/tests/demo/file_reader/train.py create mode 100644 python/paddle/fluid/tests/demo/pipeline_train.py create mode 100644 python/paddle/fluid/tests/demo/pyreader.py create mode 100644 python/paddle/fluid/tests/test_beam_search_decoder.py create mode 100644 python/paddle/fluid/tests/test_communicator.py create mode 100644 python/paddle/fluid/tests/test_cpp_reader.py create mode 100644 python/paddle/fluid/tests/test_data_feeder.py create mode 100644 python/paddle/fluid/tests/test_detection.py create mode 100644 python/paddle/fluid/tests/test_error_clip.py create mode 100644 python/paddle/fluid/tests/test_if_else_op.py create mode 100644 python/paddle/fluid/tests/test_lod_tensor.py create mode 100644 python/paddle/fluid/tests/test_python_operator_overriding.py create mode 100644 python/paddle/fluid/tests/unittests/.gitignore create mode 100644 python/paddle/fluid/tests/unittests/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/__init__.py create mode 100644 python/paddle/fluid/tests/unittests/benchmark.py create mode 100644 python/paddle/fluid/tests/unittests/benchmark_sum_op.py create mode 100644 python/paddle/fluid/tests/unittests/collective_allgather_op.py create mode 100644 python/paddle/fluid/tests/unittests/collective_allreduce_op.py create mode 100644 python/paddle/fluid/tests/unittests/collective_broadcast_op.py create mode 100644 python/paddle/fluid/tests/unittests/collective_reducescatter_op.py create mode 100644 python/paddle/fluid/tests/unittests/ctr_dataset_reader.py create mode 100644 python/paddle/fluid/tests/unittests/decorator_helper.py create mode 100644 python/paddle/fluid/tests/unittests/dist_allreduce_op.py create mode 100644 python/paddle/fluid/tests/unittests/dist_ctr.py create mode 100644 python/paddle/fluid/tests/unittests/dist_ctr_reader.py create mode 100644 python/paddle/fluid/tests/unittests/dist_fleet_ctr.py create mode 100644 python/paddle/fluid/tests/unittests/dist_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/dist_mnist_batch_merge.py create mode 100644 python/paddle/fluid/tests/unittests/dist_mnist_lars.py create mode 100644 python/paddle/fluid/tests/unittests/dist_save_load.py create mode 100644 python/paddle/fluid/tests/unittests/dist_se_resnext.py create mode 100644 python/paddle/fluid/tests/unittests/dist_simnet_bow.py create mode 100644 python/paddle/fluid/tests/unittests/dist_text_classification.py create mode 100644 python/paddle/fluid/tests/unittests/dist_transformer.py create mode 100644 python/paddle/fluid/tests/unittests/dist_word2vec.py create mode 100644 python/paddle/fluid/tests/unittests/fake_reader.py create mode 100644 python/paddle/fluid/tests/unittests/gradient_checker.py create mode 100644 python/paddle/fluid/tests/unittests/ir_memory_optimize_net_base.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/__init__.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/mkldnn_op_test.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_batch_norm_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_concat_int8_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_dequantize_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_gaussian_random_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_lrn_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_mul_int8_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_quantize_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_requantize_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_softmax_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_transpose_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/multi_process.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/ngraph/__init__.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_accuracy_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_activation_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_adam_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_assign_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_batch_norm_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_cast_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_compare_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_concat_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_conv2d_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_cross_entropy_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_dropout_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_elementwise_add_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_elementwise_div_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_elementwise_max_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_elementwise_min_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_elementwise_mul_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_elementwise_pow_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_elementwise_sub_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_fill_constant_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_fill_zeros_like_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_gather_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_increment_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_layer_norm_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_logical_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_lookup_table_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_lrn_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_matmul_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_momentum_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_mul_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_pool2d_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_reduce_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_reshape_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_slice_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_softmax_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_softmax_with_cross_entropy_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_stack_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_sum_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_top_k_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_transpose_ngraph_op.py create mode 100644 python/paddle/fluid/tests/unittests/op_test.py create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py create mode 100644 python/paddle/fluid/tests/unittests/parallel_executor_test_base.py create mode 100644 python/paddle/fluid/tests/unittests/simple_nets.py create mode 100644 python/paddle/fluid/tests/unittests/test_accuracy_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_activation_nn_grad.py create mode 100644 python/paddle/fluid/tests/unittests/test_activation_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_adadelta_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_adagrad_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_adam_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_adamax_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_add_position_encoding_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_affine_channel_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_affine_grid_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_allgather.py create mode 100644 python/paddle/fluid/tests/unittests/test_allreduce.py create mode 100644 python/paddle/fluid/tests/unittests/test_anchor_generator_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_arg_min_max_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_argsort_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_array_read_write_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_assign_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_assign_value_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_attention_lstm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_auc_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_auc_single_pred_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_backward_find_no_grad_vars.py create mode 100644 python/paddle/fluid/tests/unittests/test_base_layer.py create mode 100644 python/paddle/fluid/tests/unittests/test_basic_gru_api.py create mode 100644 python/paddle/fluid/tests/unittests/test_basic_gru_unit_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_basic_lstm_api.py create mode 100644 python/paddle/fluid/tests/unittests/test_basic_lstm_unit_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_batch_norm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_beam_search_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_bipartite_match_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_box_clip_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_box_coder_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_box_decoder_and_assign_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_bpr_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_broadcast.py create mode 100644 python/paddle/fluid/tests/unittests/test_buffer_shared_inplace_pass.py create mode 100644 python/paddle/fluid/tests/unittests/test_calc_gradient.py create mode 100644 python/paddle/fluid/tests/unittests/test_cast_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_chunk_eval_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_clip_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_coalesce_tensor_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_collect_fpn_proposals_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_collective_base.py create mode 100644 python/paddle/fluid/tests/unittests/test_compare_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_compat.py create mode 100644 python/paddle/fluid/tests/unittests/test_concat_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_conditional_block.py create mode 100644 python/paddle/fluid/tests/unittests/test_const_value.py create mode 100644 python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_conv2d_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_conv3d_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_conv_shift_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_cos_sim_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_create_op_doc_string.py create mode 100644 python/paddle/fluid/tests/unittests/test_crf_decoding_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_crop_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_cross_entropy2_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_cross_entropy_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_ctc_align.py create mode 100644 python/paddle/fluid/tests/unittests/test_cumsum_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_cvm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_dataset.py create mode 100644 python/paddle/fluid/tests/unittests/test_debugger.py create mode 100644 python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py create mode 100644 python/paddle/fluid/tests/unittests/test_decoupled_py_reader_data_check.py create mode 100644 python/paddle/fluid/tests/unittests/test_default_scope_funcs.py create mode 100644 python/paddle/fluid/tests/unittests/test_deformable_conv_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_deformable_psroi_pooling.py create mode 100644 python/paddle/fluid/tests/unittests/test_density_prior_box_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_desc_clone.py create mode 100644 python/paddle/fluid/tests/unittests/test_detection_map_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_dgc_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_diag.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_allreduce_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_base.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_ctr.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_fleet_base.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_backward_deps.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_batch_merge.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_hallreduce.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_lars.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_multi_comm.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_pg.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_ring_allreduce.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_save_load.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_se_resnext.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_se_resnext_nccl.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_text_classification.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_train.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_transformer.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_transpiler.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_word2vec.py create mode 100644 python/paddle/fluid/tests/unittests/test_distribute_fpn_proposals_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_distributions.py create mode 100644 python/paddle/fluid/tests/unittests/test_dropout_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py create mode 100644 python/paddle/fluid/tests/unittests/test_dyn_rnn.py create mode 100644 python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py create mode 100644 python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_conditional_block.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_delete_vars.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py create mode 100644 python/paddle/fluid/tests/unittests/test_eager_deletion_while_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_edit_distance_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_add_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_div_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_floordiv_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_max_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_min_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_mod_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_nn_grad.py create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_exception.py create mode 100644 python/paddle/fluid/tests/unittests/test_executor_and_mul.py create mode 100644 python/paddle/fluid/tests/unittests/test_executor_and_use_program_cache.py create mode 100644 python/paddle/fluid/tests/unittests/test_expand_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fake_init_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fake_quantize_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fc_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_feed_fetch_method.py create mode 100644 python/paddle/fluid/tests/unittests/test_fetch_var.py create mode 100644 python/paddle/fluid/tests/unittests/test_fill_any_like_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fill_constant_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fill_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fill_zeros_like2_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_flatten_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_framework_debug_str.py create mode 100644 python/paddle/fluid/tests/unittests/test_fsp_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_ftrl_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py create mode 100644 python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py create mode 100644 python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fused_embedding_fc_lstm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fusion_gru_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fusion_repeated_fc_relu_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fusion_seqconv_eltadd_relu_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fusion_seqexpand_concat_fc_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fusion_seqpool_concat_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fusion_squared_mat_sub_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fusion_transpose_flatten_concat_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_gather_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_gaussian_random_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_generate_mask_labels_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_generate_proposals_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_get_places_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_grad_clip_minimize.py create mode 100644 python/paddle/fluid/tests/unittests/test_gradient_clip.py create mode 100644 python/paddle/fluid/tests/unittests/test_grid_sampler_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_group_norm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_gru_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_gru_unit_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_hash_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_hinge_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_hsigmoid_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_huber_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_im2sequence_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_image_classification_layer.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_base.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_basic.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_decorator.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_deepcf.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_gan.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_gnn.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_mnist_sorted_gradient.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_optimizer.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_recurrent_usage.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_resnet.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_save_load_optimizer.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_transformer.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py create mode 100644 python/paddle/fluid/tests/unittests/test_infer_shape.py create mode 100644 python/paddle/fluid/tests/unittests/test_inference_model_io.py create mode 100644 python/paddle/fluid/tests/unittests/test_initializer.py create mode 100644 python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py create mode 100644 python/paddle/fluid/tests/unittests/test_install_check.py create mode 100644 python/paddle/fluid/tests/unittests/test_iou_similarity_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_ir_graph.py create mode 100644 python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py create mode 100644 python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py create mode 100644 python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py create mode 100644 python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py create mode 100644 python/paddle/fluid/tests/unittests/test_is_empty_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_isfinite_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_kldiv_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_l1_norm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_label_smooth_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lamb_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_launch.sh create mode 100644 python/paddle/fluid/tests/unittests/test_layer_norm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_layers.py create mode 100644 python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py create mode 100644 python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_linspace.py create mode 100644 python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lod_array_length_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lod_rank_table.py create mode 100644 python/paddle/fluid/tests/unittests/test_lod_reset_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lod_tensor_array.py create mode 100644 python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py create mode 100644 python/paddle/fluid/tests/unittests/test_log_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_logical_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lookup_remote_table_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lookup_table_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lrn_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lstm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lstm_unit_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_lstmp_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_math_op_patch.py create mode 100644 python/paddle/fluid/tests/unittests/test_matmul_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_maxout_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_mean_iou.py create mode 100644 python/paddle/fluid/tests/unittests/test_mean_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py create mode 100644 python/paddle/fluid/tests/unittests/test_memory_usage.py create mode 100644 python/paddle/fluid/tests/unittests/test_merge_ids_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_metrics.py create mode 100644 python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_minus_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py create mode 100644 python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_momentum_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_mul_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_multi_file_reader.py create mode 100644 python/paddle/fluid/tests/unittests/test_multi_pass_reader.py create mode 100644 python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_multihead_attention.py create mode 100644 python/paddle/fluid/tests/unittests/test_multiplex_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_name_scope.py create mode 100644 python/paddle/fluid/tests/unittests/test_nce.py create mode 100644 python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_nearest_interp_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_network_with_dtype.py create mode 100644 python/paddle/fluid/tests/unittests/test_nn_grad.py create mode 100644 python/paddle/fluid/tests/unittests/test_norm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_normalization_wrapper.py create mode 100644 python/paddle/fluid/tests/unittests/test_npair_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_nvprof.py create mode 100644 python/paddle/fluid/tests/unittests/test_one_hot_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_op_support_gpu.py create mode 100644 python/paddle/fluid/tests/unittests/test_operator.py create mode 100644 python/paddle/fluid/tests/unittests/test_operator_desc.py create mode 100644 python/paddle/fluid/tests/unittests/test_optimizer.py create mode 100644 python/paddle/fluid/tests/unittests/test_pad2d_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_pad_constant_like.py create mode 100644 python/paddle/fluid/tests/unittests/test_pad_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_se_resnext.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_crf_auto_growth.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_drop_scope.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_transformer_auto_growth.py create mode 100644 python/paddle/fluid/tests/unittests/test_parameter.py create mode 100644 python/paddle/fluid/tests/unittests/test_partial_eager_deletion_transformer.py create mode 100644 python/paddle/fluid/tests/unittests/test_pass_builder.py create mode 100644 python/paddle/fluid/tests/unittests/test_pipeline.py create mode 100644 python/paddle/fluid/tests/unittests/test_pixel_shuffle.py create mode 100644 python/paddle/fluid/tests/unittests/test_polygon_box_transform.py create mode 100644 python/paddle/fluid/tests/unittests/test_pool2d_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_pool3d_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_pool_max_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_precision_recall_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_prelu_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_preprocessor.py create mode 100644 python/paddle/fluid/tests/unittests/test_print_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_prior_box_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_profiler.py create mode 100644 python/paddle/fluid/tests/unittests/test_program.py create mode 100644 python/paddle/fluid/tests/unittests/test_program_code.py create mode 100644 python/paddle/fluid/tests/unittests/test_protobuf.py create mode 100644 python/paddle/fluid/tests/unittests/test_protobuf_descs.py create mode 100644 python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_proximal_gd_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_psroi_pool_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_py_func_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_py_reader_lod_level_share.py create mode 100644 python/paddle/fluid/tests/unittests/test_py_reader_pin_memory.py create mode 100644 python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py create mode 100644 python/paddle/fluid/tests/unittests/test_py_reader_sample_generator.py create mode 100644 python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py create mode 100644 python/paddle/fluid/tests/unittests/test_pyreader.py create mode 100644 python/paddle/fluid/tests/unittests/test_random_crop_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_range.py create mode 100644 python/paddle/fluid/tests/unittests/test_rank_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_reader_reset.py create mode 100644 python/paddle/fluid/tests/unittests/test_recordio_reader.py create mode 100644 python/paddle/fluid/tests/unittests/test_recurrent_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_reduce_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_reducescatter.py create mode 100644 python/paddle/fluid/tests/unittests/test_ref_by_trainer_id_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_registry.py create mode 100644 python/paddle/fluid/tests/unittests/test_regularizer.py create mode 100644 python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py create mode 100644 python/paddle/fluid/tests/unittests/test_reshape_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_retinanet_detection_output.py create mode 100644 python/paddle/fluid/tests/unittests/test_reverse_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_rmsprop_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_roi_align_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_roi_perspective_transform_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_roi_pool_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_row_conv_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sampling_id_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_scale_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_scatter_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_scope.py create mode 100644 python/paddle/fluid/tests/unittests/test_selected_rows.py create mode 100644 python/paddle/fluid/tests/unittests/test_selu_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_seq_conv.py create mode 100644 python/paddle/fluid/tests/unittests/test_seq_pool.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_concat.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_enumerate_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_erase_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_expand.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_expand_as.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_mask.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_pad_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_reshape.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_reverse.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_scatter_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_slice_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_unpad_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sgd_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_shape_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_shard_index_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py create mode 100644 python/paddle/fluid/tests/unittests/test_shuffle_channel_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sign_op.py create mode 100755 python/paddle/fluid/tests/unittests/test_similarity_focus_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_size_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_slice_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_slice_var.py create mode 100644 python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_softmax_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_space_to_depth_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_spectral_norm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_split_ids_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_split_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_spp_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_squeeze_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_stack_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sum_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_switch.py create mode 100644 python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_target_assign_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_teacher_student_sigmoid_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_temporal_shift_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_tensor.py create mode 100644 python/paddle/fluid/tests/unittests/test_tensor_array_to_tensor.py create mode 100644 python/paddle/fluid/tests/unittests/test_tensor_to_numpy.py create mode 100644 python/paddle/fluid/tests/unittests/test_top_k_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_transpose_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_tree_conv_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_truncated_gaussian_random_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_unfold_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_uniform_random_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_unique.py create mode 100644 python/paddle/fluid/tests/unittests/test_unique_name.py create mode 100644 python/paddle/fluid/tests/unittests/test_unpool_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_unsqueeze_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_unstack_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_variable.py create mode 100644 python/paddle/fluid/tests/unittests/test_version.py create mode 100644 python/paddle/fluid/tests/unittests/test_warpctc_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_weight_decay.py create mode 100644 python/paddle/fluid/tests/unittests/test_weight_normalization.py create mode 100644 python/paddle/fluid/tests/unittests/test_where.py create mode 100644 python/paddle/fluid/tests/unittests/test_while_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_yolo_box_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py create mode 100644 python/paddle/fluid/tests/unittests/testsuite.py create mode 100644 python/paddle/fluid/tests/unittests/transformer_model.py create mode 100644 python/paddle/fluid/trainer_desc.py create mode 100644 python/paddle/fluid/trainer_factory.py create mode 100644 python/paddle/fluid/transpiler/__init__.py create mode 100644 python/paddle/fluid/transpiler/collective.py create mode 100644 python/paddle/fluid/transpiler/details/__init__.py create mode 100644 python/paddle/fluid/transpiler/details/checkport.py create mode 100644 python/paddle/fluid/transpiler/details/program_utils.py create mode 100644 python/paddle/fluid/transpiler/details/ufind.py create mode 100644 python/paddle/fluid/transpiler/details/vars_distributed.py create mode 100644 python/paddle/fluid/transpiler/distribute_transpiler.py create mode 100644 python/paddle/fluid/transpiler/inference_transpiler.py create mode 100755 python/paddle/fluid/transpiler/memory_optimization_transpiler.py create mode 100644 python/paddle/fluid/transpiler/ps_dispatcher.py create mode 100644 python/paddle/fluid/unique_name.py create mode 100644 python/paddle/fluid/wrapped_decorator.py create mode 100644 python/paddle/libs/__init__.py create mode 100644 python/paddle/reader/__init__.py create mode 100644 python/paddle/reader/creator.py create mode 100644 python/paddle/reader/decorator.py create mode 100644 python/paddle/reader/tests/CMakeLists.txt create mode 100644 python/paddle/reader/tests/__init__.py create mode 100644 python/paddle/reader/tests/creator_test.py create mode 100644 python/paddle/reader/tests/decorator_test.py create mode 100644 python/paddle/reader/tests/test_data_creator.txt create mode 100644 python/paddle/reader/tests/test_reader_recordio.dat create mode 100644 python/paddle/reader/tests/test_recordio_creator.dat create mode 100644 python/paddle/utils/__init__.py create mode 100644 python/paddle/utils/image_util.py create mode 100644 python/paddle/utils/plot.py create mode 100644 python/paddle/utils/plotcurve.py create mode 100644 python/paddle/utils/preprocess_img.py create mode 100644 python/paddle/utils/preprocess_util.py create mode 100644 python/paddle/utils/show_pb.py create mode 100644 python/paddle/utils/torch2paddle.py create mode 100644 python/requirements.txt create mode 100644 python/setup.py.in create mode 100644 tools/__init__.py create mode 100644 tools/aws_benchmarking/README.md create mode 100644 tools/aws_benchmarking/client/Dockerfile create mode 100644 tools/aws_benchmarking/client/cluster_launcher.py create mode 100644 tools/aws_benchmarking/client/requirements.txt create mode 100644 tools/aws_benchmarking/diagram.png create mode 100644 tools/aws_benchmarking/server/Dockerfile create mode 100644 tools/aws_benchmarking/server/cluster_master.py create mode 100644 tools/aws_benchmarking/server/logs/master.log create mode 100644 tools/aws_benchmarking/server/pserver.sh.template create mode 100644 tools/aws_benchmarking/server/requirements.txt create mode 100644 tools/aws_benchmarking/server/trainer.sh.template create mode 100644 tools/check_ctest_hung.py create mode 100644 tools/check_pr_approval.py create mode 100644 tools/codestyle/.gitignore create mode 100755 tools/codestyle/clang_format.hook create mode 100644 tools/codestyle/copyright.hook create mode 100755 tools/codestyle/cpplint_pre_commit.hook create mode 100644 tools/codestyle/docstring_checker.py create mode 100755 tools/codestyle/pylint_pre_commit.hook create mode 100644 tools/codestyle/test_docstring_checker.py create mode 100644 tools/continuous_integration/bisect.py create mode 100644 tools/diff_api.py create mode 100644 tools/diff_use_default_grad_op_maker.py create mode 100755 tools/document_preview.sh create mode 100644 tools/generate_op_use_grad_op_desc_maker_spec.py create mode 100644 tools/manylinux1/Dockerfile.x64 create mode 100644 tools/manylinux1/README.md create mode 100755 tools/manylinux1/build_all.sh create mode 100644 tools/manylinux1/build_scripts/build.sh create mode 100755 tools/manylinux1/build_scripts/build_utils.sh create mode 100644 tools/manylinux1/build_scripts/install_nccl2.sh create mode 100644 tools/manylinux1/build_scripts/manylinux1-check.py create mode 100644 tools/manylinux1/build_scripts/python-tag-abi-tag.py create mode 100644 tools/manylinux1/build_scripts/ssl-check.py create mode 100644 tools/print_signatures.py create mode 100644 tools/test_runner.py create mode 100644 tools/timeline.py diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 00000000..5be71c9b --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,70 @@ +| Github account | name | +|---|---| +| abhinavarora | Abhinav Arora | +| backyes | Yan-Fei Wang | +| baiyfbupt | Yi-Fan Bai | +| beckett1124 | Bin Qi | +| ChengduoZH | Cheng-Duo Zhao| +| chengxiaohua1105 | Xiao-Hua Cheng | +| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang | +| cxysteven | Xing-Yi Cheng | +| dzhwinter | Zhi-Hong Dong | +| dragonwarrior | Long Wang | +| dyning | Yuning Du | +| emailweixu | Wei Xu | +| gangliao | Gang Liao | +| gongweibao | Wei-Bao Gong | +| guru4elephant | Daxiang Dong | +| Guo Sheng | Sheng Guo | +| Haichao-Zhang | Hai-Chao Zhang | +| hedaoyuan | Dao-Yuan He | +| helinwang | He-Lin Wang | +| jacquesqiao | Long-Fei Qiao | +| jczaja | Jacek Czaja | +| JiayiFeng | Jia-Yi Feng | +| kbinias | Krzysztof Binias | +| kexinzhao | Ke-Xin Zhao | +| kuke | Yi-Bing Liu | +| lcy-seso | Ying Cao | +| cjld | Dun Liang | +| lipeng-unisound | Peng Li | +| gavin1332 | Yi Liu | +| liuyuan | Yuan Liu | +| livc | Zhao Li | +| llxxxll | Yong-Feng Liu | +| luotao01 | Tao Luo | +| lzhao4ever | Liang Zhao | +| mozga-intel | Mateusz Ozga | +| NHZlX | Zhao-Long Xing | +| Noplz | Yuan Gao | +| pakchoi | Chuan-Jiang Song | +| panyx0718 | Xin Pan | +| pengli09 | Peng Li | +| pkuyym | Ya-Ming Yang | +| pzelazko-intel | Pawel Zelazko | +| QiJune | Jun Qi | +| qingqing01 | Qing-Qing Dang | +| reyoung | Yang Yu | +| Sand3r- | Michal Gallus | +| sfraczek | Sylwester Fraczek | +| sneaxiy | Jin-Le Zeng | +| Superjom | Chun-Wei Yan | +| tensor-tang | Jian Tang | +| tianbingsz | Tian-Bing Xu | +| tpatejko | Tomasz Patejko | +| typhoonzero | Yi Wu | +| velconia | Qi-Yang Min | +| wanghaoshuang | Hao-Shuang Wang | +| wangyang59 | Yang Wang | +| wangzhen-nlp | Zhen Wang | +| wen-bo-yang | Wen-Bo Yang | +| wojtuss | Wojciech Uss | +| wwhu | Wei-Wei Hu | +| xinghai-sun | Xing-Hai Sun | +| Xreki | Yi-Qun Liu | +| xujun05 | Jun Xu | +| xushaoyong | Shao-Yong Xu | +| Yancey1989 | Xu Yan | +| zhaopu7 | Pu Zhao | +| zhouxiao-coder | Xiao Zhou | +| Zrachel | Rui-Qing Zhang | diff --git a/BCLOUD b/BCLOUD index eb7dc1a2..c0041680 100644 --- a/BCLOUD +++ b/BCLOUD @@ -1,67 +1,65 @@ -#edit-mode: -*- python -*- -#coding:utf-8 - WORKROOT('../../../') - -#platform, if not write PLATFORM('xxx') in BCLOUD file, default is 'centos4u3' -#PLATFORM('centos4u3') - -#gcc version, default 'gcc' COMPILER('gcc482') -#Preprocessor flags. -#CPPFLAGS(r'-D_GNU_SOURCE -D__STDC_LIMIT_MACROS') -#CPPFLAGS(r'-DVERSION=\"%s\"' % REPO_REVISION()) - -#C flags. -#CFLAGS('-g -pipe -W -Wall -fPIC') - -#C++ flags. -#CXXFLAGS('-g -pipe -W -Wall -fPIC') - -#IDL flags. -#IDLFLAGS('--compack') - -#UBRPC flags. -#UBRPCFLAGS('--compack') - -#-I path -INCPATHS('. ./include $INC') - -#libs which need to link with -#LIBS('$OUT/lib/libpaddle-trainer.a') -#LIBS('$OUT/so/libpaddle-trainer.so') - -#link flags -#LDFLAGS('-lpthread -lcrypto -lrt') - -#CONFIGS("baidu/bcloud-demo/hello-svn@hello-svn_1-0-2-1_PD_BL@git_tag") - -#user_sources=GLOB("*.c *.cpp *.cc *.idl") - -#release headers -HEADERS('*.h', '$INC') -HEADERS('*.hpp', '$INC') -HEADERS('include/*.h', '$INC') -HEADERS('include/*.hpp', '$INC') - -#release files except headers -#OUTPUT('conf', '$OUT') - -#bin -#Application('paddle-trainer', Sources(user_sources)) - -#UT -#UTApplication('paddle-trainer', Sources(user_sources), UTArgs(''), UTOnServer(False)) - -#.a -#StaticLibrary('paddle-trainer', Sources(user_sources)) -#StaticLibrary('paddle-trainer', PreBuilt(True)) - -#.so -#SharedLibrary('paddle-trainer', Sources(user_sources)) -#SharedLibrary('paddle-trainer', PreBuilt(True)) - -#sub directory -#Directory('demo') - +INCPATHS('./') +INCPATHS('$OUT/../') +INCPATHS('../../third-party') +INCPATHS('../../third-party/eigen') +INCPATHS('$OUT_ROOT/baidu/third-party/python/output/include/python2.7') +LDFLAGS('-lpthread -lcrypto -lrt -ldl -lssl -lz -lrt') +CONFIGS('baidu/third-party/any@15595d8324be9e8a9a80d9ae442fdd12bd66df5d@git_branch') +CONFIGS('baidu/third-party/boost@v1.41.0@git_branch') +CONFIGS('baidu/third-party/c-ares@v1.13.0@git_branch') +CONFIGS('baidu/third-party/eigen@917060c364181f33a735dc023818d5a54f60e54c@git_branch') +CONFIGS('baidu/third-party/gflags@77592648e3f3be87d6c7123eb81cbad75f9aef5a@git_branch') +CONFIGS('baidu/third-party/glog@v0.3.5@git_branch') +CONFIGS('baidu/third-party/leveldb@v1.18@git_branch') +CONFIGS('baidu/third-party/OpenBLAS@v0.2.20@git_branch') +CONFIGS('baidu/third-party/protobuf@9f75c5aa851cd877fb0d93ccc31b8567a6706546@git_branch') +CONFIGS('baidu/third-party/snappy@v1.1.7@git_branch') +CONFIGS('baidu/third-party/snappy_stream@0.2.8@git_branch') +CONFIGS('baidu/third-party/threadpool@9a42ec1329f259a5f4881a291db1dcb8f2ad9040@git_branch') +CONFIGS('baidu/third-party/warpctc@warp_ctc_head@git_branch') +CONFIGS('baidu/third-party/zlib@v1.2.8@git_branch') +# CONFIGS('baidu/third-party/brpc@7dc04defad1fd4173aae170c3fcbde131b65155a@git_branch') +CONFIGS('baidu/third-party/mklml@v20180406@git_branch') +CONFIGS('baidu/third-party/xbyak@v5.661@git_branch') +CONFIGS('baidu/third-party/xxhash@v0.6.5@git_branch') +CONFIGS('baidu/third-party/dlpack@v0.2@git_branch') +CONFIGS('baidu/third-party/gzstream@master@git_branch') +CONFIGS('baidu/third-party/pybind11@v2.2.4@git_branch') +CONFIGS('baidu/third-party/python@gcc482output@git_branch') + +HEADERS('paddle/fluid/memory/*.h', '$INC/paddle/fluid/memory/') +HEADERS('paddle/fluid/memory/detail/*.h', '$INC/paddle/fluid/memory/detail/') +HEADERS('paddle/fluid/memory/allocation/*.h', '$INC/paddle/fluid/memory/allocation/') +HEADERS('paddle/fluid/inference/*.h', '$INC/paddle/fluid/inference/') +HEADERS('paddle/fluid/platform/*.h', '$INC/paddle/fluid/platform/') +HEADERS('paddle/fluid/platform/dynload/*.h', '$INC/paddle/fluid/platform/dynload/') +HEADERS('paddle/fluid/platform/details/*.h', '$INC/paddle/fluid/platform/details/') +HEADERS('paddle/fluid/string/*.h', '$INC/paddle/fluid/string/') +HEADERS('paddle/fluid/string/tinyformat/*.h', '$INC/paddle/fluid/string/tinyformat/') +HEADERS('paddle/fluid/framework/*.h', '$INC/paddle/fluid/framework/') +HEADERS('paddle/fluid/framework/details/*.h', '$INC/paddle/fluid/framework/details/') +HEADERS('paddle/fluid/framework/ir/memory_optimize_pass/*.h', '$INC/paddle/fluid/framework/ir/memory_optimize_pass/') +HEADERS('paddle/fluid/framework/ir/*.h', '$INC/paddle/fluid/framework/ir/') +HEADERS('paddle/fluid/framework/fleet/*.h', '$INC/paddle/fluid/framework/fleet/') +HEADERS('paddle/fluid/inference/*.h', '$INC/paddle/fluid/inference/') +HEADERS('paddle/fluid/inference/api/*.h', '$INC/paddle/fluid/inference/api/') +HEADERS('paddle/fluid/pybind/pybind.h', '$INC/paddle/fluid/pybind') +HEADERS('paddle/fluid/inference/api/*.h', '$INC/paddle/fluid/inference/api/') +HEADERS(GLOB_GEN_SRCS('paddle/fluid/framework/*pb.h'), '$INC/paddle/fluid/framework') +HEADERS(GLOB_GEN_SRCS('paddle/fluid/platform/*pb.h'), '$INC/paddle/fluid/platform') + +PROTOC('../../third-party/protobuf/bin/protoc') +paddle_fluid_avx_mklml_src = "paddle/fluid/memory/detail/memory_block.cc paddle/fluid/memory/detail/memory_block_desc.cc paddle/fluid/memory/detail/meta_cache.cc paddle/fluid/memory/detail/system_allocator.cc paddle/fluid/memory/detail/buddy_allocator.cc paddle/fluid/memory/allocation/allocator.cc paddle/fluid/memory/allocation/cpu_allocator.cc paddle/fluid/memory/allocation/locked_allocator.cc paddle/fluid/memory/allocation/buffered_allocator.cc paddle/fluid/memory/allocation/best_fit_allocator.cc paddle/fluid/memory/allocation/naive_best_fit_allocator.cc paddle/fluid/memory/allocation/retry_allocator.cc paddle/fluid/memory/allocation/aligned_allocator.cc paddle/fluid/memory/allocation/allocator_strategy.cc paddle/fluid/memory/allocation/allocator_facade.cc paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc paddle/fluid/memory/malloc.cc paddle/fluid/memory/memcpy.cc paddle/fluid/platform/profiler.proto paddle/fluid/platform/enforce.cc paddle/fluid/platform/cpu_info.cc paddle/fluid/platform/place.cc paddle/fluid/platform/dynload/dynamic_loader.cc paddle/fluid/platform/dynload/warpctc.cc paddle/fluid/platform/dynload/mklml.cc paddle/fluid/platform/cpu_helper.cc paddle/fluid/platform/temporary_allocator.cc paddle/fluid/platform/device_context.cc paddle/fluid/platform/init.cc paddle/fluid/platform/timer.cc paddle/fluid/platform/lodtensor_printer.cc paddle/fluid/platform/device_tracer.cc paddle/fluid/platform/profiler.cc paddle/fluid/platform/device_memory_aligment.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc paddle/fluid/framework/ir/node.cc paddle/fluid/framework/ir/graph.cc paddle/fluid/framework/ir/graph_helper.cc paddle/fluid/framework/ir/pass.cc paddle/fluid/framework/ir/graph_traits.cc paddle/fluid/framework/ir/graph_pattern_detector.cc paddle/fluid/framework/ir/fuse_pass_base.cc paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc paddle/fluid/framework/ir/graph_to_program_pass.cc paddle/fluid/framework/ir/graph_viz_pass.cc paddle/fluid/framework/ir/lock_free_optimize_pass.cc paddle/fluid/framework/ir/fc_fuse_pass.cc paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc paddle/fluid/framework/ir/infer_clean_graph_pass.cc paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc paddle/fluid/framework/ir/fc_gru_fuse_pass.cc paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc paddle/fluid/framework/ir/multi_batch_merge_pass.cc paddle/fluid/framework/ir/conv_bn_fuse_pass.cc paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc paddle/fluid/framework/ir/is_test_pass.cc paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc paddle/fluid/framework/ir/sync_batch_norm_pass.cc paddle/fluid/framework/ir/runtime_context_cache_pass.cc paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc paddle/fluid/framework/ir/pass_builder.cc paddle/fluid/framework/details/var_handle.cc paddle/fluid/framework/details/op_handle_base.cc paddle/fluid/framework/details/scale_loss_grad_op_handle.cc paddle/fluid/framework/details/fetch_op_handle.cc paddle/fluid/framework/details/computation_op_handle.cc paddle/fluid/framework/details/rpc_op_handle.cc paddle/fluid/framework/details/fetch_barrier_op_handle.cc paddle/fluid/framework/details/multi_devices_helper.cc paddle/fluid/framework/details/variable_visitor.cc paddle/fluid/framework/details/all_reduce_op_handle.cc paddle/fluid/framework/details/fused_all_reduce_op_handle.cc paddle/fluid/framework/details/reduce_op_handle.cc paddle/fluid/framework/details/broadcast_op_handle.cc paddle/fluid/framework/details/fused_broadcast_op_handle.cc paddle/fluid/framework/details/gather_op_handle.cc paddle/fluid/framework/details/eager_deletion_op_handle.cc paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc paddle/fluid/framework/details/ssa_graph_executor.cc paddle/fluid/framework/details/threaded_ssa_graph_executor.cc paddle/fluid/framework/details/parallel_ssa_graph_executor.cc paddle/fluid/framework/details/async_ssa_graph_executor.cc paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc paddle/fluid/framework/details/build_strategy.cc paddle/fluid/framework/fleet/fleet_wrapper.cc paddle/fluid/framework/fleet/nccl_wrapper.cc paddle/fluid/framework/io/fs.cc paddle/fluid/framework/io/shell.cc paddle/fluid/framework/framework.proto paddle/fluid/framework/data_feed.proto paddle/fluid/framework/trainer_desc.proto paddle/fluid/framework/ddim.cc paddle/fluid/framework/data_type.cc paddle/fluid/framework/tensor.cc paddle/fluid/framework/tensor_util.cc paddle/fluid/framework/lod_tensor.cc paddle/fluid/framework/garbage_collector.cc paddle/fluid/framework/reader.cc paddle/fluid/framework/threadpool.cc paddle/fluid/framework/var_type_traits.cc paddle/fluid/framework/scope.cc paddle/fluid/framework/scope_pool.cc paddle/fluid/framework/data_device_transform.cc paddle/fluid/framework/data_type_transform.cc paddle/fluid/framework/data_layout_transform.cc paddle/fluid/framework/data_transform.cc paddle/fluid/framework/attribute.cc paddle/fluid/framework/op_proto_maker.cc paddle/fluid/framework/op_info.cc paddle/fluid/framework/shape_inference.cc paddle/fluid/framework/transfer_scope_cache.cc paddle/fluid/framework/op_kernel_type.cc paddle/fluid/framework/operator.cc paddle/fluid/framework/version.cc paddle/fluid/framework/var_desc.cc paddle/fluid/framework/op_desc.cc paddle/fluid/framework/block_desc.cc paddle/fluid/framework/program_desc.cc paddle/fluid/framework/op_registry.cc paddle/fluid/framework/lod_rank_table.cc paddle/fluid/framework/feed_fetch_method.cc paddle/fluid/framework/variable_helper.cc paddle/fluid/framework/naive_executor.cc paddle/fluid/framework/executor_gc_helper.cc paddle/fluid/framework/executor.cc paddle/fluid/framework/multi_trainer.cc paddle/fluid/framework/pipeline_trainer.cc paddle/fluid/framework/dataset_factory.cc paddle/fluid/framework/dist_multi_trainer.cc paddle/fluid/framework/trainer_factory.cc paddle/fluid/framework/trainer.cc paddle/fluid/framework/data_feed_factory.cc paddle/fluid/framework/data_feed.cc paddle/fluid/framework/device_worker.cc paddle/fluid/framework/hogwild_worker.cc paddle/fluid/framework/downpour_worker.cc paddle/fluid/framework/pull_dense_worker.cc paddle/fluid/framework/section_worker.cc paddle/fluid/framework/device_worker_factory.cc paddle/fluid/framework/data_set.cc paddle/fluid/framework/parallel_executor.cc paddle/fluid/framework/prune.cc paddle/fluid/framework/selected_rows.cc paddle/fluid/framework/dlpack_tensor.cc paddle/fluid/imperative/flags.cc paddle/fluid/operators/math/detail/avx_functions.cc paddle/fluid/operators/math/concat_and_split.cc paddle/fluid/operators/math/context_project.cc paddle/fluid/operators/math/cross_entropy.cc paddle/fluid/operators/math/cos_sim_functor.cc paddle/fluid/operators/math/im2col.cc paddle/fluid/operators/math/sample_prob.cc paddle/fluid/operators/math/sampler.cc paddle/fluid/operators/math/gru_compute.cc paddle/fluid/operators/math/lstm_compute.cc paddle/fluid/operators/math/blas.cc paddle/fluid/operators/math/math_function.cc paddle/fluid/operators/math/maxouting.cc paddle/fluid/operators/math/pooling.cc paddle/fluid/operators/math/selected_rows_functor.cc paddle/fluid/operators/math/sequence2batch.cc paddle/fluid/operators/math/sequence_padding.cc paddle/fluid/operators/math/sequence_pooling.cc paddle/fluid/operators/math/sequence_scale.cc paddle/fluid/operators/math/softmax.cc paddle/fluid/operators/math/beam_search.cc paddle/fluid/operators/math/matrix_bit_code.cc paddle/fluid/operators/math/unpooling.cc paddle/fluid/operators/math/vol2col.cc paddle/fluid/operators/math/tree2col.cc paddle/fluid/operators/controlflow/feed_op.cc paddle/fluid/operators/controlflow/logical_op.cc paddle/fluid/operators/controlflow/while_op.cc paddle/fluid/operators/controlflow/get_places_op.cc paddle/fluid/operators/controlflow/fetch_op.cc paddle/fluid/operators/controlflow/compare_op.cc paddle/fluid/operators/controlflow/conditional_block_infer_op.cc paddle/fluid/operators/controlflow/conditional_block_op.cc paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc paddle/fluid/operators/controlflow/op_variant.cc paddle/fluid/operators/controlflow/recurrent_op_helper.cc paddle/fluid/operators/controlflow/while_op_helper.cc paddle/fluid/operators/detection/bipartite_match_op.cc paddle/fluid/operators/detection/box_coder_op.cc paddle/fluid/operators/detection/iou_similarity_op.cc paddle/fluid/operators/detection/mine_hard_examples_op.cc paddle/fluid/operators/detection/multiclass_nms_op.cc paddle/fluid/operators/detection/poly_util.cc paddle/fluid/operators/detection/gpc.cc paddle/fluid/operators/detection/prior_box_op.cc paddle/fluid/operators/detection/density_prior_box_op.cc paddle/fluid/operators/detection/anchor_generator_op.cc paddle/fluid/operators/detection/target_assign_op.cc paddle/fluid/operators/detection/polygon_box_transform_op.cc paddle/fluid/operators/detection/rpn_target_assign_op.cc paddle/fluid/operators/detection/generate_proposal_labels_op.cc paddle/fluid/operators/detection/box_clip_op.cc paddle/fluid/operators/detection/yolov3_loss_op.cc paddle/fluid/operators/detection/yolo_box_op.cc paddle/fluid/operators/detection/box_decoder_and_assign_op.cc paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc paddle/fluid/operators/detection/retinanet_detection_output_op.cc paddle/fluid/operators/detection/generate_proposals_op.cc paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc paddle/fluid/operators/detection/collect_fpn_proposals_op.cc paddle/fluid/operators/detection/roi_perspective_transform_op.cc paddle/fluid/operators/detection/mask_util.cc paddle/fluid/operators/detection/generate_mask_labels_op.cc paddle/fluid/operators/elementwise/elementwise_mod_op.cc paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc paddle/fluid/operators/elementwise/elementwise_max_op.cc paddle/fluid/operators/elementwise/elementwise_pow_op.cc paddle/fluid/operators/elementwise/elementwise_sub_op.cc paddle/fluid/operators/elementwise/elementwise_add_op.cc paddle/fluid/operators/elementwise/elementwise_min_op.cc paddle/fluid/operators/elementwise/elementwise_div_op.cc paddle/fluid/operators/elementwise/elementwise_mul_op.cc paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc paddle/fluid/operators/fused/fusion_gru_op.cc paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc paddle/fluid/operators/fused/fusion_lstm_op.cc paddle/fluid/operators/fused/fused_elemwise_activation_op.cc paddle/fluid/operators/metrics/accuracy_op.cc paddle/fluid/operators/metrics/precision_recall_op.cc paddle/fluid/operators/metrics/auc_op.cc paddle/fluid/operators/optimizers/adamax_op.cc paddle/fluid/operators/optimizers/sgd_op.cc paddle/fluid/operators/optimizers/lars_momentum_op.cc paddle/fluid/operators/optimizers/adagrad_op.cc paddle/fluid/operators/optimizers/ftrl_op.cc paddle/fluid/operators/optimizers/momentum_op.cc paddle/fluid/operators/optimizers/adadelta_op.cc paddle/fluid/operators/optimizers/rmsprop_op.cc paddle/fluid/operators/optimizers/lamb_op.cc paddle/fluid/operators/optimizers/proximal_gd_op.cc paddle/fluid/operators/optimizers/proximal_adagrad_op.cc paddle/fluid/operators/optimizers/adam_op.cc paddle/fluid/operators/optimizers/decayed_adagrad_op.cc paddle/fluid/operators/reduce_ops/reduce_all_op.cc paddle/fluid/operators/reduce_ops/reduce_min_op.cc paddle/fluid/operators/reduce_ops/reduce_sum_op.cc paddle/fluid/operators/reduce_ops/reduce_any_op.cc paddle/fluid/operators/reduce_ops/reduce_max_op.cc paddle/fluid/operators/reduce_ops/reduce_mean_op.cc paddle/fluid/operators/reduce_ops/reduce_prod_op.cc paddle/fluid/operators/sequence_ops/sequence_erase_op.cc paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc paddle/fluid/operators/sequence_ops/sequence_mask_op.cc paddle/fluid/operators/sequence_ops/sequence_expand_op.cc paddle/fluid/operators/sequence_ops/sequence_pad_op.cc paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc paddle/fluid/operators/sequence_ops/sequence_slice_op.cc paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc paddle/fluid/operators/sequence_ops/sequence_pool_op.cc paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc paddle/fluid/operators/sequence_ops/sequence_conv_op.cc paddle/fluid/operators/sequence_ops/sequence_concat_op.cc paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc paddle/fluid/operators/jit/helper.cc paddle/fluid/operators/jit/kernel_key.cc paddle/fluid/operators/jit/gen_base.cc paddle/fluid/operators/jit/kernel_pool.cc paddle/fluid/operators/jit/refer/refer.cc paddle/fluid/operators/jit/more/mkl/mkl.cc paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc paddle/fluid/operators/jit/more/mix/mix.cc paddle/fluid/operators/jit/gen/sgd.cc paddle/fluid/operators/jit/gen/hopv.cc paddle/fluid/operators/jit/gen/lstm.cc paddle/fluid/operators/jit/gen/gru.cc paddle/fluid/operators/jit/gen/vbroadcast.cc paddle/fluid/operators/jit/gen/matmul.cc paddle/fluid/operators/jit/gen/seqpool.cc paddle/fluid/operators/jit/gen/embseqpool.cc paddle/fluid/operators/jit/gen/act.cc paddle/fluid/operators/jit/gen/blas.cc paddle/fluid/operators/reader/reader_op_registry.cc paddle/fluid/operators/reader/py_reader.cc paddle/fluid/operators/reader/buffered_reader.cc paddle/fluid/operators/reader/open_files_op.cc paddle/fluid/operators/reader/create_random_data_generator_op.cc paddle/fluid/operators/reader/create_shuffle_reader_op.cc paddle/fluid/operators/reader/create_batch_reader_op.cc paddle/fluid/operators/reader/create_recordio_file_reader_op.cc paddle/fluid/operators/reader/create_double_buffer_reader_op.cc paddle/fluid/operators/reader/create_multi_pass_reader_op.cc paddle/fluid/operators/reader/create_custom_reader_op.cc paddle/fluid/operators/reader/create_py_reader_op.cc paddle/fluid/operators/reader/read_op.cc paddle/fluid/operators/increment_op.cc paddle/fluid/operators/stack_op.cc paddle/fluid/operators/fc_op.cc paddle/fluid/operators/assign_op.cc paddle/fluid/operators/load_op.cc paddle/fluid/operators/fill_op.cc paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc paddle/fluid/operators/conv_shift_op.cc paddle/fluid/operators/fill_zeros_like_op.cc paddle/fluid/operators/hash_op.cc paddle/fluid/operators/dequantize_op.cc paddle/fluid/operators/fake_quantize_op.cc paddle/fluid/operators/size_op.cc paddle/fluid/operators/scatter_op.cc paddle/fluid/operators/uniform_random_op.cc paddle/fluid/operators/beam_search_op.cc paddle/fluid/operators/beam_search_decode_op.cc paddle/fluid/operators/dropout_op.cc paddle/fluid/operators/interpolate_op.cc paddle/fluid/operators/sampling_id_op.cc paddle/fluid/operators/lstm_op.cc paddle/fluid/operators/modified_huber_loss_op.cc paddle/fluid/operators/temporal_shift_op.cc paddle/fluid/operators/sum_op.cc paddle/fluid/operators/arg_min_op.cc paddle/fluid/operators/psroi_pool_op.cc paddle/fluid/operators/uniform_random_batch_size_like_op.cc paddle/fluid/operators/rnn_memory_helper_op.cc paddle/fluid/operators/crf_decoding_op.cc paddle/fluid/operators/where_op.cc paddle/fluid/operators/fake_dequantize_op.cc paddle/fluid/operators/mean_iou_op.cc paddle/fluid/operators/roi_align_op.cc paddle/fluid/operators/range_op.cc paddle/fluid/operators/edit_distance_op.cc paddle/fluid/operators/multiplex_op.cc paddle/fluid/operators/clip_op.cc paddle/fluid/operators/gaussian_random_op.cc paddle/fluid/operators/norm_op.cc paddle/fluid/operators/rank_loss_op.cc paddle/fluid/operators/detection_map_op.cc paddle/fluid/operators/lstm_unit_op.cc paddle/fluid/operators/shard_index_op.cc paddle/fluid/operators/shape_op.cc paddle/fluid/operators/arg_max_op.cc paddle/fluid/operators/average_accumulates_op.cc paddle/fluid/operators/requantize_op.cc paddle/fluid/operators/conv_op.cc paddle/fluid/operators/add_position_encoding_op.cc paddle/fluid/operators/gru_unit_op.cc paddle/fluid/operators/batch_norm_op.cc paddle/fluid/operators/chunk_eval_op.cc paddle/fluid/operators/lod_rank_table_op.cc paddle/fluid/operators/unsqueeze_op.cc paddle/fluid/operators/positive_negative_pair_op.cc paddle/fluid/operators/im2sequence_op.cc paddle/fluid/operators/margin_rank_loss_op.cc paddle/fluid/operators/hinge_loss_op.cc paddle/fluid/operators/cvm_op.cc paddle/fluid/operators/huber_loss_op.cc paddle/fluid/operators/crop_op.cc paddle/fluid/operators/activation_op.cc paddle/fluid/operators/hierarchical_sigmoid_op.cc paddle/fluid/operators/unfold_op.cc paddle/fluid/operators/max_sequence_len_op.cc paddle/fluid/operators/mul_op.cc paddle/fluid/operators/attention_lstm_op.cc paddle/fluid/operators/top_k_op.cc paddle/fluid/operators/group_norm_op.cc paddle/fluid/operators/selu_op.cc paddle/fluid/operators/lstmp_op.cc paddle/fluid/operators/merge_lod_tensor_op.cc paddle/fluid/operators/truncated_gaussian_random_op.cc paddle/fluid/operators/label_smooth_op.cc paddle/fluid/operators/matmul_op.cc paddle/fluid/operators/spp_op.cc paddle/fluid/operators/unstack_op.cc paddle/fluid/operators/conv_transpose_op.cc paddle/fluid/operators/diag_op.cc paddle/fluid/operators/unpool_op.cc paddle/fluid/operators/lod_array_length_op.cc paddle/fluid/operators/affine_channel_op.cc paddle/fluid/operators/log_loss_op.cc paddle/fluid/operators/concat_op.cc paddle/fluid/operators/lod_tensor_to_array_op.cc paddle/fluid/operators/gru_op.cc paddle/fluid/operators/coalesce_tensor_op.cc paddle/fluid/operators/fsp_op.cc paddle/fluid/operators/linspace_op.cc paddle/fluid/operators/reverse_op.cc paddle/fluid/operators/recurrent_op.cc paddle/fluid/operators/split_selected_rows_op.cc paddle/fluid/operators/dgc_clip_by_norm_op.cc paddle/fluid/operators/scale_op.cc paddle/fluid/operators/save_op.cc paddle/fluid/operators/load_combine_op.cc paddle/fluid/operators/merge_selected_rows_op.cc paddle/fluid/operators/split_op.cc paddle/fluid/operators/cumsum_op.cc paddle/fluid/operators/deformable_psroi_pooling_op.cc paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc paddle/fluid/operators/transpose_op.cc paddle/fluid/operators/fill_constant_batch_size_like_op.cc paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc paddle/fluid/operators/shuffle_channel_op.cc paddle/fluid/operators/affine_grid_op.cc paddle/fluid/operators/split_lod_tensor_op.cc paddle/fluid/operators/grid_sampler_op.cc paddle/fluid/operators/lookup_table_op.cc paddle/fluid/operators/cos_sim_op.cc paddle/fluid/operators/quantize_op.cc paddle/fluid/operators/spectral_norm_op.cc paddle/fluid/operators/cross_entropy_op.cc paddle/fluid/operators/print_op.cc paddle/fluid/operators/lrn_op.cc paddle/fluid/operators/nce_op.cc paddle/fluid/operators/similarity_focus_op.cc paddle/fluid/operators/get_tensor_from_selected_rows_op.cc paddle/fluid/operators/squared_l2_distance_op.cc paddle/fluid/operators/cudnn_lstm_op.cc paddle/fluid/operators/tree_conv_op.cc paddle/fluid/operators/one_hot_op.cc paddle/fluid/operators/lookup_sparse_table_op.cc paddle/fluid/operators/unique_op.cc paddle/fluid/operators/mean_op.cc paddle/fluid/operators/prelu_op.cc paddle/fluid/operators/delete_var_op.cc paddle/fluid/operators/ctc_align_op.cc paddle/fluid/operators/argsort_op.cc paddle/fluid/operators/data_norm_op.cc paddle/fluid/operators/minus_op.cc paddle/fluid/operators/shrink_rnn_memory_op.cc paddle/fluid/operators/lod_reset_op.cc paddle/fluid/operators/l1_norm_op.cc paddle/fluid/operators/gaussian_random_batch_size_like_op.cc paddle/fluid/operators/is_empty_op.cc paddle/fluid/operators/bilinear_tensor_product_op.cc paddle/fluid/operators/kldiv_loss_op.cc paddle/fluid/operators/squeeze_op.cc paddle/fluid/operators/softmax_op.cc paddle/fluid/operators/clip_by_norm_op.cc paddle/fluid/operators/pool_with_index_op.cc paddle/fluid/operators/linear_chain_crf_op.cc paddle/fluid/operators/reshape_op.cc paddle/fluid/operators/fill_constant_op.cc paddle/fluid/operators/space_to_depth_op.cc paddle/fluid/operators/gather_op.cc paddle/fluid/operators/softmax_with_cross_entropy_op.cc paddle/fluid/operators/slice_op.cc paddle/fluid/operators/sign_op.cc paddle/fluid/operators/expand_op.cc paddle/fluid/operators/smooth_l1_loss_op.cc paddle/fluid/operators/tensor_array_to_tensor_op.cc paddle/fluid/operators/row_conv_op.cc paddle/fluid/operators/pad2d_op.cc paddle/fluid/operators/pixel_shuffle_op.cc paddle/fluid/operators/assign_value_op.cc paddle/fluid/operators/random_crop_op.cc paddle/fluid/operators/squared_l2_norm_op.cc paddle/fluid/operators/save_combine_op.cc paddle/fluid/operators/pool_op.cc paddle/fluid/operators/cast_op.cc paddle/fluid/operators/array_to_lod_tensor_op.cc paddle/fluid/operators/fill_any_like_op.cc paddle/fluid/operators/flatten_op.cc paddle/fluid/operators/sample_logits_op.cc paddle/fluid/operators/pad_op.cc paddle/fluid/operators/bpr_loss_op.cc paddle/fluid/operators/roi_pool_op.cc paddle/fluid/operators/pad_constant_like_op.cc paddle/fluid/operators/isfinite_op.cc paddle/fluid/operators/layer_norm_op.cc paddle/fluid/operators/maxout_op.cc paddle/fluid/operators/warpctc_op.cc paddle/fluid/string/piece.cc paddle/fluid/string/pretty_log.cc paddle/fluid/string/string_helper.cc paddle/fluid/recordio/header.cc paddle/fluid/recordio/chunk.cc paddle/fluid/recordio/writer.cc paddle/fluid/recordio/scanner.cc paddle/fluid/inference/io.cc paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.cc paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc paddle/fluid/inference/analysis/passes/passes.cc paddle/fluid/inference/analysis/helper.cc paddle/fluid/inference/analysis/ir_pass_manager.cc paddle/fluid/inference/analysis/argument.cc paddle/fluid/inference/analysis/analysis_pass.cc paddle/fluid/inference/analysis/analyzer.cc paddle/fluid/inference/utils/benchmark.cc paddle/fluid/inference/api/api.cc paddle/fluid/inference/api/api_impl.cc paddle/fluid/inference/api/helper.cc paddle/fluid/inference/api/analysis_predictor.cc paddle/fluid/inference/api/details/zero_copy_tensor.cc paddle/fluid/inference/api/details/reset_tensor_array.cc paddle/fluid/inference/api/analysis_config.cc paddle/fluid/inference/api/paddle_pass_builder.cc" +paddle_fluid_avx_mklml_src += ' paddle/fluid/framework/revision.cc' + +StaticLibrary('paddle_fluid_avx_mklml', Sources(paddle_fluid_avx_mklml_src, CppFlags('-DHPPL_STUB_FUNC -DLAPACK_FOUND -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_ON_INFERENCE -DPADDLE_USE_DSO -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_WITH_AVX -DPADDLE_WITH_MKLML -DPADDLE_WITH_XBYAK -DXBYAK64 -DXBYAK_NO_OP_NAMES -D_GNU_SOURCE -D__STDC_LIMIT_MACROS -DPYBIND_AVX_MKLML' + r" -DPADDLE_REVISION=\"%s@%s@%s\"" % (REPO_URL(), REPO_BRANCH(), REPO_REVISION())), CFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -fopenmp -mavx -O3 -DNDEBUG '), CxxFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -fopenmp -mavx -O3 -DNDEBUG '))) +SharedLibrary('paddle_fluid_avx_mklml', Sources(paddle_fluid_avx_mklml_src, CppFlags('-DHPPL_STUB_FUNC -DLAPACK_FOUND -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_ON_INFERENCE -DPADDLE_USE_DSO -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_WITH_AVX -DPADDLE_WITH_MKLML -DPADDLE_WITH_XBYAK -DXBYAK64 -DXBYAK_NO_OP_NAMES -D_GNU_SOURCE -D__STDC_LIMIT_MACROS -DPYBIND_AVX_MKLML' + r" -DPADDLE_REVISION=\"%s@%s@%s\"" % (REPO_URL(), REPO_BRANCH(), REPO_REVISION())), CFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -fopenmp -mavx -O3 -DNDEBUG '), CxxFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -fopenmp -mavx -O3 -DNDEBUG ')), LinkDeps(True)) + +paddle_fluid_noavx_openblas_src = "paddle/fluid/memory/detail/memory_block.cc paddle/fluid/memory/detail/memory_block_desc.cc paddle/fluid/memory/detail/meta_cache.cc paddle/fluid/memory/detail/system_allocator.cc paddle/fluid/memory/detail/buddy_allocator.cc paddle/fluid/memory/allocation/allocator.cc paddle/fluid/memory/allocation/cpu_allocator.cc paddle/fluid/memory/allocation/locked_allocator.cc paddle/fluid/memory/allocation/buffered_allocator.cc paddle/fluid/memory/allocation/best_fit_allocator.cc paddle/fluid/memory/allocation/naive_best_fit_allocator.cc paddle/fluid/memory/allocation/retry_allocator.cc paddle/fluid/memory/allocation/aligned_allocator.cc paddle/fluid/memory/allocation/allocator_strategy.cc paddle/fluid/memory/allocation/allocator_facade.cc paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc paddle/fluid/memory/malloc.cc paddle/fluid/memory/memcpy.cc paddle/fluid/platform/profiler.proto paddle/fluid/platform/enforce.cc paddle/fluid/platform/cpu_info.cc paddle/fluid/platform/place.cc paddle/fluid/platform/dynload/dynamic_loader.cc paddle/fluid/platform/dynload/warpctc.cc paddle/fluid/platform/cpu_helper.cc paddle/fluid/platform/temporary_allocator.cc paddle/fluid/platform/device_context.cc paddle/fluid/platform/init.cc paddle/fluid/platform/timer.cc paddle/fluid/platform/lodtensor_printer.cc paddle/fluid/platform/device_tracer.cc paddle/fluid/platform/profiler.cc paddle/fluid/platform/device_memory_aligment.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc paddle/fluid/framework/ir/node.cc paddle/fluid/framework/ir/graph.cc paddle/fluid/framework/ir/graph_helper.cc paddle/fluid/framework/ir/pass.cc paddle/fluid/framework/ir/graph_traits.cc paddle/fluid/framework/ir/graph_pattern_detector.cc paddle/fluid/framework/ir/fuse_pass_base.cc paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc paddle/fluid/framework/ir/graph_to_program_pass.cc paddle/fluid/framework/ir/graph_viz_pass.cc paddle/fluid/framework/ir/lock_free_optimize_pass.cc paddle/fluid/framework/ir/fc_fuse_pass.cc paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc paddle/fluid/framework/ir/infer_clean_graph_pass.cc paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc paddle/fluid/framework/ir/fc_gru_fuse_pass.cc paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc paddle/fluid/framework/ir/multi_batch_merge_pass.cc paddle/fluid/framework/ir/conv_bn_fuse_pass.cc paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc paddle/fluid/framework/ir/is_test_pass.cc paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc paddle/fluid/framework/ir/sync_batch_norm_pass.cc paddle/fluid/framework/ir/runtime_context_cache_pass.cc paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc paddle/fluid/framework/ir/pass_builder.cc paddle/fluid/framework/details/var_handle.cc paddle/fluid/framework/details/op_handle_base.cc paddle/fluid/framework/details/scale_loss_grad_op_handle.cc paddle/fluid/framework/details/fetch_op_handle.cc paddle/fluid/framework/details/computation_op_handle.cc paddle/fluid/framework/details/rpc_op_handle.cc paddle/fluid/framework/details/fetch_barrier_op_handle.cc paddle/fluid/framework/details/multi_devices_helper.cc paddle/fluid/framework/details/variable_visitor.cc paddle/fluid/framework/details/all_reduce_op_handle.cc paddle/fluid/framework/details/fused_all_reduce_op_handle.cc paddle/fluid/framework/details/reduce_op_handle.cc paddle/fluid/framework/details/broadcast_op_handle.cc paddle/fluid/framework/details/fused_broadcast_op_handle.cc paddle/fluid/framework/details/gather_op_handle.cc paddle/fluid/framework/details/eager_deletion_op_handle.cc paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc paddle/fluid/framework/details/ssa_graph_executor.cc paddle/fluid/framework/details/threaded_ssa_graph_executor.cc paddle/fluid/framework/details/parallel_ssa_graph_executor.cc paddle/fluid/framework/details/async_ssa_graph_executor.cc paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc paddle/fluid/framework/details/build_strategy.cc paddle/fluid/framework/fleet/fleet_wrapper.cc paddle/fluid/framework/fleet/nccl_wrapper.cc paddle/fluid/framework/io/fs.cc paddle/fluid/framework/io/shell.cc paddle/fluid/framework/framework.proto paddle/fluid/framework/data_feed.proto paddle/fluid/framework/trainer_desc.proto paddle/fluid/framework/ddim.cc paddle/fluid/framework/data_type.cc paddle/fluid/framework/tensor.cc paddle/fluid/framework/tensor_util.cc paddle/fluid/framework/lod_tensor.cc paddle/fluid/framework/garbage_collector.cc paddle/fluid/framework/reader.cc paddle/fluid/framework/threadpool.cc paddle/fluid/framework/var_type_traits.cc paddle/fluid/framework/scope.cc paddle/fluid/framework/scope_pool.cc paddle/fluid/framework/data_device_transform.cc paddle/fluid/framework/data_type_transform.cc paddle/fluid/framework/data_layout_transform.cc paddle/fluid/framework/data_transform.cc paddle/fluid/framework/attribute.cc paddle/fluid/framework/op_proto_maker.cc paddle/fluid/framework/op_info.cc paddle/fluid/framework/shape_inference.cc paddle/fluid/framework/transfer_scope_cache.cc paddle/fluid/framework/op_kernel_type.cc paddle/fluid/framework/operator.cc paddle/fluid/framework/version.cc paddle/fluid/framework/var_desc.cc paddle/fluid/framework/op_desc.cc paddle/fluid/framework/block_desc.cc paddle/fluid/framework/program_desc.cc paddle/fluid/framework/op_registry.cc paddle/fluid/framework/lod_rank_table.cc paddle/fluid/framework/feed_fetch_method.cc paddle/fluid/framework/variable_helper.cc paddle/fluid/framework/naive_executor.cc paddle/fluid/framework/executor_gc_helper.cc paddle/fluid/framework/executor.cc paddle/fluid/framework/multi_trainer.cc paddle/fluid/framework/pipeline_trainer.cc paddle/fluid/framework/dataset_factory.cc paddle/fluid/framework/dist_multi_trainer.cc paddle/fluid/framework/trainer_factory.cc paddle/fluid/framework/trainer.cc paddle/fluid/framework/data_feed_factory.cc paddle/fluid/framework/data_feed.cc paddle/fluid/framework/device_worker.cc paddle/fluid/framework/hogwild_worker.cc paddle/fluid/framework/downpour_worker.cc paddle/fluid/framework/pull_dense_worker.cc paddle/fluid/framework/section_worker.cc paddle/fluid/framework/device_worker_factory.cc paddle/fluid/framework/data_set.cc paddle/fluid/framework/parallel_executor.cc paddle/fluid/framework/prune.cc paddle/fluid/framework/selected_rows.cc paddle/fluid/framework/dlpack_tensor.cc paddle/fluid/imperative/flags.cc paddle/fluid/operators/math/detail/avx_functions.cc paddle/fluid/operators/math/concat_and_split.cc paddle/fluid/operators/math/context_project.cc paddle/fluid/operators/math/cross_entropy.cc paddle/fluid/operators/math/cos_sim_functor.cc paddle/fluid/operators/math/im2col.cc paddle/fluid/operators/math/sample_prob.cc paddle/fluid/operators/math/sampler.cc paddle/fluid/operators/math/gru_compute.cc paddle/fluid/operators/math/lstm_compute.cc paddle/fluid/operators/math/blas.cc paddle/fluid/operators/math/math_function.cc paddle/fluid/operators/math/maxouting.cc paddle/fluid/operators/math/pooling.cc paddle/fluid/operators/math/selected_rows_functor.cc paddle/fluid/operators/math/sequence2batch.cc paddle/fluid/operators/math/sequence_padding.cc paddle/fluid/operators/math/sequence_pooling.cc paddle/fluid/operators/math/sequence_scale.cc paddle/fluid/operators/math/softmax.cc paddle/fluid/operators/math/beam_search.cc paddle/fluid/operators/math/matrix_bit_code.cc paddle/fluid/operators/math/unpooling.cc paddle/fluid/operators/math/vol2col.cc paddle/fluid/operators/math/tree2col.cc paddle/fluid/operators/controlflow/feed_op.cc paddle/fluid/operators/controlflow/logical_op.cc paddle/fluid/operators/controlflow/while_op.cc paddle/fluid/operators/controlflow/get_places_op.cc paddle/fluid/operators/controlflow/fetch_op.cc paddle/fluid/operators/controlflow/compare_op.cc paddle/fluid/operators/controlflow/conditional_block_infer_op.cc paddle/fluid/operators/controlflow/conditional_block_op.cc paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc paddle/fluid/operators/controlflow/op_variant.cc paddle/fluid/operators/controlflow/recurrent_op_helper.cc paddle/fluid/operators/controlflow/while_op_helper.cc paddle/fluid/operators/detection/bipartite_match_op.cc paddle/fluid/operators/detection/box_coder_op.cc paddle/fluid/operators/detection/iou_similarity_op.cc paddle/fluid/operators/detection/mine_hard_examples_op.cc paddle/fluid/operators/detection/multiclass_nms_op.cc paddle/fluid/operators/detection/poly_util.cc paddle/fluid/operators/detection/gpc.cc paddle/fluid/operators/detection/prior_box_op.cc paddle/fluid/operators/detection/density_prior_box_op.cc paddle/fluid/operators/detection/anchor_generator_op.cc paddle/fluid/operators/detection/target_assign_op.cc paddle/fluid/operators/detection/polygon_box_transform_op.cc paddle/fluid/operators/detection/rpn_target_assign_op.cc paddle/fluid/operators/detection/generate_proposal_labels_op.cc paddle/fluid/operators/detection/box_clip_op.cc paddle/fluid/operators/detection/yolov3_loss_op.cc paddle/fluid/operators/detection/yolo_box_op.cc paddle/fluid/operators/detection/box_decoder_and_assign_op.cc paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc paddle/fluid/operators/detection/retinanet_detection_output_op.cc paddle/fluid/operators/detection/generate_proposals_op.cc paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc paddle/fluid/operators/detection/collect_fpn_proposals_op.cc paddle/fluid/operators/detection/roi_perspective_transform_op.cc paddle/fluid/operators/detection/mask_util.cc paddle/fluid/operators/detection/generate_mask_labels_op.cc paddle/fluid/operators/elementwise/elementwise_mod_op.cc paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc paddle/fluid/operators/elementwise/elementwise_max_op.cc paddle/fluid/operators/elementwise/elementwise_pow_op.cc paddle/fluid/operators/elementwise/elementwise_sub_op.cc paddle/fluid/operators/elementwise/elementwise_add_op.cc paddle/fluid/operators/elementwise/elementwise_min_op.cc paddle/fluid/operators/elementwise/elementwise_div_op.cc paddle/fluid/operators/elementwise/elementwise_mul_op.cc paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc paddle/fluid/operators/fused/fusion_gru_op.cc paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc paddle/fluid/operators/fused/fusion_lstm_op.cc paddle/fluid/operators/fused/fused_elemwise_activation_op.cc paddle/fluid/operators/metrics/accuracy_op.cc paddle/fluid/operators/metrics/precision_recall_op.cc paddle/fluid/operators/metrics/auc_op.cc paddle/fluid/operators/optimizers/adamax_op.cc paddle/fluid/operators/optimizers/sgd_op.cc paddle/fluid/operators/optimizers/lars_momentum_op.cc paddle/fluid/operators/optimizers/adagrad_op.cc paddle/fluid/operators/optimizers/ftrl_op.cc paddle/fluid/operators/optimizers/momentum_op.cc paddle/fluid/operators/optimizers/adadelta_op.cc paddle/fluid/operators/optimizers/rmsprop_op.cc paddle/fluid/operators/optimizers/lamb_op.cc paddle/fluid/operators/optimizers/proximal_gd_op.cc paddle/fluid/operators/optimizers/proximal_adagrad_op.cc paddle/fluid/operators/optimizers/adam_op.cc paddle/fluid/operators/optimizers/decayed_adagrad_op.cc paddle/fluid/operators/reduce_ops/reduce_all_op.cc paddle/fluid/operators/reduce_ops/reduce_min_op.cc paddle/fluid/operators/reduce_ops/reduce_sum_op.cc paddle/fluid/operators/reduce_ops/reduce_any_op.cc paddle/fluid/operators/reduce_ops/reduce_max_op.cc paddle/fluid/operators/reduce_ops/reduce_mean_op.cc paddle/fluid/operators/reduce_ops/reduce_prod_op.cc paddle/fluid/operators/sequence_ops/sequence_erase_op.cc paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc paddle/fluid/operators/sequence_ops/sequence_mask_op.cc paddle/fluid/operators/sequence_ops/sequence_expand_op.cc paddle/fluid/operators/sequence_ops/sequence_pad_op.cc paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc paddle/fluid/operators/sequence_ops/sequence_slice_op.cc paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc paddle/fluid/operators/sequence_ops/sequence_pool_op.cc paddle/fluid/operators/sequence_ops/sequence_reverse_op.cc paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc paddle/fluid/operators/sequence_ops/sequence_conv_op.cc paddle/fluid/operators/sequence_ops/sequence_concat_op.cc paddle/fluid/operators/sequence_ops/sequence_reshape_op.cc paddle/fluid/operators/jit/helper.cc paddle/fluid/operators/jit/kernel_key.cc paddle/fluid/operators/jit/gen_base.cc paddle/fluid/operators/jit/kernel_pool.cc paddle/fluid/operators/jit/refer/refer.cc paddle/fluid/operators/jit/more/mix/mix.cc paddle/fluid/operators/jit/gen/sgd.cc paddle/fluid/operators/jit/gen/hopv.cc paddle/fluid/operators/jit/gen/lstm.cc paddle/fluid/operators/jit/gen/gru.cc paddle/fluid/operators/jit/gen/vbroadcast.cc paddle/fluid/operators/jit/gen/matmul.cc paddle/fluid/operators/jit/gen/seqpool.cc paddle/fluid/operators/jit/gen/embseqpool.cc paddle/fluid/operators/jit/gen/act.cc paddle/fluid/operators/jit/gen/blas.cc paddle/fluid/operators/reader/reader_op_registry.cc paddle/fluid/operators/reader/py_reader.cc paddle/fluid/operators/reader/buffered_reader.cc paddle/fluid/operators/reader/open_files_op.cc paddle/fluid/operators/reader/create_random_data_generator_op.cc paddle/fluid/operators/reader/create_shuffle_reader_op.cc paddle/fluid/operators/reader/create_batch_reader_op.cc paddle/fluid/operators/reader/create_recordio_file_reader_op.cc paddle/fluid/operators/reader/create_double_buffer_reader_op.cc paddle/fluid/operators/reader/create_multi_pass_reader_op.cc paddle/fluid/operators/reader/create_custom_reader_op.cc paddle/fluid/operators/reader/create_py_reader_op.cc paddle/fluid/operators/reader/read_op.cc paddle/fluid/operators/increment_op.cc paddle/fluid/operators/stack_op.cc paddle/fluid/operators/fc_op.cc paddle/fluid/operators/assign_op.cc paddle/fluid/operators/load_op.cc paddle/fluid/operators/fill_op.cc paddle/fluid/operators/reorder_lod_tensor_by_rank_op.cc paddle/fluid/operators/conv_shift_op.cc paddle/fluid/operators/fill_zeros_like_op.cc paddle/fluid/operators/hash_op.cc paddle/fluid/operators/dequantize_op.cc paddle/fluid/operators/fake_quantize_op.cc paddle/fluid/operators/size_op.cc paddle/fluid/operators/scatter_op.cc paddle/fluid/operators/uniform_random_op.cc paddle/fluid/operators/beam_search_op.cc paddle/fluid/operators/beam_search_decode_op.cc paddle/fluid/operators/dropout_op.cc paddle/fluid/operators/interpolate_op.cc paddle/fluid/operators/sampling_id_op.cc paddle/fluid/operators/lstm_op.cc paddle/fluid/operators/modified_huber_loss_op.cc paddle/fluid/operators/temporal_shift_op.cc paddle/fluid/operators/sum_op.cc paddle/fluid/operators/arg_min_op.cc paddle/fluid/operators/psroi_pool_op.cc paddle/fluid/operators/uniform_random_batch_size_like_op.cc paddle/fluid/operators/rnn_memory_helper_op.cc paddle/fluid/operators/crf_decoding_op.cc paddle/fluid/operators/where_op.cc paddle/fluid/operators/fake_dequantize_op.cc paddle/fluid/operators/mean_iou_op.cc paddle/fluid/operators/roi_align_op.cc paddle/fluid/operators/range_op.cc paddle/fluid/operators/edit_distance_op.cc paddle/fluid/operators/multiplex_op.cc paddle/fluid/operators/clip_op.cc paddle/fluid/operators/gaussian_random_op.cc paddle/fluid/operators/norm_op.cc paddle/fluid/operators/rank_loss_op.cc paddle/fluid/operators/detection_map_op.cc paddle/fluid/operators/lstm_unit_op.cc paddle/fluid/operators/shard_index_op.cc paddle/fluid/operators/shape_op.cc paddle/fluid/operators/arg_max_op.cc paddle/fluid/operators/average_accumulates_op.cc paddle/fluid/operators/requantize_op.cc paddle/fluid/operators/conv_op.cc paddle/fluid/operators/add_position_encoding_op.cc paddle/fluid/operators/gru_unit_op.cc paddle/fluid/operators/batch_norm_op.cc paddle/fluid/operators/chunk_eval_op.cc paddle/fluid/operators/lod_rank_table_op.cc paddle/fluid/operators/unsqueeze_op.cc paddle/fluid/operators/positive_negative_pair_op.cc paddle/fluid/operators/im2sequence_op.cc paddle/fluid/operators/margin_rank_loss_op.cc paddle/fluid/operators/hinge_loss_op.cc paddle/fluid/operators/cvm_op.cc paddle/fluid/operators/huber_loss_op.cc paddle/fluid/operators/crop_op.cc paddle/fluid/operators/activation_op.cc paddle/fluid/operators/hierarchical_sigmoid_op.cc paddle/fluid/operators/unfold_op.cc paddle/fluid/operators/max_sequence_len_op.cc paddle/fluid/operators/mul_op.cc paddle/fluid/operators/attention_lstm_op.cc paddle/fluid/operators/top_k_op.cc paddle/fluid/operators/group_norm_op.cc paddle/fluid/operators/selu_op.cc paddle/fluid/operators/lstmp_op.cc paddle/fluid/operators/merge_lod_tensor_op.cc paddle/fluid/operators/truncated_gaussian_random_op.cc paddle/fluid/operators/label_smooth_op.cc paddle/fluid/operators/matmul_op.cc paddle/fluid/operators/spp_op.cc paddle/fluid/operators/unstack_op.cc paddle/fluid/operators/conv_transpose_op.cc paddle/fluid/operators/diag_op.cc paddle/fluid/operators/unpool_op.cc paddle/fluid/operators/lod_array_length_op.cc paddle/fluid/operators/affine_channel_op.cc paddle/fluid/operators/log_loss_op.cc paddle/fluid/operators/concat_op.cc paddle/fluid/operators/lod_tensor_to_array_op.cc paddle/fluid/operators/gru_op.cc paddle/fluid/operators/coalesce_tensor_op.cc paddle/fluid/operators/fsp_op.cc paddle/fluid/operators/linspace_op.cc paddle/fluid/operators/reverse_op.cc paddle/fluid/operators/recurrent_op.cc paddle/fluid/operators/split_selected_rows_op.cc paddle/fluid/operators/dgc_clip_by_norm_op.cc paddle/fluid/operators/scale_op.cc paddle/fluid/operators/save_op.cc paddle/fluid/operators/load_combine_op.cc paddle/fluid/operators/merge_selected_rows_op.cc paddle/fluid/operators/split_op.cc paddle/fluid/operators/cumsum_op.cc paddle/fluid/operators/deformable_psroi_pooling_op.cc paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc paddle/fluid/operators/transpose_op.cc paddle/fluid/operators/fill_constant_batch_size_like_op.cc paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc paddle/fluid/operators/shuffle_channel_op.cc paddle/fluid/operators/affine_grid_op.cc paddle/fluid/operators/split_lod_tensor_op.cc paddle/fluid/operators/grid_sampler_op.cc paddle/fluid/operators/lookup_table_op.cc paddle/fluid/operators/cos_sim_op.cc paddle/fluid/operators/quantize_op.cc paddle/fluid/operators/spectral_norm_op.cc paddle/fluid/operators/cross_entropy_op.cc paddle/fluid/operators/print_op.cc paddle/fluid/operators/lrn_op.cc paddle/fluid/operators/nce_op.cc paddle/fluid/operators/similarity_focus_op.cc paddle/fluid/operators/get_tensor_from_selected_rows_op.cc paddle/fluid/operators/squared_l2_distance_op.cc paddle/fluid/operators/cudnn_lstm_op.cc paddle/fluid/operators/tree_conv_op.cc paddle/fluid/operators/one_hot_op.cc paddle/fluid/operators/lookup_sparse_table_op.cc paddle/fluid/operators/unique_op.cc paddle/fluid/operators/mean_op.cc paddle/fluid/operators/prelu_op.cc paddle/fluid/operators/delete_var_op.cc paddle/fluid/operators/ctc_align_op.cc paddle/fluid/operators/argsort_op.cc paddle/fluid/operators/data_norm_op.cc paddle/fluid/operators/minus_op.cc paddle/fluid/operators/shrink_rnn_memory_op.cc paddle/fluid/operators/lod_reset_op.cc paddle/fluid/operators/l1_norm_op.cc paddle/fluid/operators/gaussian_random_batch_size_like_op.cc paddle/fluid/operators/is_empty_op.cc paddle/fluid/operators/bilinear_tensor_product_op.cc paddle/fluid/operators/kldiv_loss_op.cc paddle/fluid/operators/squeeze_op.cc paddle/fluid/operators/softmax_op.cc paddle/fluid/operators/clip_by_norm_op.cc paddle/fluid/operators/pool_with_index_op.cc paddle/fluid/operators/linear_chain_crf_op.cc paddle/fluid/operators/reshape_op.cc paddle/fluid/operators/fill_constant_op.cc paddle/fluid/operators/space_to_depth_op.cc paddle/fluid/operators/gather_op.cc paddle/fluid/operators/softmax_with_cross_entropy_op.cc paddle/fluid/operators/slice_op.cc paddle/fluid/operators/sign_op.cc paddle/fluid/operators/expand_op.cc paddle/fluid/operators/smooth_l1_loss_op.cc paddle/fluid/operators/tensor_array_to_tensor_op.cc paddle/fluid/operators/row_conv_op.cc paddle/fluid/operators/pad2d_op.cc paddle/fluid/operators/pixel_shuffle_op.cc paddle/fluid/operators/assign_value_op.cc paddle/fluid/operators/random_crop_op.cc paddle/fluid/operators/squared_l2_norm_op.cc paddle/fluid/operators/save_combine_op.cc paddle/fluid/operators/pool_op.cc paddle/fluid/operators/cast_op.cc paddle/fluid/operators/array_to_lod_tensor_op.cc paddle/fluid/operators/fill_any_like_op.cc paddle/fluid/operators/flatten_op.cc paddle/fluid/operators/sample_logits_op.cc paddle/fluid/operators/pad_op.cc paddle/fluid/operators/bpr_loss_op.cc paddle/fluid/operators/roi_pool_op.cc paddle/fluid/operators/pad_constant_like_op.cc paddle/fluid/operators/isfinite_op.cc paddle/fluid/operators/layer_norm_op.cc paddle/fluid/operators/maxout_op.cc paddle/fluid/operators/warpctc_op.cc paddle/fluid/string/piece.cc paddle/fluid/string/pretty_log.cc paddle/fluid/string/string_helper.cc paddle/fluid/recordio/header.cc paddle/fluid/recordio/chunk.cc paddle/fluid/recordio/writer.cc paddle/fluid/recordio/scanner.cc paddle/fluid/inference/io.cc paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.cc paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc paddle/fluid/inference/analysis/passes/passes.cc paddle/fluid/inference/analysis/helper.cc paddle/fluid/inference/analysis/ir_pass_manager.cc paddle/fluid/inference/analysis/argument.cc paddle/fluid/inference/analysis/analysis_pass.cc paddle/fluid/inference/analysis/analyzer.cc paddle/fluid/inference/utils/benchmark.cc paddle/fluid/inference/api/api.cc paddle/fluid/inference/api/api_impl.cc paddle/fluid/inference/api/helper.cc paddle/fluid/inference/api/analysis_predictor.cc paddle/fluid/inference/api/details/zero_copy_tensor.cc paddle/fluid/inference/api/details/reset_tensor_array.cc paddle/fluid/inference/api/analysis_config.cc paddle/fluid/inference/api/paddle_pass_builder.cc" +paddle_fluid_noavx_openblas_src += ' paddle/fluid/framework/revision.cc' + +StaticLibrary('paddle_fluid_noavx_openblas', Sources(paddle_fluid_noavx_openblas_src, CppFlags('-DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_ON_INFERENCE -DPADDLE_USE_DSO -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_WITH_XBYAK -DXBYAK64 -DXBYAK_NO_OP_NAMES -D_GNU_SOURCE -D__STDC_LIMIT_MACROS -DPYBIND_NOAVX_OPENBLAS' + r" -DPADDLE_REVISION=\"%s@%s@%s\"" % (REPO_URL(), REPO_BRANCH(), REPO_REVISION())), CFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -msse3 -O3 -DNDEBUG '), CxxFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -msse3 -O3 -DNDEBUG '))) +SharedLibrary('paddle_fluid_noavx_openblas', Sources(paddle_fluid_noavx_openblas_src, CppFlags('-DHPPL_STUB_FUNC -DPADDLE_DISABLE_PROFILER -DPADDLE_NO_PYTHON -DPADDLE_ON_INFERENCE -DPADDLE_USE_DSO -DPADDLE_USE_OPENBLAS -DPADDLE_USE_PTHREAD_BARRIER -DPADDLE_USE_PTHREAD_SPINLOCK -DPADDLE_VERSION=0.0.0 -DPADDLE_WITH_XBYAK -DXBYAK64 -DXBYAK_NO_OP_NAMES -D_GNU_SOURCE -D__STDC_LIMIT_MACROS -DPYBIND_NOAVX_OPENBLAS' + r" -DPADDLE_REVISION=\"%s@%s@%s\"" % (REPO_URL(), REPO_BRANCH(), REPO_REVISION())), CFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -msse3 -O3 -DNDEBUG '), CxxFlags('-std=c++11 -m64 -fPIC -fno-omit-frame-pointer -Werror -Wall -Wextra -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter -Wno-unused-function -Wno-error=literal-suffix -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=maybe-uninitialized -msse3 -O3 -DNDEBUG ')), LinkDeps(True)) diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..8cb178be --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,242 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +cmake_minimum_required(VERSION 3.0) +set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) + +include(system) + +project(paddle CXX C) +message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " + "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") +message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " + "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") +message(STATUS "AR tools: ${CMAKE_AR}") + +if(WIN32) + set(CMAKE_SUPPRESS_REGENERATION ON) + set(CMAKE_STATIC_LIBRARY_PREFIX lib) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838) + set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221") + set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") +endif(WIN32) + +find_package(CUDA QUIET) +find_package(Git REQUIRED) +find_package(Threads REQUIRED) + +include(simd) + +################################ Exposed Configurations ####################################### +option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) +option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) +option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) +option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) +option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) +option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) +option(WITH_SYSTEM_BLAS "Use system blas library" OFF) +option(WITH_DISTRIBUTE "Compile with distributed support" OFF) +option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) +option(ON_INFER "Turn on inference optimization." OFF) +################################ Internal Configurations ####################################### +option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF) +option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF) +option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) +option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) +option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) +option(WITH_PSLIB "Compile with pslib support" OFF) +option(WITH_CONTRIB "Compile the third-party contributation" OFF) +option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) +option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) +option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) +option(WITH_HIGH_LEVEL_API_TEST "Test fluid python high-level api interface" OFF) +option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) +option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) +option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ON) + +# PY_VERSION +if(NOT PY_VERSION) + set(PY_VERSION 2.7) +endif() +set(PYBIND11_PYTHON_VERSION ${PY_VERSION}) + +# CMAKE_BUILD_TYPE +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" + FORCE) +endif() + +if (APPLE) + set(WITH_MKL OFF CACHE STRING + "Disable MKL for building on mac" FORCE) +endif() + +if (WIN32) + set(WITH_DISTRIBUTE OFF CACHE STRING + "Disable DISTRIBUTE when compiling for Windows" FORCE) +endif() + +set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING + "A path setting third party libraries download & build directories.") + +set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING + "A path setting fluid shared and static libraries") + +set(FLUID_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_inference_install_dir" CACHE STRING + "A path setting fluid inference shared and static libraries") + +set(THIRD_PARTY_BUILD_TYPE Release) + +set(WITH_MKLML ${WITH_MKL}) +if (NOT DEFINED WITH_MKLDNN) + if (WITH_MKL AND AVX2_FOUND) + set(WITH_MKLDNN ON) + else() + message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN") + set(WITH_MKLDNN OFF) + endif() +endif() + +if (REPLACE_ENFORCE_GLOG) + add_definitions("-DREPLACE_ENFORCE_GLOG") +endif() +######################################################################################## + +include(external/mklml) # download mklml package +include(external/xbyak) # download xbyak package +include(external/libxsmm) # download, build, install libxsmm +include(external/zlib) # download, build, install zlib +include(external/gflags) # download, build, install gflags +include(external/glog) # download, build, install glog +include(external/gtest) # download, build, install gtest +include(external/protobuf) # download, build, install protobuf +include(external/python) # download, build, install python +include(external/openblas) # download, build, install openblas +include(external/mkldnn) # download, build, install mkldnn +include(external/ngraph) # download, build, install nGraph +include(external/boost) # download boost +include(external/eigen) # download eigen3 +include(external/pybind11) # download pybind11 +include(external/cares) +include(external/cub) +include(external/rocprim) +include(external/xxhash) # download xxhash +include(external/dlpack) +include(external/snappy) # download snappy +include(external/snappystream) # download snappystream +include(external/warpctc) # download, build, install warpctc + +if (NOT WIN32) +# there is no official support of nccl, cupti in windows +include(cupti) +endif (NOT WIN32) + +if(WITH_PSLIB) + include(external/libmct) + include(external/pslib_brpc) + include(external/pslib) +endif(WITH_PSLIB) + +if(WITH_DISTRIBUTE) + if(WITH_GRPC) + include(external/grpc) + message(STATUS "Use grpc framework.") + else() + message(STATUS "Use brpc framework.") + include(external/leveldb) + include(external/brpc) + endif() +endif() + +if(WITH_BRPC_RDMA) + message(STATUS "Use brpc with rdma.") + if(WITH_GRPC) + message(FATAL_ERROR "Can't use grpc with brpc rdma.") + endif() + if(NOT WITH_DISTRIBUTE) + message(FATAL_ERROR "Can't use brpc rdma in no distribute env.") + endif() +endif() + +include(anakin_subgraph) + +include(external/threadpool) +include(flags) # set paddle compile flags +include(cudnn) # set cudnn libraries, must before configure +include(configure) # add paddle env configuration + +if(WITH_GPU) + include(cuda) + include(tensorrt) +endif() + +if(WIN32 OR APPLE OR NOT WITH_GPU OR ON_INFER) + set(WITH_DGC OFF) +endif() + +if(WITH_DGC) + message(STATUS "add dgc lib.") + include(external/dgc) + add_definitions(-DPADDLE_WITH_DGC) +endif() + +if (WITH_PROFILER) + find_package(Gperftools REQUIRED) + include_directories(${GPERFTOOLS_INCLUDE_DIR}) + add_definitions(-DWITH_GPERFTOOLS) +endif() + +include(generic) # simplify cmake module +include(package) # set paddle packages +include(ccache) # set ccache for compilation +include(util) # set unittest and link libs +include(version) # set PADDLE_VERSION +include(coveralls) # set code coverage +include(inference_lib) # add paddle fluid inference libraries + + +include_directories("${PADDLE_SOURCE_DIR}") + +if(WITH_AMD_GPU) + find_package(HIP) + include(hip) +endif(WITH_AMD_GPU) + +set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") + +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") +set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") + +if (ON_INFER) + message(STATUS "On inference mode, will take place some specific optimization.") + add_definitions(-DPADDLE_ON_INFERENCE) +else() + #TODO(luotao), combine this warning with `make inference_lib_dist` command. + message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.") +endif() + +add_subdirectory(paddle) +if(WITH_PYTHON) + add_subdirectory(python) +endif() diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..54131b48 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at paddle-dev@baidu.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/CODE_OF_CONDUCT_cn.md b/CODE_OF_CONDUCT_cn.md new file mode 100644 index 00000000..2be794f1 --- /dev/null +++ b/CODE_OF_CONDUCT_cn.md @@ -0,0 +1,50 @@ +# 参与者公约 + +## 我们的保证 + +为了促进一个开放透明且友好的环境,我们作为贡献者和维护者保证:无论年龄、种族、民族、性别认同和表达(方式)、体型、身体健全与否、经验水平、国籍、个人表现、宗教或性别取向,参与者在我们项目和社区中都免于骚扰。 + +## 我们的标准 + +有助于创造正面环境的行为包括但不限于: +* 使用友好和包容性语言 +* 尊重不同的观点和经历 +* 耐心地接受建设性批评 +* 关注对社区最有利的事情 +* 友善对待其他社区成员 + +身为参与者不能接受的行为包括但不限于: +* 使用与性有关的言语或是图像,以及不受欢迎的性骚扰 +* 捣乱/煽动/造谣的行为或进行侮辱/贬损的评论,人身攻击及政治攻击 +* 公开或私下的骚扰 +* 未经许可地发布他人的个人资料,例如住址或是电子地址 +* 其他可以被合理地认定为不恰当或者违反职业操守的行为 + +## 我们的责任 + +项目维护者有责任为「可接受的行为」标准做出诠释,以及对已发生的不被接受的行为采取恰当且公平的纠正措施。 + +项目维护者有权利及责任去删除、编辑、拒绝与本行为标准有所违背的评论(comments)、提交(commits)、代码、wiki 编辑、问题(issues)和其他贡献,以及项目维护者可暂时或永久性的禁止任何他们认为有不适当、威胁、冒犯、有害行为的贡献者。 + +## 使用范围 + +当一个人代表该项目或是其社区时,本行为标准适用于其项目平台和公共平台。 + +代表项目或是社区的情况,举例来说包括使用官方项目的电子邮件地址、通过官方的社区媒体账号发布或线上或线下事件中担任指定代表。 + +该项目的呈现方式可由其项目维护者进行进一步的定义及解释。 + +## 强制执行 + +可以通过paddle-dev@baidu.com,来联系项目团队来举报滥用、骚扰或其他不被接受的行为。 + +任何维护团队认为有必要且适合的所有投诉都将进行审查及调查,并做出相对应的回应。项目小组有对事件回报者有保密的义务。具体执行的方针近一步细节可能会单独公布。 + +没有切实地遵守或是执行本行为标准的项目维护人员,可能会因项目领导人或是其他成员的决定,暂时或是永久地取消其参与资格。 + +## 来源 + +本行为标准改编自[贡献者公约][主页],版本 1.4 +可在此观看https://www.contributor-covenant.org/zh-cn/version/1/4/code-of-conduct.html + +[主页]: https://www.contributor-covenant.org diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 00000000..62b26b99 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,162 @@ +# Contribute Code + +You are welcome to contribute to project PaddlePaddle. To contribute to PaddlePaddle, you have to agree with the +[PaddlePaddle Contributor License Agreement](https://gist.github.com/wangkuiyi/0c22c7b1bd3bb7eb27d76f85c3a3e329). + +We sincerely appreciate your contribution. This document explains our workflow and work style. + +## Workflow + +PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/). The following steps guide usual contributions. + +1. Fork + + Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo. So, please file Pull Requests from your fork. To make a fork, just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/). + +1. Clone + + To make a copy of your fork to your local computers, please run + + ```bash + git clone https://github.com/your-github-account/paddle + cd paddle + ``` + +1. Create the local feature branch + + For daily works like adding a new feature or fixing a bug, please open your feature branch before coding: + + ```bash + git checkout -b my-cool-stuff + ``` + +1. Commit + + Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands: + + ```bash + pip install pre-commit + pre-commit install + ``` + + Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python. + + Once installed, `pre-commit` checks the style of code and documentation in every commit. We will see something like the following when you run `git commit`: + + ``` + ➜ git commit + CRLF end-lines remover...............................(no files to check)Skipped + yapf.................................................(no files to check)Skipped + Check for added large files..............................................Passed + Check for merge conflicts................................................Passed + Check for broken symlinks................................................Passed + Detect Private Key...................................(no files to check)Skipped + Fix End of Files.....................................(no files to check)Skipped + clang-formater.......................................(no files to check)Skipped + [my-cool-stuff c703c041] add test file + 1 file changed, 0 insertions(+), 0 deletions(-) + create mode 100644 233 + ``` + + NOTE: The `yapf` installed by `pip install pre-commit` and `conda install -c conda-forge pre-commit` is slightly different. Paddle developers use `pip install pre-commit`. + +1. Build and test + + Users can build PaddlePaddle natively on Linux and Mac OS X. But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md). + +1. Keep pulling + + An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts. + + ```bash + git remote add upstream https://github.com/PaddlePaddle/Paddle + git pull upstream develop + ``` + +1. Push and file a pull request + + You can "push" your local work into your forked repo: + + ```bash + git push origin my-cool-stuff + ``` + + The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one. + + To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/). + + If your change is for fixing an issue, please write ["Fixes "](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request. Github would close the issue when the owners merge your pull request. + + Please remember to specify some reviewers for your pull request. If you don't know who are the right ones, please follow Github's recommendation. + + +1. Delete local and remote branches + + To keep your local workspace and your fork clean, you might want to remove merged branches: + + ```bash + git push origin :my-cool-stuff + git checkout develop + git pull upstream develop + git branch -d my-cool-stuff + ``` + +### Code Review + +- Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email. Please do this after your pull request passes the CI. + +- Please answer reviewers' every comment. If you are to follow the comment, please write "Done"; please give a reason otherwise. + +- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/). + +- Reduce the unnecessary commits. Some developers commit often. It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`. + + +## Coding Standard + +### Code Style + +Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html). + +Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/). + +Our build process helps to check the code style. In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default. This flag is on + +Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`. To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43). + +### Unit Tests + +Please remember to add related unit tests. + +- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/primer.md) . + +- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/). + + +### Writing Logs + +We use [glog](https://github.com/google/glog) for logging in our C/C++ code. + +For general information, please use `LOG`. For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose). The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ). + +`VLOG` requires a *verbose level* parameter. For example: + +```c++ +VLOG(3) << "Operator FC is taking " << num_inputs << "inputs." +``` + +When we run a PaddlePaddle application or test, we can specify a verbose threshold. For example: + +```bash +GLOG_vmodule=buddy_allocator=2 \ +GLOG_v=10 \ +python \ +../python/paddle/v2/framework/tests/test_recurrent_op.py +``` + +This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3. This suggests that we output overall messages in lower verbose levels, so they display with higher probability. When coding C++, please follow the verbose level convention as follows: + +- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework) +- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators) +- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform) +- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators/math/) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..5db7fa50 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,211 @@ +# A image for building paddle binaries +# Use cuda devel base image for both cpu and gpu environment +# When you modify it, please be aware of cudnn-runtime version +FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04 +MAINTAINER PaddlePaddle Authors + +ARG UBUNTU_MIRROR +RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' + +# ENV variables +ARG WITH_GPU +ARG WITH_AVX + +ENV WOBOQ OFF +ENV WITH_GPU=${WITH_GPU:-ON} +ENV WITH_AVX=${WITH_AVX:-ON} + +ENV HOME /root +# Add bash enhancements +COPY ./paddle/scripts/docker/root/ /root/ + +# Prepare packages for Python +RUN apt-get update && \ + apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \ + libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \ + xz-utils tk-dev libffi-dev liblzma-dev + +# Downgrade gcc&&g++ +RUN apt-get update +WORKDIR /usr/bin +RUN apt install -y gcc-4.8 g++-4.8 +RUN cp gcc gcc.bak +RUN cp g++ g++.bak +RUN rm gcc +RUN rm g++ +RUN ln -s gcc-4.8 gcc +RUN ln -s g++-4.8 g++ + +# Install Python3.6 +RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \ + tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \ + ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \ + wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \ + tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null + +# Install Python3.7 +RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ + tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null + +RUN rm -r /root/python_build + +RUN apt-get update && \ + apt-get install -y --allow-downgrades patchelf \ + python3 python3-dev python3-pip \ + git python-pip python-dev python-opencv openssh-server bison \ + libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ + wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ + curl sed grep graphviz libjpeg-dev zlib1g-dev \ + python-matplotlib gcc-4.8 g++-4.8 \ + automake locales clang-format swig cmake \ + liblapack-dev liblapacke-dev \ + clang-3.8 llvm-3.8 libclang-3.8-dev \ + net-tools libtool ccache && \ + apt-get clean -y + +# Install Python2.7.15 to replace original python +WORKDIR /home +ENV version=2.7.15 +RUN wget https://www.python.org/ftp/python/$version/Python-$version.tgz +RUN tar -xvf Python-$version.tgz +WORKDIR /home/Python-$version +RUN ./configure --enable-unicode=ucs4 --enable-shared CFLAGS=-fPIC --prefix=/usr/local/python2.7.15 +RUN make && make install + +RUN echo "export PATH=/usr/local/python2.7.15/include:${PATH}" >> ~/.bashrc +RUN echo "export PATH=/usr/local/python2.7.15/bin:${PATH}" >> ~/.bashrc +RUN echo "export LD_LIBRARY_PATH=/usr/local/python2.7.15/lib:${LD_LIBRARY_PATH}" >> ~/.bashrc +RUN echo "export CPLUS_INCLUDE_PATH=/usr/local/python2.7.15/include/python2.7:$CPLUS_INCLUDE_PATH" >> ~/.bashrc +ENV PATH=/usr/local/python2.7.15/include:${PATH} +ENV PATH=/usr/local/python2.7.15/bin:${PATH} +ENV LD_LIBRARY_PATH=/usr/local/python2.7.15/lib:${LD_LIBRARY_PATH} +ENV CPLUS_INCLUDE_PATH=/usr/local/python2.7.15/include/python2.7:$CPLUS_INCLUDE_PATH +RUN mv /usr/bin/python /usr/bin/python.bak +RUN ln -s /usr/local/python2.7.15/bin/python2.7 /usr/local/bin/python +RUN ln -s /usr/local/python2.7.15/bin/python2.7 /usr/bin/python +WORKDIR /home +RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e442e80deffbb8d0676b93261d761b1f0ef8fb/setuptools-40.6.2.zip +RUN apt-get -y install unzip +RUN unzip setuptools-40.6.2.zip +WORKDIR /home/setuptools-40.6.2 +RUN python setup.py build +RUN python setup.py install +WORKDIR /home +RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz +RUN tar -zxvf pip-18.0.tar.gz +WORKDIR pip-18.0 +RUN python setup.py install + +WORKDIR /home +RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-18.0.tar.gz && \ + rm -r Python-$version setuptools-40.6.2 pip-18.0 + +# Install Go and glide +RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ + mkdir /root/gopath && \ + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src +ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin +# install glide +RUN curl -s -q https://glide.sh/get | sh + +# Install TensorRT +# following TensorRT.tar.gz is not the default official one, we do two miny changes: +# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, +# and its size is only one-third of the official one. +# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. +# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. + +RUN wget -q https://paddlepaddledeps.bj.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \ + tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \ + cp -rf /usr/local/TensorRT/include /usr && \ + cp -rf /usr/local/TensorRT/lib /usr + +# git credential to skip password typing +RUN git config --global credential.helper store + +# Fix locales to en_US.UTF-8 +RUN localedef -i en_US -f UTF-8 en_US.UTF-8 + +# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter +# version util jupyter fixes this issue. + +# specify sphinx version as 1.5.6 and remove -U option for [pip install -U +# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest +# version(1.7.1 for now), which causes building documentation failed. +RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ + pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ + pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ + pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ + pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark + +RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 --no-cache-dir install opencv-python && \ + pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.6 --no-cache-dir install opencv-python && \ + pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.7 --no-cache-dir install opencv-python && \ + pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip --no-cache-dir install opencv-python + +#For docstring checker +RUN pip3 --no-cache-dir install pylint pytest astroid isort +RUN pip3.6 --no-cache-dir install pylint pytest astroid isort +RUN pip3.7 --no-cache-dir install pylint pytest astroid isort +RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker + +COPY ./python/requirements.txt /root/ +RUN pip3 --no-cache-dir install -r /root/requirements.txt +RUN pip3.6 --no-cache-dir install -r /root/requirements.txt +RUN pip3.7 --no-cache-dir install -r /root/requirements.txt +RUN pip --no-cache-dir install -r /root/requirements.txt + +# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use +# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 +RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y +RUN pip3 --no-cache-dir install certifi urllib3[secure] +RUN pip3.6 --no-cache-dir install certifi urllib3[secure] +RUN pip3.7 --no-cache-dir install certifi urllib3[secure] +RUN pip --no-cache-dir install certifi urllib3[secure] + + +# Install woboq_codebrowser to /woboq +RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ + (cd /woboq \ + cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ + -DCMAKE_BUILD_TYPE=Release . \ + make) + +# ar mishandles 4GB files +# https://sourceware.org/bugzilla/show_bug.cgi?id=14625 +# remove them when apt-get support 2.27 and higher version +RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/2.27-9ubuntu1/binutils_2.27.orig.tar.gz && \ + tar -xzf binutils_2.27.orig.tar.gz && \ + cd binutils-2.27 && \ + ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz + +# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service +RUN mkdir /var/run/sshd +RUN echo 'root:root' | chpasswd +RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config +RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config +CMD source ~/.bashrc +EXPOSE 22 diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md new file mode 100644 index 00000000..6b2614b1 --- /dev/null +++ b/ISSUE_TEMPLATE.md @@ -0,0 +1,14 @@ +Thank you for contributing to PaddlePaddle. Submitting an issue is a great help for us. +Both Chinese and English issues are welcome. + +It's hard to solve a problem when important details are missing. +Before submitting the issue, look over the following criteria before handing your request in. + +- [ ] Was there a similar issue submitted or resolved before ? You could search issue in the github. +- [ ] Did you retrieve your issue from widespread search engines ? +- [ ] Is my description of the issue clear enough to reproduce this problem? + * If some errors occurred, we need details about `how do you run your code?`, `what system do you use?`, `Are you using GPU or not?`, etc. + * If you use an recording [asciinema](https://asciinema.org/) to show what you are doing to make it happen, that's awesome! We could help you solve the problem more quickly. +- [ ] Is my description of the issue use the github markdown correctly? + * Please use the proper markdown syntaxes for styling all forms of writing, e.g, source code, error information, etc. + * Check out [this page](https://guides.github.com/features/mastering-markdown/) to find out much more about markdown. diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..5fe86943 --- /dev/null +++ b/LICENSE @@ -0,0 +1,203 @@ +Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index e45bbf11..7908d061 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,109 @@ -# paddle-trainer -简要说明 +# PaddlePaddle (clone from /baidu/paddlepaddle/paddle) -## 快速开始 -如何构建、安装、运行 +English | [简体中文](./README_cn.md) -## 测试 -如何执行自动化测试 +[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html) +[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) +[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) -## 如何贡献 -贡献patch流程、质量要求 +Welcome to the PaddlePaddle GitHub. -## 讨论 -百度Hi讨论群:XXXX +PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use, +efficient, flexible and scalable deep learning platform, which is originally +developed by Baidu scientists and engineers for the purpose of applying deep +learning to many products at Baidu. + +Our vision is to enable deep learning for everyone via PaddlePaddle. +Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. + +### Latest PaddlePaddle Release: [Fluid 1.5.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.5) +### Install Latest Stable Release: +``` +# Linux CPU +pip install paddlepaddle +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu +# Linux GPU cuda10cudnn7 +pip install paddlepaddle-gpu==1.5.1.post107 +# Linux GPU cuda8cudnn7 +pip install paddlepaddle-gpu==1.5.1.post87 + +# For installation on other platform, refer to http://paddlepaddle.org/ +``` +Now our developers could acquire Tesla V100 online computing resources for free. If you create a program by AI Studio, you would obtain 12 hours to train models online per day. If you could insist on that for five consecutive days, then you would own extra 48 hours. [Click here to start](http://ai.baidu.com/support/news?action=detail&id=981). + +## Features + +- **Flexibility** + + PaddlePaddle supports a wide range of neural network architectures and + optimization algorithms. It is easy to configure complex models such as + neural machine translation model with attention mechanism or complex memory + connection. + +- **Efficiency** + + In order to unleash the power of heterogeneous computing resource, + optimization occurs at different levels of PaddlePaddle, including + computing, memory, architecture and communication. The following are some + examples: + + - Optimized math operations through SSE/AVX intrinsics, BLAS libraries + (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels. + - Optimized CNN networks through MKL-DNN library. + - Highly optimized recurrent networks which can handle **variable-length** + sequence without padding. + - Optimized local and distributed training for models with high dimensional + sparse data. + +- **Scalability** + + With PaddlePaddle, it is easy to use many CPUs/GPUs and machines to speed + up your training. PaddlePaddle can achieve high throughput and performance + via optimized communication. + +- **Connected to Products** + + In addition, PaddlePaddle is also designed to be easily deployable. At Baidu, + PaddlePaddle has been deployed into products and services with a vast number + of users, including ad click-through rate (CTR) prediction, large-scale image + classification, optical character recognition(OCR), search ranking, computer + virus detection, recommendation, etc. It is widely utilized in products at + Baidu and it has achieved a significant impact. We hope you can also explore + the capability of PaddlePaddle to make an impact on your product. + +## Installation + +It is recommended to read [this doc](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) on our website. + +## Documentation + +We provide [English](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) and +[Chinese](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) documentation. + +- [Deep Learning 101](https://github.com/PaddlePaddle/book) + + You might want to start from this online interactive book that can run in a Jupyter Notebook. + +- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.4/user_guides/howto/training/multi_node_en.html) + + You can run distributed training jobs on MPI clusters. + +- [Python API](http://paddlepaddle.org/documentation/docs/en/1.4/api/index_en.html) + + Our new API enables much shorter programs. + +- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.4/advanced_usage/development/contribute_to_paddle/index_en.html) + + We appreciate your contributions! + +## Communication + +- [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc. +- QQ discussion group: 432676488 (PaddlePaddle). +- [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc. + +## Copyright and License +PaddlePaddle is provided under the [Apache-2.0 license](LICENSE). diff --git a/README_cn.md b/README_cn.md new file mode 100644 index 00000000..a89b3f05 --- /dev/null +++ b/README_cn.md @@ -0,0 +1,91 @@ +# PaddlePaddle + +[English](./README.md) | 简体中文 + +[![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html) +[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) +[![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) + +欢迎来到 PaddlePaddle GitHub + +PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。 + +我们的愿景是让每个人都能通过PaddlePaddle接触深度学习 + +跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) + +### PaddlePaddle最新版本: [Fluid 1.5.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.5) +### 安装最新稳定版本: +``` +# Linux CPU +pip install paddlepaddle +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu +# Linux GPU cuda10cudnn7 +pip install paddlepaddle-gpu==1.5.1.post107 +# Linux GPU cuda8cudnn7 +pip install paddlepaddle-gpu==1.5.1.post87 + +# 其他平台上的安装指引请参考 http://paddlepaddle.org/ +``` +PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型更高效。**每日登陆即送12小时**,**连续五天运行再加送48小时**,[前往使用免费算力](https://ai.baidu.com/support/news?action=detail&id=981)。 + +## 特性 + +- **灵活性** + + PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。 + +- **高效性** + + 为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例: + + - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。 + - 通过MKL-DNN库优化CNN网络 + - 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列 + - 针对高维稀疏数据模型,优化了局部和分布式训练。 + + +- **稳定性** + + 有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。 + +- **与产品相连** + + 另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。 + +## 安装 + +推荐阅读官网上的[安装说明](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) + +## 文档 + +我们提供[英文](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)和 +[中文](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) 文档 + +- [深度学习101](https://github.com/PaddlePaddle/book) + + 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 + +- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.4/user_guides/howto/training/multi_node.html) + + 可以在MPI集群上运行分布式训练任务 + +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.4/api_cn/index_cn.html) + + 新的API支持代码更少更简洁的程序 + +- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.4/advanced_usage/development/contribute_to_paddle/index_cn.html) + + 欢迎您的贡献! + +## 交流与反馈 + +- 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议 +- QQ群: 432676488 (PaddlePaddle) +- [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围 + +## 版权和许可证 +PaddlePaddle由[Apache-2.0 license](LICENSE)提供 diff --git a/RELEASE.md b/RELEASE.md new file mode 100644 index 00000000..2c64baaa --- /dev/null +++ b/RELEASE.md @@ -0,0 +1,3 @@ +# Release Note + +Please turn to [here](https://github.com/PaddlePaddle/Paddle/releases) for release note. diff --git a/cmake/FindGperftools.cmake b/cmake/FindGperftools.cmake new file mode 100644 index 00000000..928f573a --- /dev/null +++ b/cmake/FindGperftools.cmake @@ -0,0 +1,63 @@ +# Tries to find Gperftools. +# +# Usage of this module as follows: +# +# find_package(Gperftools) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# Gperftools_ROOT_DIR Set this variable to the root installation of +# Gperftools if the module has problems finding +# the proper installation path. +# +# Variables defined by this module: +# +# GPERFTOOLS_FOUND System has Gperftools libs/headers +# GPERFTOOLS_LIBRARIES The Gperftools libraries (tcmalloc & profiler) +# GPERFTOOLS_INCLUDE_DIR The location of Gperftools headers + +find_library(GPERFTOOLS_TCMALLOC + NAMES tcmalloc + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_library(GPERFTOOLS_PROFILER + NAMES profiler + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER + NAMES tcmalloc_and_profiler + HINTS ${Gperftools_ROOT_DIR}/lib) + +find_path(GPERFTOOLS_INCLUDE_DIR + NAMES gperftools/heap-profiler.h + HINTS ${Gperftools_ROOT_DIR}/include) + +set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args( + Gperftools + DEFAULT_MSG + GPERFTOOLS_LIBRARIES + GPERFTOOLS_INCLUDE_DIR) + +mark_as_advanced( + Gperftools_ROOT_DIR + GPERFTOOLS_TCMALLOC + GPERFTOOLS_PROFILER + GPERFTOOLS_TCMALLOC_AND_PROFILER + GPERFTOOLS_LIBRARIES + GPERFTOOLS_INCLUDE_DIR) + +# create IMPORTED targets +if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc) + add_library(gperftools::tcmalloc UNKNOWN IMPORTED) + set_target_properties(gperftools::tcmalloc PROPERTIES + IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC} + INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") + add_library(gperftools::profiler UNKNOWN IMPORTED) + set_target_properties(gperftools::profiler PROPERTIES + IMPORTED_LOCATION ${GPERFTOOLS_PROFILER} + INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}") +endif() diff --git a/cmake/FindNumPy.cmake b/cmake/FindNumPy.cmake new file mode 100644 index 00000000..8cdd642a --- /dev/null +++ b/cmake/FindNumPy.cmake @@ -0,0 +1,38 @@ +# Find the Python NumPy package +# PYTHON_NUMPY_INCLUDE_DIR +# NUMPY_FOUND +# will be set by this script + +cmake_minimum_required(VERSION 2.6) + +if(NOT PYTHON_EXECUTABLE) + if(NumPy_FIND_QUIETLY) + find_package(PythonInterp QUIET) + else() + find_package(PythonInterp) + set(_numpy_out 1) + endif() +endif() + +if (PYTHON_EXECUTABLE) + # write a python script that finds the numpy path + file(WRITE ${PROJECT_BINARY_DIR}/FindNumpyPath.py + "try: import numpy; print(numpy.get_include())\nexcept:pass\n") + + # execute the find script + exec_program("${PYTHON_EXECUTABLE}" ${PROJECT_BINARY_DIR} + ARGS "FindNumpyPath.py" + OUTPUT_VARIABLE NUMPY_PATH) +elseif(_numpy_out) + message(STATUS "Python executable not found.") +endif(PYTHON_EXECUTABLE) + +find_path(PYTHON_NUMPY_INCLUDE_DIR numpy/arrayobject.h + HINTS "${NUMPY_PATH}" "${PYTHON_INCLUDE_PATH}") + +if(PYTHON_NUMPY_INCLUDE_DIR) + set(PYTHON_NUMPY_FOUND 1 CACHE INTERNAL "Python numpy found") +endif(PYTHON_NUMPY_INCLUDE_DIR) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(NumPy DEFAULT_MSG PYTHON_NUMPY_INCLUDE_DIR) diff --git a/cmake/anakin_subgraph.cmake b/cmake/anakin_subgraph.cmake new file mode 100644 index 00000000..177f3443 --- /dev/null +++ b/cmake/anakin_subgraph.cmake @@ -0,0 +1,45 @@ +set(ANAKIN_ROOT "/usr" CACHE PATH "ANAKIN ROOT") +find_path(ANAKIN_INCLUDE_DIR anakin_config.h + PATHS ${ANAKIN_ROOT} ${ANAKIN_ROOT}/include + $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/include + NO_DEFAULT_PATH +) + +find_library(ANAKIN_LIBRARY NAMES libanakin_saber_common.so libanakin.so + PATHS ${ANAKIN_ROOT} + $ENV{ANAKIN_ROOT} $ENV{ANAKIN_ROOT}/lib + NO_DEFAULT_PATH + DOC "Path to ANAKIN library.") + +if(ANAKIN_INCLUDE_DIR AND ANAKIN_LIBRARY) + set(ANAKIN_FOUND ON) +else() + set(ANAKIN_FOUND OFF) +endif() + +if(ANAKIN_FOUND) + message(STATUS "Current ANAKIN header is ${ANAKIN_INCLUDE_DIR}/anakin_config.h. ") + include_directories(${ANAKIN_ROOT}) + include_directories(${ANAKIN_ROOT}/include) + include_directories(${ANAKIN_ROOT}/saber) + link_directories(${ANAKIN_ROOT}) + add_definitions(-DPADDLE_WITH_ANAKIN) +endif() + +if(ANAKIN_FOUND) + if (ANAKIN_MLU AND NOT WITH_GPU AND NOT ANAKIN_X86) + message(STATUS "Compile with anakin mlu place.") + add_definitions(-DANAKIN_MLU_PLACE) + elseif(ANAKIN_BM AND NOT WITH_GPU AND NOT ANAKIN_X86) + message(STATUS "Compile with anakin bm place.") + add_definitions(-DANAKIN_BM_PLACE) + elseif(ANAKIN_X86) + message(STATUS "Compile with anakin x86 place.") + add_definitions(-DANAKIN_X86_PLACE) + endif() +endif() + +if(ANAKIN_FOUND AND WITH_GPU AND WITH_DSO) + message(STATUS "Compile with anakin subgraph.") + set(ANAKIN_SUBGRAPH ON) +endif() diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake new file mode 100644 index 00000000..52ac31d1 --- /dev/null +++ b/cmake/cblas.cmake @@ -0,0 +1,94 @@ +# Find the CBlas and lapack libraries +# +# It will search MKLML, atlas, OpenBlas, reference-cblas in order. +# +# If any cblas implementation found, the following variable will be set. +# CBLAS_PROVIDER # one of MKLML, OPENBLAS, REFERENCE +# CBLAS_INC_DIR # the include directory for cblas. +# CBLAS_LIBS # a list of libraries should be linked by paddle. +# # Each library should be full path to object file. + +set(CBLAS_FOUND OFF) + +## Find MKLML First. +if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB) + set(CBLAS_FOUND ON) + set(CBLAS_PROVIDER MKLML) + set(CBLAS_INC_DIR ${MKLML_INC_DIR}) + set(CBLAS_LIBRARIES ${MKLML_LIB}) + + add_definitions(-DPADDLE_WITH_MKLML) + add_definitions(-DLAPACK_FOUND) + + message(STATUS "Found cblas and lapack in MKLML " + "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + return() +endif() + +## Then find openblas. +set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas") +set(OPENBLAS_INCLUDE_SEARCH_PATHS + ${OPENBLAS_ROOT}/include + /usr/include + /usr/include/openblas + /usr/local/opt/openblas/include) +set(OPENBLAS_LIB_SEARCH_PATHS + ${OPENBLAS_ROOT}/lib + /usr/lib + /usr/lib/blas/openblas + /usr/lib/openblas + /usr/local/opt/openblas/lib) + +find_path(OPENBLAS_INC_DIR NAMES cblas.h + PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH) +find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h + PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS}) +find_library(OPENBLAS_LIB NAMES openblas + PATHS ${OPENBLAS_LIB_SEARCH_PATHS}) + +if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB) + set(CBLAS_FOUND ON) + set(CBLAS_PROVIDER OPENBLAS) + set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR}) + set(CBLAS_LIBRARIES ${OPENBLAS_LIB}) + + add_definitions(-DPADDLE_USE_OPENBLAS) + add_definitions(-DLAPACK_FOUND) + + message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})") + return() +endif() + + +## Then find the reference-cblas. www.netlib.org/blas/ +set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH + "Folder contains reference-cblas") +set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS + ${REFERENCE_CBLAS_ROOT}/include + /usr/include + /usr/include/cblas +) + +set(REFERENCE_CBLAS_LIB_SEARCH_PATHS + ${REFERENCE_CBLAS_ROOT}/lib + /usr/lib + /usr/lib/blas/reference/ + /usr/lib/reference/ +) + +if(WITH_SYSTEM_BLAS) + find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS + ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS}) + find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS + ${REFERENCE_CBLAS_LIB_SEARCH_PATHS}) + + if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY) + set(CBLAS_FOUND ON) + set(CBLAS_PROVIDER REFERENCE) + set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR}) + set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY}) + add_definitions(-DPADDLE_USE_REFERENCE_CBLAS) + message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})") + endif() +endif() diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake new file mode 100644 index 00000000..900f59d4 --- /dev/null +++ b/cmake/ccache.cmake @@ -0,0 +1,9 @@ +# Use ccache if found ccache program + +find_program(CCACHE_PATH ccache) + +if(CCACHE_PATH) + message(STATUS "Ccache is founded, use ccache to speed up compile.") + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH}) + set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH}) +endif(CCACHE_PATH) diff --git a/cmake/configure.cmake b/cmake/configure.cmake new file mode 100644 index 00000000..5f7b4a46 --- /dev/null +++ b/cmake/configure.cmake @@ -0,0 +1,152 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_PYTHON) + add_definitions(-DPADDLE_NO_PYTHON) +endif(NOT WITH_PYTHON) + +if(WITH_DSO) + add_definitions(-DPADDLE_USE_DSO) +endif(WITH_DSO) + +if(WITH_TESTING) + add_definitions(-DPADDLE_WITH_TESTING) +endif(WITH_TESTING) + +if(NOT WITH_PROFILER) + add_definitions(-DPADDLE_DISABLE_PROFILER) +endif(NOT WITH_PROFILER) + +if(WITH_AVX AND AVX_FOUND) + set(SIMD_FLAG ${AVX_FLAG}) + add_definitions(-DPADDLE_WITH_AVX) +elseif(SSE3_FOUND) + set(SIMD_FLAG ${SSE3_FLAG}) +endif() + +if(WIN32) + # windows header option for all targets. + add_definitions(-D_XKEYCHECK_H) + # Use symbols instead of absolute path, reduce the cmake link command length. + SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1) + SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1) + SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) + SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1) + SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1) + SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1) + SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@") + SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@") + + # Specify the program to use when building static libraries + SET(CMAKE_C_CREATE_STATIC_LIBRARY " lib ") + SET(CMAKE_CXX_CREATE_STATIC_LIBRARY " lib ") + + # set defination for the dll export + if (NOT MSVC) + message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.") + endif(NOT MSVC) +endif(WIN32) + +if(WITH_PSLIB) + add_definitions(-DPADDLE_WITH_PSLIB) +endif() + +if(WITH_GPU) + add_definitions(-DPADDLE_WITH_CUDA) + add_definitions(-DEIGEN_USE_GPU) + + FIND_PACKAGE(CUDA REQUIRED) + + if(${CUDA_VERSION_MAJOR} VERSION_LESS 7) + message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile") + endif() + + if(NOT CUDNN_FOUND) + message(FATAL_ERROR "Paddle needs cudnn to compile") + endif() + if(CUPTI_FOUND) + include_directories(${CUPTI_INCLUDE_DIR}) + add_definitions(-DPADDLE_WITH_CUPTI) + else() + message(STATUS "Cannot find CUPTI, GPU Profiling is incorrect.") + endif() + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}") + + # Include cuda and cudnn + include_directories(${CUDNN_INCLUDE_DIR}) + include_directories(${CUDA_TOOLKIT_INCLUDE}) + + if(TENSORRT_FOUND) + if(${CUDA_VERSION_MAJOR} VERSION_LESS 8) + message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile") + endif() + if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile") + endif() + if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4) + message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile") + endif() + include_directories(${TENSORRT_INCLUDE_DIR}) + endif() + if(ANAKIN_FOUND) + if(${CUDA_VERSION_MAJOR} VERSION_LESS 8) + message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force ANAKIN_FOUND = OFF") + set(ANAKIN_FOUND OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE) + endif() + if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force ANAKIN_FOUND = OFF") + set(ANAKIN_FOUND OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE) + endif() + endif() +elseif(WITH_AMD_GPU) + add_definitions(-DPADDLE_WITH_HIP) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__") +else() + add_definitions(-DHPPL_STUB_FUNC) + list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) +endif() + +if (WITH_MKLML AND MKLML_IOMP_LIB) + message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") + if(WIN32) + # openmp not support well for now on windows + set(OPENMP_FLAGS "") + else(WIN32) + set(OPENMP_FLAGS "-fopenmp") + endif(WIN32) + set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}") +endif() + +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") + +if(WITH_DISTRIBUTE) + add_definitions(-DPADDLE_WITH_DISTRIBUTE) +endif() + +if(WITH_GRPC) + add_definitions(-DPADDLE_WITH_GRPC) +endif(WITH_GRPC) + +if(WITH_BRPC_RDMA) + add_definitions(-DPADDLE_WITH_BRPC_RDMA) +endif(WITH_BRPC_RDMA) + +if(ON_INFER) + add_definitions(-DPADDLE_ON_INFERENCE) +endif(ON_INFER) diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake new file mode 100644 index 00000000..c0e96e28 --- /dev/null +++ b/cmake/coveralls.cmake @@ -0,0 +1,102 @@ +# CMake script for code coverage. +# If _COVERALLS_UPLOAD is ON, it will upload json files to overalls.io automatically. + +# Param _COVERAGE_SRCS A list of coverage source files. +# Param _COVERALLS_UPLOAD Upload the result to coveralls. +# Param _CMAKE_SCRIPT_PATH CMake script path. +function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH) + # clean previous gcov data. + file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda) + + # find curl for upload JSON soon. + if (_COVERALLS_UPLOAD) + find_program(CURL_EXECUTABLE curl) + if (NOT CURL_EXECUTABLE) + message(FATAL_ERROR "Coveralls: curl not found!") + endif() + endif() + + # When passing a CMake list to an external process, the list + # will be converted from the format "1;2;3" to "1 2 3". + set(COVERAGE_SRCS "") + foreach (SINGLE_SRC ${_COVERAGE_SRCS}) + set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}") + endforeach() + + # query number of logical cores + cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES) + # coveralls json file. + set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json) + add_custom_target(coveralls_generate + # Run regress tests. + COMMAND ${CMAKE_CTEST_COMMAND} + -j ${core_size} + --output-on-failure + # Generate Gcov and translate it into coveralls JSON. + COMMAND ${CMAKE_COMMAND} + -DCOVERAGE_SRCS="${COVERAGE_SRCS}" + -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}" + -DCOV_PATH="${PROJECT_BINARY_DIR}" + -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}" + -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake" + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMENT "Coveralls: generating coveralls output..." + ) + + if (_COVERALLS_UPLOAD) + message("COVERALLS UPLOAD: ON") + # Upload the JSON to coveralls. + add_custom_target(coveralls_upload + COMMAND ${CURL_EXECUTABLE} + -S -F json_file=@${COVERALLS_FILE} + https://coveralls.io/api/v1/jobs + DEPENDS coveralls_generate + WORKING_DIRECTORY ${PROJECT_BINARY_DIR} + COMMENT "Coveralls: uploading coveralls output...") + + add_custom_target(coveralls DEPENDS coveralls_upload) + else() + message("COVERALLS UPLOAD: OFF") + add_custom_target(coveralls DEPENDS coveralls_generate) + endif() +endfunction() + +if(WITH_COVERAGE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage") + + set(EXCLUDE_DIRS + "demo/" + "build/" + "tests/" + ".test_env/" + ) + + if(WITH_GPU) + file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu") + else() + file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c") + endif() + + # exclude trivial files in PADDLE_SOURCES + foreach(EXCLUDE_DIR ${EXCLUDE_DIRS}) + foreach(TMP_PATH ${PADDLE_SOURCES}) + string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND) + if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1) + list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH}) + endif() + endforeach(TMP_PATH) + endforeach() + + # convert to absolute path + set(PADDLE_SRCS "") + foreach(PADDLE_SRC ${PADDLE_SOURCES}) + set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}") + endforeach() + + code_coverage( + "${PADDLE_SRCS}" + ${COVERALLS_UPLOAD} + "${PROJECT_SOURCE_DIR}/cmake" + ) +endif() diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake new file mode 100644 index 00000000..4641184f --- /dev/null +++ b/cmake/coverallsGcovJsons.cmake @@ -0,0 +1,401 @@ +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# Copyright (C) 2014 Joakim Söderberg +# +# This is intended to be run by a custom target in a CMake project like this. +# 0. Compile program with coverage support. +# 1. Clear coverage data. (Recursively delete *.gcda in build dir) +# 2. Run the unit tests. +# 3. Run this script specifying which source files the coverage should be performed on. +# +# This script will then use gcov to generate .gcov files in the directory specified +# via the COV_PATH var. This should probably be the same as your cmake build dir. +# +# It then parses the .gcov files to convert them into the Coveralls JSON format: +# https://coveralls.io/docs/api +# + +CMAKE_MINIMUM_REQUIRED(VERSION 2.8) + +# Since it's not possible to pass a CMake list properly in the +# "1;2;3" format to an external process, we have replaced the +# ";" with "*", so reverse that here so we get it back into the +# CMake list format. +string(REGEX REPLACE "\\*" ";" COVERAGE_SRCS ${COVERAGE_SRCS}) + +find_program(GCOV_EXECUTABLE gcov) +if (NOT GCOV_EXECUTABLE) + message(FATAL_ERROR "gcov not found! Aborting...") +endif() + +find_package(Git) + +# TODO: Add these git things to the coveralls json. +if (GIT_FOUND) + # Branch. + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_BRANCH + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + + macro (git_log_format FORMAT_CHARS VAR_NAME) + execute_process( + COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS} + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE ${VAR_NAME} + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + endmacro() + + git_log_format(an GIT_AUTHOR_EMAIL) + git_log_format(ae GIT_AUTHOR_EMAIL) + git_log_format(cn GIT_COMMITTER_NAME) + git_log_format(ce GIT_COMMITTER_EMAIL) + git_log_format(B GIT_COMMIT_MESSAGE) + + message("Git exe: ${GIT_EXECUTABLE}") + message("Git branch: ${GIT_BRANCH}") + message("Git author: ${GIT_AUTHOR_NAME}") + message("Git e-mail: ${GIT_AUTHOR_EMAIL}") + message("Git commiter name: ${GIT_COMMITTER_NAME}") + message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}") + message("Git commit message: ${GIT_COMMIT_MESSAGE}") + +endif() + +############################# Macros ######################################### + +# +# This macro converts from the full path format gcov outputs: +# +# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov +# +# to the original source file path the .gcov is for: +# +# /path/to/project/root/subdir/the_file.c +# +macro(get_source_path_from_gcov_filename _SRC_FILENAME _GCOV_FILENAME) + + # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov + # -> + # #path#to#project#root#subdir#the_file.c.gcov + get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME) + + # #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c + string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT}) + string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP}) + set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}") +endmacro() + +############################################################################## + +# Get the coverage data. +file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda") +message("Process GCDA files:") +message("===============================") + +# Get a list of all the object directories needed by gcov +# (The directories the .gcda files and .o files are found in) +# and run gcov on those. +foreach(GCDA ${GCDA_FILES}) + get_filename_component(GCDA_DIR ${GCDA} PATH) + + # + # The -p below refers to "Preserve path components", + # This means that the generated gcov filename of a source file will + # keep the original files entire filepath, but / is replaced with #. + # Example: + # + # /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda + # ------------------------------------------------------------------------------ + # File '/path/to/project/root/subdir/the_file.c' + # Lines executed:68.34% of 199 + # /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov' + # + # If -p is not specified then the file is named only "the_file.c.gcov" + # + execute_process( + COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null + WORKING_DIRECTORY ${GCDA_DIR} + ) +endforeach() + +# TODO: Make these be absolute path +file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov") + +# Get only the filenames to use for filtering. +#set(COVERAGE_SRCS_NAMES "") +#foreach (COVSRC ${COVERAGE_SRCS}) +# get_filename_component(COVSRC_NAME ${COVSRC} NAME) +# message("${COVSRC} -> ${COVSRC_NAME}") +# list(APPEND COVERAGE_SRCS_NAMES "${COVSRC_NAME}") +#endforeach() + +# +# Filter out all but the gcov files we want. +# +# We do this by comparing the list of COVERAGE_SRCS filepaths that the +# user wants the coverage data for with the paths of the generated .gcov files, +# so that we only keep the relevant gcov files. +# +# Example: +# COVERAGE_SRCS = +# /path/to/project/root/subdir/the_file.c +# +# ALL_GCOV_FILES = +# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov +# /path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov +# +# Result should be: +# GCOV_FILES = +# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov +# +set(GCOV_FILES "") +#message("Look in coverage sources: ${COVERAGE_SRCS}") +message("\nFilter out unwanted GCOV files:") +message("===============================") + +set(COVERAGE_SRCS_REMAINING ${COVERAGE_SRCS}) + +foreach (GCOV_FILE ${ALL_GCOV_FILES}) + + # + # /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov + # -> + # /path/to/project/root/subdir/the_file.c + get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE}) + + # Is this in the list of source files? + # TODO: We want to match against relative path filenames from the source file root... + list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND) + + if (NOT WAS_FOUND EQUAL -1) + message("YES: ${GCOV_FILE}") + list(APPEND GCOV_FILES ${GCOV_FILE}) + + # We remove it from the list, so we don't bother searching for it again. + # Also files left in COVERAGE_SRCS_REMAINING after this loop ends should + # have coverage data generated from them (no lines are covered). + list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH}) + else() + message("NO: ${GCOV_FILE}") + endif() +endforeach() + +# TODO: Enable setting these +set(JSON_SERVICE_NAME "travis-ci") +set(JSON_SERVICE_JOB_ID $ENV{TRAVIS_JOB_ID}) + +set(JSON_TEMPLATE +"{ + \"service_name\": \"\@JSON_SERVICE_NAME\@\", + \"service_job_id\": \"\@JSON_SERVICE_JOB_ID\@\", + \"source_files\": \@JSON_GCOV_FILES\@ +}" +) + +set(SRC_FILE_TEMPLATE +"{ + \"name\": \"\@GCOV_SRC_REL_PATH\@\", + \"source_digest\": \"\@GCOV_CONTENTS_MD5\@\", + \"coverage\": \@GCOV_FILE_COVERAGE\@ + }" +) + +message("\nGenerate JSON for files:") +message("=========================") + +set(JSON_GCOV_FILES "[") + +# Read the GCOV files line by line and get the coverage data. +foreach (GCOV_FILE ${GCOV_FILES}) + + get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE}) + file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}") + + # The new coveralls API doesn't need the entire source (Yay!) + # However, still keeping that part for now. Will cleanup in the future. + file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5) + message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}") + + # Loads the gcov file as a list of lines. + # (We first open the file and replace all occurences of [] with _ + # because CMake will fail to parse a line containing unmatched brackets... + # also the \ to escaped \n in macros screws up things.) + # https://public.kitware.com/Bug/view.php?id=15369 + file(READ ${GCOV_FILE} GCOV_CONTENTS) + string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") + string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") + string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}") + file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}") + + file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES) + list(LENGTH GCOV_LINES LINE_COUNT) + + # Instead of trying to parse the source from the + # gcov file, simply read the file contents from the source file. + # (Parsing it from the gcov is hard because C-code uses ; in many places + # which also happens to be the same as the CMake list delimeter). + file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE) + + string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + # According to http://json.org/ these should be escaped as well. + # Don't know how to do that in CMake however... + #string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + #string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + #string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}") + + # We want a json array of coverage data as a single string + # start building them from the contents of the .gcov + set(GCOV_FILE_COVERAGE "[") + + set(GCOV_LINE_COUNT 1) # Line number for the .gcov. + set(DO_SKIP 0) + foreach (GCOV_LINE ${GCOV_LINES}) + #message("${GCOV_LINE}") + # Example of what we're parsing: + # Hitcount |Line | Source + # " 8: 26: if (!allowed || (strlen(allowed) == 0))" + string(REGEX REPLACE + "^([^:]*):([^:]*):(.*)$" + "\\1;\\2;\\3" + RES + "${GCOV_LINE}") + + # Check if we should exclude lines using the Lcov syntax. + string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}") + string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}") + string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}") + + set(RESET_SKIP 0) + if (LINE_SKIP AND NOT DO_SKIP) + set(DO_SKIP 1) + set(RESET_SKIP 1) + endif() + + if (START_SKIP) + set(DO_SKIP 1) + message("${GCOV_LINE_COUNT}: Start skip") + endif() + + if (END_SKIP) + set(DO_SKIP 0) + endif() + + list(LENGTH RES RES_COUNT) + + if (RES_COUNT GREATER 2) + list(GET RES 0 HITCOUNT) + list(GET RES 1 LINE) + list(GET RES 2 SOURCE) + + string(STRIP ${HITCOUNT} HITCOUNT) + string(STRIP ${LINE} LINE) + + # Lines with 0 line numbers are metadata and can be ignored. + if (NOT ${LINE} EQUAL 0) + + if (DO_SKIP) + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ") + else() + # Translate the hitcount into valid JSON values. + if (${HITCOUNT} STREQUAL "#####") + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ") + elseif (${HITCOUNT} STREQUAL "-") + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ") + else() + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ") + endif() + endif() + endif() + else() + message(WARNING "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}") + endif() + + if (RESET_SKIP) + set(DO_SKIP 0) + endif() + math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1") + endforeach() + + message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!") + + # Advanced way of removing the trailing comma in the JSON array. + # "[1, 2, 3, " -> "[1, 2, 3" + string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE}) + + # Append the trailing ] to complete the JSON array. + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]") + + # Generate the final JSON for this file. + message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...") + string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON) + + set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ") +endforeach() + +# Loop through all files we couldn't find any coverage for +# as well, and generate JSON for those as well with 0% coverage. +foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING}) + + # Loads the source file as a list of lines. + file(STRINGS ${NOT_COVERED_SRC} SRC_LINES) + + set(GCOV_FILE_COVERAGE "[") + set(GCOV_FILE_SOURCE "") + + foreach (SOURCE ${SRC_LINES}) + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ") + + string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}") + string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}") + string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}") + string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}") + set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n") + endforeach() + + # Remove trailing comma, and complete JSON array with ] + string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE}) + set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]") + + # Generate the final JSON for this file. + string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON) + set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ") +endforeach() + +# Get rid of trailing comma. +string(REGEX REPLACE ",[ ]*$" "" JSON_GCOV_FILES ${JSON_GCOV_FILES}) +set(JSON_GCOV_FILES "${JSON_GCOV_FILES}]") + +# Generate the final complete JSON! +message("Generate final JSON...") +string(CONFIGURE ${JSON_TEMPLATE} JSON) + +file(WRITE "${COVERALLS_OUTPUT_FILE}" "${JSON}") +message("###########################################################################") +message("Generated coveralls JSON containing coverage data:") +message("${COVERALLS_OUTPUT_FILE}") +message("###########################################################################") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake new file mode 100644 index 00000000..b9c72c04 --- /dev/null +++ b/cmake/cuda.cmake @@ -0,0 +1,224 @@ +if(NOT WITH_GPU) + return() +endif() + +set(paddle_known_gpu_archs "30 35 50 52 60 61 70") +set(paddle_known_gpu_archs7 "30 35 50 52") +set(paddle_known_gpu_archs8 "30 35 50 52 60 61") +set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70") +set(paddle_known_gpu_archs10 "30 35 50 52 60 61 70 75") + +###################################################################################### +# A function for automatic detection of GPUs installed (if autodetection is enabled) +# Usage: +# detect_installed_gpus(out_variable) +function(detect_installed_gpus out_variable) + if(NOT CUDA_gpu_detect_output) + set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) + + file(WRITE ${cufile} "" + "#include \n" + "int main() {\n" + " int count = 0;\n" + " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" + " if (count == 0) return -1;\n" + " for (int device = 0; device < count; ++device) {\n" + " cudaDeviceProp prop;\n" + " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" + " std::printf(\"%d.%d \", prop.major, prop.minor);\n" + " }\n" + " return 0;\n" + "}\n") + + execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}" + "--run" "${cufile}" + WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" + RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + + if(nvcc_res EQUAL 0) + # only keep the last line of nvcc_out + STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}") + STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}") + list(GET nvcc_out -1 nvcc_out) + string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}") + set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE) + endif() + endif() + + if(NOT CUDA_gpu_detect_output) + message(STATUS "Automatic GPU detection failed. Building for all known architectures.") + set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE) + else() + set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) + endif() +endfunction() + + +######################################################################## +# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME +# Usage: +# select_nvcc_arch_flags(out_variable) +function(select_nvcc_arch_flags out_variable) + # List of arch names + set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual") + set(archs_name_default "All") + list(APPEND archs_names "Auto") + + # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui) + set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.") + set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} ) + mark_as_advanced(CUDA_ARCH_NAME) + + # verify CUDA_ARCH_NAME value + if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};") + string(REPLACE ";" ", " archs_names "${archs_names}") + message(FATAL_ERROR "Only ${archs_names} architeture names are supported.") + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Manual") + set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") + set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) + else() + unset(CUDA_ARCH_BIN CACHE) + unset(CUDA_ARCH_PTX CACHE) + endif() + + if(${CUDA_ARCH_NAME} STREQUAL "Kepler") + set(cuda_arch_bin "30 35") + elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") + set(cuda_arch_bin "50") + elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") + set(cuda_arch_bin "60 61") + elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") + set(cuda_arch_bin "70") + elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") + set(cuda_arch_bin "75") + elseif(${CUDA_ARCH_NAME} STREQUAL "All") + set(cuda_arch_bin ${paddle_known_gpu_archs}) + elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") + detect_installed_gpus(cuda_arch_bin) + else() # (${CUDA_ARCH_NAME} STREQUAL "Manual") + set(cuda_arch_bin ${CUDA_ARCH_BIN}) + endif() + + # remove dots and convert to lists + string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}") + string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}") + string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}") + list(REMOVE_DUPLICATES cuda_arch_bin) + list(REMOVE_DUPLICATES cuda_arch_ptx) + + set(nvcc_flags "") + set(nvcc_archs_readable "") + + # Tell NVCC to add binaries for the specified GPUs + foreach(arch ${cuda_arch_bin}) + if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)") + # User explicitly specified PTX for the concrete BIN + list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) + list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1}) + else() + # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch}) + list(APPEND nvcc_archs_readable sm_${arch}) + endif() + endforeach() + + # Tell NVCC to add PTX intermediate code for the specified architectures + foreach(arch ${cuda_arch_ptx}) + list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch}) + list(APPEND nvcc_archs_readable compute_${arch}) + endforeach() + + string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}") + set(${out_variable} ${nvcc_flags} PARENT_SCOPE) + set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE) +endfunction() + +message(STATUS "CUDA detected: " ${CUDA_VERSION}) +if (${CUDA_VERSION} LESS 7.0) + set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) +elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") +elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") + # CUDA 8 may complain that sm_20 is no longer supported. Suppress the + # warning for now. + list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets") +elseif (${CUDA_VERSION} LESS 10.0) # CUDA 9.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs9}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") +elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x + set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) + list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED") + list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__") +endif() +add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"") + +include_directories(${CUDA_INCLUDE_DIRS}) +if(NOT WITH_DSO) + if(WIN32) + set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY}) + endif(WIN32) +endif(NOT WITH_DSO) + +# setting nvcc arch flags +select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) +list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) +message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") + +# Set C++11 support +set(CUDA_PROPAGATE_HOST_FLAGS OFF) + +# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. +# So, don't set these flags here. +if (NOT WIN32) # windows msvc2015 support c++11 natively. +# -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake. +list(APPEND CUDA_NVCC_FLAGS "-std=c++11") +list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") +endif(NOT WIN32) + +if(WITH_FAST_MATH) + # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html + list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") +endif() +# in cuda9, suppress cuda warning on eigen +list(APPEND CUDA_NVCC_FLAGS "-w") +# Set :expt-relaxed-constexpr to suppress Eigen warnings +list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr") + +if (NOT WIN32) + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) + elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + # nvcc 9 does not support -Os. Use Release flags instead + list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) + endif() +else(NOT WIN32) + list(APPEND CUDA_NVCC_FLAGS "-Xcompiler \"/wd 4244 /wd 4267 /wd 4819\"") + list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj") + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND CUDA_NVCC_FLAGS "-g -G") + # match the cl's _ITERATOR_DEBUG_LEVEL + list(APPEND CUDA_NVCC_FLAGS "-D_DEBUG") + elseif(CMAKE_BUILD_TYPE STREQUAL "Release") + list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") + else() + message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") +endif() +endif(NOT WIN32) + +mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) +mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake new file mode 100644 index 00000000..98466d44 --- /dev/null +++ b/cmake/cudnn.cmake @@ -0,0 +1,102 @@ +if(NOT WITH_GPU) + return() +endif() + +if(WIN32) + set(CUDNN_ROOT ${CUDA_TOOLKIT_ROOT_DIR}) +else(WIN32) + set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT") +endif(WIN32) + +find_path(CUDNN_INCLUDE_DIR cudnn.h + PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include + $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE} + NO_DEFAULT_PATH +) + +get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) + +set(TARGET_ARCH "x86_64") +if(NOT ${CMAKE_SYSTEM_PROCESSOR}) + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +endif() + +list(APPEND CUDNN_CHECK_LIBRARY_DIRS + ${CUDNN_ROOT} + ${CUDNN_ROOT}/lib64 + ${CUDNN_ROOT}/lib + ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu + ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ + $ENV{CUDNN_ROOT} + $ENV{CUDNN_ROOT}/lib64 + $ENV{CUDNN_ROOT}/lib + /usr/lib + ${CUDA_TOOLKIT_ROOT_DIR} + ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 + ) +set(CUDNN_LIB_NAME "") +if (LINUX) +set(CUDNN_LIB_NAME "libcudnn.so") +endif(LINUX) + +if(WIN32) +# only support cudnn7 +set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll") +endif(WIN32) + +if(APPLE) +set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so") +endif(APPLE) + +find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a + PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist} + NO_DEFAULT_PATH + DOC "Path to cuDNN library.") + + +if(CUDNN_INCLUDE_DIR AND CUDNN_LIBRARY) + set(CUDNN_FOUND ON) +else() + set(CUDNN_FOUND OFF) +endif() + +if(CUDNN_FOUND) + file(READ ${CUDNN_INCLUDE_DIR}/cudnn.h CUDNN_VERSION_FILE_CONTENTS) + + get_filename_component(CUDNN_LIB_PATH ${CUDNN_LIBRARY} DIRECTORY) + + string(REGEX MATCH "define CUDNN_VERSION +([0-9]+)" + CUDNN_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define CUDNN_VERSION +([0-9]+)" "\\1" + CUDNN_VERSION "${CUDNN_VERSION}") + + if("${CUDNN_VERSION}" STREQUAL "2000") + message(STATUS "Current cuDNN version is v2. ") + else() + string(REGEX MATCH "define CUDNN_MAJOR +([0-9]+)" CUDNN_MAJOR_VERSION + "${CUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define CUDNN_MAJOR +([0-9]+)" "\\1" + CUDNN_MAJOR_VERSION "${CUDNN_MAJOR_VERSION}") + string(REGEX MATCH "define CUDNN_MINOR +([0-9]+)" CUDNN_MINOR_VERSION + "${CUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define CUDNN_MINOR +([0-9]+)" "\\1" + CUDNN_MINOR_VERSION "${CUDNN_MINOR_VERSION}") + string(REGEX MATCH "define CUDNN_PATCHLEVEL +([0-9]+)" + CUDNN_PATCHLEVEL_VERSION "${CUDNN_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define CUDNN_PATCHLEVEL +([0-9]+)" "\\1" + CUDNN_PATCHLEVEL_VERSION "${CUDNN_PATCHLEVEL_VERSION}") + + if(NOT CUDNN_MAJOR_VERSION) + set(CUDNN_VERSION "???") + else() + add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"") + math(EXPR CUDNN_VERSION + "${CUDNN_MAJOR_VERSION} * 1000 + + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") + endif() + + message(STATUS "Current cuDNN header is ${CUDNN_INCLUDE_DIR}/cudnn.h. " + "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ") + + endif() +endif() diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake new file mode 100644 index 00000000..72ed0f1e --- /dev/null +++ b/cmake/cupti.cmake @@ -0,0 +1,41 @@ +if(NOT WITH_GPU) + return() +endif() + + +set(CUPTI_ROOT "/usr" CACHE PATH "CUPTI ROOT") +find_path(CUPTI_INCLUDE_DIR cupti.h + PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include + $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include + ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include + NO_DEFAULT_PATH + ) + +get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) + +set(TARGET_ARCH "x86_64") +if(NOT ${CMAKE_SYSTEM_PROCESSOR}) + set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +endif() + +list(APPEND CUPTI_CHECK_LIBRARY_DIRS + ${CUPTI_ROOT} + ${CUPTI_ROOT}/lib64 + ${CUPTI_ROOT}/lib + ${CUPTI_ROOT}/lib/${TARGET_ARCH}-linux-gnu + $ENV{CUPTI_ROOT} + $ENV{CUPTI_ROOT}/lib64 + $ENV{CUPTI_ROOT}/lib + /usr/lib + ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64) +find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a + PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist} + NO_DEFAULT_PATH + DOC "Path to cuPTI library.") + +get_filename_component(CUPTI_LIBRARY_PATH ${CUPTI_LIBRARY} DIRECTORY) +if(CUPTI_INCLUDE_DIR AND CUPTI_LIBRARY) + set(CUPTI_FOUND ON) +else() + set(CUPTI_FOUND OFF) +endif() diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake new file mode 100644 index 00000000..ba8b5fc6 --- /dev/null +++ b/cmake/external/boost.cmake @@ -0,0 +1,60 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include(ExternalProject) + +set(BOOST_PROJECT "extern_boost") +# To release PaddlePaddle as a pip package, we have to follow the +# manylinux1 standard, which features as old Linux kernels and +# compilers as possible and recommends CentOS 5. Indeed, the earliest +# CentOS version that works with NVIDIA CUDA is CentOS 6. And a new +# version of boost, say, 1.66.0, doesn't build on CentOS 6. We +# checked that the devtools package of CentOS 6 installs boost 1.41.0. +# So we use 1.41.0 here. +set(BOOST_VER "1.41.0") +set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE) +set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE) + +MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}") + +set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) +set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") + +set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}" CACHE PATH "boost include directory." FORCE) +set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1) +include_directories(${BOOST_INCLUDE_DIR}) + +ExternalProject_Add( + ${BOOST_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR} + URL ${BOOST_URL} + DOWNLOAD_NO_PROGRESS 1 + PREFIX ${BOOST_SOURCES_DIR} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + UPDATE_COMMAND "" + ) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(boost STATIC ${dummyfile}) +else() + add_library(boost INTERFACE) +endif() + +add_dependencies(boost ${BOOST_PROJECT}) +set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR}) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake new file mode 100644 index 00000000..0dd35c09 --- /dev/null +++ b/cmake/external/brpc.cmake @@ -0,0 +1,70 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +find_package(OpenSSL REQUIRED) + +message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY}) +message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY}) + +ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY}) + +ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY}) + +SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc) +SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) +SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE) +SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE) + +INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) + +# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args +set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog") + +# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF +ExternalProject_Add( + extern_brpc + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/apache/incubator-brpc" + GIT_TAG "ad00fe940b4f05225b214131959293bbed8744a0" #rdma branch's head now. + PREFIX ${BRPC_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${prefix_path} + -DWITH_GLOG=ON + -DIOBUF_WITH_HUGE_BLOCK=ON + -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA} + ${EXTERNAL_OPTIONAL_ARGS} + LIST_SEPARATOR | + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) +ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy) +ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) +ADD_DEPENDENCIES(brpc extern_brpc) + +add_definitions(-DBRPC_WITH_GLOG) diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake new file mode 100644 index 00000000..52507a6a --- /dev/null +++ b/cmake/external/cares.cmake @@ -0,0 +1,45 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(NOT WITH_DISTRIBUTE) + return() +ENDIF() + +include (ExternalProject) + +# NOTE: c-ares is needed when linking with grpc. + +SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares) +SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares) +SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE) + +ExternalProject_Add( + extern_cares + GIT_REPOSITORY "https://github.com/c-ares/c-ares.git" + GIT_TAG "cares-1_13_0" + PREFIX ${CARES_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR} + BUILD_IN_SOURCE 1 + BUILD_COMMAND make -j8 + INSTALL_COMMAND make install +) + +ADD_LIBRARY(cares STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION + "${CARES_INSTALL_DIR}/lib/libcares.a") + +include_directories(${CARES_INCLUDE_DIR}) +ADD_DEPENDENCIES(cares extern_cares) diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake new file mode 100644 index 00000000..41ad8207 --- /dev/null +++ b/cmake/external/cub.cmake @@ -0,0 +1,33 @@ +if(NOT WITH_GPU) + return() +endif() + +include(ExternalProject) + +set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub) +set(CUB_INCLUDE_DIR ${CUB_SOURCE_DIR}/src/extern_cub) + +include_directories(${CUB_INCLUDE_DIR}) + +ExternalProject_Add( + extern_cub + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/NVlabs/cub.git" + GIT_TAG "v1.8.0" + PREFIX ${CUB_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cub_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(cub STATIC ${dummyfile}) +else() + add_library(cub INTERFACE) +endif() + +add_dependencies(cub extern_cub) diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake new file mode 100644 index 00000000..05e63bfe --- /dev/null +++ b/cmake/external/dgc.cmake @@ -0,0 +1,40 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(DGC_SOURCES_DIR "${THIRD_PARTY_PATH}/dgc") +SET(DGC_INSTALL_DIR "${THIRD_PARTY_PATH}/install/dgc") +SET(DGC_INCLUDE_DIR "${DGC_INSTALL_DIR}/include" CACHE PATH "dgc include directory." FORCE) +SET(DGC_LIBRARIES "${DGC_INSTALL_DIR}/lib/libdgc.a" CACHE FILEPATH "dgc library." FORCE) +INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR}) + +ExternalProject_Add( + extern_dgc + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/PaddlePaddle/Fleet" + GIT_TAG "2d04dc3800cdd0601f1b65d547dabcc60b0cf9dc" + SOURCE_DIR "${DGC_SOURCES_DIR}" + CONFIGURE_COMMAND "" + BUILD_COMMAND cd collective && make -j + INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc + && cp ${DGC_SOURCES_DIR}/collective/build/lib/libdgc.a ${DGC_LIBRARIES} + && cp ${DGC_SOURCES_DIR}/collective/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/ + BUILD_IN_SOURCE 1 +) + +ADD_LIBRARY(dgc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET dgc PROPERTY IMPORTED_LOCATION ${DGC_LIBRARIES}) +ADD_DEPENDENCIES(dgc extern_dgc) + diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake new file mode 100644 index 00000000..63dd16b2 --- /dev/null +++ b/cmake/external/dlpack.cmake @@ -0,0 +1,29 @@ +include(ExternalProject) + +set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack) +set(DLPACK_INCLUDE_DIR ${DLPACK_SOURCE_DIR}/src/extern_dlpack/include) + +include_directories(${DLPACK_INCLUDE_DIR}) + +ExternalProject_Add( + extern_dlpack + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/dmlc/dlpack.git" + GIT_TAG "v0.2" + PREFIX ${DLPACK_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/dlpack_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(dlpack STATIC ${dummyfile}) +else() + add_library(dlpack INTERFACE) +endif() + +add_dependencies(dlpack extern_dlpack) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake new file mode 100644 index 00000000..d6d4b79c --- /dev/null +++ b/cmake/external/eigen.cmake @@ -0,0 +1,62 @@ +INCLUDE(ExternalProject) + +SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) +SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) +INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) +if(NOT WITH_FAST_MATH) + # EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html + # enables some optimizations which might affect the accuracy of the result. + # This currently enables the SSE vectorization of sin() and cos(), + # and speedups sqrt() for single precision. + # Defined to 1 by default. Define it to 0 to disable. + add_definitions(-DEIGEN_FAST_MATH=0) +endif() + + +if(WIN32) + set(EIGEN_GIT_REPOSITORY https://github.com/wopeizl/eigen-git-mirror) + set(EIGEN_GIT_TAG support_cuda9_win) +else() + set(EIGEN_GIT_REPOSITORY https://github.com/eigenteam/eigen-git-mirror) + set(EIGEN_GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c) +endif() +if(WITH_AMD_GPU) + ExternalProject_Add( + extern_eigen3 + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" + GIT_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e + PREFIX ${EIGEN_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" + ) +else() + ExternalProject_Add( + extern_eigen3 + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "${EIGEN_GIT_REPOSITORY}" + # eigen on cuda9.1 missing header of math_funtions.hpp + # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen + GIT_TAG ${EIGEN_GIT_TAG} + PREFIX ${EIGEN_SOURCE_DIR} + DOWNLOAD_NAME "eigen" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" + ) +endif() + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c) + file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";") + add_library(eigen3 STATIC ${dummyfile}) +else() + add_library(eigen3 INTERFACE) +endif() + +add_dependencies(eigen3 extern_eigen3) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake new file mode 100644 index 00000000..343b7544 --- /dev/null +++ b/cmake/external/gflags.cmake @@ -0,0 +1,64 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags) +SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags) +SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE) +IF(WIN32) + set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) +ELSE(WIN32) + set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) +ENDIF(WIN32) + +INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) + +ExternalProject_Add( + extern_gflags + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/gflags/gflags.git" + GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a + PREFIX ${GFLAGS_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DBUILD_STATIC_LIBS=ON + -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) +ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) +ADD_DEPENDENCIES(gflags extern_gflags) + +# On Windows (including MinGW), the Shlwapi library is used by gflags if available. +if (WIN32) + include(CheckIncludeFileCXX) + check_include_file_cxx("shlwapi.h" HAVE_SHLWAPI) + if (HAVE_SHLWAPI) + set_property(GLOBAL PROPERTY OS_DEPENDENCY_MODULES shlwapi.lib) + endif(HAVE_SHLWAPI) +endif (WIN32) diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake new file mode 100644 index 00000000..ac629404 --- /dev/null +++ b/cmake/external/glog.cmake @@ -0,0 +1,67 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(GLOG_SOURCES_DIR ${THIRD_PARTY_PATH}/glog) +SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog) +SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE) + +IF(WIN32) + SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/glog.lib" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4267 /wd4530") +ELSE(WIN32) + SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE) + SET(GLOG_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) +ENDIF(WIN32) + +INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) + +SET(GLOG_REPOSITORY "https://github.com/google/glog.git") +SET(GLOG_TAG "v0.3.5") + +ExternalProject_Add( + extern_glog + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS gflags + GIT_REPOSITORY ${GLOG_REPOSITORY} + GIT_TAG ${GLOG_TAG} + PREFIX ${GLOG_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DWITH_GFLAGS=ON + -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags + -DBUILD_TESTING=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) + +ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) +ADD_DEPENDENCIES(glog extern_glog gflags) +LINK_LIBRARIES(glog gflags) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake new file mode 100644 index 00000000..d96da470 --- /dev/null +++ b/cmake/external/grpc.cmake @@ -0,0 +1,78 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(NOT WITH_DISTRIBUTE) + return() +ENDIF() + +include (ExternalProject) + +SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) +SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) +SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) +SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) + +include(ProcessorCount) +ProcessorCount(NUM_OF_PROCESSOR) + +IF(APPLE) + SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh) +ELSE() + SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) +ENDIF() + +# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them +ExternalProject_Add( + extern_grpc + DEPENDS protobuf zlib + # NOTE(wuyi): + # this package is generated by following steps: + # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git + # 2. git submodule update --init + # 3. keep only zlib, cares, protobuf, boringssl under "third_party", + # checkout and clean other dirs under third_party + # 4. remove .git, and package the directory. + URL "http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x.tar.gz" + URL_MD5 "1f268a2aff6759839dccd256adcc91cf" + PREFIX ${GRPC_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h + # NOTE(yuyang18): + # Disable -Werror, otherwise the compile will fail in MacOS. + # It seems that we cannot configure that by make command. + # Just dry run make command and remove `-Werror`, then use a shell to run make commands + BUILD_COMMAND ${BUILD_CMD} + INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install +) + +ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a") + +ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc++.a") +ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgpr.a") + +ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a") + +include_directories(${GRPC_INCLUDE_DIR}) +ADD_DEPENDENCIES(grpc++_unsecure extern_grpc) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake new file mode 100644 index 00000000..e4595265 --- /dev/null +++ b/cmake/external/gtest.cmake @@ -0,0 +1,82 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#FIXME:(gongwb) Move brpc's gtest dependency. +IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) + IF(WITH_TESTING) + ENABLE_TESTING() + ENDIF(WITH_TESTING) + + INCLUDE(ExternalProject) + + SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest) + SET(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest) + SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE) + + INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR}) + + IF(WIN32) + set(GTEST_LIBRARIES + "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE) + set(GTEST_MAIN_LIBRARIES + "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE) + ELSE(WIN32) + set(GTEST_LIBRARIES + "${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE) + set(GTEST_MAIN_LIBRARIES + "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE) + ENDIF(WIN32) + + IF(WITH_MKLML) + # wait for mklml downloading completed + SET(GTEST_DEPENDS ${MKLML_PROJECT}) + ENDIF() + + ExternalProject_Add( + extern_gtest + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${GTEST_DEPENDS} + GIT_REPOSITORY "https://github.com/google/googletest.git" + GIT_TAG "release-1.8.0" + PREFIX ${GTEST_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_GMOCK=ON + -Dgtest_disable_pthreads=ON + -Dgtest_force_shared_crt=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + ) + + ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL) + SET_PROPERTY(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES}) + ADD_DEPENDENCIES(gtest extern_gtest) + + ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL) + SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES}) + ADD_DEPENDENCIES(gtest_main extern_gtest) + +ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake new file mode 100644 index 00000000..ac0febd0 --- /dev/null +++ b/cmake/external/leveldb.cmake @@ -0,0 +1,41 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(LEVELDB_SOURCES_DIR ${THIRD_PARTY_PATH}/leveldb) +SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb) +SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE) +SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE) +INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR}) + +ExternalProject_Add( + extern_leveldb + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LEVELDB_SOURCES_DIR} + GIT_REPOSITORY "https://github.com/google/leveldb" + GIT_TAG v1.18 + CONFIGURE_COMMAND "" + BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a + INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ + && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES} + && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/ + BUILD_IN_SOURCE 1 +) + +ADD_DEPENDENCIES(extern_leveldb snappy) + +ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) +ADD_DEPENDENCIES(leveldb extern_leveldb) diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake new file mode 100644 index 00000000..717e0213 --- /dev/null +++ b/cmake/external/libmct.cmake @@ -0,0 +1,75 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_LIBMCT}) + return() +ENDIF(NOT ${WITH_LIBMCT}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with LIBMCT in Paddle yet." + "Force WITH_LIBMCT=OFF") + SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(LIBMCT_PROJECT "extern_libmct") +IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE) + SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE) + SET(LIBMCT_URL "https://pslib.bj.bcebos.com/libmct.tar.gz" CACHE STRING "" FORCE) +ENDIF() +MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}") +SET(LIBMCT_SOURCE_DIR "${THIRD_PARTY_PATH}/libmct") +SET(LIBMCT_DOWNLOAD_DIR "${LIBMCT_SOURCE_DIR}/src/${LIBMCT_PROJECT}") +SET(LIBMCT_DST_DIR "libmct") +SET(LIBMCT_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(LIBMCT_INSTALL_DIR ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR}) +SET(LIBMCT_ROOT ${LIBMCT_INSTALL_DIR}) +SET(LIBMCT_INC_DIR ${LIBMCT_ROOT}/include) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib") + +INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR}) + +FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(LIBMCT)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${LIBMCT_NAME}/include ${LIBMCT_NAME}/lib \n" + " DESTINATION ${LIBMCT_DST_DIR})\n") + +ExternalProject_Add( + ${LIBMCT_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LIBMCT_SOURCE_DIR} + DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz + && tar zxvf ${LIBMCT_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT} +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(libmct STATIC ${dummyfile}) +else() + add_library(libmct INTERFACE) +endif() + +ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT}) diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake new file mode 100644 index 00000000..69cdba7c --- /dev/null +++ b/cmake/external/libxsmm.cmake @@ -0,0 +1,55 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF) + +IF(NOT WITH_LIBXSMM) + return() +ENDIF() + +IF(WIN32 OR APPLE) + MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet.") + SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE) + return() +ENDIF() + +INCLUDE (ExternalProject) + +SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm) +SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm) +SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE) +SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE) +SET(LIBXSMM_LIBS "${LIBXSMM_LIBRARY_DIR}/libxsmm.a" + "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") + +ExternalProject_Add( + extern_libxsmm + GIT_REPOSITORY "https://github.com/hfp/libxsmm.git" + GIT_TAG "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2" + PREFIX ${LIBXSMM_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install + INSTALL_COMMAND "" +) +ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a") +SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a") + +MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}") +include_directories(${LIBXSMM_INCLUDE_DIR}) +ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM) +ADD_DEPENDENCIES(libxsmm extern_libxsmm) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake new file mode 100644 index 00000000..0ca37f50 --- /dev/null +++ b/cmake/external/mkldnn.cmake @@ -0,0 +1,120 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_MKLDNN}) + return() +ENDIF(NOT ${WITH_MKLDNN}) + +INCLUDE(ExternalProject) + +SET(MKLDNN_PROJECT "extern_mkldnn") +SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn) +SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) +SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) + +IF(APPLE) + MESSAGE(WARNING + "Mac is not supported with MKLDNN in Paddle yet." + "Force WITH_MKLDNN=OFF") + SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in MacOS" FORCE) + return() +ENDIF() + +# Introduce variables: +# * CMAKE_INSTALL_LIBDIR +INCLUDE(GNUInstallDirs) +SET(LIBDIR "lib") +if(CMAKE_INSTALL_LIBDIR MATCHES ".*lib64$") + SET(LIBDIR "lib64") +endif() + +MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/l${LIBDIR} to runtime path") +SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/${LIBDIR}") + +INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) # For MKLDNN code to include internal headers. + +IF(${CBLAS_PROVIDER} STREQUAL "MKLML") + SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) + MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}") +ELSE() + MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN") +ENDIF() + +IF(NOT WIN32) + SET(MKLDNN_FLAG "-Wno-error=strict-overflow -Wno-error=unused-result -Wno-error=array-bounds") + SET(MKLDNN_FLAG "${MKLDNN_FLAG} -Wno-unused-result -Wno-unused-value") + SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} ${MKLDNN_FLAG}") + SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} ${MKLDNN_FLAG}") +ELSE() + SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} /EHsc") +ENDIF(NOT WIN32) + +ExternalProject_Add( + ${MKLDNN_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${MKLDNN_DEPENDS} + GIT_REPOSITORY "https://github.com/intel/mkl-dnn.git" + GIT_TAG "aef88b7c233f48f8b945da310f1b973da31ad033" + PREFIX ${MKLDNN_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + CMAKE_ARGS -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + CMAKE_ARGS -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + CMAKE_ARGS -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} + CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + CMAKE_ARGS -DCMAKE_POSITION_INDEPENDENT_CODE=ON + CMAKE_ARGS -DMKLROOT=${MKLML_ROOT} + CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} + CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} + CMAKE_ARGS -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} + -DMKLROOT:PATH=${MKLML_ROOT} +) +if(WIN32) + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/mkldnn.lib" CACHE FILEPATH "mkldnn library." FORCE) +else(WIN32) + SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/${LIBDIR}/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE) +endif(WIN32) + +ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) +ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT}) +MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") +add_definitions(-DPADDLE_WITH_MKLDNN) + +# generate a static dummy target to track mkldnn dependencies +# for cc_library(xxx SRCS xxx.c DEPS mkldnn) +SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c) +FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") +ADD_LIBRARY(mkldnn STATIC ${dummyfile}) +TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB}) +ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) + +# copy the real so.0 lib to install dir +# it can be directly contained in wheel or capi +if(WIN32) + SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll) +else(WIN32) + SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) + ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB} + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} + DEPENDS mkldnn shared_mkldnn) +endif(WIN32) +ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB}) +ADD_DEPENDENCIES(mkldnn_shared_lib ${MKLDNN_PROJECT} mkldnn) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake new file mode 100644 index 00000000..06681129 --- /dev/null +++ b/cmake/external/mklml.cmake @@ -0,0 +1,78 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_MKLML}) + return() +ENDIF(NOT ${WITH_MKLML}) + +IF(APPLE) + MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.") + SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) +SET(MKLML_DST_DIR "mklml") +SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) +SET(MKLML_ROOT ${MKLML_INSTALL_DIR}) +SET(MKLML_INC_DIR ${MKLML_ROOT}/include) +SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib") + +SET(TIME_VERSION "2019.0.1.20181227") +IF(WIN32) + SET(MKLML_VER "mklml_win_${TIME_VERSION}" CACHE STRING "" FORCE) + SET(MKLML_URL "https://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.zip" CACHE STRING "" FORCE) + SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) + SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) + SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) + SET(MKLML_SHARED_LIB_DEPS ${MKLML_LIB_DIR}/msvcr120.dll) + SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) +ELSE() + #TODO(intel-huying): + # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. + SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) + SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) + SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) + SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) + SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) + SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) +ENDIF() + +SET(MKLML_PROJECT "extern_mklml") +MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}") +SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml") +SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") + +ExternalProject_Add( + ${MKLML_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${MKLML_SOURCE_DIR} + URL ${MKLML_URL} + DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND + ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/include ${MKLML_INC_DIR} && + ${CMAKE_COMMAND} -E copy_directory ${MKLML_DOWNLOAD_DIR}/lib ${MKLML_LIB_DIR} +) + +INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) + +ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) +ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake new file mode 100644 index 00000000..cdcbdd46 --- /dev/null +++ b/cmake/external/ngraph.cmake @@ -0,0 +1,84 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +add_library(ngraph INTERFACE) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with nGraph in Paddle yet." + "Force WITH_NGRAPH=OFF") + SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph in Windows and MacOS" FORCE) +ENDIF() + +IF(${WITH_NGRAPH} AND NOT ${WITH_MKLDNN}) + MESSAGE(WARNING + "nGraph needs mkl-dnn to be enabled." + "Force WITH_NGRAPH=OFF") + SET(WITH_NGRAPH OFF CACHE STRING "Disable nGraph if mkl-dnn is disabled" FORCE) +ENDIF() + +IF(NOT ${WITH_NGRAPH}) + return() +ENDIF() + +INCLUDE(GNUInstallDirs) + +INCLUDE(ExternalProject) + +SET(NGRAPH_PROJECT "extern_ngraph") +SET(NGRAPH_GIT_TAG "4ec94acc11084a5d53418f565529310fa584899a") +SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) +SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) +SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) +SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}) +SET(NGRAPH_SHARED_LIB_NAME libngraph.so) +SET(NGRAPH_CPU_LIB_NAME libcpu_backend.so) +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + SET(NGRAPH_TBB_LIB_NAME libtbb_debug.so.2) +else() + SET(NGRAPH_TBB_LIB_NAME libtbb.so.2) +endif() +SET(NGRAPH_GIT_REPO "https://github.com/NervanaSystems/ngraph.git") +SET(NGRAPH_SHARED_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME}) +SET(NGRAPH_CPU_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME}) +SET(NGRAPH_TBB_LIB ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME}) + +ExternalProject_Add( + ${NGRAPH_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS ${MKLDNN_PROJECT} ${MKLML_PROJECT} + GIT_REPOSITORY ${NGRAPH_GIT_REPO} + GIT_TAG ${NGRAPH_GIT_TAG} + PREFIX ${NGRAPH_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_GENERATOR ${CMAKE_GENERATOR} + CMAKE_GENERATOR_PLATFORM ${CMAKE_GENERATOR_PLATFORM} + CMAKE_GENERATOR_TOOLSET ${CMAKE_GENERATOR_TOOLSET} + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${NGRAPH_INSTALL_DIR} + CMAKE_ARGS -DNGRAPH_UNIT_TEST_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_TOOLS_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_INTERPRETER_ENABLE=FALSE + CMAKE_ARGS -DNGRAPH_DEX_ONLY=TRUE + CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} + CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR} + CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib +) + +add_dependencies(ngraph ${NGRAPH_PROJECT}) +target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH) +target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR}) +target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB}) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake new file mode 100644 index 00000000..d8a4a0be --- /dev/null +++ b/cmake/external/openblas.cmake @@ -0,0 +1,93 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +INCLUDE(cblas) + +IF(NOT ${CBLAS_FOUND}) + INCLUDE(ExternalProject) + + SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas) + SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas) + SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE) + + SET(CBLAS_LIBRARIES + "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" + CACHE FILEPATH "openblas library." FORCE) + + ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS) + + IF (WIN32) + SET(CBLAS_FOUND true) + MESSAGE(WARNING, "In windows, openblas only support msvc build, please build it manually and put it at " ${CBLAS_INSTALL_DIR}) + ENDIF(WIN32) + + IF (NOT WIN32) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") + SET(OPENBLAS_COMMIT "v0.2.20") + + IF(APPLE) + SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") + ENDIF() + SET(OPTIONAL_ARGS "") + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") + SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) + ENDIF() + + SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs) + ExternalProject_Add( + extern_openblas + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git + GIT_TAG ${OPENBLAS_COMMIT} + PREFIX ${CBLAS_SOURCES_DIR} + INSTALL_DIR ${CBLAS_INSTALL_DIR} + BUILD_IN_SOURCE 1 + BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} + INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= + && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + ) + ELSE() + ENDIF(NOT WIN32) + SET(CBLAS_PROVIDER openblas) +ENDIF(NOT ${CBLAS_FOUND}) + +MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}") +MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}") +INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) + +# FIXME(gangliao): generate cblas target to track all high performance +# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) +SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) +FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";") +ADD_LIBRARY(cblas STATIC ${dummyfile}) + +IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") + TARGET_LINK_LIBRARIES(cblas dynload_mklml) +ELSE() + TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) +ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML") + +IF(WITH_LIBXSMM) + TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS}) + ADD_DEPENDENCIES(cblas extern_libxsmm) +ENDIF() + +IF(NOT ${CBLAS_FOUND}) + ADD_DEPENDENCIES(cblas extern_openblas) +ELSE() + IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") + ADD_DEPENDENCIES(cblas mklml) + ENDIF() +ENDIF(NOT ${CBLAS_FOUND}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake new file mode 100644 index 00000000..09eb437a --- /dev/null +++ b/cmake/external/protobuf.cmake @@ -0,0 +1,251 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) +# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp +IF(NOT WIN32) +FIND_PACKAGE(Protobuf QUIET) +ENDIF(NOT WIN32) +macro(UNSET_VAR VAR_NAME) + UNSET(${VAR_NAME} CACHE) + UNSET(${VAR_NAME}) +endmacro() + +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(PROTOBUF_FOUND) +UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE) +UNSET_VAR(PROTOBUF_PROTOC_LIBRARY) +UNSET_VAR(PROTOBUF_LITE_LIBRARY) +UNSET_VAR(PROTOBUF_LIBRARY) +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) +function(protobuf_generate_python SRCS) + # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake + if(NOT ARGN) + message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files") + return() + endif() + + if(PROTOBUF_GENERATE_CPP_APPEND_PATH) + # Create an include path for each file specified + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(ABS_PATH ${ABS_FIL} PATH) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + else() + set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS) + set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}") + endif() + + if(DEFINED Protobuf_IMPORT_DIRS) + foreach(DIR ${Protobuf_IMPORT_DIRS}) + get_filename_component(ABS_PATH ${DIR} ABSOLUTE) + list(FIND _protobuf_include_path ${ABS_PATH} _contains_already) + if(${_contains_already} EQUAL -1) + list(APPEND _protobuf_include_path -I ${ABS_PATH}) + endif() + endforeach() + endif() + + set(${SRCS}) + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH) + get_filename_component(FIL_DIR ${FIL} DIRECTORY) + if(FIL_DIR) + set(FIL_WE "${FIL_DIR}/${FIL_WE}") + endif() + endif() + list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL} + DEPENDS ${ABS_FIL} ${PROTOBUF_PROTOC_EXECUTABLE} + COMMENT "Running Python protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set(${SRCS} ${${SRCS}} PARENT_SCOPE) +endfunction() + +# Print and set the protobuf library information, +# finish this cmake process and exit from this file. +macro(PROMPT_PROTOBUF_LIB) + SET(protobuf_DEPS ${ARGN}) + + MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}") + MESSAGE(STATUS "Protobuf-lite library: ${PROTOBUF_LITE_LIBRARY}") + MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}") + MESSAGE(STATUS "Protoc library: ${PROTOBUF_PROTOC_LIBRARY}") + MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}") + INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR}) + + # Assuming that all the protobuf libraries are of the same type. + IF(${PROTOBUF_LIBRARY} MATCHES ${CMAKE_STATIC_LIBRARY_SUFFIX}) + SET(protobuf_LIBTYPE STATIC) + ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$") + SET(protobuf_LIBTYPE SHARED) + ELSE() + MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}") + ENDIF() + + ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY}) + + ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY}) + + ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL) + SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY}) + + ADD_EXECUTABLE(protoc IMPORTED GLOBAL) + SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE}) + # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. + # make `protobuf_generate_cpp` happy. + SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) + + FOREACH(dep ${protobuf_DEPS}) + ADD_DEPENDENCIES(protobuf ${dep}) + ADD_DEPENDENCIES(protobuf_lite ${dep}) + ADD_DEPENDENCIES(libprotoc ${dep}) + ADD_DEPENDENCIES(protoc ${dep}) + ENDFOREACH() + + RETURN() +endmacro() +macro(SET_PROTOBUF_VERSION) + EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION) + STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}") +endmacro() + +set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") +IF (WIN32) + SET(PROTOBUF_ROOT ${THIRD_PARTY_PATH}/install/protobuf) +ENDIF(WIN32) + +if (NOT "${PROTOBUF_ROOT}" STREQUAL "") + + find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH) + find_library(PROTOBUF_LIBRARY protobuf libprotobuf.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_LITE_LIBRARY protobuf-lite libprotobuf-lite.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_PROTOC_LIBRARY protoc libprotoc.lib PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH) + if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) + message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") + SET(PROTOBUF_FOUND true) + SET_PROTOBUF_VERSION() + PROMPT_PROTOBUF_LIB() + else() + message(WARNING "Cannot find protobuf library in ${PROTOBUF_ROOT}") + endif() +endif() + +FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) + STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}") + SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME}) + SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME}) + + SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) + SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE) + SET(${TARGET_NAME}_LITE_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + SET(${TARGET_NAME}_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + SET(${TARGET_NAME}_PROTOC_LIBRARY + "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}" + PARENT_SCOPE) + SET(${TARGET_NAME}_PROTOC_EXECUTABLE + "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}" + PARENT_SCOPE) + + SET(OPTIONAL_CACHE_ARGS "") + SET(OPTIONAL_ARGS "") + IF(BUILD_FOR_HOST) + SET(OPTIONAL_ARGS "-Dprotobuf_WITH_ZLIB=OFF") + ELSE() + SET(OPTIONAL_ARGS + "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}" + "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}" + "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}" + "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}" + "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}" + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}" + "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}" + "-Dprotobuf_WITH_ZLIB=ON" + "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}" + ${EXTERNAL_OPTIONAL_ARGS}) + SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") + ENDIF() + IF(WIN32) + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") + ENDIF() + + SET(PROTOBUF_REPO "https://github.com/protocolbuffers/protobuf.git") + SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") + + ExternalProject_Add( + ${TARGET_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PROTOBUF_SOURCES_DIR} + UPDATE_COMMAND "" + DEPENDS zlib + GIT_REPOSITORY ${PROTOBUF_REPO} + GIT_TAG ${PROTOBUF_TAG} + CONFIGURE_COMMAND + ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake + ${OPTIONAL_ARGS} + -Dprotobuf_BUILD_TESTS=OFF + -DCMAKE_SKIP_RPATH=ON + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=lib + -DBUILD_SHARED_LIBS=OFF + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + ${OPTIONAL_CACHE_ARGS} + ) +ENDFUNCTION() + +SET(PROTOBUF_VERSION 3.1.0) + +IF(NOT PROTOBUF_FOUND) + build_protobuf(extern_protobuf FALSE) + + SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR} + CACHE PATH "protobuf include directory." FORCE) + SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY} + CACHE FILEPATH "protobuf lite library." FORCE) + SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY} + CACHE FILEPATH "protobuf library." FORCE) + SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY} + CACHE FILEPATH "protoc library." FORCE) + + SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE} + CACHE FILEPATH "protobuf executable." FORCE) + PROMPT_PROTOBUF_LIB(extern_protobuf) +ENDIF(NOT PROTOBUF_FOUND) diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake new file mode 100644 index 00000000..99a1c23b --- /dev/null +++ b/cmake/external/pslib.cmake @@ -0,0 +1,72 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_PSLIB}) + return() +ENDIF(NOT ${WITH_PSLIB}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with PSLIB in Paddle yet." + "Force WITH_PSLIB=OFF") + SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(PSLIB_PROJECT "extern_pslib") +IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(PSLIB_VER "0.1.1" CACHE STRING "" FORCE) + SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE) + SET(PSLIB_URL "https://pslib.bj.bcebos.com/pslib.tar.gz" CACHE STRING "" FORCE) +ENDIF() +MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}") +SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib") +SET(PSLIB_DOWNLOAD_DIR "${PSLIB_SOURCE_DIR}/src/${PSLIB_PROJECT}") +SET(PSLIB_DST_DIR "pslib") +SET(PSLIB_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(PSLIB_INSTALL_DIR ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR}) +SET(PSLIB_ROOT ${PSLIB_INSTALL_DIR}) +SET(PSLIB_INC_DIR ${PSLIB_ROOT}/include) +SET(PSLIB_LIB_DIR ${PSLIB_ROOT}/lib) +SET(PSLIB_LIB ${PSLIB_LIB_DIR}/libps.so) +SET(PSLIB_IOMP_LIB ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib") + +INCLUDE_DIRECTORIES(${PSLIB_INC_DIR}) + +FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${PSLIB_NAME}/include ${PSLIB_NAME}/lib \n" + " DESTINATION ${PSLIB_DST_DIR})\n") + +ExternalProject_Add( + ${PSLIB_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_SOURCE_DIR} + DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz + && tar zxvf ${PSLIB_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} +) + +ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) +ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake new file mode 100644 index 00000000..c1d63089 --- /dev/null +++ b/cmake/external/pslib_brpc.cmake @@ -0,0 +1,72 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_PSLIB_BRPC}) + return() +ENDIF(NOT ${WITH_PSLIB_BRPC}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet." + "Force WITH_PSLIB_BRPC=OFF") + SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(PSLIB_BRPC_PROJECT "extern_pslib_brpc") +IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE) + SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE) + SET(PSLIB_BRPC_URL "https://pslib.bj.bcebos.com/pslib_brpc.tar.gz" CACHE STRING "" FORCE) +ENDIF() +MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}") +SET(PSLIB_BRPC_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib_brpc") +SET(PSLIB_BRPC_DOWNLOAD_DIR "${PSLIB_BRPC_SOURCE_DIR}/src/${PSLIB_BRPC_PROJECT}") +SET(PSLIB_BRPC_DST_DIR "pslib_brpc") +SET(PSLIB_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(PSLIB_BRPC_INSTALL_DIR ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR}) +SET(PSLIB_BRPC_ROOT ${PSLIB_BRPC_INSTALL_DIR}) +SET(PSLIB_BRPC_INC_DIR ${PSLIB_BRPC_ROOT}/include) +SET(PSLIB_BRPC_LIB_DIR ${PSLIB_BRPC_ROOT}/lib) +SET(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libbrpc.a) +SET(PSLIB_BRPC_IOMP_LIB ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib") + +INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR}) + +FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB_BRPC)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${PSLIB_BRPC_NAME}/include ${PSLIB_BRPC_NAME}/lib \n" + " DESTINATION ${PSLIB_BRPC_DST_DIR})\n") + +ExternalProject_Add( + ${PSLIB_BRPC_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_BRPC_SOURCE_DIR} + DOWNLOAD_DIR ${PSLIB_BRPC_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_NAME}.tar.gz + && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT} +) + +ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB}) +ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT}) diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake new file mode 100644 index 00000000..3a10ea94 --- /dev/null +++ b/cmake/external/pybind11.cmake @@ -0,0 +1,46 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT WITH_PYTHON) + return() +endif() + +include(ExternalProject) + +set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind) + +include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include) + +ExternalProject_Add( + extern_pybind + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/pybind/pybind11.git" + GIT_TAG "v2.2.4" + PREFIX ${PYBIND_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c) + file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";") + add_library(pybind STATIC ${dummyfile}) +else() + add_library(pybind INTERFACE) +endif() + +add_dependencies(pybind extern_pybind) diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake new file mode 100644 index 00000000..623c53f4 --- /dev/null +++ b/cmake/external/python.cmake @@ -0,0 +1,83 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT WITH_PYTHON) + return() +ENDIF() + +INCLUDE(python_module) + +FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED) +FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED) + +if(WIN32) + execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" +"from distutils import sysconfig as s;import sys;import struct; +print(sys.prefix); +print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); +" + RESULT_VARIABLE _PYTHON_SUCCESS + OUTPUT_VARIABLE _PYTHON_VALUES + ERROR_VARIABLE _PYTHON_ERROR_VALUE) + + if(NOT _PYTHON_SUCCESS MATCHES 0) + set(PYTHONLIBS_FOUND FALSE) + return() + endif() + + # Convert the process output into a list + string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES}) + string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES}) + list(GET _PYTHON_VALUES 0 PYTHON_PREFIX) + list(GET _PYTHON_VALUES 1 PYTHON_LIBRARY_SUFFIX) + + # Make sure all directory separators are '/' + string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX}) + + set(PYTHON_LIBRARY + "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") + + # when run in a venv, PYTHON_PREFIX points to it. But the libraries remain in the + # original python installation. They may be found relative to PYTHON_INCLUDE_DIR. + if(NOT EXISTS "${PYTHON_LIBRARY}") + get_filename_component(_PYTHON_ROOT ${PYTHON_INCLUDE_DIR} DIRECTORY) + set(PYTHON_LIBRARY + "${_PYTHON_ROOT}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") + endif() + + # raise an error if the python libs are still not found. + if(NOT EXISTS "${PYTHON_LIBRARY}") + message(FATAL_ERROR "Python libraries not found") + endif() + SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}") +endif(WIN32) + +# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE. +ADD_LIBRARY(python SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) + +SET(py_env "") +IF(PYTHONINTERP_FOUND) + find_python_module(pip REQUIRED) + find_python_module(numpy REQUIRED) + find_python_module(wheel REQUIRED) + find_python_module(google.protobuf REQUIRED) + FIND_PACKAGE(NumPy REQUIRED) + IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0") + MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, " + "please use pip to upgrade protobuf. pip install -U protobuf") + ENDIF() +ENDIF(PYTHONINTERP_FOUND) +INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) +INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake new file mode 100644 index 00000000..914c0649 --- /dev/null +++ b/cmake/external/rocprim.cmake @@ -0,0 +1,44 @@ +if (NOT WITH_AMD_GPU) + return() +endif() + +# rocprim is "ROCm Parallel Primitives" for short. +# It is a header-only library providing HIP and HC parallel primitives +# for developing performant GPU-accelerated code on AMD ROCm platform. + +if("x${HCC_HOME}" STREQUAL "x") + set(HCC_HOME "/opt/rocm/hcc") +endif() + +INCLUDE(ExternalProject) + +SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim) +SET(ROCPRIM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocprim) +SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include) + +ExternalProject_Add( + extern_rocprim + GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git" + GIT_TAG 5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc + PREFIX ${ROCPRIM_SOURCE_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc + CMAKE_ARGS -DONLY_INSTALL=ON + CMAKE_ARGS -DBUILD_TEST=OFF + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR} + + INSTALL_DIR ${ROCPRIM_INSTALL_DIR} + ${EXTERNAL_PROJECT_LOG_ARGS} +) + +INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR}) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/rocprim_dummy.c) + file(WRITE ${dummyfile} "const char *dummy_rocprim = \"${dummyfile}\";") + add_library(rocprim STATIC ${dummyfile}) +else() + add_library(rocprim INTERFACE) +endif() + +add_dependencies(rocprim extern_rocprim) diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake new file mode 100644 index 00000000..3fb6b49f --- /dev/null +++ b/cmake/external/snappy.cmake @@ -0,0 +1,65 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include (ExternalProject) + +# NOTE: snappy is needed when linking with recordio + +set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) +set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) +set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) + +if(WIN32) + SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") +else() + SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) +endif() + +ExternalProject_Add( + extern_snappy + GIT_REPOSITORY "https://github.com/google/snappy" + GIT_TAG "1.1.7" + PREFIX ${SNAPPY_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DSNAPPY_BUILD_TESTS:BOOL=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) +IF(WIN32) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib") +else(WIN32) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") +endif (WIN32) + +add_library(snappy STATIC IMPORTED GLOBAL) +set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES}) + +include_directories(${SNAPPY_INCLUDE_DIR}) +add_dependencies(snappy extern_snappy) diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake new file mode 100644 index 00000000..392f186b --- /dev/null +++ b/cmake/external/snappystream.cmake @@ -0,0 +1,63 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +include (ExternalProject) + +set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream) +set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream) +set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE) + +if(WIN32) + # Fix me, VS2015 come without VLA support + set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib") + MESSAGE(WARNING, "In windows, snappystream has no compile support for windows, + please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR}) +else(WIN32) + set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") + + ExternalProject_Add( + extern_snappystream + GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git" + GIT_TAG "0.2.8" + PREFIX ${SNAPPYSTREAM_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + DEPENDS snappy + ) +endif(WIN32) + +add_library(snappystream STATIC IMPORTED GLOBAL) +set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES}) + +include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers. +include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers. + +add_dependencies(snappystream extern_snappystream) diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake new file mode 100644 index 00000000..1f56bc7a --- /dev/null +++ b/cmake/external/threadpool.cmake @@ -0,0 +1,28 @@ +INCLUDE(ExternalProject) + +SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool) +SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool) +INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR}) + +ExternalProject_Add( + extern_threadpool + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/progschj/ThreadPool.git" + GIT_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040 + PREFIX ${THREADPOOL_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c) + file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";") + add_library(simple_threadpool STATIC ${dummyfile}) +else() + add_library(simple_threadpool INTERFACE) +endif() + +add_dependencies(simple_threadpool extern_threadpool) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake new file mode 100644 index 00000000..5fc46ae8 --- /dev/null +++ b/cmake/external/warpctc.cmake @@ -0,0 +1,81 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc) +SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) + +SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" + CACHE PATH "Warp-ctc Directory" FORCE) +# Used in unit test test_WarpCTCLayer +SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" + CACHE PATH "Warp-ctc Library Directory" FORCE) + +IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32) + SET(USE_OMP OFF) +ELSE() + SET(USE_OMP ON) +ENDIF() + +IF(WIN32) + SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git") +ELSE() + SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git") +ENDIF() + +ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY ${WARPCTC_REPOSITORY} + PREFIX ${WARPCTC_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} +) +IF(WIN32) + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +else(WIN32) + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +ENDIF(WIN32) + +MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") +get_filename_component(WARPCTC_LIBRARY_PATH ${WARPCTC_LIBRARIES} DIRECTORY) +INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. +INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include warpctc headers. + +ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) +ADD_DEPENDENCIES(warpctc extern_warpctc) diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake new file mode 100644 index 00000000..1d61154c --- /dev/null +++ b/cmake/external/xbyak.cmake @@ -0,0 +1,57 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set(WITH_XBYAK ON) +if(WIN32 OR APPLE) + SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE) + return() +endif() + +include(ExternalProject) + +set(XBYAK_PROJECT extern_xbyak) +set(XBYAK_PREFIX_DIR ${THIRD_PARTY_PATH}/xbyak) +set(XBYAK_INSTALL_ROOT ${THIRD_PARTY_PATH}/install/xbyak) +set(XBYAK_INC_DIR ${XBYAK_INSTALL_ROOT}/include) + +include_directories(${XBYAK_INC_DIR}) +include_directories(${XBYAK_INC_DIR}/xbyak) + +add_definitions(-DPADDLE_WITH_XBYAK) + +# xbyak options +add_definitions(-DXBYAK64) +add_definitions(-DXBYAK_NO_OP_NAMES) + +ExternalProject_Add( + ${XBYAK_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + DEPENDS "" + GIT_REPOSITORY "https://github.com/herumi/xbyak.git" + GIT_TAG "v5.661" # Jul 26th + PREFIX ${XBYAK_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XBYAK_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XBYAK_INSTALL_ROOT} +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/xbyak_dummy.c) + file(WRITE ${dummyfile} "const char *dummy_xbyak = \"${dummyfile}\";") + add_library(xbyak STATIC ${dummyfile}) +else() + add_library(xbyak INTERFACE) +endif() + +add_dependencies(xbyak ${XBYAK_PROJECT}) diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake new file mode 100644 index 00000000..262d47f6 --- /dev/null +++ b/cmake/external/xxhash.cmake @@ -0,0 +1,68 @@ +INCLUDE(ExternalProject) + +set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash) +set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash) +set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include") + +IF(WITH_STATIC_LIB) + SET(BUILD_CMD make lib) +ELSE() + IF(APPLE) + SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib) + ELSE(APPLE) + SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib) + ENDIF(APPLE) +ENDIF() + +if(WIN32) + ExternalProject_Add( + extern_xxhash + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" + GIT_TAG "v0.6.5" + PREFIX ${XXHASH_SOURCE_DIR} + DOWNLOAD_NAME "xxhash" + UPDATE_COMMAND "" + BUILD_IN_SOURCE 1 + PATCH_COMMAND + CONFIGURE_COMMAND + ${CMAKE_COMMAND} ${XXHASH_SOURCE_DIR}/src/extern_xxhash/cmake_unofficial + -DCMAKE_INSTALL_PREFIX:PATH=${XXHASH_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DBUILD_XXHSUM=OFF + -DCMAKE_GENERATOR_PLATFORM=x64 + -DBUILD_SHARED_LIBS=OFF + ${OPTIONAL_CACHE_ARGS} + TEST_COMMAND "" + ) +else() + ExternalProject_Add( + extern_xxhash + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" + GIT_TAG "v0.6.5" + PREFIX ${XXHASH_SOURCE_DIR} + DOWNLOAD_NAME "xxhash" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + PATCH_COMMAND + BUILD_COMMAND ${BUILD_CMD} + INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install + TEST_COMMAND "" + ) +endif() + +if (WIN32) + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib") +else() + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") +endif () +INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR}) + +add_library(xxhash STATIC IMPORTED GLOBAL) +set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES}) +include_directories(${XXHASH_INCLUDE_DIR}) +add_dependencies(xxhash extern_xxhash) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake new file mode 100644 index 00000000..58881ac2 --- /dev/null +++ b/cmake/external/zlib.cmake @@ -0,0 +1,54 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(ZLIB_SOURCES_DIR ${THIRD_PARTY_PATH}/zlib) +SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib) +SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE) +SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE) + +INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers. +INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h. + +ExternalProject_Add( + extern_zlib + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/madler/zlib.git" + GIT_TAG "v1.2.8" + PREFIX ${ZLIB_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR} + -DBUILD_SHARED_LIBS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_MACOSX_RPATH=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) +IF(WIN32) + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE) +ELSE(WIN32) + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) +ENDIF(WIN32) + +ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) +ADD_DEPENDENCIES(zlib extern_zlib) diff --git a/cmake/flags.cmake b/cmake/flags.cmake new file mode 100644 index 00000000..36b533aa --- /dev/null +++ b/cmake/flags.cmake @@ -0,0 +1,194 @@ +# Setting Paddle Compile Flags +include(CheckCXXCompilerFlag) +include(CheckCCompilerFlag) +include(CheckCXXSymbolExists) +include(CheckTypeSize) + +function(CheckCompilerCXX11Flag) + if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) + message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") + # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang" + # Apple Clang is a different compiler than upstream Clang which havs different version numbers. + # https://gist.github.com/yamaya/2924292 + if(APPLE) # cmake < 3.0 compiler id "Clang" on Mac OS X + if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1) + message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.") + endif() + else() + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) + message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") + endif() + endif() + endif() +endfunction() + +CheckCompilerCXX11Flag() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +# safe_set_flag +# +# Set a compile flag only if compiler is support +# is_c: is C flag or C++ flag, bool type. +# src_list: The list name which the flag name will be append to. +# flag_name: the flag name for compiler, such as '-Werror' '-Wall' etc +# rest arguments: not used. +function(safe_set_flag is_c src_list flag_name) + string(REPLACE "-" "_" safe_name ${flag_name}) + string(REPLACE "=" "_" safe_name ${safe_name}) + if(is_c) + CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name}) + set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name}) + else() + CHECK_CXX_COMPILER_FLAG(${flag_name} CXX_COMPILER_SUPPORT_FLAG_${safe_name}) + set(safe_name CXX_COMPILER_SUPPORT_FLAG_${safe_name}) + endif() + if(${safe_name}) + set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE) + endif() +endfunction() + +# helper macro to set cflag +macro(safe_set_cflag src_list flag_name) + safe_set_flag(ON ${src_list} ${flag_name}) +endmacro() + +# helper macro to set cxxflag +macro(safe_set_cxxflag src_list flag_name) + safe_set_flag(OFF ${src_list} ${flag_name}) +endmacro() + +# helper macro to set nvcc flag +macro(safe_set_nvflag flag_name) + string(REPLACE "-" "_" safe_name ${flag_name}) + string(REPLACE "=" "_" safe_name ${safe_name}) + CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name}) + set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name}) + if(${safe_name}) + LIST(APPEND CUDA_NVCC_FLAGS -Xcompiler ${flag_name}) + endif() +endmacro() + +macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared + if (BUILD_SHARED_LIBS) + return() # if build shared libs, the flags keep same with '/MD' + endif(BUILD_SHARED_LIBS) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +endmacro() + +CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS) +if(NOT UINT64_MAX_EXISTS) + set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS) + CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE) + if(UINT64_MAX_EXISTS_HERE) + set(CMAKE_REQUIRED_DEFINITIONS) + add_definitions(-D__STDC_LIMIT_MACROS) + else() + message(FATAL_ERROR "Cannot find symbol UINT64_MAX") + endif() +endif() + +SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h") +CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND) +CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND) +if(SPINLOCK_FOUND) + add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK) +endif(SPINLOCK_FOUND) +if(BARRIER_FOUND) + add_definitions(-DPADDLE_USE_PTHREAD_BARRIER) +endif(BARRIER_FOUND) +SET(CMAKE_EXTRA_INCLUDE_FILES "") + +# Common flags. the compiler flag used for C/C++ sources whenever release or debug +# Do not care if this flag is support for gcc. + +# https://github.com/PaddlePaddle/Paddle/issues/12773 +if (NOT WIN32) +set(COMMON_FLAGS + -fPIC + -fno-omit-frame-pointer + -Werror + -Wall + -Wextra + -Wnon-virtual-dtor + -Wdelete-non-virtual-dtor + -Wno-unused-parameter + -Wno-unused-function + -Wno-error=literal-suffix + -Wno-error=sign-compare + -Wno-error=unused-local-typedefs + -Wno-error=parentheses-equality # Warnings in pybind11 + -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 + -Wno-error=terminate # Warning in PADDLE_ENFORCE + -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 + -Wimplicit-fallthrough=0 # Warning in tinyformat.h + -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2 +) + +set(GPU_COMMON_FLAGS + -fPIC + -fno-omit-frame-pointer + -Wnon-virtual-dtor + -Wdelete-non-virtual-dtor + -Wno-unused-parameter + -Wno-unused-function + -Wno-error=sign-compare + -Wno-error=literal-suffix + -Wno-error=unused-local-typedefs + -Wno-error=unused-function # Warnings in Numpy Header. + -Wno-error=array-bounds # Warnings in Eigen::array +) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64") +endif(NOT WIN32) + +if (APPLE) + # On Mac OS X build fat binaries with x86_64 architectures by default. + set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE) + # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0 + set (COMMON_FLAGS -Wno-deprecated-register) +endif(APPLE) + +if(LINUX) + set(GPU_COMMON_FLAGS + -Wall + -Wextra + -Werror + ${GPU_COMMON_FLAGS}) +endif(LINUX) + +if(UNIX AND NOT APPLE) + # except apple from nix*Os family + set(LINUX TRUE) +endif(UNIX AND NOT APPLE) + +foreach(flag ${COMMON_FLAGS}) + safe_set_cflag(CMAKE_C_FLAGS ${flag}) + safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) + +endforeach() + +foreach(flag ${GPU_COMMON_FLAGS}) + safe_set_nvflag(${flag}) +endforeach() + +if(WIN32) +# windows build turn off warnings. +safe_set_static_flag() + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO + CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) + string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}") + set(flag_var "${flag_var} /w") + endforeach(flag_var) +endif(WIN32) diff --git a/cmake/generic.cmake b/cmake/generic.cmake new file mode 100644 index 00000000..d846e08b --- /dev/null +++ b/cmake/generic.cmake @@ -0,0 +1,816 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +# generic.cmake defines CMakes functions that look like Bazel's +# building rules (https://bazel.build/). +# +# +# ------------------------------------------- +# C++ CUDA C++ Go +# ------------------------------------------- +# cc_library nv_library go_library +# cc_binary nv_binary go_binary +# cc_test nv_test go_test +# ------------------------------------------- +# +# To build a static library example.a from example.cc using the system +# compiler (like GCC): +# +# cc_library(example SRCS example.cc) +# +# To build a static library example.a from multiple source files +# example{1,2,3}.cc: +# +# cc_library(example SRCS example1.cc example2.cc example3.cc) +# +# To build a shared library example.so from example.cc: +# +# cc_library(example SHARED SRCS example.cc) +# +# To build a library using Nvidia's NVCC from .cu file(s), use the nv_ +# prefixed version: +# +# nv_library(example SRCS example.cu) +# +# To specify that a library new_example.a depends on other libraies: +# +# cc_library(new_example SRCS new_example.cc DEPS example) +# +# Static libraries can be composed of other static libraries: +# +# cc_library(composed DEPS dependent1 dependent2 dependent3) +# +# To build an executable binary file from some source files and +# dependent libraries: +# +# cc_binary(example SRCS main.cc something.cc DEPS example1 example2) +# +# To build an executable binary file using NVCC, use the nv_ prefixed +# version: +# +# nv_binary(example SRCS main.cc something.cu DEPS example1 example2) +# +# To build a unit test binary, which is an executable binary with +# GoogleTest linked: +# +# cc_test(example_test SRCS example_test.cc DEPS example) +# +# To build a unit test binary using NVCC, use the nv_ prefixed version: +# +# nv_test(example_test SRCS example_test.cu DEPS example) +# +# It is pretty often that executable and test binaries depend on +# pre-defined external libaries like glog and gflags defined in +# /cmake/external/*.cmake: +# +# cc_test(example_test SRCS example_test.cc DEPS example glog gflags) +# +# To build a go static library using Golang, use the go_ prefixed version: +# +# go_library(example STATIC) +# +# To build a go shared library using Golang, use the go_ prefixed version: +# +# go_library(example SHARED) +# + +# including binary directory for generated headers. +include_directories(${CMAKE_CURRENT_BINARY_DIR}) + +if(NOT APPLE) + find_package(Threads REQUIRED) + link_libraries(${CMAKE_THREAD_LIBS_INIT}) + set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt") +endif(NOT APPLE) + +set_property(GLOBAL PROPERTY FLUID_MODULES "") +# find all fluid modules is used for paddle fluid static library +# for building inference libs +function(find_fluid_modules TARGET_NAME) + get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) + string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) + string(FIND "${__target_path}" "fluid" pos) + if(pos GREATER 1) + get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) + set(fluid_modules ${fluid_modules} ${TARGET_NAME}) + set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}") + endif() +endfunction(find_fluid_modules) + + +function(common_link TARGET_NAME) + if (WITH_PROFILER) + target_link_libraries(${TARGET_NAME} gperftools::profiler) + endif() +endfunction() + + +# find all third_party modules is used for paddle static library +# for reduce the dependency when building the inference libs. +set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY) +function(find_fluid_thirdparties TARGET_NAME) + get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) + string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) + string(FIND "${__target_path}" "third_party" pos) + if(pos GREATER 1) + get_property(fluid_ GLOBAL PROPERTY FLUID_THIRD_PARTY) + set(fluid_third_partys ${fluid_third_partys} ${TARGET_NAME}) + set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY "${fluid_third_partys}") + endif() +endfunction(find_fluid_thirdparties) + +function(merge_static_libs TARGET_NAME) + set(libs ${ARGN}) + list(REMOVE_DUPLICATES libs) + + # Get all propagation dependencies from the merged libraries + foreach(lib ${libs}) + list(APPEND libs_deps ${${lib}_LIB_DEPENDS}) + endforeach() + if(libs_deps) + list(REMOVE_DUPLICATES libs_deps) + endif() + + # To produce a library we need at least one source file. + # It is created by add_custom_command below and will helps + # also help to track dependencies. + set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) + + if(APPLE) # Use OSX's libtool to merge archives + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${target_SRCS} + COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS} + DEPENDS ${libs}) + + # Generate dummy staic lib + file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";") + add_library(${TARGET_NAME} STATIC ${target_SRCS}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + foreach(lib ${libs}) + # Get the file names of the libraries to be merged + set(libfiles ${libfiles} $) + endforeach() + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" + COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles} + ) + endif(APPLE) + if(LINUX) # general UNIX: use "ar" to extract objects and re-add to a common lib + set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir) + + foreach(lib ${libs}) + set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library + set(objdir ${target_DIR}/${lib}.objdir) + + add_custom_command(OUTPUT ${objdir} + COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir} + DEPENDS ${lib}) + + add_custom_command(OUTPUT ${objlistfile} + COMMAND ${CMAKE_AR} -x "$" + COMMAND ${CMAKE_AR} -t "$" > ${objlistfile} + DEPENDS ${lib} ${objdir} + WORKING_DIRECTORY ${objdir}) + + list(APPEND target_OBJS "${objlistfile}") + endforeach() + + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${target_SRCS} + COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS} + DEPENDS ${libs} ${target_OBJS}) + + # Generate dummy staic lib + file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";") + add_library(${TARGET_NAME} STATIC ${target_SRCS}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + # Get the file name of the generated library + set(target_LIBNAME "$") + + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'` + COMMAND ${CMAKE_RANLIB} ${target_LIBNAME} + WORKING_DIRECTORY ${target_DIR}) + endif(LINUX) + if(WIN32) # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs. + # Make the generated dummy source file depended on all static input + # libs. If input lib changes,the source file is touched + # which causes the desired effect (relink). + add_custom_command(OUTPUT ${target_SRCS} + COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS} + DEPENDS ${libs}) + + # Generate dummy staic lib + file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";") + add_library(${TARGET_NAME} STATIC ${target_SRCS}) + target_link_libraries(${TARGET_NAME} ${libs_deps}) + + foreach(lib ${libs}) + # Get the file names of the libraries to be merged + set(libfiles ${libfiles} $) + endforeach() + # msvc will put libarary in directory of "/Release/xxxlib" by default + # COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib" + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}" + COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib ${libfiles} + ) + endif(WIN32) +endfunction(merge_static_libs) + +function(cc_library TARGET_NAME) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(WIN32) + # add libxxx.lib prefix in windows + set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") + endif(WIN32) + if(cc_library_SRCS) + if(cc_library_SHARED OR cc_library_shared) # build *.so + add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) + else() + add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) + find_fluid_modules(${TARGET_NAME}) + endif() + + if(cc_library_DEPS) + # Don't need link libwarpctc.so + if("${cc_library_DEPS};" MATCHES "warpctc;") + list(REMOVE_ITEM cc_library_DEPS warpctc) + add_dependencies(${TARGET_NAME} warpctc) + endif() + # Only deps libmklml.so, not link + if("${cc_library_DEPS};" MATCHES "mklml;") + list(REMOVE_ITEM cc_library_DEPS mklml) + if(NOT "${TARGET_NAME}" MATCHES "dynload_mklml") + list(APPEND cc_library_DEPS dynload_mklml) + endif() + add_dependencies(${TARGET_NAME} mklml) + if(WIN32) + target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB}) + else(WIN32) + target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") + endif(WIN32) + endif() + # remove link to python, see notes at: + # https://github.com/pybind/pybind11/blob/master/docs/compiling.rst#building-manually + if("${cc_library_DEPS};" MATCHES "python;") + list(REMOVE_ITEM cc_library_DEPS python) + add_dependencies(${TARGET_NAME} python) + if(WIN32) + target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES}) + else() + target_link_libraries(${TARGET_NAME} "-Wl,-undefined,dynamic_lookup") + endif(WIN32) + endif() + target_link_libraries(${TARGET_NAME} ${cc_library_DEPS}) + add_dependencies(${TARGET_NAME} ${cc_library_DEPS}) + common_link(${TARGET_NAME}) + endif() + + # cpplint code style + foreach(source_file ${cc_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + else(cc_library_SRCS) + if(cc_library_DEPS) + merge_static_libs(${TARGET_NAME} ${cc_library_DEPS}) + else() + message(FATAL_ERROR "Please specify source files or libraries in cc_library(${TARGET_NAME} ...).") + endif() + endif(cc_library_SRCS) +endfunction(cc_library) + +# The link operation under windows may exceeds the maximum characters limit, simply break the link command +# into multiple link opeartion can fix that, say +# original: +# lib /out:target.lib a.lib b.lib c.lib d.lib +# after: +# 1. lib /out:dummy_lib_1.lib a.lib b.lib +# 2. lib /out:dummy_lib_2.lib c.lib d.lib +# 1. lib /out:target.lib dummy_lib_1.lib dummy_lib_2.lib +function(sep_library TARGET_NAME) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(sep_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(dummy_index 1) + set(dummy_offset 1) + # the dummy target would be consisted of limit size libraries + set(dummy_limit 50) + list(LENGTH sep_library_DEPS sep_all_len) + foreach(v ${sep_library_DEPS}) + list(APPEND dummy_list ${v}) + list(LENGTH dummy_list listlen ) + if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${sep_all_len})) + message("create dummy library ${TARGET_NAME}_dummy_lib_${dummy_index} for ${TARGET_NAME}") + cc_library(${TARGET_NAME}_dummy_lib_${dummy_index} STATIC DEPS ${dummy_list}) + foreach(i ${dummy_list}) + list(REMOVE_AT dummy_list 0) + endforeach() + list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_lib_${dummy_index}) + MATH(EXPR dummy_index "${dummy_index}+1") + endif() + MATH(EXPR dummy_offset "${dummy_offset}+1") + endforeach() + if(${sep_library_SHARED}) + cc_library(${TARGET_NAME} SHARED SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list}) + else(${sep_library_SHARED}) + cc_library(${TARGET_NAME} STATIC SRCS ${sep_library_SRCS} DEPS ${${TARGET_NAME}_dummy_list}) + endif(${sep_library_SHARED}) +endfunction(sep_library) + +function(cc_binary TARGET_NAME) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${cc_binary_SRCS}) + if(cc_binary_DEPS) + target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS}) + add_dependencies(${TARGET_NAME} ${cc_binary_DEPS}) + common_link(${TARGET_NAME}) + endif() + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${os_dependency_modules}) +endfunction(cc_binary) + +function(cc_test_build TARGET_NAME) + if(WITH_TESTING) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_executable(${TARGET_NAME} ${cc_test_SRCS}) + if(WIN32) + if("${cc_test_DEPS};" MATCHES "python;") + list(REMOVE_ITEM cc_test_DEPS python) + target_link_libraries(${TARGET_NAME} ${PYTHON_LIBRARIES}) + endif() + endif(WIN32) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} ${os_dependency_modules} paddle_gtest_main lod_tensor memory gtest gflags glog) + add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) + endif() +endfunction() + +function(cc_test_run TARGET_NAME) + if(WITH_TESTING) + set(oneValueArgs "") + set(multiValueArgs COMMAND ARGS) + cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + add_test(NAME ${TARGET_NAME} + COMMAND ${cc_test_COMMAND} + ARGS ${cc_test_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) + # No unit test should exceed 10 minutes. + set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) + endif() +endfunction() + +function(cc_test TARGET_NAME) + if(WITH_TESTING) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS ARGS) + cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cc_test_build(${TARGET_NAME} + SRCS ${cc_test_SRCS} + DEPS ${cc_test_DEPS}) + cc_test_run(${TARGET_NAME} + COMMAND ${TARGET_NAME} + ARGS ${cc_test_ARGS}) + endif() +endfunction(cc_test) + +function(nv_library TARGET_NAME) + if (WITH_GPU) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(nv_library_SRCS) + if (nv_library_SHARED OR nv_library_shared) # build *.so + cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) + else() + cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) + find_fluid_modules(${TARGET_NAME}) + endif() + if (nv_library_DEPS) + add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${nv_library_DEPS}) + endif() + # cpplint code style + foreach(source_file ${nv_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + else(nv_library_SRCS) + if (nv_library_DEPS) + merge_static_libs(${TARGET_NAME} ${nv_library_DEPS}) + else() + message(FATAL "Please specify source file or library in nv_library.") + endif() + endif(nv_library_SRCS) + endif() +endfunction(nv_library) + +function(nv_binary TARGET_NAME) + if (WITH_GPU) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS}) + if(nv_binary_DEPS) + target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS}) + add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) + common_link(${TARGET_NAME}) + endif() + endif() +endfunction(nv_binary) + +function(nv_test TARGET_NAME) + if (WITH_GPU AND WITH_TESTING) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog ${os_dependency_modules}) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + common_link(${TARGET_NAME}) + add_test(${TARGET_NAME} ${TARGET_NAME}) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) + endif() +endfunction(nv_test) + +function(hip_library TARGET_NAME) + if (WITH_AMD_GPU) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(_sources ${hip_library_SRCS}) + HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + if(_source_files) + list(REMOVE_ITEM _sources ${_source_files}) + endif() + if(hip_library_SRCS) + if (hip_library_SHARED OR hip_library_shared) # build *.so + add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources}) + set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) + else() + add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources}) + set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX) + target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so) + find_fluid_modules(${TARGET_NAME}) + endif() + if("${hip_library_DEPS}" MATCHES "ARCHIVE_START") + # Support linking flags: --whole-archive (Linux) / -force_load (MacOS). + # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries. + target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END) + else() + target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + endif() + # cpplint code style + foreach(source_file ${hip_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + else(hip_library_SRCS) + if (hip_library_DEPS) + merge_static_libs(${TARGET_NAME} ${hip_library_DEPS}) + else() + message(FATAL "Please specify source file or library in nv_library.") + endif() + endif(hip_library_SRCS) + endif() +endfunction(hip_library) + +function(hip_binary TARGET_NAME) + if (WITH_AMD_GPU) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS}) + if(hip_binary_DEPS) + target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS}) + add_dependencies(${TARGET_NAME} ${hip_binary_DEPS}) + common_link(${TARGET_NAME}) + endif() + endif() +endfunction(hip_binary) + +function(hip_test TARGET_NAME) + if (WITH_AMD_GPU AND WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(_sources ${hip_test_SRCS}) + HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + if(_source_files) + list(REMOVE_ITEM _sources ${_source_files}) + endif() + add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources}) + set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags ${os_dependency_modules}) + add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags) + common_link(${TARGET_NAME}) + add_test(${TARGET_NAME} ${TARGET_NAME}) + endif() +endfunction(hip_test) + +function(go_library TARGET_NAME) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs DEPS) + cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if (go_library_SHARED OR go_library_shared) + set(BUILD_MODE "-buildmode=c-shared") + set(${TARGET_NAME}_LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") + else() + set(BUILD_MODE "-buildmode=c-archive") + set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}") + endif() + + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c) + + # This custom command will always run since it depends on a not + # existing file. + add_custom_command( + OUTPUT dummy_rebulid_${TARGET_NAME} + COMMAND cmake -E touch ${dummyfile} + ) + # Create a custom target that depends on the custom command output + # file, so the custom command can be referenced as a dependency by + # `add_dependencies`. + add_custom_target(rebuild_${TARGET_NAME} + DEPENDS dummy_rebulid_${TARGET_NAME} + ) + + # Add dummy code to support `make target_name` under Terminal Command + file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";") + if (go_library_SHARED OR go_library_shared) + add_library(${TARGET_NAME} SHARED ${dummyfile}) + else() + add_library(${TARGET_NAME} STATIC ${dummyfile}) + endif() + if(go_library_DEPS) + add_dependencies(${TARGET_NAME} ${go_library_DEPS}) + common_link(${TARGET_NAME}) + endif(go_library_DEPS) + + # The "source file" of the library is `${dummyfile}` which never + # change, so the target will never rebuild. Make the target depends + # on the custom command that touches the library "source file", so + # rebuild will always happen. + add_dependencies(${TARGET_NAME} rebuild_${TARGET_NAME}) + + set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}") + + file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go") + string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND rm "${${TARGET_NAME}_LIB_PATH}" + # Golang build source code + COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE} + -o "${${TARGET_NAME}_LIB_PATH}" + "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}" + # must run under GOPATH + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_dependencies(${TARGET_NAME} go_vendor) +endfunction(go_library) + +function(go_binary TARGET_NAME) + set(options OPTIONAL) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + + add_custom_command(OUTPUT ${TARGET_NAME}_timestamp + COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build + -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" + "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}" + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS}) + install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin) +endfunction(go_binary) + +function(go_test TARGET_NAME) + set(options OPTIONAL) + set(oneValueArgs "") + set(multiValueArgs DEPS) + cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS}) + add_custom_command(TARGET ${TARGET_NAME} POST_BUILD + COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race + -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}" + ".${CMAKE_CURRENT_SOURCE_REL_DIR}" + WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go") + add_test(NAME ${TARGET_NAME} + COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) +endfunction(go_test) + +# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support +# Usage: +# paddle_protobuf_generate_cpp( ) + +function(paddle_protobuf_generate_cpp SRCS HDRS) + if(NOT ARGN) + message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files") + return() + endif() + + set(${SRCS}) + set(${HDRS}) + + foreach(FIL ${ARGN}) + get_filename_component(ABS_FIL ${FIL} ABSOLUTE) + get_filename_component(FIL_WE ${FIL} NAME_WE) + + set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc") + set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h") + list(APPEND ${SRCS} "${_protobuf_protoc_src}") + list(APPEND ${HDRS} "${_protobuf_protoc_hdr}") + + add_custom_command( + OUTPUT "${_protobuf_protoc_src}" + "${_protobuf_protoc_hdr}" + + COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + -I${CMAKE_CURRENT_SOURCE_DIR} + --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL} + DEPENDS ${ABS_FIL} protoc + COMMENT "Running C++ protocol buffer compiler on ${FIL}" + VERBATIM ) + endforeach() + + set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE) + set(${SRCS} ${${SRCS}} PARENT_SCOPE) + set(${HDRS} ${${HDRS}} PARENT_SCOPE) +endfunction() + + +function(proto_library TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(proto_srcs) + set(proto_hdrs) + paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS}) + cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf) +endfunction() + +function(py_proto_compile TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS) + cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(py_srcs) + protobuf_generate_python(py_srcs ${py_proto_compile_SRCS}) + add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs} protobuf) +endfunction() + +function(py_test TARGET_NAME) + if(WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS ARGS ENVS) + cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(WITH_COVERAGE) + add_test(NAME ${TARGET_NAME} + COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true + FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G + PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} + COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data + ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${py_test_SRCS} ${py_test_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + else() + add_test(NAME ${TARGET_NAME} + COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true + FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G + PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} + ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + endif() + + # No unit test should exceed 10 minutes. + set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) + endif() +endfunction() + +# grpc_library generate grpc code using grpc_cpp_plugin and protoc +# then build the generated protobuf code and grpc code with your +# implementation source codes together. Use SRCS argument for your +# implementation source files and PROTO argument for your .proto +# files. +# +# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep) + +function(grpc_library TARGET_NAME) + set(oneValueArgs PROTO) + set(multiValueArgs SRCS DEPS) + set(options "") + cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message(STATUS "generating grpc ${grpc_library_PROTO}") + + get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE) + get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE) + get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + + #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but + # somehow it didn't. line 602 to 604 is to patching this. Leaving this here + # for now to enable dist CI. + paddle_protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}") + set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc") + set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h") + cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}") + + add_custom_command( + OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" + --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" + "${ABS_PROTO}" + DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc) + + # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it + # as compiler warnings instead of error. Should try remove the warnings also. + set_source_files_properties( + ${grpc_grpc_srcs} + PROPERTIES + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}") + + set_source_files_properties( + ${grpc_library_SRCS} + PROPERTIES + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") +endfunction() + + +function(brpc_library TARGET_NAME) + set(oneValueArgs PROTO) + set(multiValueArgs SRCS DEPS) + set(options "") + cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message(STATUS "generating brpc ${brpc_library_PROTO}") + + get_filename_component(ABS_PROTO ${brpc_library_PROTO} ABSOLUTE) + get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE) + get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + + paddle_protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}") + cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}") + cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}") +endfunction() diff --git a/cmake/hip.cmake b/cmake/hip.cmake new file mode 100644 index 00000000..c3a748db --- /dev/null +++ b/cmake/hip.cmake @@ -0,0 +1,53 @@ +if(NOT WITH_AMD_GPU) + return() +endif() + +include_directories("/opt/rocm/include") +include_directories("/opt/rocm/hip/include") +include_directories("/opt/rocm/miopen/include") +include_directories("/opt/rocm/hipblas/include") +include_directories("/opt/rocm/hiprand/include") +include_directories("/opt/rocm/rocrand/include") +include_directories("/opt/rocm/rccl/include") +include_directories("/opt/rocm/thrust") + +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" ) + +if(WITH_DSO) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO") +endif(WITH_DSO) + +if(WITH_TESTING) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING") +endif(WITH_TESTING) + +if(WITH_DISTRIBUTE) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE") +endif(WITH_DISTRIBUTE) + +if(WITH_GRPC) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC") +endif(WITH_GRPC) + +if(WITH_MKLDNN) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN") +endif(WITH_MKLDNN) + +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE") + +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) +elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) +elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) +endif() + +if("x${HCC_HOME}" STREQUAL "x") + set(HCC_HOME "/opt/rocm/hcc") +endif() + +set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o ") +set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared") +set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared") + diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake new file mode 100644 index 00000000..2a3962b9 --- /dev/null +++ b/cmake/inference_lib.cmake @@ -0,0 +1,296 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# make package for paddle fluid shared and static library +function(copy TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DSTS DEPS) + cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE) + + list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) + list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) + if (NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len}) + message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers") + endif () + math(EXPR len "${copy_lib_SRCS_len} - 1") + + add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS}) + foreach (index RANGE ${len}) + list(GET copy_lib_SRCS ${index} src) + list(GET copy_lib_DSTS ${index} dst) + if (WIN32) + if(IS_DIRECTORY ${src}) + get_filename_component(last_path ${src} NAME) + string(APPEND dst "/" ${last_path}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + if(EXISTS ${src}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND cmake -E copy_directory "${src}" "${dst}" + COMMENT "copying ${src} -> ${dst}") + else() + message(WARNING "${src} not exist!") + endif() + else() + # windows cmd shell will not expand wildcard automatically. + # below expand the files, and copy them by rules. + file(GLOB src_files ${src}) + if (NOT "${src_files}" STREQUAL "") + list(REMOVE_DUPLICATES src_files) + endif () + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + foreach (src_file ${src_files}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" + COMMENT "copying ${src_file} -> ${dst}") + endforeach () + endif() + else (WIN32) # not windows + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND mkdir -p "${dst}" + COMMAND cp -r "${src}" "${dst}" + COMMENT "copying ${src} -> ${dst}") + endif (WIN32) # not windows + endforeach () +endfunction() + +# third party +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3") +copy(eigen3_lib + SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen + DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported + DEPS eigen3 + ) + +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags") +copy(gflags_lib + SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS gflags + ) + +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog") +copy(glog_lib + SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS glog + ) + +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/") +copy(boost_lib + SRCS ${BOOST_INCLUDE_DIR}/boost + DSTS ${dst_dir} + DEPS boost + ) + +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash") +copy(xxhash_lib + SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS xxhash + ) + +if (NOT PROTOBUF_FOUND OR WIN32) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") + copy(protobuf_lib + SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS extern_protobuf + ) +endif () + +if (WITH_MKLML) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml") + if(WIN32) + copy(mklml_lib + SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB} + ${MKLML_SHARED_LIB_DEPS} ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR} + DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib + ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} + DEPS mklml + ) + else() + copy(mklml_lib + SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} + DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} + DEPS mklml + ) + endif() +elseif (NOT CBLAS_FOUND OR WIN32) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas") + copy(openblas_lib + SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include + DSTS ${dst_dir} ${dst_dir} + DEPS extern_openblas + ) +endif () + +if (WITH_MKLDNN) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn") + if(WIN32) + copy(mkldnn_lib + SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB} + DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib + DEPS mkldnn_shared_lib + ) + else() + copy(mkldnn_lib + SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS mkldnn_shared_lib + ) + endif() +endif () + +if (WITH_NGRAPH) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph") + copy(ngraph_lib + SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR} + DSTS ${dst_dir} ${dst_dir} + DEPS ngraph + ) +endif () + +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") +copy(snappy_lib + SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappy) + +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") +copy(snappystream_lib + SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappystream) + +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") +copy(zlib_lib + SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS zlib) + +# paddle fluid module +set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") +set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") +set(module "framework") +if (NOT WIN32) + set(framework_lib_deps framework_py_proto) +endif (NOT WIN32) + +copy(framework_lib DEPS ${framework_lib_deps} + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h + ${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet + ) + +set(module "memory") +copy(memory_lib + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation + ) + +set(inference_deps paddle_fluid_shared paddle_fluid) + +set(module "inference/api") + +if (TENSORRT_FOUND) + copy(tensorrt_lib DEPS ${inference_deps} + SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer* + DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib) +endif () + +if (ANAKIN_FOUND) + copy(anakin_lib DEPS ${inference_deps} + SRCS ${ANAKIN_ROOT}/* + DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin) +endif () + +set(module "inference") +if(WIN32) + set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*) +else(WIN32) + set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*) +endif(WIN32) +copy(inference_lib DEPS ${inference_deps} + SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib} + ${src_dir}/${module}/api/paddle_*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} + ) + +set(module "platform") +copy(platform_lib DEPS profiler_py_proto + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details + ) + +set(module "string") +copy(string_lib + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat + ) + +set(module "pybind") +copy(pybind_lib + SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h + DSTS ${dst_dir}/${module} + ) + +# CMakeCache Info +copy(cmake_cache + SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt + DSTS ${FLUID_INSTALL_DIR}) + +# This command generates a complete fluid library for both train and inference +add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) + +# Following commands generate a inference-only fluid library +# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR} +copy(third_party DEPS fluid_lib_dist + SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt + DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR} + ) + +# only need libpaddle_fluid.so/a and paddle_*.h for inference-only library +copy(inference_api_lib DEPS fluid_lib_dist + SRCS ${paddle_fluid_lib} + ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h + DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include +) + +add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib) + +# paddle fluid version +function(version version_file) + execute_process( + COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_GIT_COMMIT) + file(WRITE ${version_file} + "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n" + "WITH_MKL: ${WITH_MKL}\n" + "WITH_MKLDNN: ${WITH_MKLDNN}\n" + "WITH_GPU: ${WITH_GPU}\n") + if (WITH_GPU) + file(APPEND ${version_file} + "CUDA version: ${CUDA_VERSION}\n" + "CUDNN version: v${CUDNN_MAJOR_VERSION}\n") + endif () +endfunction() +version(${FLUID_INSTALL_DIR}/version.txt) +version(${FLUID_INFERENCE_INSTALL_DIR}/version.txt) diff --git a/cmake/make_resource.py b/cmake/make_resource.py new file mode 100644 index 00000000..09a2ca87 --- /dev/null +++ b/cmake/make_resource.py @@ -0,0 +1,25 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import sys + +res = sys.argv[1] +out = sys.argv[2] +var = re.sub(r'[ .-]', '_', os.path.basename(res)) + +open(out, "w").write("const unsigned char " + var + "[] = {" + ",".join([ + "0x%02x" % ord(c) for c in open(res).read() +]) + ",0};\n" + "const unsigned " + var + "_size = sizeof(" + var + ");\n") diff --git a/cmake/operators.cmake b/cmake/operators.cmake new file mode 100644 index 00000000..134c8943 --- /dev/null +++ b/cmake/operators.cmake @@ -0,0 +1,227 @@ +set(PART_CUDA_KERNEL_FILES) +function(op_library TARGET) + # op_library is a function to create op library. The interface is same as + # cc_library. But it handle split GPU/CPU code and link some common library + # for ops. + set(cc_srcs) + set(cu_srcs) + set(hip_cu_srcs) + set(miopen_hip_cc_srcs) + set(cu_cc_srcs) + set(cudnn_cu_cc_srcs) + set(CUDNN_FILE) + set(mkldnn_cc_srcs) + set(MKLDNN_FILE) + set(op_common_deps operator op_registry math_function) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + set(pybind_flag 0) + cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + + list(LENGTH op_library_SRCS op_library_SRCS_len) + if (${op_library_SRCS_len} EQUAL 0) + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) + list(APPEND cc_srcs ${TARGET}.cc) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc) + list(APPEND cu_cc_srcs ${TARGET}.cu.cc) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) + list(APPEND cu_srcs ${TARGET}.cu) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu + ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE) + list(APPEND cu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) + endif() + + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu) + list(APPEND hip_cu_srcs ${TARGET}.hip.cu) + endif() + string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}") + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc) + list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc) + endif() + if(WITH_AMD_GPU) + string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}") + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc) + list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc) + endif() + endif() + if(WITH_MKLDNN) + string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/mkldnn/${MKLDNN_FILE}.cc) + list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc) + endif() + endif() + else() + foreach(src ${op_library_SRCS}) + if (${src} MATCHES ".*\\.hip.cu$") + list(APPEND hip_cu_srcs ${src}) + elseif (${src} MATCHES ".*\\.cu$") + list(APPEND cu_srcs ${src}) + elseif(${src} MATCHES ".*_cudnn_op.cu.cc$") + list(APPEND cudnn_cu_cc_srcs ${src}) + elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$") + list(APPEND miopen_hip_cc_srcs ${src}) + elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$") + list(APPEND mkldnn_cc_srcs ${src}) + elseif(${src} MATCHES ".*\\.cu.cc$") + list(APPEND cu_cc_srcs ${src}) + elseif(${src} MATCHES ".*\\.cc$") + list(APPEND cc_srcs ${src}) + else() + message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu") + endif() + endforeach() + endif() + + list(LENGTH cc_srcs cc_srcs_len) + if (${cc_srcs_len} EQUAL 0) + message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file") + endif() + if (WIN32) + # remove windows unsupported op, because windows has no nccl, no warpctc such ops. + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") + if ("${TARGET}" STREQUAL "${windows_unsupport_op}") + return() + endif() + endforeach() + endif(WIN32) + set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} CACHE INTERNAL "op libs") + + list(LENGTH op_library_DEPS op_library_DEPS_len) + if (${op_library_DEPS_len} GREATER 0) + set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) + endif() + if (WITH_GPU) + nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} + ${op_common_deps}) + elseif (WITH_AMD_GPU) + hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} + ${op_common_deps}) + else() + cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} + ${op_common_deps}) + endif() + + # Define operators that don't need pybind here. + foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" +"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" +"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "deformable_conv_op" "dgc_op") + if ("${TARGET}" STREQUAL "${manual_pybind_op}") + set(pybind_flag 1) + endif() + endforeach() + + # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. + # Note that it's enough to just adding one operator to pybind in a *_op.cc file. + # And for detail pybind information, please see generated paddle/pybind/pybind.h. + file(READ ${TARGET}.cc TARGET_CONTENT) + string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}") + string(REGEX MATCH "REGISTER_OPERATOR\\([a-z0-9_]*," one_register "${multi_register}") + if (one_register STREQUAL "") + string(REPLACE "_op" "" TARGET "${TARGET}") + else () + string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}") + string(REPLACE "," "" TARGET "${TARGET}") + endif() + + # pybind USE_NO_KERNEL_OP + # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel + string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") + string(REPLACE "_op" "" TARGET "${TARGET}") + if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") + file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n") + set(pybind_flag 1) + endif() + + # pybind USE_CPU_ONLY_OP + list(LENGTH cu_srcs cu_srcs_len) + list(LENGTH cu_cc_srcs cu_cc_srcs_len) + list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) + list(LENGTH hip_cu_srcs hip_cu_srcs_len) + list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len) + if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND + ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0) + file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") + set(pybind_flag 1) + endif() + + # pybind USE_OP_DEVICE_KERNEL for CUDNN + list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len) + if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0) + if(${TARGET} STREQUAL "activation") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n") + else() + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") + endif() + endif() + + # pybind USE_OP_DEVICE_KERNEL for MIOPEN + if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n") + endif() + + # pybind USE_OP_DEVICE_KERNEL for MKLDNN + if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) + # Append first implemented MKLDNN activation operator + if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") + elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n") + + else() + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") + endif() + endif() + + # pybind USE_OP + if (${pybind_flag} EQUAL 0) + # NOTE(*): activation use macro to regist the kernels, set use_op manually. + if(${TARGET} STREQUAL "activation") + file(APPEND ${pybind_file} "USE_OP(relu);\n") + elseif(${TARGET} STREQUAL "fake_dequantize") + file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") + elseif(${TARGET} STREQUAL "fake_quantize") + file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n") + elseif(${TARGET} STREQUAL "tensorrt_engine_op") + message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference") + elseif(${TARGET} STREQUAL "fc") + # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition + file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") + else() + file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") + endif() + endif() +endfunction() + + +function(register_operators) + set(options "") + set(oneValueArgs "") + set(multiValueArgs EXCLUDES DEPS) + cmake_parse_arguments(register_operators "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + + file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") + string(REPLACE "_mkldnn" "" OPS "${OPS}") + string(REPLACE ".cc" "" OPS "${OPS}") + list(REMOVE_DUPLICATES OPS) + list(LENGTH register_operators_DEPS register_operators_DEPS_len) + + foreach(src ${OPS}) + list(FIND register_operators_EXCLUDES ${src} _index) + if (${_index} EQUAL -1) + if (${register_operators_DEPS_len} GREATER 0) + op_library(${src} DEPS ${register_operators_DEPS}) + else() + op_library(${src}) + endif() + endif() + endforeach() +endfunction() diff --git a/cmake/package.cmake b/cmake/package.cmake new file mode 100644 index 00000000..79e02147 --- /dev/null +++ b/cmake/package.cmake @@ -0,0 +1,21 @@ +set(CPACK_PACKAGE_NAME paddle) +set(CPACK_PACKAGE_VERSION_MAJOR ${PADDLE_MAJOR_VERSION}) +set(CPACK_PACKAGE_VERSION_MINOR ${PADDLE_MINOR_VERSION}) +set(CPACK_PACKAGE_VERSION_PATCH ${PADDLE_PATCH_VERSION}) +set(CPACK_PACKAGE_VERSION ${PADDLE_VERSION}) +## DEB Settings +set(CPACK_DEBIAN_PACKAGE_NAME paddle) +set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE amd64) +set(CPACK_DEBIAN_PACKAGE_MAINTAINER PaddlePaddle Dev ) +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Paddle") +set(CPACK_PACKAGE_DESCRIPTION "") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl") +set(CPACK_DEBIAN_PACKAGE_SECTION Devel) +set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION}) +set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst") +#set(CPACK_GENERATOR "DEB") +# Start cpack +include (CMakePackageConfigHelpers) +include (CPack) + + diff --git a/cmake/python_module.cmake b/cmake/python_module.cmake new file mode 100644 index 00000000..1412b7f7 --- /dev/null +++ b/cmake/python_module.cmake @@ -0,0 +1,43 @@ +# Find if a Python module is installed +# Found at http://www.cmake.org/pipermail/cmake/2011-January/041666.html +# To use do: find_python_module(PyQt4 REQUIRED) +function(find_python_module module) + string(TOUPPER ${module} module_upper) + if(NOT PY_${module_upper}) + if(ARGC GREATER 1 AND ARGV1 STREQUAL "REQUIRED") + set(${module}_FIND_REQUIRED TRUE) + else() + set(${module}_FIND_REQUIRED FALSE) + endif() + # A module's location is usually a directory, but for binary modules + # it's a .so file. + execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" + "import re, ${module}; print(re.compile('/__init__.py.*').sub('',${module}.__file__))" + RESULT_VARIABLE _${module}_status + OUTPUT_VARIABLE _${module}_location + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT _${module}_status) + set(PY_${module_upper} ${_${module}_location} CACHE STRING + "Location of Python module ${module}") + endif(NOT _${module}_status) + endif(NOT PY_${module_upper}) + find_package_handle_standard_args(PY_${module} DEFAULT_MSG PY_${module_upper}) + if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED) + message(FATAL_ERROR "python module ${module} is not found") + endif() + + execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" + "import sys, ${module}; sys.stdout.write(${module}.__version__)" + OUTPUT_VARIABLE _${module}_version + RESULT_VARIABLE _${module}_status + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE) + if(NOT _${module}_status) + set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING + "Version of Python module ${module}") + endif(NOT _${module}_status) + + set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE) + set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE) +endfunction(find_python_module) diff --git a/cmake/simd.cmake b/cmake/simd.cmake new file mode 100644 index 00000000..566dc75f --- /dev/null +++ b/cmake/simd.cmake @@ -0,0 +1,99 @@ +# This file is use to check all support level of AVX on your machine +# so that PaddlePaddle can unleash the vectorization power of muticore. + +include(CheckCXXSourceRuns) +include(CheckCXXSourceCompiles) + +if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(MMX_FLAG "-mmmx") + set(SSE2_FLAG "-msse2") + set(SSE3_FLAG "-msse3") + set(AVX_FLAG "-mavx") + set(AVX2_FLAG "-mavx2") + set(AVX512F_FLAG "-mavx512f") +elseif(MSVC) + set(MMX_FLAG "/arch:MMX") + set(SSE2_FLAG "/arch:SSE2") + set(SSE3_FLAG "/arch:SSE3") + SET(AVX_FLAG "/arch:AVX") + SET(AVX2_FLAG "/arch:AVX2") +endif() + +set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) + +# Check MMX +set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG}) +set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + _mm_setzero_si64(); + return 0; +}" MMX_FOUND) + +# Check SSE2 +set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG}) +set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + _mm_setzero_si128(); + return 0; +}" SSE2_FOUND) + +# Check SSE3 +set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG}) +set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m128d a = _mm_set1_pd(6.28); + __m128d b = _mm_set1_pd(3.14); + __m128d result = _mm_addsub_pd(a, b); + result = _mm_movedup_pd(result); + return 0; +}" SSE3_FOUND) + +# Check AVX +set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) +set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; +}" AVX_FOUND) + +# Check AVX 2 +set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) +set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; +}" AVX2_FOUND) + +# Check AVX512F +set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) +set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) +CHECK_CXX_SOURCE_RUNS(" +#include +int main() +{ + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; +}" AVX512F_FOUND) + +set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) +mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/cmake/system.cmake b/cmake/system.cmake new file mode 100644 index 00000000..65db05be --- /dev/null +++ b/cmake/system.cmake @@ -0,0 +1,85 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Detects the OS and sets appropriate variables. +# CMAKE_SYSTEM_NAME only give us a coarse-grained name of the OS CMake is +# building for, but the host processor name like centos is necessary +# in some scenes to distinguish system for customization. +# +# for instance, protobuf libs path is /lib64 +# on CentOS, but /lib on other systems. + +IF(WIN32) + SET(HOST_SYSTEM "win32") +ELSE(WIN32) + IF(APPLE) + SET(HOST_SYSTEM "macosx") + EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION) + STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}") + IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET}) + # Set cache variable - end user may change this during ccmake or cmake-gui configure. + SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING + "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.") + ENDIF() + set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security") + ELSE(APPLE) + + IF(EXISTS "/etc/issue") + FILE(READ "/etc/issue" LINUX_ISSUE) + IF(LINUX_ISSUE MATCHES "CentOS") + SET(HOST_SYSTEM "centos") + ELSEIF(LINUX_ISSUE MATCHES "Debian") + SET(HOST_SYSTEM "debian") + ELSEIF(LINUX_ISSUE MATCHES "Ubuntu") + SET(HOST_SYSTEM "ubuntu") + ELSEIF(LINUX_ISSUE MATCHES "Red Hat") + SET(HOST_SYSTEM "redhat") + ELSEIF(LINUX_ISSUE MATCHES "Fedora") + SET(HOST_SYSTEM "fedora") + ENDIF() + + STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}") + ENDIF(EXISTS "/etc/issue") + + IF(EXISTS "/etc/redhat-release") + FILE(READ "/etc/redhat-release" LINUX_ISSUE) + IF(LINUX_ISSUE MATCHES "CentOS") + SET(HOST_SYSTEM "centos") + ENDIF() + ENDIF(EXISTS "/etc/redhat-release") + + IF(NOT HOST_SYSTEM) + SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME}) + ENDIF() + + ENDIF(APPLE) +ENDIF(WIN32) + +# query number of logical cores +CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES) + +MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES) + +MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}") +MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores") + +# external dependencies log output +SET(EXTERNAL_PROJECT_LOG_ARGS + LOG_DOWNLOAD 0 # Wrap download in script to log output + LOG_UPDATE 1 # Wrap update in script to log output + LOG_CONFIGURE 1 # Wrap configure in script to log output + LOG_BUILD 0 # Wrap build in script to log output + LOG_TEST 1 # Wrap test in script to log output + LOG_INSTALL 0 # Wrap install in script to log output +) diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake new file mode 100644 index 00000000..3bf12094 --- /dev/null +++ b/cmake/tensorrt.cmake @@ -0,0 +1,38 @@ +if(NOT WITH_GPU) + return() +endif() + +set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT") +find_path(TENSORRT_INCLUDE_DIR NvInfer.h + PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include + $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include + NO_DEFAULT_PATH +) + +find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a + PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib + $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib + NO_DEFAULT_PATH + DOC "Path to TensorRT library.") + +if(TENSORRT_INCLUDE_DIR AND TENSORRT_LIBRARY) + if(WITH_DSO) + set(TENSORRT_FOUND ON) + endif(WITH_DSO) +else() + set(TENSORRT_FOUND OFF) +endif() + +if(TENSORRT_FOUND) + file(READ ${TENSORRT_INCLUDE_DIR}/NvInfer.h TENSORRT_VERSION_FILE_CONTENTS) + string(REGEX MATCH "define NV_TENSORRT_MAJOR +([0-9]+)" TENSORRT_MAJOR_VERSION + "${TENSORRT_VERSION_FILE_CONTENTS}") + string(REGEX REPLACE "define NV_TENSORRT_MAJOR +([0-9]+)" "\\1" + TENSORRT_MAJOR_VERSION "${TENSORRT_MAJOR_VERSION}") + + message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " + "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") + include_directories(${TENSORRT_INCLUDE_DIR}) + link_directories(${TENSORRT_LIBRARY}) + add_definitions(-DPADDLE_WITH_TENSORRT) +endif() diff --git a/cmake/util.cmake b/cmake/util.cmake new file mode 100644 index 00000000..02667dbc --- /dev/null +++ b/cmake/util.cmake @@ -0,0 +1,55 @@ +# Some common routine for paddle compile. + +# target_circle_link_libraries +# Link libraries to target which has circle dependencies. +# +# First Argument: target name want to be linked with libraries +# Rest Arguments: libraries which link together. +function(target_circle_link_libraries TARGET_NAME) + if(APPLE) + set(LIBS) + set(inArchive OFF) + set(libsInArgn) + + foreach(arg ${ARGN}) + if(${arg} STREQUAL "ARCHIVE_START") + set(inArchive ON) + elseif(${arg} STREQUAL "ARCHIVE_END") + set(inArchive OFF) + else() + if(inArchive) + list(APPEND LIBS "-Wl,-force_load") + endif() + list(APPEND LIBS ${arg}) + list(APPEND libsInArgn ${arg}) + endif() + endforeach() + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") + if(NOT IOS_ENABLE_BITCODE) + list(APPEND LIBS "-undefined dynamic_lookup") + endif() + endif() + list(REVERSE libsInArgn) + target_link_libraries(${TARGET_NAME} + ${LIBS} + ${libsInArgn}) + + else() # LINUX + set(LIBS) + + foreach(arg ${ARGN}) + if(${arg} STREQUAL "ARCHIVE_START") + list(APPEND LIBS "-Wl,--whole-archive") + elseif(${arg} STREQUAL "ARCHIVE_END") + list(APPEND LIBS "-Wl,--no-whole-archive") + else() + list(APPEND LIBS ${arg}) + endif() + endforeach() + + target_link_libraries(${TARGET_NAME} + "-Wl,--start-group" + ${LIBS} + "-Wl,--end-group") + endif() +endfunction() diff --git a/cmake/version.cmake b/cmake/version.cmake new file mode 100644 index 00000000..dd57d4ab --- /dev/null +++ b/cmake/version.cmake @@ -0,0 +1,63 @@ +# Get the latest git tag. +set(PADDLE_VERSION $ENV{PADDLE_VERSION}) +set(tmp_version "HEAD") +set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?") +set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+") +while ("${PADDLE_VERSION}" STREQUAL "") + # Check current branch name + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_BRANCH_NAME + RESULT_VARIABLE GIT_BRANCH_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT ${GIT_BRANCH_RESULT}) + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always ${tmp_version} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_TAG_NAME + RESULT_VARIABLE GIT_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT ${GIT_RESULT}) + # Check if current branch is release branch + if (${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}") + # Check the tag is a correct version + if (${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}") + # if no tag was found, set PADDLE_VERSION to 0.0.0 to represent latest + set(PADDLE_VERSION "0.0.0") + elseif (${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") + string(REPLACE "v" "" PADDLE_VERSION ${GIT_TAG_NAME}) + else() # otherwise, get the previous git tag name. + set(tmp_version "${GIT_TAG_NAME}~1") + endif() + else() + execute_process( + COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE GIT_EXACT_TAG_NAME + RESULT_VARIABLE GIT_EXACT_TAG_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) + if (NOT ${GIT_EXACT_TAG_NAME}) + # Check if current branch is tag branch + if (${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}") + string(REPLACE "v" "" PADDLE_VERSION ${GIT_EXACT_TAG_NAME}) + else() + set(PADDLE_VERSION "0.0.0") + endif() + else() + # otherwise, we always set PADDLE_VERSION to 0.0.0 to represent latest + set(PADDLE_VERSION "0.0.0") + endif() + endif() + else() + set(PADDLE_VERSION "0.0.0") + message(WARNING "Cannot add paddle version from git tag") + endif() + else() + set(PADDLE_VERSION "0.0.0") + message(WARNING "Cannot add paddle version for wrong git branch result") + endif() +endwhile() + +add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION}) +message(STATUS "Paddle version is ${PADDLE_VERSION}") diff --git a/doc/README.md b/doc/README.md new file mode 100644 index 00000000..998a39f1 --- /dev/null +++ b/doc/README.md @@ -0,0 +1,7 @@ +# For Readers and Developers + +Thanks for reading PaddlePaddle documentation. + +Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [FluidDoc Repo](https://github.com/PaddlePaddle/FluidDoc) and updated there. + +Please turn to FluidDoc Repo for the latest documentation. diff --git a/go/glide.lock b/go/glide.lock new file mode 100644 index 00000000..d15fc934 --- /dev/null +++ b/go/glide.lock @@ -0,0 +1,233 @@ +hash: 107c058cf5c9163a75d40eef2273a793c36112683c25d72aa8288827fdde3a19 +updated: 2017-10-30T03:46:19.137696069Z +imports: +- name: github.com/alecthomas/gometalinter + version: bae2f1293d092fd8167939d5108d1b025eaef9de +- name: github.com/beorn7/perks + version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9 + subpackages: + - quantile +- name: github.com/boltdb/bolt + version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9 +- name: github.com/cockroachdb/cmux + version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92 +- name: github.com/coreos/etcd + version: f1d7dd87da3e8feab4aaf675b8e29c6a5ed5f58b + subpackages: + - alarm + - auth + - auth/authpb + - client + - clientv3 + - clientv3/concurrency + - compactor + - discovery + - embed + - error + - etcdserver + - etcdserver/api + - etcdserver/api/etcdhttp + - etcdserver/api/v2http + - etcdserver/api/v2http/httptypes + - etcdserver/api/v3client + - etcdserver/api/v3election + - etcdserver/api/v3election/v3electionpb + - etcdserver/api/v3election/v3electionpb/gw + - etcdserver/api/v3lock + - etcdserver/api/v3lock/v3lockpb + - etcdserver/api/v3lock/v3lockpb/gw + - etcdserver/api/v3rpc + - etcdserver/api/v3rpc/rpctypes + - etcdserver/auth + - etcdserver/etcdserverpb + - etcdserver/etcdserverpb/gw + - etcdserver/membership + - etcdserver/stats + - lease + - lease/leasehttp + - lease/leasepb + - mvcc + - mvcc/backend + - mvcc/mvccpb + - pkg/adt + - pkg/contention + - pkg/cors + - pkg/cpuutil + - pkg/crc + - pkg/debugutil + - pkg/fileutil + - pkg/httputil + - pkg/idutil + - pkg/ioutil + - pkg/logutil + - pkg/monotime + - pkg/netutil + - pkg/pathutil + - pkg/pbutil + - pkg/runtime + - pkg/schedule + - pkg/srv + - pkg/tlsutil + - pkg/transport + - pkg/types + - pkg/wait + - proxy/grpcproxy/adapter + - raft + - raft/raftpb + - rafthttp + - snap + - snap/snappb + - store + - version + - wal + - wal/walpb +- name: github.com/coreos/go-semver + version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6 + subpackages: + - semver +- name: github.com/coreos/go-systemd + version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6 + subpackages: + - daemon + - journal + - util +- name: github.com/coreos/pkg + version: 3ac0863d7acf3bc44daf49afef8919af12f704ef + subpackages: + - capnslog +- name: github.com/dgrijalva/jwt-go + version: d2709f9f1f31ebcda9651b03077758c1f3a0018c +- name: github.com/ghodss/yaml + version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7 +- name: github.com/go-stack/stack + version: 817915b46b97fd7bb80e8ab6b69f01a53ac3eebf +- name: github.com/gogo/protobuf + version: 909568be09de550ed094403c2bf8a261b5bb730a + subpackages: + - proto +- name: github.com/golang/protobuf + version: 4bd1920723d7b7c925de087aa32e2187708897f7 + subpackages: + - jsonpb + - proto +- name: github.com/golang/snappy + version: 553a641470496b2327abcac10b36396bd98e45c9 +- name: github.com/google/btree + version: 925471ac9e2131377a91e1595defec898166fe49 +- name: github.com/grpc-ecosystem/go-grpc-prometheus + version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0 +- name: github.com/grpc-ecosystem/grpc-gateway + version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676 + subpackages: + - runtime + - runtime/internal + - utilities +- name: github.com/inconshreveable/log15 + version: 0decfc6c20d9ca0ad143b0e89dcaa20f810b4fb3 +- name: github.com/jonboulle/clockwork + version: 2eee05ed794112d45db504eb05aa693efd2b8b09 +- name: github.com/mattn/go-colorable + version: 5411d3eea5978e6cdc258b30de592b60df6aba96 +- name: github.com/mattn/go-isatty + version: 57fdcb988a5c543893cc61bce354a6e24ab70022 +- name: github.com/matttproud/golang_protobuf_extensions + version: c12348ce28de40eed0136aa2b644d0ee0650e56c + subpackages: + - pbutil +- name: github.com/namsral/flag + version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04 +- name: github.com/PaddlePaddle/recordio + version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81 +- name: github.com/prometheus/client_golang + version: c5b7fccd204277076155f10851dad72b76a49317 + subpackages: + - prometheus +- name: github.com/prometheus/client_model + version: 6f3806018612930941127f2a7c6c453ba2c527d2 + subpackages: + - go +- name: github.com/prometheus/common + version: 49fee292b27bfff7f354ee0f64e1bc4850462edf + subpackages: + - expfmt + - internal/bitbucket.org/ww/goautoneg + - model +- name: github.com/prometheus/procfs + version: a1dba9ce8baed984a2495b658c82687f8157b98f + subpackages: + - xfs +- name: github.com/satori/go.uuid + version: 879c5887cd475cd7864858769793b2ceb0d44feb +- name: github.com/sirupsen/logrus + version: f006c2ac4710855cf0f916dd6b77acf6b048dc6e +- name: github.com/topicai/candy + version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc +- name: github.com/ugorji/go + version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74 + subpackages: + - codec +- name: github.com/xiang90/probing + version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2 +- name: golang.org/x/crypto + version: 9419663f5a44be8b34ca85f08abc5fe1be11f8a3 + repo: https://github.com/golang/crypto.git + vcs: git + subpackages: + - bcrypt + - blowfish + - ssh/terminal +- name: golang.org/x/net + version: c8c74377599bd978aee1cf3b9b63a8634051cec2 + subpackages: + - context + - http2 + - http2/hpack + - idna + - internal/timeseries + - lex/httplex + - trace +- name: golang.org/x/sys + version: e48874b42435b4347fc52bdee0424a52abc974d7 + repo: https://github.com/golang/sys.git + vcs: git + subpackages: + - unix + - windows +- name: golang.org/x/text + version: 836efe42bb4aa16aaa17b9c155d8813d336ed720 + repo: https://github.com/golang/text.git + vcs: git + subpackages: + - secure/bidirule + - transform + - unicode/bidi + - unicode/norm +- name: google.golang.org/grpc + version: 8050b9cbc271307e5a716a9d782803d09b0d6f2d + subpackages: + - codes + - credentials + - grpclog + - internal + - keepalive + - metadata + - naming + - peer + - stats + - tap + - transport +- name: gopkg.in/yaml.v2 + version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b +testImports: +- name: github.com/davecgh/go-spew + version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9 + subpackages: + - spew +- name: github.com/pmezard/go-difflib + version: d8ed2627bdf02c080bf22230dbb337003b7aba2d + subpackages: + - difflib +- name: github.com/stretchr/testify + version: 05e8a0eda380579888eb53c394909df027f06991 + subpackages: + - assert diff --git a/go/glide.yaml b/go/glide.yaml new file mode 100644 index 00000000..c5d66694 --- /dev/null +++ b/go/glide.yaml @@ -0,0 +1,33 @@ +package: github.com/PaddlePaddle/Paddle/go +import: +- package: github.com/PaddlePaddle/recordio +- package: github.com/coreos/etcd + version: ^3.2.1 + subpackages: + - clientv3 + - clientv3/concurrency + - embed + - etcdserver +- package: github.com/namsral/flag + version: ^1.7.4-pre +- package: github.com/sirupsen/logrus + version: ^1.0.0 +- package: github.com/topicai/candy +- package: golang.org/x/crypto + repo: https://github.com/golang/crypto.git + vcs: git +- package: golang.org/x/sys + repo: https://github.com/golang/sys.git + vcs: git +- package: golang.org/x/text + repo: https://github.com/golang/text.git + vcs: git +- package: github.com/satori/go.uuid + version: v1.1.0 +- package: github.com/alecthomas/gometalinter + version: v1.2.1 +- package: github.com/inconshreveable/log15 + version: v2.13 +- package: github.com/go-stack/stack + version: v1.6.0 +- package: github.com/golang/protobuf diff --git a/paddle/.common_test_util.sh b/paddle/.common_test_util.sh new file mode 100644 index 00000000..4681e49a --- /dev/null +++ b/paddle/.common_test_util.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +PORT_FILE=/tmp/paddle_test_ports +PORT_LOCK_FILE=/tmp/paddle_test_ports.lock + +# Create flag file, all user can rw, ignore all error here +touch $PORT_FILE $PORT_LOCK_FILE 2>/dev/null +chmod a+rw $PORT_FILE $PORT_LOCK_FILE 2>/dev/null + +# acquire a range of ports that not used by other runtests.sh currentlly. +# return 1 if ports is used by other, otherwise return 0. +# NOTE: the acquire_ports/release_ports is interprocess mutexed. +# +# There are two parameter of this method +# param 1: the begin of port range +# param 2: the length of port range. +# so, the port range is [param1, param1+param2) +acquire_ports(){ + ( + flock -x 200 + let "len=$1+$2" + for((i=$1; i<$len; i++)) + do + grep -q $i $PORT_FILE + if [ $? -eq 0 ] ; then + return 1 # Port already write to $PORT_FILE + fi + done + + for((i=$1; i<$len; i++)) + do + echo $i >> $PORT_FILE # Write to $PORT_FILE + done + return 0 + )200>$PORT_LOCK_FILE +} + +# release a range of ports. Mark these ports is not used by runtests.sh. +# NOTE: the acquire_ports/release_ports is interprocess mutexed. +# +# The parameter is same as acquire_ports, see acquire_ports' comments. +release_ports(){ + ( + flock -x 200 + let "len=$1+$2" + for((i=$1; i<$len; i++)) + do + tmp=`sed "/$i/d" $PORT_FILE` # remove port + echo $tmp > $PORT_FILE + done + )200>$PORT_LOCK_FILE +} + +# use set_port to get a random free port +# such as set_port -p port test_fuc to run test_fuc --port=random +# use -n to set_port test_fuc to get a continuous free port +# such as set_port -n 10 -p port test_fuc to get ten continuous free port to run test_fuc --port=random +set_port() +{ + num=1 + + port_type="port" + unset OPTIND + while getopts "n:p:" opt + do + case "$opt" in + n) echo "get num ${OPTARG}" + num=${OPTARG} + ;; + p) echo "get port_type ${OPTARG}" + port_type=${OPTARG} + ;; + esac + done + shift $((OPTIND-1)) + cmd=$@ + for ((i=1;i<=10000;i++)) + do + declare -i port=$RANDOM+10000 + port_used_total=0 + for((n=0;n<=num-1;n++)) + do + declare -i port_check=$port+$n + port_used_num=`netstat -a |grep $port_check|wc -l` + declare -i port_used_total=$port_used_total+$port_used_num + done + if [ $port_used_total -ne 0 ] + then + continue + fi + # Lock Ports. + acquire_ports $port $num + if [ $? -ne 0 ]; then + continue + fi + $cmd --$port_type=$port + return_val=$? + release_ports $port $num + if [ $return_val -eq 0 ]; then + return 0 + else + echo "$cmd run wrong" + return 1 + fi + done + +} diff --git a/paddle/.gitignore b/paddle/.gitignore new file mode 100644 index 00000000..01904aa6 --- /dev/null +++ b/paddle/.gitignore @@ -0,0 +1,43 @@ +.timestamp +*.o +*.a +.svn +GPATH +GRTAGS +GTAGS +.idl* +*~ +*.pyc +*.pb.cc +*.pb.h +*_pb2.py +output/ +google/ +Makefile +log/ +.pptool_config +hf/ +build +issue.info + +ar +g++ +gcc +ld +ld-linux-x86-64.so.2 +x86_64-scm-linux-gnu/ +.lint.*.md5 + +.idea/ +.test_env +Paddle_wrap.cxx +Paddle_wrap.h +paddle.py +py_paddle-*.whl +py_paddle/paddle.py +.py_paddle_extra_link_flags +HPPL_ERROR_LOG +unittest.list +proto +dist +setup.py diff --git a/paddle/.set_port.sh b/paddle/.set_port.sh new file mode 100755 index 00000000..617ac79a --- /dev/null +++ b/paddle/.set_port.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +DIRNAME=`dirname $0` +source $DIRNAME/.common_test_util.sh +set_port $@ diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh new file mode 100755 index 00000000..8fd58925 --- /dev/null +++ b/paddle/.set_python_path.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# A simple test driver for cmake. +# set PYTHONPATH before run command. +# Usage: +# ./.set_python_pash.sh -p YOUR_PYTHON_PATH {exec...} +# +# It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...} +# +PYPATH="" +set -x +while getopts "d:" opt; do + case $opt in + d) + PYPATH=$OPTARG + ;; + esac +done +shift $(($OPTIND - 1)) +export PYTHONPATH=$PYPATH:$PYTHONPATH +$@ diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt new file mode 100644 index 00000000..c0c04d47 --- /dev/null +++ b/paddle/CMakeLists.txt @@ -0,0 +1,4 @@ +add_subdirectory(scripts) +add_subdirectory(testing) +set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory") +add_subdirectory(fluid) diff --git a/paddle/contrib/float16/.gitignore b/paddle/contrib/float16/.gitignore new file mode 100644 index 00000000..dd28d354 --- /dev/null +++ b/paddle/contrib/float16/.gitignore @@ -0,0 +1 @@ +*.inference.model diff --git a/paddle/contrib/float16/README.md b/paddle/contrib/float16/README.md new file mode 100644 index 00000000..a1f8cb42 --- /dev/null +++ b/paddle/contrib/float16/README.md @@ -0,0 +1,171 @@ +# Float16 Inference in PaddlePaddle Fluid + +Kexin Zhao + +## Introduction +Deep learning is usually a two-stage work: training and inference. The training stage estimates model parameters (weights) from data. The inference stage loads the weights and uses them to interpret inputs. Typically, weights are 32-bit float values (float32). Some new devices, including NVIDIA Volta GPUs, support higher speed computation using 16-bit float values (float16). + +This article explains our efforts with PaddlePaddle to train using float32 and to inference using float16. We describe a [*transpiler*](https://github.com/PaddlePaddle/Paddle/blob/a4d3de0071e1f3912230c3ab3f9ac74cf06b093a/doc/fluid/design/motivation/fluid_compiler.md), which converts a PaddlePaddle Fluid model, which, to be precise, should be called a [Fluid *program*](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md), into the inference program, and converts the weights from float32 into float16. + + +## What is float16? +float16 (or FP16) is a half-precision floating-point format that uses 16 bits in memory to represent a value. The advantage over 32-bit single-precision floating-point format (commonly known as float or float32 data type) is that it requires half the storage and bandwidth at the expense of precision and range. Fortunately, DNN inference has a high tolerance for the loss of precision and range when using float16 to represent the weights, and the inference accuracy will only be minimally affected in most cases, which gives us the opportunity to use float16 data type to speed up the inference. + +Interested readers can refer to our [design doc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md) and [code](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/float16.h) for more details on how we implement the float16 data type. + +## Why float16? +The trend in today's deep learning community is to use bigger and deeper model, which translates to larger memory footprint, higher computation demands, and as a result higher energy consumption on computing devices. The advantages of float16 over float32 are correspondingly three-fold: + +1. We only need half the memory size to load the same model using float16 representations. Moreover, most of the intermediate results generated during float16 inference are also of the float16 data type. As a result, the whole memory footprint of float16 inference is roughly half of its float counterpart, which is especially useful when deploying inference on mobile devices with limited available memory. Also given the same available memory, the maximum batch size for float16 inference is about twice that for float inference. + +2. Because float16 occupies less memory than float, in theory, hardware devices can achieve much higher floating point operators per second (FLOPS) for float16 data than float data. Right now, NVIDIA's latest Volta GPUs, including Tesla V100 and Titan V, can deliver significantly higher FLOPS for float16 using Tensor Cores. Moreover, float16 takes less time to read from or write to memory, and hence float16 can make inference more efficient especially in memory-bound applications where the performance is mostly affected by how fast it is to read and write data. + +3. From the energy efficiency perspective, the energy needed to read, write, and compute float16 data is much less than its float counterpart, which can significantly reduce the battery power consumption on mobile devices or the total cost of ownership (TCO) of data centers. + +## Fluid implementation of float16 inference +### Overview +Fluid use [Program](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#program) instead of computation graph to describe a neural network model and the optimization procedure. Fluid program is a python wrapper around a protobuf message called [ProgramDesc](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/program.md). Similar to programming languages, the basic structure of a Fluid program is some nested [blocks](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#block), where each block consists of some [variable](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#variable) definitions and a sequence of [operators](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/modules/python_api.md#operator). An [executor](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/concepts/executor.md) will run a given program by sequentially executing the operators in the entrance block. + +### Basic requirement +When an executor runs an operator, it uses a kernel to perform computations on tensors contained in the input variables, and then writes the results to the tensors in the output variables. Each operator has multiple kernels for different combinations of data types, devices, and library types, respectively. The operator will select the appropriate kernel to run based on, among other things, the data type of the input tensors. By default, every Fluid operator has a kernel for float data type that takes float inputs and generates float outputs. + +If we provide float input to the first operator in a program, then each operator will use float kernel to compute float output and send it as input to the next operator to trigger its float kernel. This chain effect will make the program run in float mode and gives us a final output of float data type. + +The same principle applies if we want a program to run in float16 mode. We provide input variable of the float16 data type to the first operator, and every subsequent operator will invoke the float16 kernel until we get the final output in float16. So the preliminary requirements for float16 inference are to add float16 kernels to operators that are needed in a specific kind of neural networks. Our current focus is on Convolutional Neural Networks (CNN) and hence we have added float16 kernels to the following operators: convolution, pooling, GEMM, elementwise addition, batch norm, dropout, various activations including relu and tanh, and softmax. + +### float16 transpiler +Furthermore, we need a transpiler to write float16 inference code similar to the following: + +```python +# Get the float32 inference program and load the associated float32 weights +[inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + +# Prepare the float input data +batch_size = 1 +tensor_img = numpy.random.rand(batch_size, 3, 32, 32).astype(numpy.float32) + +# Running inference_program in float mode +float_results = exe.run(inference_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets) + +# Use float16 transpiler to speedup +float16_inference_program = float_inference_program.clone() +t = Float16Transpiler() +t.transpile(float16_inference_program, GPUPlace) + +# Running float16_inference_program in float16 mode using the same input data +float16_results = exe.run(float16_inference_program, + feed={feed_target_names[0]: tensor_img}, + fetch_list=fetch_targets) + +# Do some tests to verify the correctness of float16 inference +... +np.testing.assert_almost_equal(float_results, float16_results, ...) +... + +# Save the float16 inference program and float16 weights for future deployment +fluid.io.save_inference_model(fp16_save_dirname, feed_target_names, + fetch_targets, exe, + float16_inference_program) +``` + +In this scenario, we already have a float32 inference program and some associated float32 weights. We can simply use the `transpile` method of the `Float16Transpiler` class to do certain modifications to the existing program and weights so that we have a new float16 program and the associated float16 weights. + +We can then run various inference experiments in float16 mode and save the float16 program and weights on disk for future deployment. To enhance the code usability, we maintain a consistent API so that user can use the same float32 input data to run inference program in either float32 and float16 mode and obtain output data both of float32 data type. Consequently, we need to add cast operators in the float16 inference program for conversions between the float16 tensor and float32 tensor. + +The float16 transpiler is implemented to fulfill the requirements mentioned above. The details of the float16 transpiler can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/data_type/float16.md#float16-inference). + +### Experiment results +Simply running the following commands to reproduce the experiment results presented in this section: + +```bash +git clone https://github.com/PaddlePaddle/Paddle.git +cd Paddle +# This line will generate a paddle development docker image with cuda 8 and cudnn 7 +# If you want test on cuda 9 instead, change the line 5 in Paddle/Dockerfile +# from `FROM nvidia/cuda:8.0-cudnn7-devel-ubuntu16.04` +# to `FROM nvidia/cuda:9.0-cudnn7-devel-ubuntu16.04` and similarly for other configurations +nvidia-docker build -t paddle:float16 . +# After running this, different results will be written to different log files in Paddle/contrib/float16/ +nvidia-docker run -it -v $PWD:/paddle paddle:float16 /paddle/paddle/contrib/float16/run_float16_demo.sh +``` + +#### Accuracy +As is mentioned before, DNN inference has been found to be tolerant against the loss of precision and range incurred by float16, and we want to see how good this tolerance is. + +We train a resnet32 model using cifar10 data set, save it when test set accuracy is above 60%, and then test the inference accuracy on the 10000 examples of the cifar10 test set in float16 and float32 mode, respectively. + +We repeat the test ten times and get the following results: + +| | float16 | float32 | +|--------|--------:|--------: | +| # 1 | 62.75% | 62.72% | +| # 2 | 61.27% | 61.28% | +| # 3 | 62.24% | 62.23% | +| # 4 | 64.16% | 64.17% | +| # 5 | 60.75% | 60.77% | +| # 6 | 63.25% | 63.24% | +| # 7 | 62.15% | 62.13% | +| # 8 | 62.05% | 62.02% | +| # 9 | 65.19% | 65.20% | +| #10 | 62.53% | 62.48% | +| average| 62.63% | 62.62% | + +We can see that the accuracy of float16 inference is very close to that of float32 inference in every experiment (within 0.05% difference) and is overall 0.01% better than its float32 counterpart averaged over ten tests. + +#### Performance benchmark +Currently, Fluid only supports float16 inference on NVIDIA GPUs. There is no motivation to support float16 inference on non-ARM CPUs where float16 is not natively supported, and float16 calculation will only be slower than its float32 counterpart. + +NVIDIA started to support its native float16 data type (which has the same internal memory representation as Fluid's float16 class) on CUDA 7.5. Moreover, float16 speedups on computationally intensive tasks including GEMM (general matrix-matrix multiplication) and convolution are supported since cuBLAS 7.5 and cuDNN 5.0. + +Recently, the introduction of [Tensor Core](https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/) in Volta architecture GPUs and the support of Tensor Core computation in CUDA 9.0 and cuDNN 7 make float16 genuinely superior to float in some deep learning applications. + +We thus benchmark the float16 inference performance on a single NVIDIA Tesla V100 GPU (Volta architecture and with Tensor Cores) and compare it with its float32 counterpart. All the following results are in ms (millisecond) averaged over 1000 mini-batches with respective to different mini-batch(mb) sizes. + +Average inference time for one mini-batch on Vgg16 model tested on ImageNet dataset: + +| total | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | +|-------|-----: |-----: |-----: |-----: |------: |------:|-------:| +|float32| 14.01 | 9.70 | 22.99 | 28.26 | 53.87 | 84.42 | 178.95 | +|float16| 3.32 | 4.11 | 5.88 | 9.41 | 16.54 | 30.47 | 60.23 | +|Speedup| 4.22 | 2.36  | 3.91 | 3.00 | 3.26  | 2.77 | 2.97 | + +We can see that float16 inference provides **2x ~ 4x** speedup on different batch sizes. + +Convolution operation is ususally the computational bottleneck of CNN, so we also check the average time spent on the Fluid convolution operators for one mini-batch as follows: + +|conv op| mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | +|-------|-----: |-----: |-----: |-----: |------: |------:|-------:| +|float32| 11.95 | 6.96 | 18.65 | 21.42 | 41.35 | 60.58 | 130.11 | +|float16| 1.78 | 2.10 | 2.93 | 4.55 | 7.99 | 14.63 | 28.67 | +|Speedup| 6.71 | 3.31  | 6.37 | 4.71 | 5.18  | 4.14 | 4.54 | + +Fluid convolution operator uses cuDNN 7 to implement the kernel, and we can see that with the help of Tensor Core, float16 convolution is significantly faster than its float32 counterpart, which makes the overall float16 inference performance much better. + +Similarly, we also list the benchmark results of Resnet50 model tested on the ImageNet dataset: + +| total | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | +|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:| +|float32| 7.03 | 7.41 | 9.16 | 12.55 | 21.13 | 38.27 | 67.93 | 127.02 | +|float16| 6.13 | 6.32 | 6.24 | 7.40 | 10.90 | 18.18 | 33.20 | 64.52 | +|Speedup| 1.15 | 1.17  | 1.47  | 1.70 | 1.94  | 2.11 | 2.05 | 1.97 | + +|conv op| mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | +|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:| +|float32| 5.43 | 5.46 | 6.50 | 8.36 | 13.80 | 24.45 | 41.21 | 73.44 | +|float16| 4.19 | 4.30 | 3.96 | 4.21 | 5.63 | 8.77 | 15.24 | 28.40 | +|Speedup| 1.30 | 1.27  | 1.64  | 1.99 | 2.45  | 2.79 | 2.70 | 2.59 | + +We find that the speedup provided by float16 inference starts relatively small at 1.15x for batch size 1 and gradually increases to about 2x for larger batch sizes. A similar trend can be found for the time spent on the convolution operator. Note that right now Tensor Cores will only be utilized in the convolution operation when the input data and filter meet specific dimensional requirements. The speedup by float16 inference for Resnet50 is smaller than the Vgg16 counterpart partially because the convolution operation in Resnet is much simpler than its Vgg counterpart and this makes the tensor core less utilized in Resnet than in Vgg. + +We also did the same benchmark on a single NVIDIA GeForce GTX 1080 Ti GPU that does not support Tensor Core. The results show that for Vgg16, float16 inference provides consistent small speedup (around 1.15x) for all mini-batch sizes, while for Resnet50, float16 inference is slower than its float32 counterpart in small batch sizes (mb = 1 and 2) and then delivers around 1.15x speedup for all larger batch sizes. By comparing the benchmarks on 1080 Ti and V100, we find that Tensor Core, which is specialized for float16 computations, is a critical component of high performance float16 inference. + +Please refer to [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/float16/float16_benchmark.md) for complete benchmark results. + +### Summary +1. Fluid is now able to run inference in float16 mode via a float16 transpiler. We currently support CNN programs, including Vgg and Resnet, to run in float16 inference mode. +2. The accuracy of float16 inference is verified to be almost identical to its float32 counterpart at least on CNN models. +3. float16 inference provides a significant speedup on large and computationally intensive Vgg16 model on ImageNet dataset. For the much smaller and simpler Resnet50 model, the speedup provided by float16 inference is less significant than for Vgg16 model but still favorable, especially for large batch sizes. +4. We cannot achieve the superior float16 inference performance without the help of the newly introduced Tensor Cores on NVIDIA Volta architecture GPUs. diff --git a/paddle/contrib/float16/float16_benchmark.md b/paddle/contrib/float16/float16_benchmark.md new file mode 100644 index 00000000..b51d6bde --- /dev/null +++ b/paddle/contrib/float16/float16_benchmark.md @@ -0,0 +1,97 @@ +# float16 benchmark + +## Description +We want to compare the inference benchmark of float16 vs float32 on the "image_classification" example on Nvidia Tesla V100 GPU, where we can enable the tensor core computation for float16 mode. We test Vgg16 and Resnet50 on the imagenet data set, and Vgg16 and Resnet32 on the cifar10 data set. For completeness, we also add the inference benchmark of Vgg16 and Resnet50 on imagenet data set tested on Nvidia GeForce GTX 1080 Ti GPU. + +For more details about tensor core, please refer to https://devblogs.nvidia.com/programming-tensor-cores-cuda-9/ + +## Test environment +- GPU: single Nvidia Tesla V100 or single Nvidia GeForce GTX 1080 Ti +- CUDNN: 7.1.1 +- CUDA: 9.0 +- Code: https://github.com/PaddlePaddle/Paddle/pull/10331 (Tensor core is enabled in float16 mode) + +## Benchmark on V100 +All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia V100 GPU with respective to different mini-batch(mb) sizes. + +### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]): + +Total inference time for one batch: + +| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | +|-------|-----: |-----: |-----: |-----: |------: |------:|-------:| +|float32| 14.01 | 9.70 | 22.99 | 28.26 | 53.87 | 84.42 | 178.95 | +|float16| 3.32 | 4.11 | 5.88 | 9.41 | 16.54 | 30.47 | 60.23 | +|Speedup| 4.22 | 2.36  | 3.91 | 3.00 | 3.26  | 2.77 | 2.97 | + +Total time spent on conv op for one batch: + +| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | +|-------|-----: |-----: |-----: |-----: |------: |------:|-------:| +|float32| 11.95 | 6.96 | 18.65 | 21.42 | 41.35 | 60.58 | 130.11 | +|float16| 1.78 | 2.10 | 2.93 | 4.55 | 7.99 | 14.63 | 28.67 | +|Speedup| 6.71 | 3.31  | 6.37 | 4.71 | 5.18  | 4.14 | 4.54 | + + +### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]): + +Total inference time for one batch: + +|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | +|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:| +|float32| 7.03 | 7.41 | 9.16 | 12.55 | 21.13 | 38.27 | 67.93 | 127.02 | +|float16| 6.13 | 6.32 | 6.24 | 7.40 | 10.90 | 18.18 | 33.20 | 64.52 | +|Speedup| 1.15 | 1.17  | 1.47  | 1.70 | 1.94  | 2.11 | 2.05 | 1.97 | + +Total time spent on conv op for one batch: + +|       | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | +|-------|-----: |-----: |-----: |-----: |------: |------:|-------:|-------:| +|float32| 5.43 | 5.46 | 6.50 | 8.36 | 13.80 | 24.45 | 41.21 | 73.44 | +|float16| 4.19 | 4.30 | 3.96 | 4.21 | 5.63 | 8.77 | 15.24 | 28.40 | +|Speedup| 1.30 | 1.27  | 1.64  | 1.99 | 2.45  | 2.79 | 2.70 | 2.59 | + + +### Vgg16 on cifar10 (image.shape = [3, 32, 32]): + +Total inference time for one batch: + +| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 | +|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:| +|float32| 3.13 | 3.17 | 3.19 | 3.58 | 3.98 | 6.23 | 8.42 | 13.44 | 24.19 | 44.97 | +|float16| 2.72 | 2.77 | 2.76 | 2,88 | 2.96 | 3.24 | 4.01 | 5.78 | 9.65 | 17.37 | +|Speedup| 1.15 | 1.14 | 1.16 | 1.24 | 1.34 | 1.92  | 2.10 | 2.33  | 2.51 | 2.59 | + + +### Resnet32 on cifar10 (image.shape = [3, 32, 32]): + +Total inference time for one batch: + +| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | mb=128 | mb=256 | mb=512 | +|-------|-----:|-----:|-----:|-----:|------:|------:|------:|-------:|-------:|-------:| +|float32| 3.11 | 3.14 | 2.99 | 3.04 | 3.10 | 3.28 | 4.47 | 6.86 | 11.63 | 21.16 | +|float16| 3.70 | 3.81 | 3.75 | 3.83 | 3.77 | 3.97 | 3.92 | 4.15 | 6.41 | 11.02 | +|Speedup|     |     |     |     |       | | 1.14  | 1.65 | 1.81 | 1.92 | + + +## Benchmark on 1080 Ti +All times are in ms (millisecond) averaged over 1000 iterations tested on a single Nvidia GeForce GTX 1080 Ti GPU with respective to different mini-batch(mb) sizes. + +### Vgg16 on imagenet (flowers data set: image.shape = [3, 224, 224]): +Total inference time for one batch: + +| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | +|-------|-----: |-----: |-----: |-----: |------: |-------:| +|float32| 5.60 | 9.38 | 15.86 | 29.79 | 57.60 | 117.73 | +|float16| 4.99 | 7.79 | 13.47 | 26.02 | 52.30 | 102.34 | +|Speedup| 1.12 | 1.20  | 1.18 | 1.15 | 1.10  | 1.15 | + + +### Resnet50 on imagenet (flowers data set: image.shape = [3, 224, 224]): +Total inference time for one batch: + +| | mb=1 | mb=2 | mb=4 | mb=8 | mb=16 | mb=32 | mb=64 | +|-------|-----: |-----: |-----: |-----: |------: |-------:|-------:| +|float32| 5.63 | 6.23 | 8.85 | 14.71 | 26.07 | 52.86 | 108.95 | +|float16| 5.89 | 6.44 | 7.94 | 12.57 | 22.03 | 45.06 | 92.68 | +|Speedup| |  | 1.12  | 1.17 | 1.18  | 1.17 | 1.18 | diff --git a/paddle/contrib/float16/float16_inference_demo.py b/paddle/contrib/float16/float16_inference_demo.py new file mode 100644 index 00000000..063227d5 --- /dev/null +++ b/paddle/contrib/float16/float16_inference_demo.py @@ -0,0 +1,362 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +from float16_transpiler import Float16Transpiler + +import argparse +import paddle +import paddle.fluid as fluid +import contextlib +import math +import sys +import numpy as np +import os + +parser = argparse.ArgumentParser( + 'Float16 inference accuracy test and benchmark.') +parser.add_argument( + '--train_batch_size', type=int, default=16, help="Batch size for training.") +parser.add_argument( + '--inf_batch_size', type=int, default=32, help="Batch size for inference.") +parser.add_argument( + '--repeat', type=int, default=1, help="How many times to run the test.") +parser.add_argument( + '--data_set', + type=str, + default='cifar10', + choices=['cifar10', 'imagenet'], + help="Optional dataset for benchmark.") +parser.add_argument( + '--model', + type=str, + default='vgg', + choices=['vgg', 'resnet'], + help="Optional model for benchmark.") +parser.add_argument( + '--threshold', + type=float, + default=0.005, + help='Save inference model when test accuracy reach this threshold.') +parser.add_argument('--learning_rate', type=float, default=0.001) +args = parser.parse_args() + + +def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'): + conv1 = fluid.layers.conv2d( + input=input, + filter_size=filter_size, + num_filters=ch_out, + stride=stride, + padding=padding, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv1, act=act) + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out: + return conv_bn_layer(input, ch_out, 1, stride, 0, None) + else: + return input + + +def basicblock(input, ch_out, stride): + short = shortcut(input, ch_out, stride) + conv1 = conv_bn_layer(input, ch_out, 3, stride, 1) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1, act=None) + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') + + +def bottleneck(input, ch_out, stride): + short = shortcut(input, ch_out * 4, stride) + conv1 = conv_bn_layer(input, ch_out, 1, stride, 0) + conv2 = conv_bn_layer(conv1, ch_out, 3, 1, 1) + conv3 = conv_bn_layer(conv2, ch_out * 4, 1, 1, 0, act=None) + return fluid.layers.elementwise_add(x=short, y=conv3, act='relu') + + +def layer_warp(block_func, input, ch_out, count, stride): + res_out = block_func(input, ch_out, stride) + for i in range(1, count): + res_out = block_func(res_out, ch_out, 1) + return res_out + + +def resnet_imagenet(input, depth=50): + cfg = { + 18: ([2, 2, 2, 1], basicblock), + 34: ([3, 4, 6, 3], basicblock), + 50: ([3, 4, 6, 3], bottleneck), + 101: ([3, 4, 23, 3], bottleneck), + 152: ([3, 8, 36, 3], bottleneck) + } + stages, block_func = cfg[depth] + conv1 = conv_bn_layer(input, ch_out=64, filter_size=7, stride=2, padding=3) + pool1 = fluid.layers.pool2d( + input=conv1, pool_type='avg', pool_size=3, pool_stride=2) + res1 = layer_warp(block_func, pool1, 64, stages[0], 1) + res2 = layer_warp(block_func, res1, 128, stages[1], 2) + res3 = layer_warp(block_func, res2, 256, stages[2], 2) + res4 = layer_warp(block_func, res3, 512, stages[3], 2) + pool2 = fluid.layers.pool2d( + input=res4, + pool_size=7, + pool_type='avg', + pool_stride=1, + global_pooling=True) + return pool2 + + +def resnet_cifar10(input, depth=32): + assert (depth - 2) % 6 == 0 + + n = (depth - 2) // 6 + + conv1 = conv_bn_layer( + input=input, ch_out=16, filter_size=3, stride=1, padding=1) + res1 = layer_warp(basicblock, conv1, 16, n, 1) + res2 = layer_warp(basicblock, res1, 32, n, 2) + res3 = layer_warp(basicblock, res2, 64, n, 2) + pool = fluid.layers.pool2d( + input=res3, pool_size=8, pool_type='avg', pool_stride=1) + return pool + + +def vgg16(input): + def conv_block(input, num_filter, groups, dropouts): + return fluid.nets.img_conv_group( + input=input, + pool_size=2, + pool_stride=2, + conv_num_filter=[num_filter] * groups, + conv_filter_size=3, + conv_act='relu', + conv_with_batchnorm=True, + conv_batchnorm_drop_rate=dropouts, + pool_type='max') + + conv1 = conv_block(input, 64, 2, [0.3, 0]) + conv2 = conv_block(conv1, 128, 2, [0.4, 0]) + conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0]) + conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0]) + conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0]) + + drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) + fc1 = fluid.layers.fc(input=drop, size=4096, act=None) + bn = fluid.layers.batch_norm(input=fc1, act='relu') + drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) + fc2 = fluid.layers.fc(input=drop2, size=4096, act=None) + return fc2 + + +def train(place, save_dirname): + if args.data_set == "cifar10": + class_dim = 10 + data_shape = [3, 32, 32] + elif args.data_set == "imagenet": + class_dim = 102 + data_shape = [3, 224, 224] + else: + raise ValueError("%s dataset is not supported" % data_set) + + images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + if args.model == "vgg": + print("train vgg") + net = vgg16(images) + elif args.model == "resnet": + print("train resnet") + if args.data_set == "cifar10": + net = resnet_cifar10(images) + elif args.data_set == "imagenet": + net = resnet_imagenet(images) + else: + raise ValueError("%s dataset is not supported" % args.data_set) + else: + raise ValueError("%s network is not supported" % args.model) + + predict = fluid.layers.fc(input=net, size=class_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc = fluid.layers.accuracy(input=predict, label=label) + + #Test program + test_program = fluid.default_main_program().clone(for_test=True) + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) + optimizer.minimize(avg_cost) + + BATCH_SIZE = args.train_batch_size + PASS_NUM = 100 + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.flowers.train() + if args.data_set == 'imagenet' else paddle.dataset.cifar.train10(), + buf_size=128 * 10), + batch_size=args.train_batch_size) + + test_reader = paddle.batch( + paddle.dataset.flowers.test() + if args.data_set == 'imagenet' else paddle.dataset.cifar.test10(), + batch_size=args.inf_batch_size) + + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) + + exe.run(fluid.default_startup_program()) + main_program = fluid.default_main_program() + + for pass_id in range(PASS_NUM): + for batch_id, data in enumerate(train_reader()): + train_image = np.array( + map(lambda x: x[0].reshape(data_shape), data)).astype("float32") + train_label = np.array(map(lambda x: x[1], data)).astype("int64") + train_label = train_label.reshape([-1, 1]) + + exe.run(main_program, + feed={'pixel': train_image, + 'label': train_label}) + + if (batch_id % 100) == 0: + acc_list = [] + avg_loss_list = [] + for tid, test_data in enumerate(test_reader()): + test_image = np.array( + map(lambda x: x[0].reshape(data_shape), + test_data)).astype("float32") + test_label = np.array(map(lambda x: x[1], + test_data)).astype("int64") + test_label = test_label.reshape([-1, 1]) + + loss_t, acc_t = exe.run( + program=test_program, + feed={"pixel": test_image, + "label": test_label}, + fetch_list=[avg_cost, acc]) + if math.isnan(float(loss_t)): + sys.exit("got NaN loss, training failed.") + acc_list.append(float(acc_t)) + avg_loss_list.append(float(loss_t)) + + acc_value = np.array(acc_list).mean() + avg_loss_value = np.array(avg_loss_list).mean() + + print( + 'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Accuracy {3:2.2}'. + format(pass_id, batch_id + 1, + float(avg_loss_value), float(acc_value))) + + if acc_value > args.threshold: + print( + 'Save inference model with test accuracy of {0} at {1}'. + format(float(acc_value), save_dirname)) + fluid.io.save_inference_model(save_dirname, ["pixel"], + [predict], exe) + return + + +def test_accuracy(executor, inference_program, feed_target_names, + fetch_targets): + if args.data_set == "cifar10": + data_shape = [3, 32, 32] + elif args.data_set == "imagenet": + data_shape = [3, 224, 224] + else: + raise ValueError("%s dataset is not supported" % data_set) + + test_reader = paddle.batch( + paddle.dataset.cifar.test10() + if args.data_set == "cifar10" else paddle.dataset.flowers.test(), + batch_size=args.inf_batch_size) + + test_num = 0 + correct_num = 0 + + for test_data in test_reader(): + test_image = np.array( + map(lambda x: x[0].reshape(data_shape), test_data)).astype( + "float32") + test_label = np.array(map(lambda x: x[1], test_data)).astype("int64") + test_label = test_label.reshape([-1, 1]) + + results = executor.run(program=inference_program, + feed={feed_target_names[0]: test_image}, + fetch_list=fetch_targets) + + prediction = np.argmax(results[0], axis=1).reshape([-1, 1]) + correct_num += np.sum(prediction == test_label) + test_num += test_label.size + + print("{0} out of {1} predictions are correct.".format(correct_num, + test_num)) + print("Test accuray is {0}.".format(float(correct_num) / float(test_num))) + + +def infer(place, save_dirname): + exe = fluid.Executor(place) + inference_scope = fluid.core.Scope() + + with fluid.scope_guard(inference_scope): + # Use fluid.io.load_inference_model to obtain the inference program desc, + # the feed_target_names (the names of variables that will be feeded + # data using feed operators), and the fetch_targets (variables that + # we want to obtain data from using fetch operators). + print("Load inference model from {0}".format(save_dirname)) + [inference_program, feed_target_names, + fetch_targets] = fluid.io.load_inference_model(save_dirname, exe) + + print("The test set accuracy of inference in float mode is:") + test_accuracy(exe, inference_program, feed_target_names, fetch_targets) + + float16_inference_program = inference_program.clone() + t = Float16Transpiler() + t.transpile(float16_inference_program, place) + + print("The test set accuracy of inference in float16 mode is:") + test_accuracy(exe, float16_inference_program, feed_target_names, + fetch_targets) + + fp16_save_dirname = "float16_" + save_dirname + fluid.io.save_inference_model(fp16_save_dirname, feed_target_names, + fetch_targets, exe, + float16_inference_program) + + +@contextlib.contextmanager +def scope_prog_guard(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + +if __name__ == "__main__": + if not fluid.core.is_compiled_with_cuda(): + raise Exception("This test requires CUDA GPUs!") + + place = fluid.CUDAPlace(0) + if not fluid.core.is_float16_supported(place): + raise Exception( + "This test requires compute capability of CUDA GPU >= 5.3!") + + for i in range(args.repeat): + with scope_prog_guard(): + save_dirname = "image_classification_" + args.data_set + "_" + args.model + ".inference.model" + train(place, save_dirname) + infer(place, save_dirname) diff --git a/paddle/contrib/float16/float16_transpiler.py b/paddle/contrib/float16/float16_transpiler.py new file mode 100644 index 00000000..500f64be --- /dev/null +++ b/paddle/contrib/float16/float16_transpiler.py @@ -0,0 +1,256 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.framework import Program +from paddle.fluid.executor import global_scope + + +class Float16Transpiler: + def transpile(self, program, place, scope=None): + ''' + Transpile the program desc and cast the weights to float16 data type to + enable float16 inference. + + Since the operator in a program desc will automatically choose the + right compute kernel to run based on the data type of the input tensor. + We actually don't need to change the program desc to run in float16 mode. + + However, in this way, users who are used to feeding and fetching tensors + of float32 data type when running typical inference may find it confusing + and difficult to run inference in float16 mode as they need to convert + input data to float16 dtype and then convert the results back to float32 + dtype to match the rest of code. + + So this function appends cast ops to the program desc where necessary so + that users are able to run inference in float16 mode while providing input + tensor (feed_holder) of float data type and obtaining output tensor + (fetch_holder) of float data type. + + Moreover, it is desired that when we have the scope and program desc to run + inference in float32 mode, we can use a single API to do the necessary + modification and then user can run float16 inference on the fly. To make + this happen, this function also create new parameters in the scope to have the + converted float16 weights and change the operators in program desc to use + these new parameters. + + :param program: program to transpile + :type program: Program + :param place: inference place + :type place: Place + :param scope: inference scope + :type scope: Scope + ''' + if not isinstance(program, Program): + raise TypeError("program should be as Program type") + if not isinstance(place, core.CPUPlace) and not isinstance( + place, core.CUDAPlace): + raise TypeError("place should be as CPUPlace/CUDAPlace type") + if scope is None: + scope = global_scope() + if not isinstance(scope, core._Scope): + raise TypeError("scope should be as Scope type or None") + + self.scope = scope + self.place = place + self.block = program.block(0) + self.input_map = {} # store the input names should be adjusted + + self._modify_feed_fetch() + self._convert_param_to_float16() + self._adjust_input(skip=True) + self._remove_unused_var() + + # TODO(luotao): use clone() method to flush the program.desc in force, + # since some large program.desc will not be flushed immediately. + # And a better solution will be considered later. + program = program.clone() + + # ====================== private transpiler functions ===================== + def _adjust_input(self, skip=False): + ''' + Change the input variable name in operators. + + When we are in the process of modifying a program desc, we usually + replace some variables with some other variables, where we create + a dictionary input_map to record the one-to-one correspondence + between each old variable and the new one. + + After that, this function will search all the operators that use the + old variables and change the info in op to use the new variables. There + maybe some exceptions to this rule when we are using the float16 transpiler + and insert cast ops to cast float32 variable to float16 one. After we + insert the cast op to cast var_1 to var_1_fp16, we don't want to change + the input of cast op to var_1_fp16 after using this function. + ''' + skip_ops = {"cast"} + for i in range(len(self.block.ops)): + current_op = self.block.ops[i] + if skip and current_op.type in skip_ops: + continue + for input_arg in current_op.input_arg_names: + if input_arg in self.input_map: + current_op._rename_input(input_arg, + self.input_map[input_arg]) + + def _remove_unused_var(self): + ''' + remove unused varibles in program + ''' + args = [] + for i in range(len(self.block.ops)): + current_op = self.block.ops[i] + args += current_op.input_arg_names + args += current_op.output_arg_names + args = list(set(args)) # unique the input and output arguments + + for var in self.block.vars.keys(): + if var not in args: + self.block._remove_var(var) + + def _modify_feed_fetch(self): + ''' + Modify feed fetch op/vars for float16 inference. + + For each feed op: + feed_op->feed_target_var + + Change it to: + feed_op->feed_target_var->cast_op(from other dtype to float16)->tmp_var + + For each fetch op: + fetch_target_var->fetch_op + + Change it to: + tmp_var->cast_op(from float16 to other dtype)->fetch_target_var->fetch_op + + :return: None + ''' + + def find_op(var): + # It is possible that var.op is not up to date after some + # modifications to program desc. Here we force to make it up to date. + var.op = None + for op in self.block.ops: + if var.name in op.output_arg_names: + var.op = op + break + + if var.op is None: + raise ValueError("The target variable must have an " + "associated operator that generates it.") + + i = 0 + while i < len(self.block.ops): + cur_op = self.block.ops[i] + if cur_op.type == "feed": + var_name = cur_op.output("Out")[0] + tmp_var_name = var_name + ".fp16" + var = self.block.vars[var_name] + tmp_var = self.block.create_var( + name=tmp_var_name.encode('ascii'), + type=var.type, + dtype=core.VarDesc.VarType.FP16, + shape=var.shape, + persistable=var.persistable) + self.block._insert_op( + i + 1, + type="cast", + inputs={"X": var}, + outputs={"Out": tmp_var}, + attrs={ + 'in_dtype': int(var.dtype), + 'out_dtype': int(tmp_var.dtype) + }) + self.input_map[var_name] = tmp_var_name + i = i + 1 + elif cur_op.type == "fetch": + var_name = cur_op.input("X")[0] + tmp_var_name = var_name + ".fp16" + var = self.block.vars[var_name] + tmp_var = self.block.create_var( + name=tmp_var_name.encode('ascii'), + type=var.type, + dtype=core.VarDesc.VarType.FP16, + shape=var.shape, + persistable=var.persistable) + find_op(var) + var.op._rename_output(var_name, tmp_var_name) + self.block._insert_op( + i, + type="cast", + inputs={"X": tmp_var}, + outputs={"Out": var}, + attrs={ + 'in_dtype': int(tmp_var.dtype), + 'out_dtype': int(var.dtype) + }) + i = i + 1 + i = i + 1 + + def _convert_param_to_float16(self): + def _get_no_fp16_conversion_var_names(): + ''' + Get the set of input variable names that shouldn't be converted to float16. + + When we want to run inference in float16 mode, most parameters need to be + firstly converted to float16. However, there are some parameters that + shouldn't be converted to float16 because the corresponding operator + requires float32 parameters even in float16 mode (when the input data is + of float16 data type). Currently, the only operator that has this exclusion + is the batch norm op. + + :return: set of input variable names + :type var_names: set + ''' + op_names = {'batch_norm'} + var_names = [] + for op in self.block.ops: + if op.type in op_names: + var_names += op.input_arg_names + return set(var_names) + + def _should_be_converted(var): + return var.persistable and \ + var.name not in self.no_conversion_vars and \ + var.type != core.VarDesc.VarType.FEED_MINIBATCH and \ + var.type != core.VarDesc.VarType.FETCH_LIST + + self.no_conversion_vars = _get_no_fp16_conversion_var_names() + conversion_var_list = filter(_should_be_converted, + self.block.vars.values()) + for var in conversion_var_list: + fp16_var_name = var.name + ".fp16" + fp16_var = self.block.create_parameter( + name=fp16_var_name.encode('ascii'), + type=var.type, + dtype=core.VarDesc.VarType.FP16, + shape=var.shape) + + # cast the data in the tensor of the original var to float16 + # data type and store it in the tensor of the new float16 var + self.scope.var(fp16_var_name) + fp16_tensor = self.scope.find_var(fp16_var_name).get_tensor() + tensor = np.array(self.scope.find_var(var.name).get_tensor()) + # After the old tensor data is converted to np.float16, view(np.uint16) + # is used so that the internal memory of the numpy array will be + # reinterpreted to be of np.uint16 data type, which is binded to fluid + # float16 data type via the help of pybind in tensor_py.h. + fp16_tensor.set( + tensor.astype(np.float16).view(np.uint16), self.place) + + # old var will be replaced by the fp16 var in program desc + self.input_map[var.name] = fp16_var_name + self.block._remove_var(var.name) diff --git a/paddle/contrib/float16/run_float16_demo.sh b/paddle/contrib/float16/run_float16_demo.sh new file mode 100755 index 00000000..34cb7a12 --- /dev/null +++ b/paddle/contrib/float16/run_float16_demo.sh @@ -0,0 +1,111 @@ +#!/bin/bash + +BUILD_PATH=/paddle/fp16_build +WHEEL_PATH=$BUILD_PATH/python/dist +INFER_PATH=$BUILD_PATH/paddle/fluid/inference/tests/book +DEMO_PATH=/paddle/paddle/contrib/float16 + +# Use the single most powerful CUDA GPU on your machine +export CUDA_VISIBLE_DEVICES=0 + +# Build the PaddlePaddle Fluid wheel package and install it. +mkdir -p $BUILD_PATH && cd $BUILD_PATH +cmake .. -DWITH_AVX=OFF \ + -DWITH_MKL=OFF \ + -DWITH_GPU=ON \ + -DWITH_TESTING=ON \ + -DWITH_PROFILER=ON \ +make -j `nproc` +pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)" + +cd $DEMO_PATH +# Clear previous log results +rm -f *.log + +# Test the float16 inference accuracy of resnet32 on cifar10 data set +stdbuf -oL python float16_inference_demo.py \ + --data_set=cifar10 \ + --model=resnet \ + --threshold=0.6 \ + --repeat=10 \ + 2>&1 | tee -a float16_inference_accuracy.log + +# Sleep to cool down the GPU for consistent benchmarking +sleep 2m + +# benchmarking parameters +REPEAT=1000 +MAXIMUM_BATCH_SIZE=512 + +for ((batch_size = 1; batch_size <= MAXIMUM_BATCH_SIZE; batch_size *= 2)); +do + + # Test inference benchmark of vgg16 on imagenet + stdbuf -oL python float16_inference_demo.py \ + --data_set=imagenet \ + --model=vgg \ + --threshold=0.001 \ + --repeat=1 \ + + $INFER_PATH/test_inference_image_classification_vgg \ + --dirname=$DEMO_PATH/image_classification_imagenet_vgg.inference.model \ + --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_vgg.inference.model \ + --repeat=$REPEAT \ + --batch_size=$batch_size \ + --skip_cpu=true \ + 2>&1 | tee -a imagenet_vgg16_benchmark.log + + sleep 2m + + # Test inference benchmark of resnet50 on imagenet + stdbuf -oL python float16_inference_demo.py \ + --data_set=imagenet \ + --model=resnet \ + --threshold=0.001 \ + --repeat=1 \ + + $INFER_PATH/test_inference_image_classification_resnet \ + --dirname=$DEMO_PATH/image_classification_imagenet_resnet.inference.model \ + --fp16_dirname=$DEMO_PATH/float16_image_classification_imagenet_resnet.inference.model \ + --repeat=$REPEAT \ + --batch_size=$batch_size \ + --skip_cpu=true \ + 2>&1 | tee -a imagenet_resnet50_benchmark.log + + sleep 2m + + # Test inference benchmark of vgg16 on cifar10 + stdbuf -oL python float16_inference_demo.py \ + --data_set=cifar10 \ + --model=vgg \ + --threshold=0.001 \ + --repeat=1 \ + + $INFER_PATH/test_inference_image_classification_vgg \ + --dirname=$DEMO_PATH/image_classification_cifar10_vgg.inference.model \ + --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_vgg.inference.model \ + --repeat=$REPEAT \ + --batch_size=$batch_size \ + --skip_cpu=true \ + 2>&1 | tee -a cifar10_vgg16_benchmark.log + + sleep 1m + + # Test inference benchmark of resnet32 on cifar10 + stdbuf -oL python float16_inference_demo.py \ + --data_set=cifar10 \ + --model=resnet \ + --threshold=0.001 \ + --repeat=1 \ + + $INFER_PATH/test_inference_image_classification_vgg \ + --dirname=$DEMO_PATH/image_classification_cifar10_resnet.inference.model \ + --fp16_dirname=$DEMO_PATH/float16_image_classification_cifar10_resnet.inference.model \ + --repeat=$REPEAT \ + --batch_size=$batch_size \ + --skip_cpu=true \ + 2>&1 | tee -a cifar10_resnet32_benchmark.log + + sleep 1m + +done diff --git a/paddle/fluid/.clang-format b/paddle/fluid/.clang-format new file mode 100644 index 00000000..29282dc8 --- /dev/null +++ b/paddle/fluid/.clang-format @@ -0,0 +1,5 @@ +--- +Language: Cpp +BasedOnStyle: Google +Standard: Cpp11 +... diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec new file mode 100644 index 00000000..7c31b5b4 --- /dev/null +++ b/paddle/fluid/API.spec @@ -0,0 +1,1043 @@ +paddle.fluid.Program ('paddle.fluid.framework.Program', ('document', '7364a01d7b9132a435e46162c7fbd6c6')) +paddle.fluid.Program.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.Program.block (ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None), ('document', '86cd9499e226be661a3d686260ee1150')) +paddle.fluid.Program.clone (ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'a65221387f84c74eee5130d7678ca900')) +paddle.fluid.Program.current_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd601c7719e425e3d9cf862ea4ad194ca')) +paddle.fluid.Program.global_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd64ea1dc96e9f674499ea3006d470aa4')) +paddle.fluid.Program.list_vars (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '32c14b0f12baae4b352200fa09b5e789')) +paddle.fluid.Program.parse_from_string (ArgSpec(args=['binary_str'], varargs=None, keywords=None, defaults=None), ('document', 'b6a7ffb239a30bf2ce58cfaca8d8b8d5')) +paddle.fluid.Program.to_string (ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,)), ('document', '89acca639baf00f3ad08b9d827e81706')) +paddle.fluid.default_startup_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'ba609cb02e4e55e8d626723567ef1778')) +paddle.fluid.default_main_program (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '853718df675e59aea7104f3d61bbf11d')) +paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', '78fb5c7f70ef76bcf4a1862c3f6b8191')) +paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '917d313881ff990de5fb18d98a9c7b42')) +paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '1f2bb6ece651e44117652d2d7bedecf5')) +paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '956bab564ebc69ffd17195c08cc8ffa0')) +paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c2562241744aabe3fff1b59af22dd281')) +paddle.fluid.in_dygraph_mode (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '301bae0d8e02cc9eec5be02f052f11c6')) +paddle.fluid.is_compiled_with_cuda (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '60c7f107a5050aeb58bb74eb175672b5')) +paddle.fluid.Executor ('paddle.fluid.executor.Executor', ('document', '34e8c1769313fbeff7817212dda6259e')) +paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3a584496aa1343f36eebf3c46b323a74')) +paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', 'bedc29ad01c1b911e99032ee1e19ac59')) +paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', '4cfcd9c15b766a51b584cc46d38f1ad8')) +paddle.fluid.Executor.train_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '28f50904a0213f110947a30e0438529c')) +paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f65788d9ead293ada47551339df12203')) +paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'e6c073ed237001aaba7bff976b62b122')) +paddle.fluid.DistributeTranspiler ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspiler', ('document', 'b2b19821c5dffcd11473d6a4eef089af')) +paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'b1951949c6d21698290aa8ac69afee32')) +paddle.fluid.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'c89fc350f975ef827f5448d68af388cf')) +paddle.fluid.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '90a40b80e0106f69262cc08b861c3e39')) +paddle.fluid.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '0e47f020304e2b824e87ff03475c17cd')) +paddle.fluid.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '418c7e8b268e9be4104f2809e654c2f7')) +paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, True)), ('document', '2348247f684bfd5bb9466470f35be064')) +paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd38c5b8b2b2e0bb19bcf1b581a80a7e4')) +paddle.fluid.DistributeTranspilerConfig ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspilerConfig', ('document', '550b8c767a8ae1a2eb74b18924ddc975')) +paddle.fluid.DistributeTranspilerConfig.__init__ +paddle.fluid.ParallelExecutor ('paddle.fluid.parallel_executor.ParallelExecutor', ('document', '2b4d2e859f2e0c6161f4fed995f7956d')) +paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.ParallelExecutor.drop_local_exe_scopes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '77c739744ea5708b80fb1b37cc89db40')) +paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '33ce6ec50f8eeb05d340e6b114b026fd')) +paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', 'b82ea20e2dc5ff2372e0643169ca47ff')) +paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '74dc6d23185d90a7a50fbac19f5b65fb')) +paddle.fluid.DataFeedDesc ('paddle.fluid.data_feed_desc.DataFeedDesc', ('document', '43877a0d9357db94d3dbc7359cbe8c73')) +paddle.fluid.DataFeedDesc.__init__ (ArgSpec(args=['self', 'proto_file'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '9c6615854b61caa5f0d3e6ccc5e51338')) +paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'a34790bff4a2891713ddd644db56418d')) +paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'fdd07ce63e72bed57f2c0db5bec5720f')) +paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'c23a79dfa04edd014b477bd4b183da06')) +paddle.fluid.CompiledProgram ('paddle.fluid.compiler.CompiledProgram', ('document', '6c45b5ccc24ae62d10115ce8abdc29a5')) +paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', '0e17773521634ef798fddd7d2ea3ef96')) +paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8')) +paddle.fluid.ExecutionStrategy ('paddle.fluid.core_avx.ExecutionStrategy', ('document', '535ce28c4671176386e3cd283a764084')) +paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.ExecutionStrategy) -> None +paddle.fluid.BuildStrategy ('paddle.fluid.core_avx.BuildStrategy', ('document', 'eec64b9b7cba58b0a63687b4c34ffe56')) +paddle.fluid.BuildStrategy.GradientScaleStrategy ('paddle.fluid.core_avx.GradientScaleStrategy', ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.GradientScaleStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.ReduceStrategy ('paddle.fluid.core_avx.ReduceStrategy', ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.BuildStrategy.ReduceStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy.ReduceStrategy, arg0: int) -> None +paddle.fluid.BuildStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.BuildStrategy) -> None +paddle.fluid.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b')) +paddle.fluid.io.save_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '869104f47e6fd21d897c3fcc426aa942')) +paddle.fluid.io.save_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '046d7c43d67e08c2660bb3bd7e081015')) +paddle.fluid.io.save_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'ffcee38044975c29f2ab2fec0576f963')) +paddle.fluid.io.load_vars (ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '1bb9454cf09d71f190bb51550c5a3ac9')) +paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', '116a9ed169e7ff0226faccff3c29364c')) +paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cfa84ef7c5435625bff4cc132cb8a0e3')) +paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment', 'program_only'], varargs=None, keywords=None, defaults=(None, None, None, True, False)), ('document', 'fc82bfd137a9b1ab8ebd1651bd35b6e5')) +paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '2f54d7c206b62f8c10f4f9d78c731cfd')) +paddle.fluid.io.PyReader ('paddle.fluid.reader.PyReader', ('document', 'e37efae53f3935b32aec37eda9f3d906')) +paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable', 'return_list'], varargs=None, keywords=None, defaults=(None, None, True, True, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.io.PyReader.decorate_batch_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', '4364e836e3cb8ab5e68e411b763c50c7')) +paddle.fluid.io.PyReader.decorate_sample_generator (ArgSpec(args=['self', 'sample_generator', 'batch_size', 'drop_last', 'places'], varargs=None, keywords=None, defaults=(True, None)), ('document', 'efa4c8b90fe6d99dcbda637b70351bb1')) +paddle.fluid.io.PyReader.decorate_sample_list_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', '6c11980092720de304863de98074a64a')) +paddle.fluid.io.PyReader.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '7432197701fdaab1848063860dc0b97e')) +paddle.fluid.io.PyReader.start (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f6395fd95b025000c5c7a5be31aebc4e')) +paddle.fluid.initializer.ConstantInitializer ('paddle.fluid.initializer.ConstantInitializer', ('document', '798f1fd87cbe9798d001ffb6e616415d')) +paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.UniformInitializer ('paddle.fluid.initializer.UniformInitializer', ('document', 'a8f1177e4ce29766853e801d5b0a3635')) +paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.NormalInitializer ('paddle.fluid.initializer.NormalInitializer', ('document', '2171207fb07293603e0fd2ff01234b3e')) +paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.TruncatedNormalInitializer ('paddle.fluid.initializer.TruncatedNormalInitializer', ('document', 'b8e90aad6ee5687cb5f2b6fd404370d1')) +paddle.fluid.initializer.TruncatedNormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.XavierInitializer ('paddle.fluid.initializer.XavierInitializer', ('document', '3d5676f1a5414aa0c815d793a795ccb3')) +paddle.fluid.initializer.XavierInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.BilinearInitializer ('paddle.fluid.initializer.BilinearInitializer', ('document', '5646a5cd44f0c9111344d13f46d31169')) +paddle.fluid.initializer.BilinearInitializer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd389912dc079cbef432335a00017cec0')) +paddle.fluid.initializer.MSRAInitializer ('paddle.fluid.initializer.MSRAInitializer', ('document', 'ecfadb28c52d01496d107835a69ec3f9')) +paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)), ('document', '53c757bed9345f2ad3361902531e7cf5')) +paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '53c01b661feb8e60d0efa2066976c1a8')) +paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '68bebc3963526880a07c98a5d6226794')) +paddle.fluid.initializer.NumpyArrayInitializer ('paddle.fluid.initializer.NumpyArrayInitializer', ('document', '064f134a27c16372967d450f499762ab')) +paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1c74f52549814235077ecc34856a95eb')) +paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '1b4916f765620374ad0fdefe5a352993')) +paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', '6d3ee14da70adfa36d85c40b18716ef2')) +paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'c37d51aad655c8a9f9b045c64717320a')) +paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3')) +paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e')) +paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '34f96be41684b0959897a9e735997e20')) +paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c469f22029f7b5d41ecd44dfa1e81ffd')) +paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', '8e6ce424cf9e261ef32ee229c06a6e66')) +paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', 'f43c659ca1749a3f0ff2231e6dfda07d')) +paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6263dfdeb6c670fa0922c9cbc8fb1bf4')) +paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'bbb9e708bab250359864fefbdf48e9d9')) +paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types', 'seq_length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b02844e0ad4bd713c5fe6802aa13219c')) +paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)), ('document', '3d8e8f3e0e1cf520156be37605e83ccd')) +paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '114c7fe6b0adfc6d6371122f9b9f506e')) +paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '367293b5bada54136a91621078d38334')) +paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test', 'pad_value'], varargs=None, keywords=None, defaults=(False, 0.0)), ('document', 'e90a93251c52dc4e6fb34fb3991b3f82')) +paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'eaa9d0bbd3d4e017c8bc4ecdac483711')) +paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', 'cee673c79e3ff4582656a24e04f841e5')) +paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', 'be7e530dcbd603962e25573a63eb145e')) +paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '053b1a855f13a066d005759171724bc6')) +paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '52343203de40afe29607397e13aaf0d2')) +paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '55db6ae7275fb9678a6814aebab81a9c')) +paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '4cc22c3553e73a958e8b9a240d894431')) +paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', '2460b30fb87037555208fa8ac6fc1787')) +paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '83e08f21af41ac8bac37aeab1f86fdd0')) +paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '903ac9a778e0bf1bf649bd71e9d0ba0c')) +paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '9b1f13c1fc872f76f8f84cf11e955f53')) +paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '10e122eb755c2bd1f78ef2332b28f1a0')) +paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '858c432e7cbd8bb952cc2eb555457d50')) +paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '1ba3ccfe13ed5091e113c09c13dc3a20')) +paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7f5ce36fb0016621e6bc001f4236d978')) +paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', 'fe126c58e4339410e875ab1eba246d21')) +paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'dd5f06fb7cf39ca06cbab4abd03e6893')) +paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'a3024789eba11a70c2ef27c358173400')) +paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '10023caec4d7f78c3b901f023a1feaa7')) +paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '1a1c91625ce3c32646f69ca10d4d1da7')) +paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'b386471f0476c80c61d8c8672278063d')) +paddle.fluid.layers.reduce_all (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '8ab17ab51f68a6e76302b27f928cedf3')) +paddle.fluid.layers.reduce_any (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '0483ac3b7a99e879ccde583ae8d7a60d')) +paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'f2dfd65b859de9844e7261e7a4503f63')) +paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '1af2e3a887e4f914f9d6650406186ab6')) +paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '39fbc5437be389f6c0c769f82fc1fba2')) +paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', '558d13133596209190df9a624264f28f')) +paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '78cf3a7323d1a7697658242e13f63759')) +paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2bc3a59efa9d52b628a6255422d9f0e8')) +paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(True, None, None, None)), ('document', '77cbfb28cd2fc589f589c7013c5086cd')) +paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', 'c1df110ea65998984f564c5c10abc54a')) +paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'fa2081f6e731bb9de7cd535ca07f523a')) +paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e50940f3ce5a08cc477b72f517491bf3')) +paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', '4aa9df890b47eb67d5442f04aaf9eeec')) +paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'f568714a876425004aca4ea2d4a27701')) +paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8e72db173d4c082e27cb11f31d8c9bfa')) +paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', '33134416fc27dd65a767e5f15116ee16')) +paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '11a544a6e3fd0482509712dd54377fa1')) +paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', 'd4435a63d34203339831ee6a86ef9242')) +paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', 'b83e7dfa81059b39bb137922dc914f50')) +paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '1270395ce97a4e1b556104abbb14f096')) +paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e')) +paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23')) +paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', '79797f827d89ae72c77960e9696883a9')) +paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '96b24820e8863d6044d5be4eaaddb9fd')) +paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '9461e67095a6fc5d568fb2ce8fef66ff')) +paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax', 'axis'], varargs=None, keywords=None, defaults=(False, -100, True, False, -1)), ('document', '54e1675aa0364f4a78fa72804ec0f413')) +paddle.fluid.layers.smooth_l1 (ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'ecb75c1b00c4c76c98b482f633b7a10c')) +paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth', 'allow_out_of_range'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ec4115591be842868c86b2e5334245c6')) +paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '98e7927f09ee2270535b29f048e481ec')) +paddle.fluid.layers.reshape (ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '6196c9ec3075ca5a9c058ea1f8492256')) +paddle.fluid.layers.squeeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbac07662a6e22e8e299ced880c7775')) +paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b9bd3129d36a70e7c4385df51ff71c62')) +paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9a72a7c8c80926150ea826e94efd7e9b')) +paddle.fluid.layers.lrn (ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)), ('document', '73d297256da8954617996958d26ee93d')) +paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '2f189f8ef61f1c23779e1593b78755c0')) +paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '95aa1972983f30fe9b5a3713e523e20f')) +paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '214f1dfbe95a628600bbe99e836319cf')) +paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'ceedc8c22752c623d6e1ea2e8df0f43f')) +paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '6f65342f646ef04ae705080a7dfee63f')) +paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '7e8e4bf1f0f8612961ed113e8af8f0c5')) +paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', 'a29488d94d9a4bc4434d8a3529b4c6fe')) +paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', 'bd97ebfe4bdf5110a5fcb8ecb626a447')) +paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', '548c7c2ead5771d15abbaad505f901e9')) +paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', 'b7d810d1e251c5957c1efa6aa699d2d0')) +paddle.fluid.layers.gather (ArgSpec(args=['input', 'index', 'overwrite'], varargs=None, keywords=None, defaults=(True,)), ('document', 'f985c9b66e3aec96fa753a8eb44c991c')) +paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name', 'overwrite'], varargs=None, keywords=None, defaults=(None, True)), ('document', '69b22affd4a6326502af166f04c095ab')) +paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'abe3f714120117a5a3d3e639853932bf')) +paddle.fluid.layers.random_crop (ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)), ('document', '042af0b8abea96b40c22f6e70d99e042')) +paddle.fluid.layers.mean_iou (ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None), ('document', 'e3b6630ba43cb13dfeeb1601cb64d671')) +paddle.fluid.layers.relu (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0942c174f4f6fb274976d4357356f6a2')) +paddle.fluid.layers.selu (ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'f93c61f5b0bf933cd425a64dca2c4fdd')) +paddle.fluid.layers.log (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '02f668664e3bfc4df6c00d7363467140')) +paddle.fluid.layers.crop (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'ddf9837ee83e549119210a3d714d5f44')) +paddle.fluid.layers.rank_loss (ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8eb36596bb43d7a907d3397c7aedbdb3')) +paddle.fluid.layers.margin_rank_loss (ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)), ('document', '6fc86ed23b420c8a0f6c043563cf3937')) +paddle.fluid.layers.elu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '9af1926c06711eacef9e82d7a9e4d308')) +paddle.fluid.layers.relu6 (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)), ('document', '538fc860b2a1734e118b94e4a1a3ee67')) +paddle.fluid.layers.pow (ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '35fa2b79b1ae6968d4a69788051c1d27')) +paddle.fluid.layers.stanh (ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None)), ('document', '1e1efad868714425da15c785dfb533a1')) +paddle.fluid.layers.hard_sigmoid (ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None)), ('document', '607d79ca873bee40eed1c79a96611591')) +paddle.fluid.layers.swish (ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'b511609e3e0e8b636bf19f8b98249897')) +paddle.fluid.layers.prelu (ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '2da40e447716338affebfe058d05d9a9')) +paddle.fluid.layers.brelu (ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None)), ('document', '49580538249a52c857fce75c94ad8af7')) +paddle.fluid.layers.leaky_relu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None)), ('document', '1eb3009c69060299ec87949ee0d4b9ae')) +paddle.fluid.layers.soft_relu (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(40.0, None)), ('document', '6455afd2498b00198f53f83d63d6c6a4')) +paddle.fluid.layers.flatten (ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'b52306659a21e6b118eed49fe2c155a1')) +paddle.fluid.layers.sequence_mask (ArgSpec(args=['x', 'maxlen', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 'int64', None)), ('document', '6c3f916921b24edaad220f1fcbf039de')) +paddle.fluid.layers.stack (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', 'a76f347bf27ffe21b990340d5d9524d5')) +paddle.fluid.layers.pad2d (ArgSpec(args=['input', 'paddings', 'mode', 'pad_value', 'data_format', 'name'], varargs=None, keywords=None, defaults=([0, 0, 0, 0], 'constant', 0.0, 'NCHW', None)), ('document', '3f3abdb795a5c2aad8c2312249551ce5')) +paddle.fluid.layers.unstack (ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b0c4ca08d4eb295189e1b107c920d093')) +paddle.fluid.layers.sequence_enumerate (ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b870fed41abd2aecf929ece65f555fa1')) +paddle.fluid.layers.unique (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=('int32',)), ('document', 'cab0b06e5683875f12f0efc62fa230a9')) +paddle.fluid.layers.expand (ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '33bc4f6010282ffe044d77be7ba7c275')) +paddle.fluid.layers.sequence_concat (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b992616c1afbd6b0c2a897ac23036381')) +paddle.fluid.layers.scale (ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)), ('document', '463e4713806e5adaa4d20a41e2218453')) +paddle.fluid.layers.elementwise_add (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '5c0fb7298aec32525f96d451ae4c2851')) +paddle.fluid.layers.elementwise_div (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '1da49b7cda887dd84087ef8c060fcf6a')) +paddle.fluid.layers.elementwise_sub (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '992559c8327c61babd2ed25fc9047fbf')) +paddle.fluid.layers.elementwise_mul (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '213db11a61dcb0f31159d343cc35e2f5')) +paddle.fluid.layers.elementwise_max (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '409167a1409ec31b0d3a2f8852a7943f')) +paddle.fluid.layers.elementwise_min (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '4e1322836eb69473d5606bfe346c5375')) +paddle.fluid.layers.elementwise_pow (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'b9e7e9fa1ca28d8b6f07cc59eadb4a02')) +paddle.fluid.layers.elementwise_mod (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '614984304f810f3ddae6b489ec01296b')) +paddle.fluid.layers.elementwise_floordiv (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', 'a8c4b26d899246378e878f169582c7a4')) +paddle.fluid.layers.uniform_random_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)), ('document', 'cfa120e583cd4a5bfa120c8a26f98a28')) +paddle.fluid.layers.gaussian_random (ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', 'ebbf399d4e03190ce5dc9488f05c92f4')) +paddle.fluid.layers.sampling_id (ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', 'c39b647b6cf08e058d96ee503d5284fe')) +paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')), ('document', 'b24d0b21361c4bb8ef2cec8c26fb12b2')) +paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'f4b60847cb0f1ae00823ba6fb1b11310')) +paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '3ca6a761570d86e303e473afba99bb49')) +paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'bf61c8f79d795a8371bdb3b5468aa82b')) +paddle.fluid.layers.rank (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '096df0e0273145ab80ed119a4c294db3')) +paddle.fluid.layers.size (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'cf2e156beae36378722666c4c33bebfe')) +paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '1d6777f61831c54bea3a0029e2118448')) +paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '4d51a5a453755e0eb8c5ff6910a00dca')) +paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '1840f54c5bd5338bdf854980d47bf771')) +paddle.fluid.layers.logical_not (ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'd8fc1c5a5535736d4cd44c893a9701c9')) +paddle.fluid.layers.clip (ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ce33756573c572da67302499455dbcd')) +paddle.fluid.layers.clip_by_norm (ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '99a1b9012d9c4495efc89d69958c3be7')) +paddle.fluid.layers.mean (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '597257fb94d0597c404a6a5c91ab5258')) +paddle.fluid.layers.mul (ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)), ('document', '784b7e36cea88493f9e37a41b10fbf4d')) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits (ArgSpec(args=['x', 'label', 'ignore_index', 'name', 'normalize'], varargs=None, keywords=None, defaults=(-100, None, False)), ('document', '7637c974f2d749d359acae9062c4d96f')) +paddle.fluid.layers.maxout (ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '22df6542f3f9aa3f34c0c2dab5dc1d80')) +paddle.fluid.layers.space_to_depth (ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '26decdea9376b6b9a0d3432d82ca207b')) +paddle.fluid.layers.affine_grid (ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f85b263b7b6698d000977529a28f202b')) +paddle.fluid.layers.sequence_reverse (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65c8362e48810b8226e311c5d046db51')) +paddle.fluid.layers.affine_channel (ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name', 'act'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None, None)), ('document', '9f303c67538e468a36c5904a0a3aa110')) +paddle.fluid.layers.similarity_focus (ArgSpec(args=['input', 'axis', 'indexes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '18ec2e3afeb90e70c8b73d2b71c40fdb')) +paddle.fluid.layers.hash (ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', 'a0b73c21be618cec0281e7903039e5e3')) +paddle.fluid.layers.grid_sampler (ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5d16663e096d7f04954c70ce1cc5e195')) +paddle.fluid.layers.log_loss (ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)), ('document', 'e3993a477c94729526040ff65d95728e')) +paddle.fluid.layers.add_position_encoding (ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e399f9436fed5f7ff480d8532e42c937')) +paddle.fluid.layers.bilinear_tensor_product (ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '45fc3652a8e1aeffbe4eba371c54f756')) +paddle.fluid.layers.merge_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b2b0e5d5c155ce24bafc38b78cd0b164')) +paddle.fluid.layers.get_tensor_from_selected_rows (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3e60aec040a6f740a130353323580bff')) +paddle.fluid.layers.lstm (ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)), ('document', 'baa7327ed89df6b7bdd32f9ffdb62f63')) +paddle.fluid.layers.shuffle_channel (ArgSpec(args=['x', 'group', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '276a1213dd431228cefa33c3146df34a')) +paddle.fluid.layers.temporal_shift (ArgSpec(args=['x', 'seg_num', 'shift_ratio', 'name'], varargs=None, keywords=None, defaults=(0.25, None)), ('document', '13b1cdcb01f5ffdc26591ff9a2ec4669')) +paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)), ('document', '8404e472ac12b4a30a505d3d3a3e5fdb')) +paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '42d5155374f69786300d90d751956998')) +paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '07cb0d95a646dba1b9cc7cdce89e59f0')) +paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '11bb8e62cc9256958eff3991fe4834da')) +paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '18bc95c62d3300456c3c7da5278b47bb')) +paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '864f3cdc5e0c6152e2a39b136171644f')) +paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '6b6ee1170fe20a79cf0631a1f49b0df2')) +paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '7e5cac851fd9bad344230e1044b6a565')) +paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', '20992b20d19c2e5983f366150827b4a6')) +paddle.fluid.layers.continuous_value_model (ArgSpec(args=['input', 'cvm', 'use_cvm'], varargs=None, keywords=None, defaults=(True,)), ('document', 'c03490ffaa1b78258747157c313db4cd')) +paddle.fluid.layers.where (ArgSpec(args=['condition'], varargs=None, keywords=None, defaults=None), ('document', 'b1e1487760295e1ff55307b880a99e18')) +paddle.fluid.layers.sign (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'fa2f457a81714430c5677c2d68744728')) +paddle.fluid.layers.deformable_conv (ArgSpec(args=['input', 'offset', 'mask', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'deformable_groups', 'im2col_step', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, None, None, None)), ('document', '4d83ba6b971cfd590493b0925b3e081e')) +paddle.fluid.layers.unfold (ArgSpec(args=['x', 'kernel_sizes', 'strides', 'paddings', 'dilations', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None)), ('document', '3f884662ad443d9ecc2b3734b4f61ad6')) +paddle.fluid.layers.deformable_roi_pooling (ArgSpec(args=['input', 'rois', 'trans', 'no_trans', 'spatial_scale', 'group_size', 'pooled_height', 'pooled_width', 'part_size', 'sample_per_part', 'trans_std', 'position_sensitive', 'name'], varargs=None, keywords=None, defaults=(False, 1.0, [1, 1], 1, 1, None, 1, 0.1, False, None)), ('document', '99c03e3f249e36854f87dedaa17c8f35')) +paddle.fluid.layers.shard_index (ArgSpec(args=['input', 'index_num', 'nshards', 'shard_id', 'ignore_value'], varargs=None, keywords=None, defaults=(-1,)), ('document', '5786fdbba6753ecd6cbce5e6b0889924')) +paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '9d7806e31bdf727c1a23b8782a09b545')) +paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'cccb6eb5410c822e5307c947aca2c899')) +paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '32181f6037e387fb6e68a5beaafe33b6')) +paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None), ('document', 'aa5803d1eccdaef03cdfb0b7ca088071')) +paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '3007211c84c5c77eda8dc83619a6eaf8')) +paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '7241dd1c142f4c65c8d7f66948140aa7')) +paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '290f5b97f24f0022e195f7228dd56fd9')) +paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', 'd78a1c7344955c5caed8dc13adb7beb6')) +paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '2edf37d57862b24a7a26aa19a3573f73')) +paddle.fluid.layers.Preprocessor ('paddle.fluid.layers.io.Preprocessor', ('document', '1c2efbbc1197b44941a95b9ec4e737ae')) +paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Preprocessor.inputs (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Preprocessor.outputs (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d1a4bc97bbce9fa1d4f7a4200a771ff')) +paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'aaf0176c743c43e9bc684dd7dfac25c5')) +paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '021272f30e0cdf7503586815378abfb8')) +paddle.fluid.layers.create_global_var (ArgSpec(args=['shape', 'value', 'dtype', 'persistable', 'force_cpu', 'name'], varargs=None, keywords=None, defaults=(False, False, None)), ('document', '47ea8b8c91879e50c9036e418b00ef4a')) +paddle.fluid.layers.cast (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '1e44a534cf7d26ab230aa9f5e4e0525a')) +paddle.fluid.layers.tensor_array_to_tensor (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)), ('document', '764c095ba4562ae740f979e970152d6e')) +paddle.fluid.layers.concat (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b3f30feb5dec8f110d7393ffeb30dbd9')) +paddle.fluid.layers.sums (ArgSpec(args=['input', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '5df743d578638cd2bbb9369499b44af4')) +paddle.fluid.layers.assign (ArgSpec(args=['input', 'output'], varargs=None, keywords=None, defaults=(None,)), ('document', '8bd94aef4e123986d9a8c29f67b5532b')) +paddle.fluid.layers.fill_constant_batch_size_like (ArgSpec(args=['input', 'shape', 'dtype', 'value', 'input_dim_idx', 'output_dim_idx'], varargs=None, keywords=None, defaults=(0, 0)), ('document', '3551aa494e88d0f271e40cd45d6e3020')) +paddle.fluid.layers.fill_constant (ArgSpec(args=['shape', 'dtype', 'value', 'force_cpu', 'out'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'd6b76c7d2c7129f8d713ca74f1c2c287')) +paddle.fluid.layers.argmin (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '3dd54487232d05df4d70fba94b7d0b79')) +paddle.fluid.layers.argmax (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)), ('document', '7f47cc9aa7531b6bd37c5c96bc7f0469')) +paddle.fluid.layers.argsort (ArgSpec(args=['input', 'axis', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '9792371e3b66258531225a5551de8961')) +paddle.fluid.layers.ones (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', '812c623ed52610b9773f9fc05413bc34')) +paddle.fluid.layers.zeros (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,)), ('document', '95379f9288c2d05356ec0e2375c6bc57')) +paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None), ('document', '628135603692137d52bcf5a8d8d6816d')) +paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '51a0fa1cfaf2507c00a215adacdb8a63')) +paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '129cf426e71452fe8276d616a6dc21ae')) +paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '23c66e5918040fcc11c8fa8c5da1b38e')) +paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', 'a45b42f21bc5a4e84b60981a3d629ab3')) +paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '3663d1148946eed4c1c34c81be586b9e')) +paddle.fluid.layers.zeros_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd88a23bcdc443719b3953593f7cef14a')) +paddle.fluid.layers.ones_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '642afd126553337d6796600e886a6525')) +paddle.fluid.layers.diag (ArgSpec(args=['diagonal'], varargs=None, keywords=None, defaults=None), ('document', '88a15e15f0098d549f07a01eaebf9ce3')) +paddle.fluid.layers.While ('paddle.fluid.layers.control_flow.While', ('document', '50110155608a00f43d3d3fd1be41dcb4')) +paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Switch ('paddle.fluid.layers.control_flow.Switch', ('document', 'a1c5ef8ff117d7d6ba8940ec104f02ce')) +paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Switch.case (ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Switch.default (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=None, keywords=None, defaults=(1.0, True)), ('document', 'f88b5787bb80ae6b8bf513a70dabbdc1')) +paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '3f913b5069ad40bd85d89b33e4aa5939')) +paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '556de793fdf24d515f3fc91260e2c048')) +paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '04af32422c3a3d8f6040aeb406c82768')) +paddle.fluid.layers.less_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '7b6d952a9f6340a044cfb91c16aad842')) +paddle.fluid.layers.greater_than (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '55710e2fafeda70cd1b53d7509712499')) +paddle.fluid.layers.greater_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '14bff27b2be5e60eaa30e41925265beb')) +paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '788aa651e8b9fec79d16931ef3a33e90')) +paddle.fluid.layers.not_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '57adebb8858ffab6be2d86d0522b85dc')) +paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', 'caf0d94349cdc28e1bda3b8a19411ac0')) +paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', '6f24a9b872027634ad758ea2826c9727')) +paddle.fluid.layers.IfElse ('paddle.fluid.layers.control_flow.IfElse', ('document', 'a389f88e19c3b332c3afcbf7df4488a5')) +paddle.fluid.layers.IfElse.__init__ (ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.false_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.output (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.IfElse.true_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.DynamicRNN ('paddle.fluid.layers.control_flow.DynamicRNN', ('document', 'b71e87285dbd4a43a6cc4f8a473245e6')) +paddle.fluid.layers.DynamicRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.DynamicRNN.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7')) +paddle.fluid.layers.DynamicRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'value', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, False, 'float32')), ('document', '57cdd0a63747f4c670cdb9d250ceb7e1')) +paddle.fluid.layers.DynamicRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'b439a176a3328de8a75bdc5c08eece4a')) +paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '55ab9c562edd7dabec0bd6fd6c1a28cc')) +paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '4b300851b5201891d0e11c406e4c7d07')) +paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f')) +paddle.fluid.layers.StaticRNN ('paddle.fluid.layers.control_flow.StaticRNN', ('document', 'f73671cb98696a1962bf5deaf49dc2e9')) +paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'f1b60dc4194d0bb714d6c6f5921b227f')) +paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'df6ceab6e6c9bd31e97914d7e7538137')) +paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7')) +paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '903387ec11f3d0bf46821d31a68cffa5')) +paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837')) +paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f')) +paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '5b552a1f0f7eb4dacb768a975ba15d08')) +paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', 'ee6c70867d317b0a87094ed23546215f')) +paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '3011dc695f490afdf504dc24f628319a')) +paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd894323f31a913c4a5bd4cc764f6a76a')) +paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd083538e3439ed6b28b00207e0f321d5')) +paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9ef0909adb4d8c9430fcd595bab72dc1')) +paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f820eeaf81dfbdd1c360122cd5795cc8')) +paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2dde114018cbcaff9b24c566bf6704a5')) +paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '63d198e36e1d85dcfb454c1a3cb3b38e')) +paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '4f53a5e7f50c55ea516375ef8f46316b')) +paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '893ec81a025f3c82f1c8fca6aa84d39f')) +paddle.fluid.layers.rsqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c629f5163fa04f80abb3d0240c462fa6')) +paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f6d5642f52e357f3cec89cc9c15dc66c')) +paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e5ccc5339056e947272c1921d11e6cfe')) +paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b9c1474e5d0f83e4a15a5cd827abbf9c')) +paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd7d3af92e8c1d93aeeb4d6bc2e0fc9b6')) +paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5c9a00178c5c28bb824f7d6c25060d3b')) +paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '20d1d49fe4d13430a63c57fc4b29a677')) +paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0da9ea1a725c3d91ca0c37cea951ba29')) +paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e91cb3422c0ffdc04375752143179b47')) +paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '30bd7174c21294230616a22cd87b0035')) +paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '200916f013bad0b052b13dc43901f0b8')) +paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'be4533a4cd97c84424512dca76142083')) +paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '410f27a44b7365cc60d5d5ff5a53407e')) +paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', '6de6775d9e9ed885056e764982130cfd')) +paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '386a4103d2884b2f1312ebc1e8ee6486')) +paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '5ab9d5721a6734fe127069e4314e1309')) +paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '9a0464425426a9b9c1b7500ede2836c1')) +paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', '0fdf82762fd0a5acb2578a72771b5b44')) +paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', '7a484a0da5e993a7734867a3dfa86571')) +paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'fd58078fdfffd899b91f992ba224628f')) +paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '080ce0d54d3f1950ad5a3a8e5ae529e9')) +paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'e9685f32d21bec8c013626c0254502c5')) +paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'efae414c1137c7944d6174dd08c5347a')) +paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '8edacd4b9bd02dd68931b9fa6bfe0cbd')) +paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '651d98d51879dfa1bc1cd40391786a41')) +paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', 'fa1d1c9d5e0111684c0db705f86a2595')) +paddle.fluid.layers.sigmoid_focal_loss (ArgSpec(args=['x', 'label', 'fg_num', 'gamma', 'alpha'], varargs=None, keywords=None, defaults=(2, 0.25)), ('document', 'aeac6aae100173b3fc7f102cf3023a3d')) +paddle.fluid.layers.anchor_generator (ArgSpec(args=['input', 'anchor_sizes', 'aspect_ratios', 'variance', 'stride', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, [0.1, 0.1, 0.2, 0.2], None, 0.5, None)), ('document', '0aaacaf9858b8270a8ab5b0aacdd94b7')) +paddle.fluid.layers.roi_perspective_transform (ArgSpec(args=['input', 'rois', 'transformed_height', 'transformed_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1.0,)), ('document', 'a82016342789ba9d85737e405f824ff1')) +paddle.fluid.layers.generate_proposal_labels (ArgSpec(args=['rpn_rois', 'gt_classes', 'is_crowd', 'gt_boxes', 'im_info', 'batch_size_per_im', 'fg_fraction', 'fg_thresh', 'bg_thresh_hi', 'bg_thresh_lo', 'bbox_reg_weights', 'class_nums', 'use_random', 'is_cls_agnostic', 'is_cascade_rcnn'], varargs=None, keywords=None, defaults=(256, 0.25, 0.25, 0.5, 0.0, [0.1, 0.1, 0.2, 0.2], None, True, False, False)), ('document', '69def376b42ef0681d0cc7f53a2dac4b')) +paddle.fluid.layers.generate_proposals (ArgSpec(args=['scores', 'bbox_deltas', 'im_info', 'anchors', 'variances', 'pre_nms_top_n', 'post_nms_top_n', 'nms_thresh', 'min_size', 'eta', 'name'], varargs=None, keywords=None, defaults=(6000, 1000, 0.5, 0.1, 1.0, None)), ('document', 'b7d707822b6af2a586bce608040235b1')) +paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', 'is_crowd', 'gt_segms', 'rois', 'labels_int32', 'num_classes', 'resolution'], varargs=None, keywords=None, defaults=None), ('document', 'b319b10ddaf17fb4ddf03518685a17ef')) +paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '72fca4a39ccf82d5c746ae62d1868a99')) +paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '4c6225fc1a1c0b84955a8f0013008243')) +paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e308ce1661cb722b220a6f482f85b9e4')) +paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '400403175718d5a632402cdae88b01b8')) +paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '11b463ae2ad4c797fb91b3ee9864c4b4')) +paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9ddee76cb808db83768bf68010e39b2b')) +paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', '76d74056e9eedcacf013d8e3b115cbd3')) +paddle.fluid.layers.retinanet_detection_output (ArgSpec(args=['bboxes', 'scores', 'anchors', 'im_info', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0.05, 1000, 100, 0.3, 1.0)), ('document', '078d28607ce261a0cba2b965a79f6bb8')) +paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6c023b9401214ae387a8b2d92638e5e4')) +paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3619a7847709f5868f5e929065947b38')) +paddle.fluid.layers.collect_fpn_proposals (ArgSpec(args=['multi_rois', 'multi_scores', 'min_level', 'max_level', 'post_nms_top_n', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '80a75103e001ca1ba056fbbe0c6a19f3')) +paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', 'ef799022a6040597462ae2b3d2f1c407')) +paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', '300537e259bba86fdefa13a133a0587d')) +paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'eaf430c5a0380fb11bfe9a8922cd6295')) +paddle.fluid.layers.natural_exp_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '63a9e96d446d7de1289f30b832bce36a')) +paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ea37a3a8a0b3ce2254e7bc49a0951dbe')) +paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', 'a343254c36c2e89512cd8cd8a1960ead')) +paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'd9f654117542c6b702963dda107a247f')) +paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'fd57228fb76195e66bbcc8d8e42c494d')) +paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', 'f0d65d8c89d0fe78051ca689daa15e35')) +paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', 'dc7292c456847ba41cfd318e9f7f4363')) +paddle.fluid.layers.Uniform ('paddle.fluid.layers.distributions.Uniform', ('document', 'af70e7003f437e7a8a9e28cded35c433')) +paddle.fluid.layers.Uniform.__init__ (ArgSpec(args=['self', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Uniform.entropy (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ba59f9ce77af3c93e2b4c8af1801a24e')) +paddle.fluid.layers.Uniform.kl_divergence (ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None), ('document', '3baee52abbed82d47e9588d9dfe2f42f')) +paddle.fluid.layers.Uniform.log_prob (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', 'b79091014ceaffb6a7372a198a341c23')) +paddle.fluid.layers.Uniform.sample (ArgSpec(args=['self', 'shape', 'seed'], varargs=None, keywords=None, defaults=(0,)), ('document', 'adac334af13f6984e991b3ecf12b8cb7')) +paddle.fluid.layers.Normal ('paddle.fluid.layers.distributions.Normal', ('document', '3265262d0d8b3b32c6245979a5cdced9')) +paddle.fluid.layers.Normal.__init__ (ArgSpec(args=['self', 'loc', 'scale'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.Normal.entropy (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd2db47b1e62c037a2570fc526b93f518')) +paddle.fluid.layers.Normal.kl_divergence (ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None), ('document', '2e8845cdf1129647e6fa6e816876cd3b')) +paddle.fluid.layers.Normal.log_prob (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', 'b79091014ceaffb6a7372a198a341c23')) +paddle.fluid.layers.Normal.sample (ArgSpec(args=['self', 'shape', 'seed'], varargs=None, keywords=None, defaults=(0,)), ('document', 'adac334af13f6984e991b3ecf12b8cb7')) +paddle.fluid.contrib.InitState ('paddle.fluid.contrib.decoder.beam_search_decoder.InitState', ('document', '3afd1f84232718e628e9e566941c5f05')) +paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.StateCell ('paddle.fluid.contrib.decoder.beam_search_decoder.StateCell', ('document', 'ecd0066c02867d445d7b461e28220c50')) +paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.StateCell.compute_state (ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None), ('document', '92973b3f222081a1d17069c683cf4a99')) +paddle.fluid.contrib.StateCell.get_input (ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None), ('document', '6f24a007cfa184e32f01a960703bfd70')) +paddle.fluid.contrib.StateCell.get_state (ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None), ('document', '630a4945cfe659ea4f307598fbbce5d2')) +paddle.fluid.contrib.StateCell.out_state (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '7ad681dff0393ddf13a724194e720f28')) +paddle.fluid.contrib.StateCell.set_state (ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None), ('document', 'd4e0e08cd5d9d9a571cbc52d114f5ae9')) +paddle.fluid.contrib.StateCell.state_updater (ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None), ('document', 'd5afe1b7665d94fb023b15cf913ca510')) +paddle.fluid.contrib.StateCell.update_states (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'fe0b0f1338723516a35a30247899c81b')) +paddle.fluid.contrib.TrainingDecoder ('paddle.fluid.contrib.decoder.beam_search_decoder.TrainingDecoder', ('document', 'cec7e190c2bdd3b17e8178bc3f177799')) +paddle.fluid.contrib.TrainingDecoder.__init__ (ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.TrainingDecoder.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '98d88fa1c989748410a12517c6a585bf')) +paddle.fluid.contrib.TrainingDecoder.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'f0a457dee586559036202087ce2eff69')) +paddle.fluid.contrib.TrainingDecoder.static_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', 'a024c72664fe815068423ba630b7658a')) +paddle.fluid.contrib.TrainingDecoder.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '4659db7a888a2495e71c1838a0483909')) +paddle.fluid.contrib.BeamSearchDecoder ('paddle.fluid.contrib.decoder.beam_search_decoder.BeamSearchDecoder', ('document', '102da4a2d2002fbb12d44b8ea36121ed')) +paddle.fluid.contrib.BeamSearchDecoder.__init__ (ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BeamSearchDecoder.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '98d88fa1c989748410a12517c6a585bf')) +paddle.fluid.contrib.BeamSearchDecoder.decode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '1e47c60f080c1343ebb6ceaef89656b2')) +paddle.fluid.contrib.BeamSearchDecoder.early_stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3a84a7454ed6707f79b9e954d92a7575')) +paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'aa89eb8fd5e4cabaf5cc1bcae14665a4')) +paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7')) +paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47')) +paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa')) +paddle.fluid.contrib.QuantizeTranspiler ('paddle.fluid.contrib.quantize.quantize_transpiler.QuantizeTranspiler', ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d')) +paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd')) +paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884')) +paddle.fluid.contrib.distributed_batch_reader (ArgSpec(args=['batch_reader'], varargs=None, keywords=None, defaults=None), ('document', 'b60796eb0a481484dd34e345f0eaa4d5')) +paddle.fluid.contrib.Compressor ('paddle.fluid.contrib.slim.core.compressor.Compressor', ('document', 'a5417774a94aa9ae5560a42b96527e7d')) +paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer', 'search_space'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], None, None, None, None)), ('document', 'c195b3bba26169cff9439e8c467557c0')) +paddle.fluid.contrib.Compressor.config (ArgSpec(args=['self', 'config_file'], varargs=None, keywords=None, defaults=None), ('document', '780d9c007276ccbb95b292400d7807b0')) +paddle.fluid.contrib.Compressor.run (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'c6e43d6a078d307672283c1f36e04fe9')) +paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67')) +paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f')) +paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b')) +paddle.fluid.contrib.HDFSClient ('paddle.fluid.contrib.utils.hdfs_utils.HDFSClient', ('document', '31207aa18424eab2249c54fe11724798')) +paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e')) +paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634')) +paddle.fluid.contrib.HDFSClient.is_dir (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)), ('document', '45bde1bae02605a205c8245b58b9156d')) +paddle.fluid.contrib.HDFSClient.is_exist (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=(None,)), ('document', 'be9c94bccff7ba0c1d95883ac62b5864')) +paddle.fluid.contrib.HDFSClient.ls (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', '808acac504870c7e46594b95674f8a86')) +paddle.fluid.contrib.HDFSClient.lsr (ArgSpec(args=['self', 'hdfs_path', 'only_file', 'sort'], varargs=None, keywords=None, defaults=(True, True)), ('document', 'fae835aa3354eb6a0434c0f9ba3c2747')) +paddle.fluid.contrib.HDFSClient.make_local_dirs (ArgSpec(args=['local_path'], varargs=None, keywords=None, defaults=None), ('document', 'e76b89c8e7f019b5da576c0026fcf689')) +paddle.fluid.contrib.HDFSClient.makedirs (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', '44d9972aae390aedf40aaea731a37e4b')) +paddle.fluid.contrib.HDFSClient.rename (ArgSpec(args=['self', 'hdfs_src_path', 'hdfs_dst_path', 'overwrite'], varargs=None, keywords=None, defaults=(False,)), ('document', '0eb133644d9a9f4da45bb39261ff0955')) +paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'retry_times'], varargs=None, keywords=None, defaults=(False, 5)), ('document', '7d053b4bfd6dcfdd2c9dda0e0dbd9665')) +paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a')) +paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a')) +paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4')) +paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'amp_lists', 'init_loss_scaling', 'incr_every_n_steps', 'decr_every_n_nan_or_inf', 'incr_ratio', 'decr_ratio', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(None, 1.0, 1000, 2, 2.0, 0.8, False)), ('document', 'd05e71f5b0bd6d92bb94e70e00b3f9cf')) +paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists ('paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists', ('document', 'c116ec6bb5d30998792daea8db21ee40')) +paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists.__init__ (ArgSpec(args=['self', 'custom_white_list', 'custom_black_list'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.fused_elemwise_activation (ArgSpec(args=['x', 'y', 'functor_list', 'axis', 'scale', 'save_intermediate_out'], varargs=None, keywords=None, defaults=(-1, 0.0, True)), ('document', '1c4b247a2858cea8d9d8750693688270')) +paddle.fluid.contrib.BasicGRUUnit ('paddle.fluid.contrib.layers.rnn_impl.BasicGRUUnit', ('document', '2aed2540ed1540f081be9f4d08f2a65e')) +paddle.fluid.contrib.BasicGRUUnit.__init__ (ArgSpec(args=['self', 'name_scope', 'hidden_size', 'param_attr', 'bias_attr', 'gate_activation', 'activation', 'dtype'], varargs=None, keywords=None, defaults=(None, None, None, None, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicGRUUnit.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.contrib.BasicGRUUnit.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.contrib.BasicGRUUnit.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicGRUUnit.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicGRUUnit.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.contrib.BasicGRUUnit.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.contrib.BasicGRUUnit.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicGRUUnit.forward (ArgSpec(args=['self', 'input', 'pre_hidden'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicGRUUnit.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.contrib.BasicGRUUnit.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicGRUUnit.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.contrib.BasicGRUUnit.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicGRUUnit.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.contrib.BasicGRUUnit.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.basic_gru (ArgSpec(args=['input', 'init_hidden', 'hidden_size', 'num_layers', 'sequence_length', 'dropout_prob', 'bidirectional', 'batch_first', 'param_attr', 'bias_attr', 'gate_activation', 'activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(1, None, 0.0, False, True, None, None, None, None, 'float32', 'basic_gru')), ('document', '0afcbe4fbe1b8c35eda58b4efe48f9fd')) +paddle.fluid.contrib.BasicLSTMUnit ('paddle.fluid.contrib.layers.rnn_impl.BasicLSTMUnit', ('document', '3d0b2e3172ce58e1304199efee066c99')) +paddle.fluid.contrib.BasicLSTMUnit.__init__ (ArgSpec(args=['self', 'name_scope', 'hidden_size', 'param_attr', 'bias_attr', 'gate_activation', 'activation', 'forget_bias', 'dtype'], varargs=None, keywords=None, defaults=(None, None, None, None, 1.0, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicLSTMUnit.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.contrib.BasicLSTMUnit.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.contrib.BasicLSTMUnit.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicLSTMUnit.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicLSTMUnit.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.contrib.BasicLSTMUnit.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.contrib.BasicLSTMUnit.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicLSTMUnit.forward (ArgSpec(args=['self', 'input', 'pre_hidden', 'pre_cell'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicLSTMUnit.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.contrib.BasicLSTMUnit.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicLSTMUnit.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.contrib.BasicLSTMUnit.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.BasicLSTMUnit.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.contrib.BasicLSTMUnit.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.contrib.basic_lstm (ArgSpec(args=['input', 'init_hidden', 'init_cell', 'hidden_size', 'num_layers', 'sequence_length', 'dropout_prob', 'bidirectional', 'batch_first', 'param_attr', 'bias_attr', 'gate_activation', 'activation', 'forget_bias', 'dtype', 'name'], varargs=None, keywords=None, defaults=(1, None, 0.0, False, True, None, None, None, None, 1.0, 'float32', 'basic_lstm')), ('document', 'fe4d0c3c55a162b8cfe10b05fabb7ce4')) +paddle.fluid.dygraph.Layer ('paddle.fluid.dygraph.layers.Layer', ('document', 'a889d5affd734ede273e94d4257163ab')) +paddle.fluid.dygraph.Layer.__init__ (ArgSpec(args=['self', 'name_scope', 'dtype'], varargs=None, keywords=None, defaults=(VarType.FP32,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Layer.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.Layer.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.Layer.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Layer.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Layer.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.Layer.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.Layer.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Layer.forward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Layer.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.Layer.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Layer.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.Layer.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Layer.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.Layer.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.__impl__ (ArgSpec(args=['func'], varargs=None, keywords=None, defaults=()), ('document', 'fa71ad4e6c2b5bf2b5258bd1959f9b2a')) +paddle.fluid.dygraph.guard (ArgSpec(args=['place'], varargs=None, keywords=None, defaults=(None,)), ('document', '7071320ffe2eec9aacdae574951278c6')) +paddle.fluid.dygraph.to_variable (ArgSpec(args=['value', 'block', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9a65d87163a2c6b00fb78f4e61fb3300')) +paddle.fluid.dygraph.Conv2D ('paddle.fluid.dygraph.nn.Conv2D', ('document', 'baafe7ae0d3a61ae79cf4c7443e2c37c')) +paddle.fluid.dygraph.Conv2D.__init__ (ArgSpec(args=['self', 'name_scope', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'dtype'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2D.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.Conv2D.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.Conv2D.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2D.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2D.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.Conv2D.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.Conv2D.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2D.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2D.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.Conv2D.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2D.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.Conv2D.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2D.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.Conv2D.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3D ('paddle.fluid.dygraph.nn.Conv3D', ('document', '8b756aaca65af9594cc574d9a5d2b055')) +paddle.fluid.dygraph.Conv3D.__init__ (ArgSpec(args=['self', 'name_scope', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3D.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.Conv3D.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.Conv3D.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3D.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3D.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.Conv3D.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.Conv3D.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3D.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3D.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.Conv3D.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3D.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.Conv3D.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3D.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.Conv3D.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Pool2D ('paddle.fluid.dygraph.nn.Pool2D', ('document', 'e9331666e47a38586c8809a23cbaf7de')) +paddle.fluid.dygraph.Pool2D.__init__ (ArgSpec(args=['self', 'name_scope', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'exclusive', 'dtype'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, True, VarType.FP32)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Pool2D.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.Pool2D.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.Pool2D.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Pool2D.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Pool2D.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.Pool2D.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.Pool2D.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Pool2D.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Pool2D.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.Pool2D.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Pool2D.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.Pool2D.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Pool2D.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.Pool2D.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.FC ('paddle.fluid.dygraph.nn.FC', ('document', '1d64242f03f2aca2307e94590b552430')) +paddle.fluid.dygraph.FC.__init__ (ArgSpec(args=['self', 'name_scope', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'dtype'], varargs=None, keywords=None, defaults=(1, None, None, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.FC.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.FC.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.FC.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.FC.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.FC.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.FC.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.FC.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.FC.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.FC.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.FC.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.FC.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.FC.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.FC.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.FC.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BatchNorm ('paddle.fluid.dygraph.nn.BatchNorm', ('document', '0b609e10e4d417c91d346f887d16771c')) +paddle.fluid.dygraph.BatchNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'num_channels', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'dtype', 'data_layout', 'in_place', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats', 'trainable_statistics'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'float32', 'NCHW', False, None, None, False, False, False, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BatchNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.BatchNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.BatchNorm.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BatchNorm.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BatchNorm.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.BatchNorm.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.BatchNorm.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BatchNorm.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BatchNorm.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.BatchNorm.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BatchNorm.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.BatchNorm.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BatchNorm.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.BatchNorm.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Embedding ('paddle.fluid.dygraph.nn.Embedding', ('document', 'ecf8dc4149f005cd30eddc0dd343454f')) +paddle.fluid.dygraph.Embedding.__init__ (ArgSpec(args=['self', 'name_scope', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Embedding.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.Embedding.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.Embedding.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Embedding.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Embedding.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.Embedding.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.Embedding.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Embedding.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Embedding.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.Embedding.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Embedding.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.Embedding.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Embedding.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.Embedding.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GRUUnit ('paddle.fluid.dygraph.nn.GRUUnit', ('document', '5308e42b6a6db4681ce5ee9e94983986')) +paddle.fluid.dygraph.GRUUnit.__init__ (ArgSpec(args=['self', 'name_scope', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GRUUnit.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.GRUUnit.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.GRUUnit.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GRUUnit.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GRUUnit.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.GRUUnit.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.GRUUnit.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GRUUnit.forward (ArgSpec(args=['self', 'input', 'hidden'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GRUUnit.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.GRUUnit.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GRUUnit.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.GRUUnit.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GRUUnit.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.GRUUnit.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.LayerNorm ('paddle.fluid.dygraph.nn.LayerNorm', ('document', '85ea3ae0e470704546cabcafd61192e1')) +paddle.fluid.dygraph.LayerNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.LayerNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.LayerNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.LayerNorm.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.LayerNorm.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.LayerNorm.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.LayerNorm.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.LayerNorm.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.LayerNorm.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.LayerNorm.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.LayerNorm.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.LayerNorm.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.LayerNorm.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.LayerNorm.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.LayerNorm.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NCE ('paddle.fluid.dygraph.nn.NCE', ('document', '47eb439a5568468fad70235f1e61ead9')) +paddle.fluid.dygraph.NCE.__init__ (ArgSpec(args=['self', 'name_scope', 'num_total_classes', 'param_attr', 'bias_attr', 'num_neg_samples', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, 'uniform', None, 0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NCE.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.NCE.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.NCE.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NCE.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NCE.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.NCE.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.NCE.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NCE.forward (ArgSpec(args=['self', 'input', 'label', 'sample_weight'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NCE.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.NCE.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NCE.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.NCE.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NCE.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.NCE.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PRelu ('paddle.fluid.dygraph.nn.PRelu', ('document', 'd395ed163b4cf86e7207968f27bc1c11')) +paddle.fluid.dygraph.PRelu.__init__ (ArgSpec(args=['self', 'name_scope', 'mode', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PRelu.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.PRelu.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.PRelu.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PRelu.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PRelu.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.PRelu.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.PRelu.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PRelu.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PRelu.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.PRelu.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PRelu.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.PRelu.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PRelu.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.PRelu.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BilinearTensorProduct ('paddle.fluid.dygraph.nn.BilinearTensorProduct', ('document', '310140d784933928a27db9a7af4761e8')) +paddle.fluid.dygraph.BilinearTensorProduct.__init__ (ArgSpec(args=['self', 'name_scope', 'size', 'name', 'act', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BilinearTensorProduct.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.BilinearTensorProduct.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.BilinearTensorProduct.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BilinearTensorProduct.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BilinearTensorProduct.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.BilinearTensorProduct.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.BilinearTensorProduct.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BilinearTensorProduct.forward (ArgSpec(args=['self', 'x', 'y'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BilinearTensorProduct.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.BilinearTensorProduct.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BilinearTensorProduct.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.BilinearTensorProduct.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BilinearTensorProduct.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.BilinearTensorProduct.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2DTranspose ('paddle.fluid.dygraph.nn.Conv2DTranspose', ('document', '918fa8ad8a62ff424c842fb8a840bf7a')) +paddle.fluid.dygraph.Conv2DTranspose.__init__ (ArgSpec(args=['self', 'name_scope', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2DTranspose.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.Conv2DTranspose.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.Conv2DTranspose.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2DTranspose.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2DTranspose.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.Conv2DTranspose.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.Conv2DTranspose.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2DTranspose.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2DTranspose.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.Conv2DTranspose.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2DTranspose.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.Conv2DTranspose.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv2DTranspose.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.Conv2DTranspose.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3DTranspose ('paddle.fluid.dygraph.nn.Conv3DTranspose', ('document', 'cd99906d0813609ddea3fb6a2ac900dc')) +paddle.fluid.dygraph.Conv3DTranspose.__init__ (ArgSpec(args=['self', 'name_scope', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3DTranspose.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.Conv3DTranspose.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.Conv3DTranspose.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3DTranspose.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3DTranspose.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.Conv3DTranspose.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.Conv3DTranspose.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3DTranspose.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3DTranspose.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.Conv3DTranspose.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3DTranspose.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.Conv3DTranspose.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Conv3DTranspose.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.Conv3DTranspose.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GroupNorm ('paddle.fluid.dygraph.nn.GroupNorm', ('document', '4d65fc6b00970e3b5c5dd0abeacd47cb')) +paddle.fluid.dygraph.GroupNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GroupNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.GroupNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.GroupNorm.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GroupNorm.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GroupNorm.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.GroupNorm.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.GroupNorm.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GroupNorm.forward (ArgSpec(args=['self', 'input'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GroupNorm.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.GroupNorm.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GroupNorm.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.GroupNorm.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.GroupNorm.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.GroupNorm.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.SpectralNorm ('paddle.fluid.dygraph.nn.SpectralNorm', ('document', 'f400a29393aa95fff829b4a6111e2952')) +paddle.fluid.dygraph.SpectralNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.SpectralNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.SpectralNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.SpectralNorm.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.SpectralNorm.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.SpectralNorm.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.SpectralNorm.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.SpectralNorm.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.SpectralNorm.forward (ArgSpec(args=['self', 'weight'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.SpectralNorm.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.SpectralNorm.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.SpectralNorm.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.SpectralNorm.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.SpectralNorm.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.SpectralNorm.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.TreeConv ('paddle.fluid.dygraph.nn.TreeConv', ('document', '1e3104dea2482f6b79cf7a7ac9a343ab')) +paddle.fluid.dygraph.TreeConv.__init__ (ArgSpec(args=['self', 'name_scope', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.TreeConv.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) +paddle.fluid.dygraph.TreeConv.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) +paddle.fluid.dygraph.TreeConv.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.TreeConv.clear_gradients (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.TreeConv.create_parameter (ArgSpec(args=['self', 'attr', 'shape', 'dtype', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'a6420ca1455366eaaf972191612de0b6')) +paddle.fluid.dygraph.TreeConv.create_variable (ArgSpec(args=['self', 'name', 'persistable', 'dtype', 'type'], varargs=None, keywords=None, defaults=(None, None, None, VarType.LOD_TENSOR)), ('document', '171cccfceba636d5bbf7bbae672945d8')) +paddle.fluid.dygraph.TreeConv.eval (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.TreeConv.forward (ArgSpec(args=['self', 'nodes_vector', 'edge_set'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.TreeConv.full_name (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '23ce4f961f48ed0f79cadf93a3938ed2')) +paddle.fluid.dygraph.TreeConv.load_dict (ArgSpec(args=['self', 'stat_dict', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.TreeConv.parameters (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '5aec25a854eb57abc798dccccbb507d5')) +paddle.fluid.dygraph.TreeConv.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.TreeConv.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) +paddle.fluid.dygraph.TreeConv.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Tracer ('paddle.fluid.dygraph.tracer.Tracer', ('document', '28d72409112111274c33e1f07229d5da')) +paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self', 'block'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Tracer.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Tracer.eval_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None 2. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None +paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'op', 'inputs', 'outputs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Tracer.trace_var (ArgSpec(args=['self', 'name', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Tracer.train_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.prepare_context (ArgSpec(args=['strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.save_persistables (ArgSpec(args=['model_dict', 'dirname', 'optimizers'], varargs=None, keywords=None, defaults=('save_dir', None)), ('document', '7f526f879139a14cda8e0b5a9171f264')) +paddle.fluid.dygraph.load_persistables (ArgSpec(args=['dirname'], varargs=None, keywords=None, defaults=('save_dir',)), ('document', '2574d50a7a9f89fb0d74ddf73d8128f0')) +paddle.fluid.dygraph.NoamDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NoamDecay', ('document', 'e45b81ab71653cb8ad7384671e6238e4')) +paddle.fluid.dygraph.NoamDecay.__init__ (ArgSpec(args=['self', 'd_model', 'warmup_steps', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(1, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NoamDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) +paddle.fluid.dygraph.NoamDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PiecewiseDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PiecewiseDecay', ('document', '139b30620ffd26ed3f4da24b954a4022')) +paddle.fluid.dygraph.PiecewiseDecay.__init__ (ArgSpec(args=['self', 'boundaries', 'values', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PiecewiseDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) +paddle.fluid.dygraph.PiecewiseDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NaturalExpDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NaturalExpDecay', ('document', 'ed584947bab492fb5263d1474dcab709')) +paddle.fluid.dygraph.NaturalExpDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NaturalExpDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) +paddle.fluid.dygraph.NaturalExpDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.ExponentialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.ExponentialDecay', ('document', '2d620b5c4ae70cf64c6d710647ef48c6')) +paddle.fluid.dygraph.ExponentialDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.ExponentialDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) +paddle.fluid.dygraph.ExponentialDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.InverseTimeDecay ('paddle.fluid.dygraph.learning_rate_scheduler.InverseTimeDecay', ('document', '599c7c42b0a27b83acfd648c705ac622')) +paddle.fluid.dygraph.InverseTimeDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.InverseTimeDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) +paddle.fluid.dygraph.InverseTimeDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PolynomialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PolynomialDecay', ('document', '19080eb899a7102ce33b43c17b5e8043')) +paddle.fluid.dygraph.PolynomialDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.PolynomialDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) +paddle.fluid.dygraph.PolynomialDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.CosineDecay ('paddle.fluid.dygraph.learning_rate_scheduler.CosineDecay', ('document', 'd21fe863218f9bcc4a7216c628cc041f')) +paddle.fluid.dygraph.CosineDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'step_each_epoch', 'epochs', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.CosineDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) +paddle.fluid.dygraph.CosineDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.BackwardStrategy ('paddle.fluid.core_avx.BackwardStrategy', ('document', '5d9496052ec793810c9f12ffad5c73ce')) +paddle.fluid.dygraph.BackwardStrategy.__init__ __init__(self: paddle.fluid.core_avx.BackwardStrategy) -> None +paddle.fluid.transpiler.DistributeTranspiler ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspiler', ('document', 'b2b19821c5dffcd11473d6a4eef089af')) +paddle.fluid.transpiler.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'b1951949c6d21698290aa8ac69afee32')) +paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs (ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None), ('document', 'c89fc350f975ef827f5448d68af388cf')) +paddle.fluid.transpiler.DistributeTranspiler.get_startup_program (ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '90a40b80e0106f69262cc08b861c3e39')) +paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program (ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,)), ('document', '0e47f020304e2b824e87ff03475c17cd')) +paddle.fluid.transpiler.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174')), ('document', '418c7e8b268e9be4104f2809e654c2f7')) +paddle.fluid.transpiler.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, True)), ('document', '2348247f684bfd5bb9466470f35be064')) +paddle.fluid.transpiler.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd38c5b8b2b2e0bb19bcf1b581a80a7e4')) +paddle.fluid.transpiler.HashName ('paddle.fluid.transpiler.ps_dispatcher.HashName', ('document', '8190ddc66ee412441f5d97fd3f702bdd')) +paddle.fluid.transpiler.HashName.__init__ (ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.HashName.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.HashName.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.RoundRobin ('paddle.fluid.transpiler.ps_dispatcher.RoundRobin', ('document', 'c124359054923c614758c3fbbf666290')) +paddle.fluid.transpiler.RoundRobin.__init__ (ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.RoundRobin.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.RoundRobin.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.transpiler.DistributeTranspilerConfig ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspilerConfig', ('document', '550b8c767a8ae1a2eb74b18924ddc975')) +paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ +paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', '13f01ff80e8dfbd3427d90cf49bc62eb')) +paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', 'd6a1e527b53f5cc15594fee307dfc5cf')) +paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', 'b87bacfc70dd3477ed25ef14aa01389a')) +paddle.fluid.nets.scaled_dot_product_attention (ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)), ('document', 'b1a07a0000eb9103e3a143ca8c13de5b')) +paddle.fluid.nets.img_conv_group (ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)), ('document', '4913d846264f17112bf7bc04273388cc')) +paddle.fluid.optimizer.SGDOptimizer ('paddle.fluid.optimizer.SGDOptimizer', ('document', 'c3c8dd3193d991adf8bda505560371d6')) +paddle.fluid.optimizer.SGDOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.SGDOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.SGDOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.SGDOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.SGDOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.SGDOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.SGDOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.MomentumOptimizer ('paddle.fluid.optimizer.MomentumOptimizer', ('document', 'a72bd02e5459e64596897d190413d449')) +paddle.fluid.optimizer.MomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.MomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.MomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.MomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.MomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.MomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.MomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.AdagradOptimizer ('paddle.fluid.optimizer.AdagradOptimizer', ('document', 'a1d4f0682cde43ad34432b1338aadf04')) +paddle.fluid.optimizer.AdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.AdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.AdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdagradOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.AdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.AdamOptimizer ('paddle.fluid.optimizer.AdamOptimizer', ('document', '6fe871b955cab6e267422d5af666dafa')) +paddle.fluid.optimizer.AdamOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdamOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.AdamOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.AdamOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdamOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.AdamOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.AdamaxOptimizer ('paddle.fluid.optimizer.AdamaxOptimizer', ('document', '883fc4541214e8343d3a89711936e15d')) +paddle.fluid.optimizer.AdamaxOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdamaxOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.AdamaxOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdamaxOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.DecayedAdagradOptimizer ('paddle.fluid.optimizer.DecayedAdagradOptimizer', ('document', 'e76838a8586bf2e58e6b5cdd2f67f780')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.DecayedAdagradOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.FtrlOptimizer ('paddle.fluid.optimizer.FtrlOptimizer', ('document', 'cba8aae0a267b9a4d8833ae79a00fc55')) +paddle.fluid.optimizer.FtrlOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'l1', 'l2', 'lr_power', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.0, 0.0, -0.5, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.FtrlOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.FtrlOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.FtrlOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.FtrlOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.FtrlOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.FtrlOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.RMSPropOptimizer ('paddle.fluid.optimizer.RMSPropOptimizer', ('document', '5217bc4fc399010021d6b70541005780')) +paddle.fluid.optimizer.RMSPropOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'rho', 'epsilon', 'momentum', 'centered', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, 0.0, False, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.RMSPropOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.RMSPropOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.RMSPropOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.AdadeltaOptimizer ('paddle.fluid.optimizer.AdadeltaOptimizer', ('document', 'b5e33fa8aca6cfbcaebfc6cd7742908a')) +paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.AdadeltaOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.AdadeltaOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.AdadeltaOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.AdadeltaOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.AdadeltaOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.ModelAverage ('paddle.fluid.optimizer.ModelAverage', ('document', '0a0adcd60230630e21fe1ef46362dbc0')) +paddle.fluid.optimizer.ModelAverage.__init__ (ArgSpec(args=['self', 'average_window_rate', 'min_average_window', 'max_average_window', 'regularization', 'name'], varargs=None, keywords=None, defaults=(10000, 10000, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.ModelAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '648010d0ac1fa707dac0b89f74b0e35c')) +paddle.fluid.optimizer.ModelAverage.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.ModelAverage.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.ModelAverage.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.ModelAverage.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.ModelAverage.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.ModelAverage.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.ModelAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '5f14ea4adda2791e1c3b37ff327f6a83')) +paddle.fluid.optimizer.LarsMomentumOptimizer ('paddle.fluid.optimizer.LarsMomentumOptimizer', ('document', '030b9092a96a409b1bf5446bf45d0659')) +paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.LarsMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.LarsMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.LarsMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.LarsMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.LarsMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.LarsMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.DGCMomentumOptimizer ('paddle.fluid.optimizer.DGCMomentumOptimizer', ('document', 'c0384e036f5c78c569f0e2b266812c0f')) +paddle.fluid.optimizer.DGCMomentumOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'momentum', 'rampup_begin_step', 'rampup_step', 'sparsity', 'use_nesterov', 'local_grad_clip_norm', 'num_trainers', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1, [0.999], False, None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DGCMomentumOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.DGCMomentumOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DGCMomentumOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.LambOptimizer ('paddle.fluid.optimizer.LambOptimizer', ('document', '7dd8b270156a52f1f6b4663336960893')) +paddle.fluid.optimizer.LambOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'lamb_weight_decay', 'beta1', 'beta2', 'epsilon', 'regularization', 'exclude_from_weight_decay_fn', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.01, 0.9, 0.999, 1e-06, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.LambOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.LambOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.LambOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.LambOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.LambOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.LambOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.ExponentialMovingAverage ('paddle.fluid.optimizer.ExponentialMovingAverage', ('document', 'a38b7d5b9f17a295ed15d4c1b9ab4cd0')) +paddle.fluid.optimizer.ExponentialMovingAverage.__init__ (ArgSpec(args=['self', 'decay', 'thres_steps', 'name'], varargs=None, keywords=None, defaults=(0.999, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.ExponentialMovingAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '30f494752ac8921dc5835a63637f453a')) +paddle.fluid.optimizer.ExponentialMovingAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '8c8a1791608b02a1ede53d6dd3a4fcec')) +paddle.fluid.optimizer.ExponentialMovingAverage.update (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'ea10f08af6d7aac3b7974aa976e4085f')) +paddle.fluid.optimizer.PipelineOptimizer ('paddle.fluid.optimizer.PipelineOptimizer', ('document', '6f85382abedb922387b08d98e8d0b69c')) +paddle.fluid.optimizer.PipelineOptimizer.__init__ (ArgSpec(args=['self', 'optimizer', 'cut_list', 'place_list', 'concurrency_list', 'queue_size', 'sync_steps', 'start_cpu_core_id'], varargs=None, keywords=None, defaults=(None, None, None, 30, 1, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1b7b2bfb986e93048e75ba69f2f490ab')) +paddle.fluid.backward.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b')) +paddle.fluid.regularizer.L1DecayRegularizer ('paddle.fluid.regularizer.L1DecayRegularizer', ('document', '34603757e70974d2fcc730643b382925')) +paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.regularizer.L2DecayRegularizer ('paddle.fluid.regularizer.L2DecayRegularizer', ('document', 'b94371c3434d7f695bc5b2d6fb5531fd')) +paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.LoDTensor ('paddle.fluid.core_avx.LoDTensor', ('document', '25e8432ed1b9a375868bc8911359aa0d')) +paddle.fluid.LoDTensor.__init__ 1. __init__(self: paddle.fluid.core_avx.LoDTensor, arg0: List[List[int]]) -> None 2. __init__(self: paddle.fluid.core_avx.LoDTensor) -> None +paddle.fluid.LoDTensor.has_valid_recursive_sequence_lengths has_valid_recursive_sequence_lengths(self: paddle.fluid.core_avx.LoDTensor) -> bool +paddle.fluid.LoDTensor.lod lod(self: paddle.fluid.core_avx.LoDTensor) -> List[List[int]] +paddle.fluid.LoDTensor.recursive_sequence_lengths recursive_sequence_lengths(self: paddle.fluid.core_avx.LoDTensor) -> List[List[int]] +paddle.fluid.LoDTensor.set_lod set_lod(self: paddle.fluid.core_avx.LoDTensor, lod: List[List[int]]) -> None +paddle.fluid.LoDTensor.set_recursive_sequence_lengths set_recursive_sequence_lengths(self: paddle.fluid.core_avx.LoDTensor, recursive_sequence_lengths: List[List[int]]) -> None +paddle.fluid.LoDTensor.shape shape(self: paddle.fluid.core_avx.Tensor) -> List[int] +paddle.fluid.LoDTensorArray ('paddle.fluid.core_avx.LoDTensorArray', ('document', 'e9895b67ba54438b9c0f7053e18966f5')) +paddle.fluid.LoDTensorArray.__init__ __init__(self: paddle.fluid.core_avx.LoDTensorArray) -> None +paddle.fluid.LoDTensorArray.append append(self: paddle.fluid.core_avx.LoDTensorArray, tensor: paddle.fluid.core_avx.LoDTensor) -> None +paddle.fluid.CPUPlace ('paddle.fluid.core_avx.CPUPlace', ('document', '6014005ef2649045b77d502aeb6cd7f9')) +paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core_avx.CPUPlace) -> None +paddle.fluid.CUDAPlace ('paddle.fluid.core_avx.CUDAPlace', ('document', '6a6cd8ed607beb951692c4b066d08c94')) +paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPlace, arg0: int) -> None +paddle.fluid.CUDAPinnedPlace ('paddle.fluid.core_avx.CUDAPinnedPlace', ('document', 'afd58ea5d390b5ea06ca70291a266d45')) +paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPinnedPlace) -> None +paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', 'fa47fa251f727c4a4f638d61e3c7c141')) +paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.WeightNormParamAttr ('paddle.fluid.param_attr.WeightNormParamAttr', ('document', '48ab4f49c7eeeade5958b731b6a96aa0')) +paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DataFeeder ('paddle.fluid.data_feeder.DataFeeder', ('document', 'a39802654f20692ad49c340cef7c6556')) +paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '449ec75d35b3498091908714e35e6686')) +paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', 'ce65fe1d81dcd7067d5092a5667f35cc')) +paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '334c6af750941a4397a2dd2ea8a4d76f')) +paddle.fluid.clip.ErrorClipByValue ('paddle.fluid.clip.ErrorClipByValue', ('document', 'e6f815a03be88dee2537707d9e6b9209')) +paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.clip.GradientClipByValue ('paddle.fluid.clip.GradientClipByValue', ('document', 'b7a22f687269cae0c338ef3866322db7')) +paddle.fluid.clip.GradientClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.clip.GradientClipByNorm ('paddle.fluid.clip.GradientClipByNorm', ('document', 'a5c23d96a3d8c8c1183e9469a5d0d52e')) +paddle.fluid.clip.GradientClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.clip.GradientClipByGlobalNorm ('paddle.fluid.clip.GradientClipByGlobalNorm', ('document', 'ef50acbe212101121d4b82f693ec1733')) +paddle.fluid.clip.GradientClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'clip_norm', 'group_name'], varargs=None, keywords=None, defaults=('default_group',)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph_grad_clip.GradClipByValue ('paddle.fluid.dygraph_grad_clip.GradClipByValue', ('document', '6971a42222de0387a7ee9c59671dd2e3')) +paddle.fluid.dygraph_grad_clip.GradClipByValue.__init__ (ArgSpec(args=['self', 'min_value', 'max_value'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph_grad_clip.GradClipByNorm ('paddle.fluid.dygraph_grad_clip.GradClipByNorm', ('document', '2039274ea09987ba48eded67999dc280')) +paddle.fluid.dygraph_grad_clip.GradClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm ('paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm', ('document', 'd1872377e7d7a5fe0dd2e8c42e4c9656')) +paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'max_global_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '49f5db5da13cfd8c069754dd11be3901')) +paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'fd1f25a7a06516ca9a1f4ab0783a4d70')) +paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a2be24e028dffa06ab28cc55a27c59e4')) +paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '4c192ea399e6e80b1ab47a8265b022a5')) +paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'bc8628b859b04242200e48a458c971c4')) +paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '4d68cde4c4df8f1b8018620b4dc19b42')) +paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '695a6e91afbcdbafac69a069038811be')) +paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ead717d6d440a1eb11971695cd1727f4')) +paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310')) +paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7')) +paddle.fluid.Scope Scope() -> paddle.fluid.core_avx._Scope +paddle.fluid.install_check.run_check (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '66b7c84a17ed32fec2df9628367be2b9')) +paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c')) +paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d')) +paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb')) +paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d')) +paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4')) +paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d')) +paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad')) +paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796')) +paddle.reader.PipeReader ('paddle.reader.decorator.PipeReader', ('document', 'd3c250618f98c1a5fb646f869016a98e')) +paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45')) +paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0')) +paddle.reader.Fake ('paddle.reader.decorator.Fake', ('document', '0d8f4847b99bed6d456ade0d903202e1')) +paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada')) +paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', 'f45fcb7add066c8e042c6774fc7c3db2')) +paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', 'b4a94ee0e2cefb495619275c2f8c61d2')) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt new file mode 100644 index 00000000..595454e9 --- /dev/null +++ b/paddle/fluid/CMakeLists.txt @@ -0,0 +1,12 @@ +add_subdirectory(memory) +add_subdirectory(platform) +add_subdirectory(framework) +add_subdirectory(imperative) +add_subdirectory(operators) +add_subdirectory(string) +add_subdirectory(recordio) +add_subdirectory(pybind) + +# NOTE: please add subdirectory inference at last. +add_subdirectory(inference) +add_subdirectory(train) diff --git a/paddle/fluid/framework/.gitignore b/paddle/fluid/framework/.gitignore new file mode 100644 index 00000000..5132131e --- /dev/null +++ b/paddle/fluid/framework/.gitignore @@ -0,0 +1,2 @@ +.tensor_util.cu +.data_type_transform.cu \ No newline at end of file diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt new file mode 100644 index 00000000..5dc6e74b --- /dev/null +++ b/paddle/fluid/framework/CMakeLists.txt @@ -0,0 +1,244 @@ + +#windows treat symbolic file as a real file, which is different with unix +#We create a hidden file and compile it instead of origin source file. +function(windows_symbolic TARGET) + set(oneValueArgs "") + set(multiValueArgs SRCS PATH) + cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) + foreach(src ${windows_symbolic_SRCS}) + get_filename_component(src ${src} NAME_WE) + if (NOT EXISTS ${final_path}/${src}.cc OR NOT EXISTS ${final_path}/${src}.cu) + message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") + endif() + + file(GENERATE OUTPUT ${final_path}/.${src}.cu INPUT ${final_path}/${src}.cc) + + add_custom_command(OUTPUT ${final_path}/.${src}.cu + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${final_path}/${src}.cc" "${final_path}/.${src}.cu" + COMMENT "create hidden file of ${src}.cu") + add_custom_target(${TARGET} ALL DEPENDS .${src}.cu) + endforeach() +endfunction() + +add_subdirectory(ir) +add_subdirectory(details) +add_subdirectory(fleet) +add_subdirectory(io) +#ddim lib +proto_library(framework_proto SRCS framework.proto) +proto_library(data_feed_proto SRCS data_feed.proto) +proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto + data_feed_proto) + +cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) +cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) +nv_test(dim_test SRCS dim_test.cu DEPS ddim) +cc_test(unroll_array_ops_test SRCS unroll_array_ops_test.cc) +cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) +cc_test(data_type_test SRCS data_type_test.cc DEPS data_type place tensor) +if(WITH_GPU) + if (WIN32) + windows_symbolic(tensor_util SRCS tensor_util.cu) + nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context) + add_dependencies(tensor tensor_util) + else() + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler) + endif(WIN32) +else() + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler) +endif() + +cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) +if(WITH_GPU) + nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor) +else() + cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) +endif() + +cc_test(eigen_test SRCS eigen_test.cc DEPS tensor) + +if(WITH_GPU) + nv_test(mixed_vector_test SRCS mixed_vector_test.cc mixed_vector_test.cu DEPS place memory device_context tensor) +else() + cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) +endif() +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) + +cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) +nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) + +cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory gflags glog) + +cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) +cc_test(reader_test SRCS reader_test.cc DEPS reader) + +cc_library(threadpool SRCS threadpool.cc DEPS enforce) +cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) + +cc_library(var_type_traits SRCS var_type_traits DEPS lod_tensor selected_rows framework_proto) +if (WITH_GPU) + target_link_libraries(var_type_traits dynload_cuda) +endif() +cc_test(var_type_traits_test SRCS var_type_traits_test.cc DEPS var_type_traits) + +cc_library(scope SRCS scope.cc DEPS glog threadpool xxhash var_type_traits) +cc_library(scope_pool SRCS scope_pool.cc DEPS scope) +cc_test(scope_test SRCS scope_test.cc DEPS scope) +cc_test(variable_test SRCS variable_test.cc DEPS tensor var_type_traits) + +cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) +nv_test(data_device_transform_test SRCS data_device_transform_test.cu + DEPS operator op_registry device_context math_function scope) + +if(WITH_GPU) + if (WIN32) +#windows treat symbolic file as a real file, which is different with unix +#We create a hidden file and compile it instead of origin source file. + windows_symbolic(hidden_file SRCS data_type_transform.cu) + nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) + add_dependencies(data_type_transform hidden_file) + else() + nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) + endif(WIN32) + nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) +else() + cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) + cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform) +endif() + +cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function) +cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform) + +cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor + framework_proto selected_rows data_device_transform data_type_transform data_layout_transform) + +cc_library(attribute SRCS attribute.cc DEPS framework_proto boost) +cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc +device_context) +cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog) +cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) +cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) +cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) + +cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context) +cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place) +cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog + shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type) + +cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) + +cc_library(version SRCS version.cc) +cc_test(version_test SRCS version_test.cc DEPS version) + +cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) + +cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper) + +nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) + +py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) +py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto) +#Generate an empty \ + #__init__.py to make framework_py_proto as a valid python module. +add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) +add_dependencies(framework_py_proto framework_py_proto_init) +if (NOT WIN32) + add_custom_command(TARGET framework_py_proto POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto + COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/ + COMMENT "Copy generated python proto into directory paddle/fluid/proto." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +else(NOT WIN32) + string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/") + add_custom_command(TARGET framework_py_proto POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto + COMMAND copy /Y *.py ${proto_dstpath} + COMMENT "Copy generated python proto into directory paddle/fluid/proto." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +endif(NOT WIN32) + +cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) + +cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) +cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) + +cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) + +if(WITH_NGRAPH) + set(NGRAPH_EXE_DEPS ngraph_engine) +else() + set(NGRAPH_EXE_DEPS) +endif() + +cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector) +if(WITH_DISTRIBUTE) + cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc + dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc + data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc + pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry + device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper lodtensor_printer + lod_rank_table feed_fetch_method sendrecvop_rpc collective_helper ${GLOB_DISTRIBUTE_DEPS} + graph_to_program_pass variable_helper data_feed_proto ${NGRAPH_EXE_DEPS} timer) +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +else() + cc_library(executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc + dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc + data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc + pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry + device_context scope framework_proto data_feed_proto trainer_desc_proto glog + lod_rank_table fs shell fleet_wrapper lodtensor_printer feed_fetch_method + graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS} timer) + cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) +endif() + +target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper) + +cc_library(parallel_executor SRCS parallel_executor.cc DEPS + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor + graph build_strategy + fast_threaded_ssa_graph_executor variable_helper) + +cc_library(prune SRCS prune.cc DEPS framework_proto) +cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) +cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry + proto_desc) +cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder) +cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) +cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) + +cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type) +cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) + +cc_test(tuple_test SRCS tuple_test.cc ) + +cc_test(inlined_vector_test SRCS inlined_vector_test.cc) + +if (NOT WIN32) +cc_test(rw_lock_test SRCS rw_lock_test.cc) +endif (NOT WIN32) + +cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) +cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) + +# Get the current working branch +execute_process( + COMMAND git rev-parse --abbrev-ref HEAD + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_BRANCH + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + +# Get the latest abbreviated commit hash of the working branch +execute_process( + COMMAND git log -1 --format=%h + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + +message(STATUS "commit: ${PADDLE_COMMIT}") +message(STATUS "branch: ${PADDLE_BRANCH}") + +configure_file(commit.h.in commit.h) diff --git a/paddle/fluid/framework/archive.h b/paddle/fluid/framework/archive.h new file mode 100644 index 00000000..100eb951 --- /dev/null +++ b/paddle/fluid/framework/archive.h @@ -0,0 +1,609 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/expect.h" + +namespace paddle { +namespace framework { + +// not a virtual class +class ArchiveBase { + protected: + ArchiveBase() {} + + // Archive is not copyable. But to allow move capture by function objects, + // check it at runtime rather than at compile time. + ArchiveBase(const ArchiveBase&) { LOG(FATAL) << "Not supported"; } + + ArchiveBase(ArchiveBase&& other) + : buffer_(other.buffer_), + cursor_(other.cursor_), + finish_(other.finish_), + limit_(other.limit_), + deleter_(std::move(other.deleter_)) { + other.buffer_ = NULL; + other.cursor_ = NULL; + other.finish_ = NULL; + other.limit_ = NULL; + other.deleter_ = nullptr; + } + + ~ArchiveBase() { FreeBuffer(); } + + public: + ArchiveBase& operator=(const ArchiveBase&) { + LOG(FATAL) << "Not supported"; + return *this; + } + + ArchiveBase& operator=(ArchiveBase&& other) { + if (this != &other) { + FreeBuffer(); + buffer_ = other.buffer_; + cursor_ = other.cursor_; + finish_ = other.finish_; + limit_ = other.limit_; + deleter_ = std::move(other.deleter_); + other.buffer_ = NULL; + other.cursor_ = NULL; + other.finish_ = NULL; + other.limit_ = NULL; + other.deleter_ = nullptr; + } + return *this; + } + + char* Buffer() { return buffer_; } + + void SetReadBuffer(char* buffer, size_t length, + std::function&& deleter) { + SetBuffer(buffer, length, length, std::move(deleter)); + } + + void SetWriteBuffer(char* buffer, size_t capacity, + std::function&& deleter) { + SetBuffer(buffer, 0, capacity, std::move(deleter)); + } + + void SetBuffer(char* buffer, size_t length, size_t capacity, + std::function&& deleter) { + CHECK(length <= capacity); + FreeBuffer(); + buffer_ = buffer; + cursor_ = buffer_; + finish_ = buffer + length; + limit_ = buffer + capacity; + deleter_ = std::move(deleter); + } + + char* Cursor() { return cursor_; } + + void SetCursor(char* cursor) { + CHECK(cursor >= buffer_ && cursor <= finish_); + cursor_ = cursor; + } + + void AdvanceCursor(size_t offset) { + CHECK(offset <= size_t(finish_ - cursor_)); + cursor_ += offset; + } + + char* Finish() { return finish_; } + + void SetFinish(char* finish) { + CHECK(finish >= cursor_ && finish <= limit_); + finish_ = finish; + } + + void AdvanceFinish(size_t offset) { + CHECK(offset <= size_t(limit_ - finish_)); + finish_ += offset; + } + + char* Limit() { return limit_; } + + size_t Position() { return cursor_ - buffer_; } + + size_t Length() { return finish_ - buffer_; } + + size_t Capacity() { return limit_ - buffer_; } + + bool Empty() { return finish_ == buffer_; } + + void Reset() { + FreeBuffer(); + buffer_ = NULL; + cursor_ = NULL; + finish_ = NULL; + limit_ = NULL; + } + + void Clear() { + cursor_ = buffer_; + finish_ = buffer_; + } + + char* Release() { + char* buf = buffer_; + buffer_ = NULL; + cursor_ = NULL; + finish_ = NULL; + deleter_ = nullptr; + return buf; + } + + void Resize(size_t newsize) { +#ifdef _LINUX + if (unlikely(newsize > Capacity())) { +#else + if (newsize > Capacity()) { +#endif + Reserve(std::max(Capacity() * 2, newsize)); + } + finish_ = buffer_ + newsize; + cursor_ = std::min(cursor_, finish_); + } + + void Reserve(size_t newcap) { + if (newcap > Capacity()) { + char* newbuf = NULL; + newbuf = new char[newcap]; + CHECK(newbuf != nullptr) << "Reserve failed, out of memory"; + if (Length() > 0) { + memcpy(newbuf, buffer_, Length()); + } + cursor_ = newbuf + (cursor_ - buffer_); + finish_ = newbuf + (finish_ - buffer_); + limit_ = newbuf + newcap; + FreeBuffer(); + buffer_ = newbuf; + deleter_ = std::default_delete(); + } + } + + void PrepareRead(size_t size) { +#ifdef _LINUX + if (unlikely(!(size <= size_t(finish_ - cursor_)))) { +#else + if (!(size <= size_t(finish_ - cursor_))) { +#endif + CHECK(size <= size_t(finish_ - cursor_)); + } + } + + void PrepareWrite(size_t size) { +#ifdef _LINUX + if (unlikely(size > size_t(limit_ - finish_))) { +#else + if (size > size_t(limit_ - finish_)) { +#endif + Reserve(std::max(Capacity() * 2, Length() + size)); + } + } + + void Read(void* data, size_t size) { + if (size > 0) { + PrepareRead(size); + memcpy(data, cursor_, size); + AdvanceCursor(size); + } + } + + void ReadBack(void* data, size_t size) { + if (size > 0) { + CHECK(size <= size_t(finish_ - cursor_)); + memcpy(data, finish_ - size, size); + finish_ -= size; + } + } + + void Write(const void* data, size_t size) { + if (size > 0) { + PrepareWrite(size); + memcpy(finish_, data, size); + AdvanceFinish(size); + } + } + + template + void GetRaw(T& x) { // NOLINT + PrepareRead(sizeof(T)); + memcpy(&x, cursor_, sizeof(T)); + AdvanceCursor(sizeof(T)); + } + + template + T GetRaw() { + T x; + GetRaw(x); + return x; + } + + template + void PutRaw(const T& x) { + PrepareWrite(sizeof(T)); + memcpy(finish_, &x, sizeof(T)); + AdvanceFinish(sizeof(T)); + } + + protected: + char* buffer_ = NULL; + char* cursor_ = NULL; + char* finish_ = NULL; + char* limit_ = NULL; + std::function deleter_ = nullptr; + + void FreeBuffer() { + if (deleter_) { + deleter_(buffer_); + } + deleter_ = nullptr; + } +}; // NOLINT + +template +class Archive {}; + +class BinaryArchiveType {}; + +typedef Archive BinaryArchive; + +template <> +class Archive : public ArchiveBase { + public: +#define ARCHIVE_REPEAT(T) \ + BinaryArchive& operator>>(T& x) { \ + GetRaw(x); \ + return *this; \ + } \ + BinaryArchive& operator<<(const T& x) { \ + PutRaw(x); \ + return *this; \ + } + + ARCHIVE_REPEAT(int16_t) + ARCHIVE_REPEAT(uint16_t) + ARCHIVE_REPEAT(int32_t) + ARCHIVE_REPEAT(uint32_t) + ARCHIVE_REPEAT(int64_t) + ARCHIVE_REPEAT(uint64_t) + ARCHIVE_REPEAT(float) + ARCHIVE_REPEAT(double) + ARCHIVE_REPEAT(signed char) + ARCHIVE_REPEAT(unsigned char) + ARCHIVE_REPEAT(bool) + +#undef ARCHIVE_REPEAT + + template + T Get() { + T x; + *this >> x; + return x; + } +}; + +template +Archive& operator<<(Archive& ar, const T (&p)[N]) { + for (size_t i = 0; i < N; i++) { + ar << p[i]; + } + return ar; +} + +template +Archive& operator>>(Archive& ar, T (&p)[N]) { + for (size_t i = 0; i < N; i++) { + ar >> p[i]; + } + return ar; +} + +template +Archive& operator<<(Archive& ar, const std::vector& p) { +#ifdef _LINUX + ar << (size_t)p.size(); +#else + ar << (uint64_t)p.size(); +#endif + for (const auto& x : p) { + ar << x; + } + return ar; +} + +template +Archive& operator>>(Archive& ar, std::vector& p) { +#ifdef _LINUX + p.resize(ar.template Get()); +#else + p.resize(ar.template Get()); +#endif + for (auto& x : p) { + ar >> x; + } + return ar; +} + +template +Archive& operator<<(Archive& ar, const std::valarray& p) { +#ifdef _LINUX + ar << (size_t)p.size(); +#else + ar << (uint64_t)p.size(); +#endif + for (const auto& x : p) { + ar << x; + } + return ar; +} + +template +Archive& operator>>(Archive& ar, std::valarray& p) { +#ifdef _LINUX + p.resize(ar.template Get()); +#else + p.resize(ar.template Get()); +#endif + for (auto& x : p) { + ar >> x; + } + return ar; +} + +inline BinaryArchive& operator<<(BinaryArchive& ar, const std::string& s) { +#ifdef _LINUX + ar << (size_t)s.length(); +#else + ar << (uint64_t)s.length(); +#endif + ar.Write(&s[0], s.length()); + return ar; +} + +inline BinaryArchive& operator>>(BinaryArchive& ar, std::string& s) { +#ifdef _LINUX + size_t len = ar.template Get(); +#else + size_t len = ar.template Get(); +#endif + ar.PrepareRead(len); + s.assign(ar.Cursor(), len); + ar.AdvanceCursor(len); + return ar; +} + +template +Archive& operator<<(Archive& ar, const std::pair& x) { + return ar << x.first << x.second; +} + +template +Archive& operator>>(Archive& ar, std::pair& x) { // NOLINT + return ar >> x.first >> x.second; +} + +#ifdef _LINUX +template +Archive& SerializeTuple(Archive& ar, // NOLINT + const std::tuple& x, // NOLINT + std::integral_constant n) { // NOLINT + return ar; +} +#else +template +Archive& SerializeTuple(Archive& ar, // NOLINT + const std::tuple& x, // NOLINT + std::integral_constant n) { // NOLINT + return ar; +} +#endif + +#ifdef _LINUX +template +Archive& serialize_tuple(Archive& ar, // NOLINT + const std::tuple& x, // NOLINT + std::integral_constant n) { // NOLINT + return SerializeTuple(ar, x, std::integral_constant()) + << std::get(x); +} +#else +template +Archive& serialize_tuple(Archive& ar, // NOLINT + const std::tuple& x, // NOLINT + std::integral_constant n) { // NOLINT + return SerializeTuple(ar, x, std::integral_constant()) + << std::get(x); +} +#endif + +#ifdef _LINUX +template +Archive& operator<<(Archive& ar, const std::tuple& x) { + const size_t size = std::tuple_size>::value; + return SerializeTuple(ar, x, std::integral_constant()); +} +#else +template +Archive& operator<<(Archive& ar, const std::tuple& x) { + const uint64_t size = std::tuple_size>::value; + return SerializeTuple(ar, x, std::integral_constant()); +} +#endif + +#ifdef _LINUX +template +Archive& DeserializeTuple(Archive& ar, std::tuple& x, // NOLINT + std::integral_constant n) { + return ar; +} +#else +template +Archive& DeserializeTuple(Archive& ar, std::tuple& x, // NOLINT + std::integral_constant n) { + return ar; +} +#endif + +#ifdef _LINUX +template +Archive& DeserializeTuple(Archive& ar, std::tuple& x, // NOLINT + std::integral_constant n) { + return DeserializeTuple(ar, x, std::integral_constant()) >> + std::get(x); +} +#else +template +Archive& DeserializeTuple(Archive& ar, std::tuple& x, // NOLINT + std::integral_constant n) { + return DeserializeTuple(ar, x, std::integral_constant()) >> + std::get(x); +} +#endif + +#ifdef _LINUX +template +Archive& operator>>(Archive& ar, std::tuple& x) { + const size_t size = std::tuple_size>::value; + return DeserializeTuple(ar, x, std::integral_constant()); +} +#else +template +Archive& operator>>(Archive& ar, std::tuple& x) { + const uint64_t size = std::tuple_size>::value; + return DeserializeTuple(ar, x, std::integral_constant()); +} +#endif + +#ifdef _LINUX +#define ARCHIVE_REPEAT(MAP_TYPE, RESERVE_STATEMENT) \ + template \ + Archive& operator<<(Archive& ar, \ + const MAP_TYPE& p) { \ + ar << (size_t)p.size(); \ + for (auto it = p.begin(); it != p.end(); ++it) { \ + ar << *it; \ + } \ + return ar; \ + } \ + template \ + Archive& operator>>(Archive& ar, MAP_TYPE& p) { \ + size_t size = ar.template get(); \ + p.clear(); \ + RESERVE_STATEMENT; \ + for (size_t i = 0; i < size; i++) { \ + p.insert(ar.template get>()); \ + } \ + return ar; \ + } +#else +#define ARCHIVE_REPEAT(MAP_TYPE, RESERVE_STATEMENT) \ + template \ + Archive& operator<<(Archive& ar, \ + const MAP_TYPE& p) { \ + ar << (uint64_t)p.size(); \ + for (auto it = p.begin(); it != p.end(); ++it) { \ + ar << *it; \ + } \ + return ar; \ + } \ + template \ + Archive& operator>>(Archive& ar, MAP_TYPE& p) { \ + size_t size = ar.template get(); \ + p.clear(); \ + RESERVE_STATEMENT; \ + for (size_t i = 0; i < size; i++) { \ + p.insert(ar.template get>()); \ + } \ + return ar; \ + } +#endif + +ARCHIVE_REPEAT(std::map, ) +ARCHIVE_REPEAT(std::multimap, ) +ARCHIVE_REPEAT(std::unordered_map, p.reserve(size)) +ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size)) + +#undef ARCHIVE_REPEAT + +#ifdef _LINUX +#define ARCHIVE_REPEAT(SET_TYPE, RESERVE_STATEMENT) \ + template \ + Archive& operator<<(Archive& ar, const SET_TYPE& p) { \ + ar << (size_t)p.size(); \ + for (auto it = p.begin(); it != p.end(); ++it) { \ + ar << *it; \ + } \ + return ar; \ + } \ + template \ + Archive& operator>>(Archive& ar, SET_TYPE& p) { \ + size_t size = ar.template get(); \ + p.clear(); \ + RESERVE_STATEMENT; \ + for (size_t i = 0; i < size; i++) { \ + p.insert(ar.template get()); \ + } \ + return ar; \ + } +#else +#define ARCHIVE_REPEAT(SET_TYPE, RESERVE_STATEMENT) \ + template \ + Archive& operator<<(Archive& ar, const SET_TYPE& p) { \ + ar << (uint64_t)p.size(); \ + for (auto it = p.begin(); it != p.end(); ++it) { \ + ar << *it; \ + } \ + return ar; \ + } \ + template \ + Archive& operator>>(Archive& ar, SET_TYPE& p) { \ + size_t size = ar.template get(); \ + p.clear(); \ + RESERVE_STATEMENT; \ + for (size_t i = 0; i < size; i++) { \ + p.insert(ar.template get()); \ + } \ + return ar; \ + } +#endif + +ARCHIVE_REPEAT(std::set, ) +ARCHIVE_REPEAT(std::multiset, ) +ARCHIVE_REPEAT(std::unordered_set, p.reserve(size)) +ARCHIVE_REPEAT(std::unordered_multiset, p.reserve(size)) + +#undef ARCHIVE_REPEAT + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/array.h b/paddle/fluid/framework/array.h new file mode 100644 index 00000000..b5308298 --- /dev/null +++ b/paddle/fluid/framework/array.h @@ -0,0 +1,137 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/unroll_array_ops.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +template +class Array { + public: + static constexpr size_t kSize = N; + + HOSTDEVICE inline Array() {} + + template + HOSTDEVICE inline explicit Array(const T &val, Args... args) { + static_assert(N == sizeof...(Args) + 1, "Invalid argument"); + UnrollVarArgsAssign::Run(data_, val, args...); + } + + HOSTDEVICE inline void Fill(const T &val) { + UnrollFillConstant::Run(data_, val); + } + + HOSTDEVICE inline const T *Get() const { return data_; } + + HOSTDEVICE inline T *GetMutable() { return data_; } + + HOSTDEVICE inline T &operator[](size_t i) { return *advance(data_, i); } + + // Writing "return data_[i]" would cause compilation warning/error: + // "array subscript is above array bound" in Python 35 CI. + // It seems that it is a false warning of GCC if we do not check the bounds + // of array index. But for better performance, we do not check in operator[] + // like what is in STL. If users want to check the bounds, use at() instead + HOSTDEVICE inline const T &operator[](size_t i) const { + return *advance(data_, i); + } + + HOSTDEVICE inline T &at(size_t i) { +#ifndef __CUDA_ARCH__ + PADDLE_ENFORCE_LT(i, N, "Array index out of bounds"); +#endif + return (*this)[i]; + } + + HOSTDEVICE inline const T &at(size_t i) const { +#ifndef __CUDA_ARCH__ + PADDLE_ENFORCE_LT(i, N, "Array index out of bounds"); +#endif + return (*this)[i]; + } + + HOSTDEVICE constexpr size_t size() const { return N; } + + HOSTDEVICE inline bool operator==(const Array &other) const { + return UnrollCompare::Run(data_, other.data_); + } + + HOSTDEVICE inline bool operator!=(const Array &other) const { + return !(*this == other); + } + + private: + template + HOSTDEVICE static inline U *advance(U *ptr, size_t i) { + return ptr + i; + } + + T data_[N]; +}; + +template +class Array { + public: + static constexpr size_t kSize = 0; + + HOSTDEVICE inline Array() {} + + HOSTDEVICE inline void Fill(const T &val) {} + + HOSTDEVICE inline constexpr T *Get() const { return nullptr; } + + // Add constexpr to GetMutable() cause warning in MAC + HOSTDEVICE inline T *GetMutable() { return nullptr; } + + HOSTDEVICE inline T &operator[](size_t) { +#ifdef __CUDA_ARCH__ + static T obj(); + return obj; +#else + PADDLE_THROW("Array has no element"); +#endif + } + + HOSTDEVICE inline const T &operator[](size_t) const { +#ifdef __CUDA_ARCH__ + static const T obj(); + return obj; +#else + PADDLE_THROW("Array has no element"); +#endif + } + + HOSTDEVICE inline T &at(size_t i) { return (*this)[i]; } + + HOSTDEVICE inline const T &at(size_t i) const { return (*this)[i]; } + + HOSTDEVICE constexpr size_t size() const { return 0; } + + HOSTDEVICE constexpr bool operator==(const Array &other) const { + return true; + } + + HOSTDEVICE constexpr bool operator!=(const Array &other) const { + return false; + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc new file mode 100644 index 00000000..7eb80a46 --- /dev/null +++ b/paddle/fluid/framework/async_executor.cc @@ -0,0 +1,172 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/async_executor.h" +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" + +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/executor_thread_worker.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/trainer.h" +#include "paddle/fluid/framework/trainer_desc.pb.h" +#include "paddle/fluid/framework/trainer_factory.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/pybind/pybind.h" + +namespace paddle { +namespace framework { +AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place) + : root_scope_(scope), place_(place) {} + +void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { + fleet_ptr_ = FleetWrapper::GetInstance(); + fleet_ptr_->InitServer(dist_desc, index); +} + +void AsyncExecutor::InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, + int node_num, int index) { + fleet_ptr_ = FleetWrapper::GetInstance(); + fleet_ptr_->InitWorker(dist_desc, host_sign_list, node_num, index); +} + +uint64_t AsyncExecutor::StartServer() { return fleet_ptr_->RunServer(); } + +void AsyncExecutor::StopServer() { fleet_ptr_->StopServer(); } + +void AsyncExecutor::GatherServers(const std::vector& host_sign_list, + int node_num) { + fleet_ptr_->GatherServers(host_sign_list, node_num); +} + +// todo InitModel +void AsyncExecutor::InitModel() {} + +// todo SaveModel +void AsyncExecutor::SaveModel(const std::string& path) {} + +void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, + const std::string& data_feed_desc_str, + const std::vector& filelist, + const int thread_num, + const std::vector& fetch_var_names, + const std::string& mode, const bool debug) { + std::vector threads; + + auto& block = main_program.Block(0); + for (auto var_name : fetch_var_names) { + auto var_desc = block.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc, "%s is not found.", var_name); + auto shapes = var_desc->GetShape(); + PADDLE_ENFORCE(shapes[shapes.size() - 1] == 1, + "var %s: Fetched var has wrong shape, " + "only variables with the last dimension size 1 supported", + var_name); + } + + DataFeedDesc data_feed_desc; + bool success = data_feed_desc.ParseFromString(data_feed_desc_str); + PADDLE_ENFORCE(success, "Fail to parse DataFeedDesc from string:\n%s", + data_feed_desc_str.c_str()); + + actual_thread_num_ = thread_num; + int file_cnt = filelist.size(); + PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty"); + + if (actual_thread_num_ > file_cnt) { + VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt + << ". Changing thread_num = " << file_cnt; + actual_thread_num_ = file_cnt; + } + + /* + readerDesc: protobuf description for reader initlization + argument: class_name, batch_size, use_slot, queue_size, buffer_size, + padding_index + + reader: + 1) each thread has a reader, reader will read input data and + put it into input queue + 2) each reader has a Next() iterface, that can fetch an instance + from the input queue + */ + // todo: should be factory method for creating datafeed + std::vector> readers; + /* + PrepareReaders(readers, actual_thread_num_, data_feed_desc, filelist); +#ifdef PADDLE_WITH_PSLIB + PrepareDenseThread(mode); +#endif + */ + std::vector> workers; + workers.resize(actual_thread_num_); + for (auto& worker : workers) { +#ifdef PADDLE_WITH_PSLIB + if (mode == "mpi") { + worker.reset(new AsyncExecutorThreadWorker); + } else { + worker.reset(new ExecutorThreadWorker); + } +#else + worker.reset(new ExecutorThreadWorker); +#endif + } + + // prepare thread resource here + /* + for (int thidx = 0; thidx < actual_thread_num_; ++thidx) { + CreateThreads(workers[thidx].get(), main_program, readers[thidx], + fetch_var_names, root_scope_, thidx, debug); + } + */ + + // start executing ops in multiple threads + for (int thidx = 0; thidx < actual_thread_num_; ++thidx) { + if (debug) { + threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer, + workers[thidx].get())); + } else { + threads.push_back( + std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get())); + } + } + + for (auto& th : threads) { + th.join(); + } + // TODO(guru4elephant): we don't need this + /* +#ifdef PADDLE_WITH_PSLIB + if (mode == "mpi") { + _pull_dense_thread->stop(); + } +#endif + */ + VLOG(3) << "start to run from files in async_executor"; + VLOG(3) << "Drop current scope kids"; + root_scope_->DropKids(); + return; +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h new file mode 100644 index 00000000..7b59e1b1 --- /dev/null +++ b/paddle/fluid/framework/async_executor.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include // NOLINT +#include // local_random_engine +#include +#include +#include // NOLINT +#include +#include +#include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/executor_thread_worker.h" +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +inline double current_realtime() { +#if !defined(_WIN32) + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec + tp.tv_nsec * 1e-9; +#else + return 0.0; +#endif +} + +inline std::default_random_engine& local_random_engine() { + struct engine_wrapper_t { + std::default_random_engine engine; + engine_wrapper_t() { + static std::atomic x(0); + std::seed_seq sseq = {x++, x++, x++, + static_cast(current_realtime() * 1000)}; + engine.seed(sseq); + } + }; + thread_local engine_wrapper_t r; + return r.engine; +} + +class AsyncExecutor { + public: + AsyncExecutor(Scope* scope, const platform::Place& place); + virtual ~AsyncExecutor() {} + void RunFromFile(const ProgramDesc& main_program, + const std::string& data_feed_desc_str, + const std::vector& filelist, + const int thread_num, + const std::vector& fetch_var_names, + const std::string& mode, const bool debug); + + // TODO(guru4elephant): make init server decoupled from executor + void InitServer(const std::string& dist_desc, int index); + void InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, int node_num, + int index); + uint64_t StartServer(); + void StopServer(); + void GatherServers(const std::vector& host_sign_list, int node_num); + void InitModel(); + void SaveModel(const std::string& path); + + public: + std::shared_ptr fleet_ptr_; + Scope* root_scope_; + platform::Place place_; + + private: + int actual_thread_num_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc new file mode 100644 index 00000000..fabf2abf --- /dev/null +++ b/paddle/fluid/framework/attribute.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/attribute.h" + +#include + +namespace paddle { +namespace framework { + +Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) { + switch (attr_desc.type()) { + case proto::AttrType::BOOLEAN: { + return attr_desc.b(); + } + case proto::AttrType::INT: { + return attr_desc.i(); + } + case proto::AttrType::FLOAT: { + return attr_desc.f(); + } + case proto::AttrType::STRING: { + return attr_desc.s(); + } + case proto::AttrType::BOOLEANS: { + std::vector val(attr_desc.bools_size()); + for (int i = 0; i < attr_desc.bools_size(); ++i) { + val[i] = attr_desc.bools(i); + } + return val; + } + case proto::AttrType::INTS: { + std::vector val(attr_desc.ints_size()); + for (int i = 0; i < attr_desc.ints_size(); ++i) { + val[i] = attr_desc.ints(i); + } + return val; + } + case proto::AttrType::FLOATS: { + std::vector val(attr_desc.floats_size()); + for (int i = 0; i < attr_desc.floats_size(); ++i) { + val[i] = attr_desc.floats(i); + } + return val; + } + case proto::AttrType::STRINGS: { + std::vector val(attr_desc.strings_size()); + for (int i = 0; i < attr_desc.strings_size(); ++i) { + val[i] = attr_desc.strings(i); + } + return val; + } + case proto::AttrType::LONG: { + return attr_desc.l(); + } + case proto::AttrType::LONGS: { + std::vector val(attr_desc.longs_size()); + for (int i = 0; i < attr_desc.longs_size(); ++i) { + val[i] = attr_desc.longs(i); + } + return val; + } + default: + PADDLE_THROW("Unsupport attr type %d", attr_desc.type()); + } + return boost::blank(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h new file mode 100644 index 00000000..aa452ac2 --- /dev/null +++ b/paddle/fluid/framework/attribute.h @@ -0,0 +1,345 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +template +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + T* operator()(Attribute& attr) const { + T* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s", + attr_name_, paddle::platform::demangle(typeid(T).name()), + paddle::platform::demangle(attr.type().name())); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +// special handle bool +// FIXME(yuyang18): Currently we cast bool into int in python binding. It is +// hard to change the logic there. In another way, we should correct handle +// if the user set `some_flag=1`. +// +// FIX ME anytime if there is a better solution. +template <> +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + bool* operator()(Attribute& attr) const { + if (attr.type() == typeid(int)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } else if (attr.type() == typeid(float)) { // NOLINT + float val = boost::get(attr); + attr = static_cast(val); + } + bool* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s", + attr_name_, paddle::platform::demangle(attr.type().name())); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +template <> +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + int64_t* operator()(Attribute& attr) const { + if (attr.type() == typeid(int)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } else if (attr.type() == typeid(float)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } + int64_t* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s", + attr_name_, paddle::platform::demangle(attr.type().name())); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +template <> +struct ExtractAttribute> { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + std::vector* operator()(Attribute& attr) const { + if (attr.type() == typeid(std::vector)) { // NOLINT + std::vector val = boost::get>(attr); + std::vector vec(val.begin(), val.end()); + attr = vec; + } else if (attr.type() == typeid(std::vector)) { // NOLINT + std::vector val = boost::get>(attr); + std::vector vec(val.begin(), val.end()); + attr = vec; + } + std::vector* attr_value = nullptr; + try { + attr_value = &boost::get>(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s", + attr_name_, paddle::platform::demangle(attr.type().name())); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +template <> +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + float* operator()(Attribute& attr) const { + if (attr.type() == typeid(int)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } else if (attr.type() == typeid(int64_t)) { // NOLINT + int64_t val = boost::get(attr); + attr = static_cast(val); + } + float* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type float, its type is %s", + attr_name_, paddle::platform::demangle(attr.type().name())); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +template +inline proto::AttrType AttrTypeID() { + Attribute tmp = T(); + return static_cast(tmp.which() - 1); +} + +Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc); + +class AttrReader { + public: + explicit AttrReader(const AttributeMap& attrs) : attrs_(attrs) {} + + template + inline const T& Get(const std::string& name) const { + PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", + name); + + Attribute& attr = const_cast(attrs_.at(name)); + ExtractAttribute extract_attr(name); + T* attr_value = extract_attr(attr); + return *attr_value; + } + + private: + const AttributeMap& attrs_; +}; + +// check whether a value(attribute) fit a certain limit +template +class GreaterThanChecker { + public: + explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} + void operator()(const T& value) const { + PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails."); + } + + private: + T lower_bound_; +}; + +template +class EqualGreaterThanChecker { + public: + explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {} + void operator()(const T& value) const { + PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails."); + } + + private: + T lower_bound_; +}; + +// we can provide users more common Checker, like 'LessThanChecker', +// 'BetweenChecker'... + +template +class DefaultValueSetter { + public: + explicit DefaultValueSetter(T default_value) + : default_value_(default_value) {} + void operator()(T* value) const { *value = default_value_; } + + private: + T default_value_; +}; + +template +class EnumInContainer { + public: + explicit EnumInContainer(const std::unordered_set& c) : container_(c) {} + void operator()(const T& val) const { + PADDLE_ENFORCE(container_.find(val) != container_.end(), + "Value %s is not in enum container %s", val, + ContainerDebugString()); + } + + private: + std::string ContainerDebugString() const { + std::ostringstream sout; + sout << "["; + size_t cnt = 0; + for (auto& v : container_) { + sout << v; + ++cnt; + if (cnt != container_.size()) { + sout << " ,"; + } + } + sout << "]"; + return sout.str(); + } + + std::unordered_set container_; +}; + +// check whether a certain attribute fit its limits +// an attribute can have more than one limits +template +class TypedAttrChecker { + typedef std::function DefaultValueChecker; + typedef std::function ValueChecker; + + public: + explicit TypedAttrChecker(const std::string& attr_name) + : attr_name_(attr_name) {} + + TypedAttrChecker& InEnum(const std::unordered_set& range) { + value_checkers_.push_back(EnumInContainer(range)); + return *this; + } + + TypedAttrChecker& GreaterThan(const T& lower_bound) { + value_checkers_.push_back(GreaterThanChecker(lower_bound)); + return *this; + } + + TypedAttrChecker& EqualGreaterThan(const T& lower_bound) { + value_checkers_.push_back(EqualGreaterThanChecker(lower_bound)); + return *this; + } + + // we can add more common limits, like LessThan(), Between()... + + TypedAttrChecker& SetDefault(const T& default_value) { + PADDLE_ENFORCE(default_value_setter_.empty(), + "%s can't have more than one default value!", attr_name_); + default_value_setter_.push_back(DefaultValueSetter(default_value)); + return *this; + } + + // allow users provide their own checker + TypedAttrChecker& AddCustomChecker(const ValueChecker& checker) { + value_checkers_.push_back(checker); + return *this; + } + + void operator()(AttributeMap* attr_map) const { + if (!attr_map->count(attr_name_)) { + // user do not set this attr + PADDLE_ENFORCE(!default_value_setter_.empty(), + "Attribute '%s' is required!", attr_name_); + // default_value_setter_ has no more than one element + T val; + (default_value_setter_[0])(&val); + (*attr_map)[attr_name_] = val; + } + Attribute& attr = attr_map->at(attr_name_); + ExtractAttribute extract_attr(attr_name_); + T* attr_value = extract_attr(attr); + for (const auto& checker : value_checkers_) { + checker(*attr_value); + } + } + + private: + std::string attr_name_; + std::vector value_checkers_; + std::vector default_value_setter_; +}; + +// check whether op's all attributes fit their own limits +class OpAttrChecker { + typedef std::function AttrChecker; + + public: + template + TypedAttrChecker& AddAttrChecker(const std::string& attr_name) { + attr_checkers_.push_back(TypedAttrChecker(attr_name)); + AttrChecker& checker = attr_checkers_.back(); + return *(checker.target>()); + } + + void Check(AttributeMap* attr_map) const { + for (const auto& checker : attr_checkers_) { + checker(attr_map); + } + } + + private: + std::vector attr_checkers_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc new file mode 100644 index 00000000..0b7aaf11 --- /dev/null +++ b/paddle/fluid/framework/block_desc.cc @@ -0,0 +1,242 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/block_desc.h" + +#include +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +VarDesc *BlockDesc::Var(const std::string &name) { + auto it = vars_.find(name); + if (it != vars_.end()) { + return it->second.get(); + } + need_update_ = true; + auto *var = new VarDesc(name); + vars_[name].reset(var); + return var; +} + +VarDesc *BlockDesc::FindVar(const std::string &name) const { + auto it = vars_.find(name); + if (it == vars_.end()) { + return nullptr; + } + return it->second.get(); +} + +bool BlockDesc::HasVar(const std::string &name) const { + return vars_.find(name) != vars_.end(); +} + +VarDesc *BlockDesc::RenameVar(const std::string &old_name, + const std::string &new_name) { + if (!this->HasVar(old_name)) { + return nullptr; + } + need_update_ = true; + auto *var = this->Var(old_name); + VarDesc *new_var = new VarDesc(*(var->Proto())); + new_var->SetName(new_name); + vars_[new_name].reset(new_var); + // rename inputs and outputs + for (const auto &op : ops_) { + auto *it = op.get(); + it->Rename(old_name, new_name); + } + vars_.erase(old_name); + return new_var; +} + +VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const { + if (name == kEmptyVarName) return nullptr; + + std::queue frontier; + std::unordered_set visited; + + frontier.push(this); + + while (!frontier.empty()) { // BFS + auto cur = frontier.front(); + frontier.pop(); + if (visited.count(cur) != 0) { + continue; + } + auto var = cur->FindVar(name); + if (var != nullptr) { + return var; + } + + auto fwd = cur->ForwardBlock(); + auto parent = cur->ParentBlock(); + + if (fwd != nullptr) { + frontier.push(fwd); + } + if (parent != nullptr) { + frontier.push(parent); + } + + visited.insert(cur); + } + + return nullptr; +} + +VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) { + VarDesc *res = FindVarRecursive(name_bytes); + if (res == nullptr) { + res = Var(name_bytes); + } + return *res; +} + +bool BlockDesc::HasVarRecursive(const std::string &name) const { + return FindVarRecursive(name) != nullptr; +} + +std::vector BlockDesc::AllVars() const { + std::vector res; + for (const auto &p : vars_) { + res.push_back(p.second.get()); + } + return res; +} + +OpDesc *BlockDesc::AppendOp() { + need_update_ = true; + ops_.emplace_back(new OpDesc(this)); + return ops_.back().get(); +} + +void BlockDesc::AppendAllocatedOp(std::unique_ptr &&op_desc) { + need_update_ = true; + ops_.emplace_back(std::move(op_desc)); +} + +OpDesc *BlockDesc::PrependOp() { + need_update_ = true; + ops_.emplace_front(new OpDesc(this)); + return ops_.front().get(); +} + +void BlockDesc::PrependAllocatedOp(std::unique_ptr &&op_desc) { + need_update_ = true; + ops_.emplace_front(std::move(op_desc)); +} + +OpDesc *BlockDesc::InsertOp(size_t index) { + need_update_ = true; + auto it = ops_.begin() + index; + std::unique_ptr new_op(new OpDesc(this)); + it = ops_.insert(it, std::move(new_op)); + return (*it).get(); +} + +void BlockDesc::RemoveOp(size_t s, size_t e) { + if (ops_.begin() + s >= ops_.end() || ops_.begin() + e > ops_.end()) { + return; + } + need_update_ = true; + ops_.erase(ops_.begin() + s, ops_.begin() + e); +} + +void BlockDesc::RemoveOpInternal(const OpDesc *op_desc) { + // TODO(minqiyang): make this faster + for (auto it = ops_.begin(); it != ops_.end(); ++it) { + if (it->get() == op_desc) { + ops_.erase(it); + break; + } + } +} + +std::vector BlockDesc::AllOps() const { + std::vector res; + for (const auto &op : ops_) { + res.push_back(op.get()); + } + return res; +} + +void BlockDesc::Flush() { + for (auto &op_desc : ops_) { + op_desc->Flush(); + } + + if (need_update_) { + this->desc_->mutable_ops()->Clear(); + for (auto &op_desc : ops_) { + this->desc_->mutable_ops()->Add()->CopyFrom(*op_desc->Proto()); + } + this->desc_->mutable_vars()->Clear(); + for (auto &var_desc : vars_) { + this->desc_->mutable_vars()->Add()->CopyFrom(*var_desc.second->Proto()); + } + need_update_ = false; + } +} + +BlockDesc *BlockDesc::ParentBlock() const { + return prog_->MutableBlock(static_cast(desc_->parent_idx())); +} + +proto::BlockDesc *BlockDesc::Proto() { + Flush(); + return desc_; +} + +BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc) + : prog_(prog), desc_(desc), need_update_(false) { + for (const proto::VarDesc &var_desc : desc_->vars()) { + vars_[var_desc.name()].reset(new VarDesc(var_desc)); + } + for (const proto::OpDesc &op_desc : desc_->ops()) { + ops_.emplace_back(new OpDesc(op_desc, this)); + } +} + +BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, + ProgramDesc *prog) + : prog_(prog), desc_(desc) { + need_update_ = true; + for (auto &op : other.ops_) { + ops_.emplace_back(new OpDesc(*op, this)); + } + for (auto &it : other.vars_) { + auto *var = new VarDesc(*it.second); + vars_[it.first].reset(var); + } +} + +void BlockDesc::SetForwardBlockID(int32_t forward_block_id) { + PADDLE_ENFORCE(!desc_->has_forward_block_idx(), + "Parent block ID has been set to %d. Cannot set to %d", + desc_->forward_block_idx(), forward_block_id); + desc_->set_forward_block_idx(forward_block_id); +} + +BlockDesc *BlockDesc::ForwardBlock() const { + return prog_->MutableBlock(static_cast(desc_->forward_block_idx())); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h new file mode 100644 index 00000000..5c6e4215 --- /dev/null +++ b/paddle/fluid/framework/block_desc.h @@ -0,0 +1,123 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/proto_desc.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class ProgramDesc; + +// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize +// read/write speed. Only when we want the protobuf message, the local changes +// will be synchronized (by `Sync` method). + +class BlockDesc { + public: + BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc); + + BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog); + + int32_t ID() const { return desc_->idx(); } + + int32_t Parent() const { return desc_->parent_idx(); } + + int32_t ForwardBlockID() const { return desc_->forward_block_idx(); } + + VarDesc *Var(const std::string &name_bytes); + + VarDesc *FindVar(const std::string &name_bytes) const; + + bool HasVar(const std::string &var_name) const; + + VarDesc *RenameVar(const std::string &old_name, const std::string &new_name); + + VarDesc *FindVarRecursive(const std::string &name_bytes) const; + + VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes); + + bool HasVarRecursive(const std::string &var_name) const; + + std::set LocalVarNames() const { + std::set var_names; + for (auto &var : vars_) { + var_names.insert(var.first); + } + return var_names; + } + + std::vector AllVars() const; + + BlockDesc *ParentBlock() const; + + BlockDesc *ForwardBlock() const; + + void SetForwardBlockID(int32_t forward_block_id); + + OpDesc *AppendOp(); + + void AppendAllocatedOp(std::unique_ptr &&op_desc); + + OpDesc *PrependOp(); + + void PrependAllocatedOp(std::unique_ptr &&op_desc); + + OpDesc *InsertOp(size_t index); + + /* + * Only remove op itself, + * do nothing to its input and output variables + */ + void RemoveOp(size_t s, size_t e); + + void RemoveOpInternal(const OpDesc *op_desc); + + void RemoveVar(const std::string &name) { vars_.erase(name); } + + std::vector AllOps() const; + + size_t OpSize() const { return ops_.size(); } + + OpDesc *Op(int idx) const { return ops_.at(idx).get(); } + + void Flush(); + + proto::BlockDesc *Proto(); + + ProgramDesc *Program() const { return this->prog_; } + + private: + ProgramDesc *prog_; // not_own + proto::BlockDesc *desc_; // not_own + bool need_update_; + + std::deque> ops_; + std::unordered_map> vars_; + + DISABLE_COPY_AND_ASSIGN(BlockDesc); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/blocking_queue.h b/paddle/fluid/framework/blocking_queue.h new file mode 100644 index 00000000..4f35da40 --- /dev/null +++ b/paddle/fluid/framework/blocking_queue.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // NOLINT +#include +#include // NOLINT +#include + +namespace paddle { +namespace framework { + +template +class BlockingQueue { + public: + void Push(const T &item) { + { + std::lock_guard g(mutex_); + q_.emplace_back(item); + } + cv_.notify_one(); + } + + void Push(T &&item) { + { + std::lock_guard g(mutex_); + q_.emplace_back(std::move(item)); + } + cv_.notify_one(); + } + + template + void Extend(const U &items) { + { + std::lock_guard g(mutex_); + for (auto &item : items) { + q_.emplace_back(item); + } + } + cv_.notify_all(); + } + + template + void Extend(U &&items) { + { + std::lock_guard g(mutex_); + for (auto &item : items) { + q_.emplace_back(std::move(item)); + } + } + cv_.notify_all(); + } + + std::deque PopAll(size_t ms, bool *timeout) { + auto time = + std::chrono::system_clock::now() + std::chrono::milliseconds(ms); + std::unique_lock lock(mutex_); + *timeout = !cv_.wait_until(lock, time, [this] { return !q_.empty(); }); + std::deque ret; + if (!*timeout) { + std::swap(ret, q_); + } + return ret; + } + + T Pop() { + std::unique_lock lock(mutex_); + cv_.wait(lock, [=] { return !q_.empty(); }); + T rc(std::move(q_.front())); + q_.pop_front(); + return rc; + } + + void Pop(T *t) { + std::unique_lock lock(mutex_); + cv_.wait(lock, [=] { return !q_.empty(); }); + *t = std::move(q_.front()); + q_.pop_front(); + } + + size_t Size() { + std::lock_guard lock(mutex_); + return q_.size(); + } + + void Clear() { + std::lock_guard lock(mutex_); + std::deque().swap(q_); + } + + private: + std::mutex mutex_; + std::condition_variable cv_; + std::deque q_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h new file mode 100644 index 00000000..644f60db --- /dev/null +++ b/paddle/fluid/framework/channel.h @@ -0,0 +1,460 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + +#include +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include +#include "paddle/fluid/framework/expect.h" + +namespace paddle { +namespace framework { + +template +class ChannelObject { + public: + ChannelObject() {} + + // capacity can be zero + explicit ChannelObject(size_t capacity) { + capacity_ = std::min(MaxCapacity(), capacity); + } + + void Clear() { + std::unique_lock lock(mutex_); + data_.clear(); + data_.shrink_to_fit(); + } + + size_t Capacity() { + return capacity_; // atomic + } + + void SetCapacity(size_t x) { // capacity can be zero + std::lock_guard lock(mutex_); + capacity_ = std::min(MaxCapacity(), x); + Notify(); + } + + size_t BlockSize() { + return block_size_; // atomic + } + + void SetBlockSize(size_t x) { + CHECK(x >= 1) << "block size must be >= 1"; + std::lock_guard lock(mutex_); + block_size_ = x; + } + + template + void InheritFrom(const std::shared_ptr>& other) { + std::lock_guard lock(mutex_); + capacity_ = other->Capacity(); + block_size_ = other->BlockSize(); + } + + bool Closed() { + return closed_; // atomic + } + + // open channel, then data can be write() to channel + void Open() { + std::lock_guard lock(mutex_); + closed_ = false; + Notify(); + } + + // close channel, then no more data can be write() to channel + void Close() { + std::lock_guard lock(mutex_); + closed_ = true; + Notify(); + } + + size_t Size() { + std::lock_guard lock(mutex_); + return data_.size(); + } + + bool Empty() { + std::lock_guard lock(mutex_); + return EmptyUnlocked(); + } + + // blocking operation + bool Get(T& val) { return Read(1, &val) != 0; } // NOLINT + + // blocking operation + // returns 0 if the channel is closed and empty + size_t Read(size_t n, T* p) { + if (n == 0) { + return 0; + } + + std::unique_lock lock(mutex_); + size_t finished = Read(n, p, lock); + Notify(); + return finished; + } + + // blocking operation + bool Put(T&& val) { return WriteMove(1, &val) != 0; } + + // blocking operation + bool Put(const T& val) { return Write(1, &val) != 0; } + + // blocking operation + // returns value less than n if the channel is closed + size_t Write(size_t n, const T* p) { + if (n == 0) { + return 0; + } + std::unique_lock lock(mutex_); + size_t finished = Write(n, p, lock); + Notify(); + return finished; + } + + // WriteMove() will clear original contents of input array + size_t WriteMove(size_t n, T* p) { + if (n == 0) { + return 0; + } + std::unique_lock lock(mutex_); + size_t finished = WriteMove(n, p, lock); + Notify(); + return finished; + } + + // read data of block size from channel to vector + size_t Read(std::vector& p) { // NOLINT + p.resize(block_size_); + size_t finished = Read(p.size(), &p[0]); + p.resize(finished); + return finished; + } + + size_t ReadAll(std::vector& p) { // NOLINT + p.clear(); + size_t finished = 0; + size_t n = 0; + do { + // _block_size may change anytime + n = block_size_; + p.resize(finished + n); + n = Read(n, &p[finished]); + finished += n; + } while (n != 0); + p.resize(finished); + return finished; + } + + // write data from vector to channel + size_t Write(const std::vector& p) { return Write(p.size(), &p[0]); } + + // write data from vector to channel + size_t Write(std::vector&& p) { return WriteMove(p.size(), &p[0]); } + + private: + size_t capacity_ = MaxCapacity(); + size_t block_size_ = 1024; + bool closed_ = false; + std::mutex mutex_; + // use deque to store data + std::deque data_; + size_t reading_count_ = 0; + int empty_waiters_ = 0; + int full_waiters_ = 0; + std::condition_variable empty_cond_; + std::condition_variable full_cond_; + + static constexpr size_t MaxCapacity() { + return std::numeric_limits::max() / 2; + } + + void Notify() { + if (empty_waiters_ != 0 && (!EmptyUnlocked() || closed_)) { + empty_cond_.notify_one(); + } + if (full_waiters_ != 0 && (!FullUnlocked() || closed_)) { + full_cond_.notify_one(); + } + } + + bool EmptyUnlocked() { return data_.empty(); } + + bool FullUnlocked() { return data_.size() >= capacity_ + reading_count_; } + + bool WaitForRead(std::unique_lock& lock) { // NOLINT +#ifdef _LINUX + while (unlikely(EmptyUnlocked() && !closed_)) { +#else + while (EmptyUnlocked() && !closed_) { +#endif + if (full_waiters_ != 0) { + full_cond_.notify_one(); + } + empty_waiters_++; + empty_cond_.wait(lock); + empty_waiters_--; + } + return !EmptyUnlocked(); + } + + bool WaitForWrite(std::unique_lock& lock) { // NOLINT +#ifdef _LINUX + while (unlikely(FullUnlocked() && !closed_)) { +#else + while (FullUnlocked() && !closed_) { +#endif + if (empty_waiters_ != 0) { + empty_cond_.notify_one(); + } + full_waiters_++; + full_cond_.wait(lock); + full_waiters_--; + } + return !closed_; + } + + size_t Read(size_t n, T* p, std::unique_lock& lock) { // NOLINT + size_t finished = 0; + CHECK(n <= MaxCapacity() - reading_count_); + reading_count_ += n; + while (finished < n && WaitForRead(lock)) { + size_t m = std::min(n - finished, data_.size()); + for (size_t i = 0; i < m; i++) { + p[finished++] = std::move(data_.front()); + data_.pop_front(); + } + reading_count_ -= m; + } + reading_count_ -= n - finished; + return finished; + } + + size_t Write(size_t n, + const T* p, // NOLINT + std::unique_lock& lock) { // NOLINT + size_t finished = 0; + while (finished < n && WaitForWrite(lock)) { + size_t m = + std::min(n - finished, capacity_ + reading_count_ - data_.size()); + for (size_t i = 0; i < m; i++) { + data_.push_back(p[finished++]); + } + } + return finished; + } + + size_t WriteMove(size_t n, + T* p, // NOLINT + std::unique_lock& lock) { // NOLINT + size_t finished = 0; + while (finished < n && WaitForWrite(lock)) { + size_t m = + std::min(n - finished, capacity_ + reading_count_ - data_.size()); + for (size_t i = 0; i < m; i++) { + data_.push_back(std::move(p[finished++])); + } + } + return finished; + } +}; // NOLINT + +template +using Channel = std::shared_ptr>; + +template +Channel MakeChannel(size_t capacity = std::numeric_limits::max()) { + return std::make_shared>(capacity); +} + +template +Channel MakeChannel(const Channel& other) { + CHECK(other != nullptr) << "channel can not be NULL"; + Channel chan = std::make_shared>(); + chan->InheritFrom(other); + return chan; +} + +// NOTE: ChannelReader is a wrapper for quick read channel with a buffer. It +// will read a block data from channel, but user can get data one by one. So it +// is important to notice that user must call operator>> until false, or call +// get_buffer_remain until false to make sure the buffered data all readed. +template +class ChannelReader { + public: + explicit ChannelReader(ChannelObject* channel = nullptr) { + Reset(channel); + } + + ~ChannelReader() { CHECK(cursor_ == 0) << "Forgot to read buffer data"; } + + ChannelObject* channel() { return channel_; } + + void Reset(ChannelObject* channel) { + CHECK(channel != nullptr) << "Channel can not be nullptr"; + channel_ = channel; + cursor_ = 0; + failed_ = !channel; + } + + // whether there were read failed + operator bool() { return !failed_; } + + ChannelReader& operator>>(T& val) { + if (failed_) { + return *this; + } + if (cursor_ >= buffer_.size()) { + cursor_ = 0; + if (channel_->read(buffer_) == 0) { + failed_ = true; + return *this; + } + } + val = std::move(buffer_[cursor_++]); + return *this; + } + + bool GetBufferRemain(T& val) { // NOLINT + if (cursor_ >= buffer_.size()) { + cursor_ = 0; + return false; + } + val = std::move(buffer_[cursor_++]); + return true; + } + + private: + ChannelObject* channel_ = nullptr; + std::vector buffer_; + size_t cursor_ = 0; + bool failed_ = true; +}; // NOLINT + +template +class ChannelWriter { + public: + explicit ChannelWriter(ChannelObject* channel = nullptr) { + Reset(channel); + } + + ~ChannelWriter() { CHECK(buffer_.empty()) << "Forgot to flush"; } + + ChannelObject* channel() { return channel_; } + + void Reset(ChannelObject* channel) { + CHECK(buffer_.empty()) << "Forgot to flush"; + CHECK(channel != nullptr) << "Channel can not be nullptr"; + channel_ = channel; + buffer_.clear(); + failed_ = !channel; + } + + // whether there were write failed + operator bool() { return !failed_; } + + ChannelWriter& operator<<(T&& val) { + if (failed_) { + return *this; + } + buffer_.push_back(std::move(val)); + if (buffer_.size() >= channel_->BlockSize()) { + Flush(); + } + return *this; + } + + ChannelWriter& operator<<(const T& val) { + if (failed_) { + return *this; + } + buffer_.push_back(val); + if (buffer_.size() >= channel_->BlockSize()) { + Flush(); + } + return *this; + } + + void Flush() { + if (failed_ || buffer_.empty()) { + buffer_.clear(); + return; + } + failed_ |= + channel_->WriteMove(buffer_.size(), &buffer_[0]) != buffer_.size(); + buffer_.clear(); + } + + private: + ChannelObject* channel_ = nullptr; + std::vector buffer_; + bool failed_ = true; +}; // NOLINT + +// only used for range-for loop +// for (auto& x : chan) {...} +template +struct ChannelIterator { + std::shared_ptr> reader_; + T data_; + + void operator++() { + CHECK(reader_ != nullptr) << "reader can not be NULL"; + if (!(*reader_ >> data_)) { + reader_ = nullptr; + } + } + + T& operator*() { return data_; } + + friend bool operator==(const ChannelIterator& a, + const ChannelIterator& b) { + return a.reader_ == b.reader_; + } + + friend bool operator!=(const ChannelIterator& a, + const ChannelIterator& b) { + return a.reader_ != b.reader_; + } +}; // NOLINT + +template +ChannelIterator begin(ChannelObject* chan) { + ChannelIterator it{std::make_shared>(chan), T()}; + ++it; + return it; +} + +template +ChannelIterator end(ChannelObject* chan) { + return {nullptr, T()}; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/commit.h b/paddle/fluid/framework/commit.h new file mode 100644 index 00000000..343bf82f --- /dev/null +++ b/paddle/fluid/framework/commit.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +namespace paddle { +namespace framework { + +static std::string paddle_commit() { + return "95c1816ec0"; +} + +static std::string paddle_compile_branch() { + return "develop"; +} + +static std::string paddle_version() { + return "0.0.0"; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/commit.h.in b/paddle/fluid/framework/commit.h.in new file mode 100644 index 00000000..3a33ece6 --- /dev/null +++ b/paddle/fluid/framework/commit.h.in @@ -0,0 +1,21 @@ +#pragma once + +#include + +namespace paddle { +namespace framework { + +static std::string paddle_commit() { + return "@PADDLE_COMMIT@"; +} + +static std::string paddle_compile_branch() { + return "@PADDLE_BRANCH@"; +} + +static std::string paddle_version() { + return "@PADDLE_VERSION@"; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc new file mode 100644 index 00000000..fee6ba40 --- /dev/null +++ b/paddle/fluid/framework/data_device_transform.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_device_transform.h" + +namespace paddle { +namespace framework { + +void TransDataDevice(const Tensor &in, const platform::Place &dst_place, + Tensor *out) { + VLOG(3) << "DeviceTransform in, src_place " << in.place() + << " dst_place: " << dst_place; + + PADDLE_ENFORCE_NE( + in.place().which(), dst_place.which(), + "Currently, model parallelism is only supported between CPU and CUDA"); + + // NOTE(yy): TransDataDevice should wait for computation of input. + platform::DeviceContextPool::Instance().Get(in.place())->Wait(); + platform::DeviceContextPool::Instance().Get(dst_place)->Wait(); + + // FIXME(zcd): TransDataDevice is used to transform data from GPU to CPU and + // the enforced checkings have been done in GetDeviceContext, so the + // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program + // slow, especially when the number of elements is little, for example, + // the elements of learning rate are one and it's CPU side. + // One solution is to use a CUDA kernel to complete the copy operation when + // the transforming is from CPU to GPU and the number of elements is little. + // But the embarrassment is that this solution this solution makes training + // slower. + TensorCopySync(in, dst_place, out); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_device_transform.h b/paddle/fluid/framework/data_device_transform.h new file mode 100644 index 00000000..8ff97646 --- /dev/null +++ b/paddle/fluid/framework/data_device_transform.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +void TransDataDevice(const Tensor& in, const platform::Place& dst_place, + Tensor* out); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu new file mode 100644 index 00000000..96a2f925 --- /dev/null +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -0,0 +1,169 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/init.h" + +namespace paddle { +namespace framework { + +template +struct AddFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } +}; + +class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("input", "input1 of test op"); + AddOutput("output", "output of test op"); + AddAttr("use_gpu", "force to use gpu kernel").SetDefault(false); + AddComment("This is test op"); + } +}; + +class TestOpWithKernel : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override {} + OpKernelType GetExpectedKernelType( + const ExecutionContext& ctx) const override { + if (Attr("use_gpu")) { + VLOG(3) << "force use gpu kernel"; + return OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0)); + } else { + VLOG(3) << "use default kernel"; + return OpKernelType(proto::VarType::FP32, + ctx.Input("input")->place()); + } + } +}; + +template +class TestKernel : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + std::cout << ctx.op().DebugString() << std::endl; + + const Tensor* input = ctx.Input("input"); + + std::cout << "input place:" << input->place() << std::endl; + auto* output = ctx.Output("output"); + output->Resize(input->dims()); + output->mutable_data(ctx.GetPlace()); + + operators::TransformFunctor, T, DeviceContext> functor( + input, input, output, ctx.template device_context(), + AddFunctor()); + functor.Run(); + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT( + test_op, paddle::framework::TestOpWithKernel, + paddle::framework::OpKernelTestProtoAndCheckerMaker); +REGISTER_OP_CPU_KERNEL( + test_op, + paddle::framework::TestKernel); +REGISTER_OP_CUDA_KERNEL( + test_op, + paddle::framework::TestKernel); + +static void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + *var->mutable_arguments()->Add() = arg_name; + } +} + +TEST(Operator, CPUtoGPU) { + paddle::framework::InitDevices(true); + + paddle::framework::Scope scope; + paddle::platform::CPUPlace cpu_place; + + // create an op to run on CPU + paddle::framework::proto::OpDesc cpu_op_desc; + cpu_op_desc.set_type("test_op"); + BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs()); + BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs()); + + auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc); + // prepare input + auto* in_t = scope.Var("IN1")->GetMutable(); + auto* src_ptr = + in_t->mutable_data({2, 3}, paddle::platform::CPUPlace()); + for (int i = 0; i < 2 * 3; ++i) { + src_ptr[i] = static_cast(i); + } + + // get output + auto* output = scope.Var("OUT1"); + cpu_op->Run(scope, cpu_place); + + auto* output_ptr = output->Get().data(); + for (int i = 0; i < 2 * 3; ++i) { + ASSERT_EQ(output_ptr[i], static_cast(i) * 2); + } + + // create an op to run on GPU + paddle::framework::proto::OpDesc gpu_op_desc; + gpu_op_desc.set_type("test_op"); + BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs()); + BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs()); + + auto attr = gpu_op_desc.mutable_attrs()->Add(); + attr->set_name("use_gpu"); + attr->set_type(paddle::framework::proto::AttrType::BOOLEAN); + attr->set_b(true); + + auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc); + + paddle::platform::CUDAPlace cuda_place(0); + // get output + auto* output2 = scope.Var("OUT2"); + gpu_op->Run(scope, cuda_place); + VLOG(3) << "after gpu_op run"; + + // auto* output2_ptr = output2->Get().data(); + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + auto dev_ctx = pool.Get(cuda_place); + + paddle::framework::Tensor output_tensor; + paddle::framework::TensorCopy(output2->Get(), + paddle::platform::CPUPlace(), *dev_ctx, + &output_tensor); + + dev_ctx->Wait(); + float* output2_ptr = output_tensor.data(); + for (int i = 0; i < 2 * 3; ++i) { + ASSERT_EQ(output2_ptr[i], static_cast(i) * 4); + } +} diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc new file mode 100644 index 00000000..ed94e30e --- /dev/null +++ b/paddle/fluid/framework/data_feed.cc @@ -0,0 +1,1100 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + +#include "paddle/fluid/framework/data_feed.h" +#ifdef _LINUX +#include +#include +#include +#include +#endif +#include +#include "gflags/gflags.h" +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" +#include "io/fs.h" +#include "io/shell.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/platform/timer.h" + +namespace paddle { +namespace framework { + +void DataFeed::AddFeedVar(Variable* var, const std::string& name) { + CheckInit(); + for (size_t i = 0; i < use_slots_.size(); ++i) { + if (name == use_slots_[i]) { + if (var == nullptr) { + feed_vec_[i] = nullptr; + } else { + feed_vec_[i] = var->GetMutable(); + } + } + } +} + +bool DataFeed::SetFileList(const std::vector& files) { + std::unique_lock lock(*mutex_for_pick_file_); + CheckInit(); + // Do not set finish_set_filelist_ flag, + // since a user may set file many times after init reader + filelist_.assign(files.begin(), files.end()); + + finish_set_filelist_ = true; + return true; +} + +void DataFeed::SetBatchSize(int batch_size) { + PADDLE_ENFORCE(batch_size > 0, "Illegal batch size: %d.", batch_size); + default_batch_size_ = batch_size; +} + +bool DataFeed::PickOneFile(std::string* filename) { + PADDLE_ENFORCE(mutex_for_pick_file_ != nullptr, + "should call SetFileListMutex before PickOneFile"); + PADDLE_ENFORCE(file_idx_ != nullptr, + "should call SetFileListIndex before PickOneFile"); + std::unique_lock lock(*mutex_for_pick_file_); + if (*file_idx_ == filelist_.size()) { + VLOG(3) << "DataFeed::PickOneFile no more file to pick"; + return false; + } + VLOG(3) << "file_idx_=" << *file_idx_; + *filename = filelist_[(*file_idx_)++]; + return true; +} + +void DataFeed::CheckInit() { + PADDLE_ENFORCE(finish_init_, "Initialization did not succeed."); +} + +void DataFeed::CheckSetFileList() { + PADDLE_ENFORCE(finish_set_filelist_, "Set filelist did not succeed."); +} + +void DataFeed::CheckStart() { + PADDLE_ENFORCE(finish_start_, "Datafeed has not started running yet."); +} + +void DataFeed::AssignFeedVar(const Scope& scope) { + CheckInit(); + for (size_t i = 0; i < use_slots_.size(); ++i) { + feed_vec_[i] = scope.FindVar(use_slots_[i])->GetMutable(); + } +} + +template +void PrivateQueueDataFeed::SetQueueSize(int queue_size) { + PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size); + queue_size_ = queue_size; + queue_ = paddle::framework::MakeChannel(); +} + +template +bool PrivateQueueDataFeed::Start() { + CheckSetFileList(); + read_thread_ = std::thread(&PrivateQueueDataFeed::ReadThread, this); + read_thread_.detach(); + + finish_start_ = true; + return true; +} + +template +void PrivateQueueDataFeed::ReadThread() { +#ifdef _LINUX + std::string filename; + while (PickOneFile(&filename)) { + int err_no = 0; + fp_ = fs_open_read(filename, &err_no, pipe_command_); + __fsetlocking(&*fp_, FSETLOCKING_BYCALLER); + T instance; + while (ParseOneInstanceFromPipe(&instance)) { + queue_->Put(instance); + } + } + queue_->Close(); +#endif +} + +template +int PrivateQueueDataFeed::Next() { +#ifdef _LINUX + CheckStart(); + int index = 0; + T ins_vec; + while (index < default_batch_size_) { + T instance; + if (!queue_->Get(instance)) { + break; + } + AddInstanceToInsVec(&ins_vec, instance, index++); + } + batch_size_ = index; + if (batch_size_ != 0) { + PutToFeedVec(ins_vec); + } + return batch_size_; +#else + return 0; +#endif +} + +// explicit instantiation +template class PrivateQueueDataFeed>; + +template +InMemoryDataFeed::InMemoryDataFeed() { + this->file_idx_ = nullptr; + this->mutex_for_pick_file_ = nullptr; + this->fp_ = nullptr; + this->thread_id_ = 0; + this->thread_num_ = 1; + this->parse_ins_id_ = false; + this->input_channel_ = nullptr; + this->output_channel_ = nullptr; + this->consume_channel_ = nullptr; +} + +template +bool InMemoryDataFeed::Start() { +#ifdef _LINUX + this->CheckSetFileList(); + if (output_channel_->Size() == 0 && input_channel_->Size() != 0) { + std::vector data; + input_channel_->Read(data); + output_channel_->Write(std::move(data)); + } +#endif + this->finish_start_ = true; + return true; +} + +template +int InMemoryDataFeed::Next() { +#ifdef _LINUX + this->CheckStart(); + CHECK(output_channel_ != nullptr); + CHECK(consume_channel_ != nullptr); + VLOG(3) << "output_channel_ size=" << output_channel_->Size() + << ", consume_channel_ size=" << consume_channel_->Size() + << ", thread_id=" << thread_id_; + int index = 0; + T instance; + std::vector ins_vec; + ins_vec.reserve(this->default_batch_size_); + while (index < this->default_batch_size_) { + if (output_channel_->Size() == 0) { + break; + } + output_channel_->Get(instance); + ins_vec.push_back(instance); + ++index; + consume_channel_->Put(std::move(instance)); + } + this->batch_size_ = index; + VLOG(3) << "batch_size_=" << this->batch_size_ + << ", thread_id=" << thread_id_; + if (this->batch_size_ != 0) { + PutToFeedVec(ins_vec); + } else { + VLOG(3) << "finish reading, output_channel_ size=" + << output_channel_->Size() + << ", consume_channel_ size=" << consume_channel_->Size() + << ", thread_id=" << thread_id_; + } + return this->batch_size_; +#else + return 0; +#endif +} + +template +void InMemoryDataFeed::SetInputChannel(void* channel) { + input_channel_ = static_cast*>(channel); +} + +template +void InMemoryDataFeed::SetOutputChannel(void* channel) { + output_channel_ = static_cast*>(channel); +} + +template +void InMemoryDataFeed::SetConsumeChannel(void* channel) { + consume_channel_ = static_cast*>(channel); +} + +template +void InMemoryDataFeed::SetThreadId(int thread_id) { + thread_id_ = thread_id; +} + +template +void InMemoryDataFeed::SetThreadNum(int thread_num) { + thread_num_ = thread_num; +} + +template +void InMemoryDataFeed::SetParseInsId(bool parse_ins_id) { + parse_ins_id_ = parse_ins_id; +} + +template +void InMemoryDataFeed::LoadIntoMemory() { +#ifdef _LINUX + VLOG(3) << "LoadIntoMemory() begin, thread_id=" << thread_id_; + std::string filename; + while (this->PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + int err_no = 0; + this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_); + CHECK(this->fp_ != nullptr); + __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER); + paddle::framework::ChannelWriter writer(input_channel_); + T instance; + platform::Timer timeline; + timeline.Start(); + while (ParseOneInstanceFromPipe(&instance)) { + writer << std::move(instance); + instance = T(); + } + writer.Flush(); + timeline.Pause(); + VLOG(3) << "LoadIntoMemory() read all lines, file=" << filename + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_; + } + VLOG(3) << "LoadIntoMemory() end, thread_id=" << thread_id_; +#endif +} + +// explicit instantiation +template class InMemoryDataFeed; + +void MultiSlotDataFeed::Init( + const paddle::framework::DataFeedDesc& data_feed_desc) { + finish_init_ = false; + finish_set_filelist_ = false; + finish_start_ = false; + + PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(), + "Multi_slot_desc has not been set."); + paddle::framework::MultiSlotDesc multi_slot_desc = + data_feed_desc.multi_slot_desc(); + SetBatchSize(data_feed_desc.batch_size()); + SetQueueSize(data_feed_desc.batch_size()); + size_t all_slot_num = multi_slot_desc.slots_size(); + all_slots_.resize(all_slot_num); + all_slots_type_.resize(all_slot_num); + use_slots_index_.resize(all_slot_num); + total_dims_without_inductive_.resize(all_slot_num); + inductive_shape_index_.resize(all_slot_num); + use_slots_.clear(); + use_slots_is_dense_.clear(); + for (size_t i = 0; i < all_slot_num; ++i) { + const auto& slot = multi_slot_desc.slots(i); + all_slots_[i] = slot.name(); + all_slots_type_[i] = slot.type(); + use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1; + total_dims_without_inductive_[i] = 1; + inductive_shape_index_[i] = -1; + if (slot.is_used()) { + use_slots_.push_back(all_slots_[i]); + use_slots_is_dense_.push_back(slot.is_dense()); + std::vector local_shape; + if (slot.is_dense()) { + for (size_t j = 0; j < slot.shape_size(); ++j) { + if (slot.shape(j) > 0) { + total_dims_without_inductive_[i] *= slot.shape(j); + } + if (slot.shape(j) == -1) { + inductive_shape_index_[i] = j; + } + } + } + for (size_t j = 0; j < slot.shape_size(); ++j) { + local_shape.push_back(slot.shape(j)); + } + use_slots_shape_.push_back(local_shape); + } + } + feed_vec_.resize(use_slots_.size()); + pipe_command_ = data_feed_desc.pipe_command(); + finish_init_ = true; +} + +void MultiSlotDataFeed::ReadThread() { +#ifdef _LINUX + std::string filename; + while (PickOneFile(&filename)) { + int err_no = 0; + fp_ = fs_open_read(filename, &err_no, pipe_command_); + CHECK(fp_ != nullptr); + __fsetlocking(&*fp_, FSETLOCKING_BYCALLER); + std::vector instance; + int ins_num = 0; + while (ParseOneInstanceFromPipe(&instance)) { + ins_num++; + queue_->Put(instance); + } + VLOG(3) << "filename: " << filename << " inst num: " << ins_num; + } + queue_->Close(); +#endif +} + +bool MultiSlotDataFeed::CheckFile(const char* filename) { +#ifdef _LINUX + CheckInit(); // get info of slots + std::ifstream fin(filename); + if (!fin.good()) { + VLOG(1) << "error: open file<" << filename << "> fail"; + return false; + } + std::string line; + int instance_cout = 0; + std::string all_slots_alias = ""; + for (const auto& alias : all_slots_) { + all_slots_alias += alias + " "; + } + std::string use_slots_alias = ""; + for (const auto& alias : use_slots_) { + use_slots_alias += alias + " "; + } + VLOG(3) << "total slots num: " << all_slots_.size(); + VLOG(3) << "total slots alias: " << all_slots_alias; + VLOG(3) << "used slots num: " << use_slots_.size(); + VLOG(3) << "used slots alias: " << use_slots_alias; + while (getline(fin, line)) { + ++instance_cout; + const char* str = line.c_str(); + char* endptr = const_cast(str); + int len = line.length(); + for (size_t i = 0; i < all_slots_.size(); ++i) { + auto num = strtol(endptr, &endptr, 10); + if (num < 0) { + VLOG(0) << "error: the number of ids is a negative number: " << num; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } else if (num == 0) { + VLOG(0) + << "error: the number of ids can not be zero, you need " + "padding it in data generator; or if there is something wrong" + " with the data, please check if the data contains unresolvable " + "characters."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } else if (errno == ERANGE || num > INT_MAX) { + VLOG(0) << "error: the number of ids greater than INT_MAX"; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + if (all_slots_type_[i] == "float") { + for (int i = 0; i < num; ++i) { + strtof(endptr, &endptr); + if (errno == ERANGE) { + VLOG(0) << "error: the value is out of the range of " + "representable values for float"; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + if (i + 1 != num && endptr - str == len) { + VLOG(0) << "error: there is a wrong with the number of ids."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + } + } else if (all_slots_type_[i] == "uint64") { + for (int i = 0; i < num; ++i) { + strtoull(endptr, &endptr, 10); + if (errno == ERANGE) { + VLOG(0) << "error: the value is out of the range of " + "representable values for uint64_t"; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + if (i + 1 != num && endptr - str == len) { + VLOG(0) << "error: there is a wrong with the number of ids."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + } + } else { + VLOG(0) << "error: this type<" << all_slots_type_[i] + << "> is not supported"; + return false; + } + } + // It may be added '\t' character to the end of the output of reduce + // task when processes data by Hadoop(when the output of the reduce + // task of Hadoop has only one field, it will add a '\t' at the end + // of the line by default, and you can use this option to avoid it: + // `-D mapred.textoutputformat.ignoreseparator=true`), which does + // not affect the correctness of the data. Therefore, it should be + // judged that the data is not normal when the end of each line of + // data contains characters which are not spaces. + while (endptr - str != len) { + if (!isspace(*(endptr++))) { + VLOG(0) + << "error: there is some extra characters at the end of the line."; + VLOG(0) << "please check line<" << instance_cout << "> in file<" + << filename << ">"; + return false; + } + } + } + VLOG(3) << "instances cout: " << instance_cout; + VLOG(3) << "The file format is correct"; +#endif + return true; +} + +bool MultiSlotDataFeed::ParseOneInstanceFromPipe( + std::vector* instance) { +#ifdef _LINUX + thread_local string::LineFileReader reader; + + if (!reader.getline(&*(fp_.get()))) { + return false; + } else { + int use_slots_num = use_slots_.size(); + instance->resize(use_slots_num); + + const char* str = reader.get(); + std::string line = std::string(str); + // VLOG(3) << line; + char* endptr = const_cast(str); + int pos = 0; + for (size_t i = 0; i < use_slots_index_.size(); ++i) { + int idx = use_slots_index_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE( + num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + if (idx != -1) { + (*instance)[idx].Init(all_slots_type_[i]); + if ((*instance)[idx].GetType()[0] == 'f') { // float + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + (*instance)[idx].AddValue(feasign); + } + } else if ((*instance)[idx].GetType()[0] == 'u') { // uint64 + for (int j = 0; j < num; ++j) { + uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10); + (*instance)[idx].AddValue(feasign); + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + // pos = line.find_first_of(' ', pos + 1); + while (line[pos + 1] != ' ') { + pos++; + } + } + } + } + return true; + } +#else + return true; +#endif +} + +bool MultiSlotDataFeed::ParseOneInstance(std::vector* instance) { +#ifdef _LINUX + std::string line; + if (getline(file_, line)) { + int use_slots_num = use_slots_.size(); + instance->resize(use_slots_num); + // parse line + const char* str = line.c_str(); + char* endptr = const_cast(str); + int pos = 0; + for (size_t i = 0; i < use_slots_index_.size(); ++i) { + int idx = use_slots_index_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE( + num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + + if (idx != -1) { + (*instance)[idx].Init(all_slots_type_[i]); + if ((*instance)[idx].GetType()[0] == 'f') { // float + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + (*instance)[idx].AddValue(feasign); + } + } else if ((*instance)[idx].GetType()[0] == 'u') { // uint64 + for (int j = 0; j < num; ++j) { + uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10); + (*instance)[idx].AddValue(feasign); + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + pos = line.find_first_of(' ', pos + 1); + } + } + } + } else { + return false; + } +#endif + return false; +} + +void MultiSlotDataFeed::AddInstanceToInsVec( + std::vector* ins_vec, + const std::vector& instance, int index) { +#ifdef _LINUX + if (index == 0) { + ins_vec->resize(instance.size()); + for (size_t i = 0; i < instance.size(); ++i) { + (*ins_vec)[i].Init(instance[i].GetType()); + (*ins_vec)[i].InitOffset(); + } + } + + for (size_t i = 0; i < instance.size(); ++i) { + (*ins_vec)[i].AddIns(instance[i]); + } +#endif +} + +void MultiSlotDataFeed::PutToFeedVec( + const std::vector& ins_vec) { +#ifdef _LINUX + for (size_t i = 0; i < use_slots_.size(); ++i) { + if (feed_vec_[i] == nullptr) { + continue; + } + const auto& type = ins_vec[i].GetType(); + const auto& offset = ins_vec[i].GetOffset(); + int total_instance = static_cast(offset.back()); + + if (type[0] == 'f') { // float + const auto& feasign = ins_vec[i].GetFloatData(); + float* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); + } else if (type[0] == 'u') { // uint64 + // no uint64_t type in paddlepaddle + const auto& feasign = ins_vec[i].GetUint64Data(); + int64_t* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); + } + + LoD data_lod{offset}; + feed_vec_[i]->set_lod(data_lod); + if (use_slots_is_dense_[i]) { + if (inductive_shape_index_[i] != -1) { + use_slots_shape_[i][inductive_shape_index_[i]] = + total_instance / total_dims_without_inductive_[i]; + } + feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i])); + } + } +#endif +} + +void MultiSlotInMemoryDataFeed::Init( + const paddle::framework::DataFeedDesc& data_feed_desc) { + finish_init_ = false; + finish_set_filelist_ = false; + finish_start_ = false; + + PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(), + "Multi_slot_desc has not been set."); + paddle::framework::MultiSlotDesc multi_slot_desc = + data_feed_desc.multi_slot_desc(); + SetBatchSize(data_feed_desc.batch_size()); + size_t all_slot_num = multi_slot_desc.slots_size(); + all_slots_.resize(all_slot_num); + all_slots_type_.resize(all_slot_num); + use_slots_index_.resize(all_slot_num); + total_dims_without_inductive_.resize(all_slot_num); + inductive_shape_index_.resize(all_slot_num); + use_slots_.clear(); + use_slots_is_dense_.clear(); + for (size_t i = 0; i < all_slot_num; ++i) { + const auto& slot = multi_slot_desc.slots(i); + all_slots_[i] = slot.name(); + all_slots_type_[i] = slot.type(); + use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1; + total_dims_without_inductive_[i] = 1; + inductive_shape_index_[i] = -1; + if (slot.is_used()) { + use_slots_.push_back(all_slots_[i]); + use_slots_is_dense_.push_back(slot.is_dense()); + std::vector local_shape; + if (slot.is_dense()) { + for (size_t j = 0; j < slot.shape_size(); ++j) { + if (slot.shape(j) > 0) { + total_dims_without_inductive_[i] *= slot.shape(j); + } + if (slot.shape(j) == -1) { + inductive_shape_index_[i] = j; + } + } + } + for (size_t j = 0; j < slot.shape_size(); ++j) { + local_shape.push_back(slot.shape(j)); + } + use_slots_shape_.push_back(local_shape); + } + } + feed_vec_.resize(use_slots_.size()); + pipe_command_ = data_feed_desc.pipe_command(); + finish_init_ = true; +} + +bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) { +#ifdef _LINUX + thread_local string::LineFileReader reader; + + if (!reader.getline(&*(fp_.get()))) { + return false; + } else { + const char* str = reader.get(); + std::string line = std::string(str); + // VLOG(3) << line; + char* endptr = const_cast(str); + int pos = 0; + if (parse_ins_id_) { + int num = strtol(&str[pos], &endptr, 10); + CHECK(num == 1); // NOLINT + pos = endptr - str + 1; + size_t len = 0; + while (str[pos + len] != ' ') { + ++len; + } + instance->ins_id_ = std::string(str + pos, len); + pos += len + 1; + VLOG(3) << "ins_id " << instance->ins_id_; + } + for (size_t i = 0; i < use_slots_index_.size(); ++i) { + int idx = use_slots_index_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE( + num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + if (idx != -1) { + if (all_slots_type_[i][0] == 'f') { // float + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + // if float feasign is equal to zero, ignore it + // except when slot is dense + if (fabs(feasign) < 1e-6 && !use_slots_is_dense_[i]) { + continue; + } + FeatureKey f; + f.float_feasign_ = feasign; + instance->float_feasigns_.push_back(FeatureItem(f, idx)); + } + } else if (all_slots_type_[i][0] == 'u') { // uint64 + for (int j = 0; j < num; ++j) { + uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10); + // if uint64 feasign is equal to zero, ignore it + // except when slot is dense + if (feasign == 0 && !use_slots_is_dense_[i]) { + continue; + } + FeatureKey f; + f.uint64_feasign_ = feasign; + instance->uint64_feasigns_.push_back(FeatureItem(f, idx)); + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + // pos = line.find_first_of(' ', pos + 1); + while (line[pos + 1] != ' ') { + pos++; + } + } + } + } + instance->float_feasigns_.shrink_to_fit(); + instance->uint64_feasigns_.shrink_to_fit(); + return true; + } +#else + return false; +#endif +} + +bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) { +#ifdef _LINUX + std::string line; + if (getline(file_, line)) { + VLOG(3) << line; + // parse line + const char* str = line.c_str(); + char* endptr = const_cast(str); + int pos = 0; + for (size_t i = 0; i < use_slots_index_.size(); ++i) { + int idx = use_slots_index_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE( + num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + + if (idx != -1) { + if (all_slots_type_[i][0] == 'f') { // float + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + if (fabs(feasign) < 1e-6) { + continue; + } + FeatureKey f; + f.float_feasign_ = feasign; + instance->float_feasigns_.push_back(FeatureItem(f, idx)); + } + } else if (all_slots_type_[i][0] == 'u') { // uint64 + for (int j = 0; j < num; ++j) { + uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10); + if (feasign == 0) { + continue; + } + FeatureKey f; + f.uint64_feasign_ = feasign; + instance->uint64_feasigns_.push_back(FeatureItem(f, idx)); + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + pos = line.find_first_of(' ', pos + 1); + } + } + } + instance->float_feasigns_.shrink_to_fit(); + instance->uint64_feasigns_.shrink_to_fit(); + return true; + } else { + return false; + } +#endif + return false; +} + +void MultiSlotInMemoryDataFeed::PutToFeedVec( + const std::vector& ins_vec) { +#ifdef _LINUX + std::vector> batch_float_feasigns(use_slots_.size(), + std::vector()); + std::vector> batch_uint64_feasigns( + use_slots_.size(), std::vector()); + std::vector> offset(use_slots_.size(), + std::vector{0}); + std::vector visit(use_slots_.size(), false); + for (size_t i = 0; i < ins_vec.size(); ++i) { + auto& r = ins_vec[i]; + for (auto& item : r.float_feasigns_) { + batch_float_feasigns[item.slot()].push_back(item.sign().float_feasign_); + visit[item.slot()] = true; + } + for (auto& item : r.uint64_feasigns_) { + batch_uint64_feasigns[item.slot()].push_back(item.sign().uint64_feasign_); + visit[item.slot()] = true; + } + for (size_t j = 0; j < use_slots_.size(); ++j) { + const auto& type = all_slots_type_[j]; + if (visit[j]) { + visit[j] = false; + } else { + // fill slot value with default value 0 + if (type[0] == 'f') { // float + batch_float_feasigns[j].push_back(0.0); + } else if (type[0] == 'u') { // uint64 + batch_uint64_feasigns[j].push_back(0); + } + } + // get offset of this ins in this slot + if (type[0] == 'f') { // float + offset[j].push_back(batch_float_feasigns[j].size()); + } else if (type[0] == 'u') { // uint64 + offset[j].push_back(batch_uint64_feasigns[j].size()); + } + } + } + + for (size_t i = 0; i < use_slots_.size(); ++i) { + if (feed_vec_[i] == nullptr) { + continue; + } + int total_instance = offset[i].back(); + const auto& type = all_slots_type_[i]; + if (type[0] == 'f') { // float + float* feasign = batch_float_feasigns[i].data(); + float* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, feasign, total_instance * sizeof(float)); + } else if (type[0] == 'u') { // uint64 + // no uint64_t type in paddlepaddle + uint64_t* feasign = batch_uint64_feasigns[i].data(); + int64_t* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, feasign, total_instance * sizeof(int64_t)); + } + auto& slot_offset = offset[i]; + LoD data_lod{slot_offset}; + feed_vec_[i]->set_lod(data_lod); + if (use_slots_is_dense_[i]) { + if (inductive_shape_index_[i] != -1) { + use_slots_shape_[i][inductive_shape_index_[i]] = + total_instance / total_dims_without_inductive_[i]; + } + feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i])); + } + } +#endif +} + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +template +void PrivateInstantDataFeed::PutToFeedVec() { + for (size_t i = 0; i < use_slots_.size(); ++i) { + const auto& type = ins_vec_[i].GetType(); + const auto& offset = ins_vec_[i].GetOffset(); + int total_instance = static_cast(offset.back()); + + if (type[0] == 'f') { // float + const auto& feasign = ins_vec_[i].GetFloatData(); + float* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); + } else if (type[0] == 'u') { // uint64 + // no uint64_t type in paddlepaddle + const auto& feasign = ins_vec_[i].GetUint64Data(); + int64_t* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); + } + + LoD data_lod{offset}; + feed_vec_[i]->set_lod(data_lod); + if (use_slots_is_dense_[i]) { + int64_t total_dims = 1; + for (const auto e : use_slots_shape_[i]) { + total_dims *= e; + } + PADDLE_ENFORCE( + total_dims == total_instance, + "The actual data size of slot[%s] doesn't match its declaration", + use_slots_[i].c_str()); + feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i])); + } + } +} + +template +int PrivateInstantDataFeed::Next() { + if (ParseOneMiniBatch()) { + PutToFeedVec(); + return ins_vec_[0].GetBatchSize(); + } + Postprocess(); + + std::string filename; + if (!PickOneFile(&filename)) { + return -1; + } + if (!Preprocess(filename)) { + return -1; + } + + PADDLE_ENFORCE(true == ParseOneMiniBatch(), "Fail to parse mini-batch data"); + PutToFeedVec(); + return ins_vec_[0].GetBatchSize(); +} + +template +void PrivateInstantDataFeed::Init(const DataFeedDesc& data_feed_desc) { + finish_init_ = false; + finish_set_filelist_ = false; + finish_start_ = false; + + PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(), + "Multi_slot_desc has not been set."); + paddle::framework::MultiSlotDesc multi_slot_desc = + data_feed_desc.multi_slot_desc(); + SetBatchSize(data_feed_desc.batch_size()); + size_t all_slot_num = multi_slot_desc.slots_size(); + all_slots_.resize(all_slot_num); + all_slots_type_.resize(all_slot_num); + use_slots_index_.resize(all_slot_num); + multi_inductive_shape_index_.resize(all_slot_num); + use_slots_.clear(); + use_slots_is_dense_.clear(); + for (size_t i = 0; i < all_slot_num; ++i) { + const auto& slot = multi_slot_desc.slots(i); + all_slots_[i] = slot.name(); + all_slots_type_[i] = slot.type(); + use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1; + if (slot.is_used()) { + use_slots_.push_back(all_slots_[i]); + use_slots_is_dense_.push_back(slot.is_dense()); + std::vector local_shape; + if (slot.is_dense()) { + for (size_t j = 0; j < slot.shape_size(); ++j) { + if (slot.shape(j) == -1) { + multi_inductive_shape_index_[i].push_back(j); + } + } + } + for (size_t j = 0; j < slot.shape_size(); ++j) { + local_shape.push_back(slot.shape(j)); + } + use_slots_shape_.push_back(local_shape); + } + } + feed_vec_.resize(use_slots_.size()); + ins_vec_.resize(use_slots_.size()); + + finish_init_ = true; +} + +template class PrivateInstantDataFeed>; + +bool MultiSlotFileInstantDataFeed::Preprocess(const std::string& filename) { + fd_ = open(filename.c_str(), O_RDONLY); + PADDLE_ENFORCE(fd_ != -1, "Fail to open file: %s", filename.c_str()); + + struct stat sb; + fstat(fd_, &sb); + end_ = static_cast(sb.st_size); + + buffer_ = + reinterpret_cast(mmap(NULL, end_, PROT_READ, MAP_PRIVATE, fd_, 0)); + PADDLE_ENFORCE(buffer_ != MAP_FAILED, strerror(errno)); + + offset_ = 0; + return true; +} + +bool MultiSlotFileInstantDataFeed::Postprocess() { + if (buffer_ != nullptr) { + munmap(buffer_, end_); + buffer_ = nullptr; + } + if (fd_ != -1) { + close(fd_); + fd_ = -1; + end_ = 0; + offset_ = 0; + } + return true; +} + +bool MultiSlotFileInstantDataFeed::ParseOneMiniBatch() { + if (offset_ == end_) { + return false; + } + + batch_size_ = 0; + while (batch_size_ < default_batch_size_ && offset_ < end_) { + for (size_t i = 0; i < use_slots_index_.size(); ++i) { + int idx = use_slots_index_[i]; + char type = all_slots_type_[i][0]; + + uint16_t num = *reinterpret_cast(buffer_ + offset_); + PADDLE_ENFORCE( + num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters."); + offset_ += sizeof(uint16_t); + + if (idx != -1) { + int inductive_size = multi_inductive_shape_index_[i].size(); + if (UNLIKELY(batch_size_ == 0)) { + ins_vec_[idx].Init(all_slots_type_[i], default_batch_size_ * num); + ins_vec_[idx].InitOffset(default_batch_size_); + uint64_t* inductive_shape = + reinterpret_cast(buffer_ + offset_); + for (int inductive_id = 0; inductive_id < inductive_size; + ++inductive_id) { + use_slots_shape_[i][multi_inductive_shape_index_[i][inductive_id]] = + static_cast(*(inductive_shape + inductive_id)); + } + } + num -= inductive_size; + offset_ += sizeof(uint64_t) * inductive_size; + + if (type == 'f') { + ins_vec_[idx].AppendValues( + reinterpret_cast(buffer_ + offset_), num); + offset_ += num * sizeof(float); + } else if (type == 'u') { + ins_vec_[idx].AppendValues( + reinterpret_cast(buffer_ + offset_), num); + offset_ += num * sizeof(uint64_t); + } + } else { + if (type == 'f') { + offset_ += num * sizeof(float); + } else if (type == 'u') { + offset_ += num * sizeof(uint64_t); + } + } + } + ++batch_size_; + // OPTIMIZE: It is better to insert check codes between instances for format + // checking + } + + PADDLE_ENFORCE(batch_size_ == default_batch_size_ || offset_ == end_, + "offset_ != end_"); + return true; +} +#endif + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h new file mode 100644 index 00000000..7164834c --- /dev/null +++ b/paddle/fluid/framework/data_feed.h @@ -0,0 +1,557 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + +#include +#include // NOLINT +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include + +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace framework { + +// DataFeed is the base virtual class for all ohther DataFeeds. +// It is used to read files and parse the data for subsequent trainer. +// Example: +// DataFeed* reader = +// paddle::framework::DataFeedFactory::CreateDataFeed(data_feed_name); +// reader->Init(data_feed_desc); // data_feed_desc is a protobuf object +// reader->SetFileList(filelist); +// const std::vector & use_slot_alias = +// reader->GetUseSlotAlias(); +// for (auto name: use_slot_alias){ // for binding memory +// reader->AddFeedVar(scope->Var(name), name); +// } +// reader->Start(); +// while (reader->Next()) { +// // trainer do something +// } +class DataFeed { + public: + DataFeed() { + mutex_for_pick_file_ = nullptr; + file_idx_ = nullptr; + } + virtual ~DataFeed() {} + virtual void Init(const DataFeedDesc& data_feed_desc) = 0; + virtual bool CheckFile(const char* filename) { + PADDLE_THROW("This function(CheckFile) is not implemented."); + } + // Set filelist for DataFeed. + // Pay attention that it must init all readers before call this function. + // Otherwise, Init() function will init finish_set_filelist_ flag. + virtual bool SetFileList(const std::vector& files); + virtual bool Start() = 0; + + // The trainer calls the Next() function, and the DataFeed will load a new + // batch to the feed_vec. The return value of this function is the batch + // size of the current batch. + virtual int Next() = 0; + // Get all slots' alias which defined in protofile + virtual const std::vector& GetAllSlotAlias() { + return all_slots_; + } + // Get used slots' alias which defined in protofile + virtual const std::vector& GetUseSlotAlias() { + return use_slots_; + } + // This function is used for binding feed_vec memory + virtual void AddFeedVar(Variable* var, const std::string& name); + + // This function is used for binding feed_vec memory in a given scope + virtual void AssignFeedVar(const Scope& scope); + + // This function will do nothing at default + virtual void SetInputChannel(void* channel) {} + // This function will do nothing at default + virtual void SetOutputChannel(void* channel) {} + // This function will do nothing at default + virtual void SetConsumeChannel(void* channel) {} + // This function will do nothing at default + virtual void SetThreadId(int thread_id) {} + // This function will do nothing at default + virtual void SetThreadNum(int thread_num) {} + // This function will do nothing at default + virtual void SetParseInsId(bool parse_ins_id) {} + virtual void SetFileListMutex(std::mutex* mutex) { + mutex_for_pick_file_ = mutex; + } + virtual void SetFileListIndex(size_t* file_index) { file_idx_ = file_index; } + virtual void LoadIntoMemory() { + PADDLE_THROW("This function(LoadIntoMemory) is not implemented."); + } + + protected: + // The following three functions are used to check if it is executed in this + // order: + // Init() -> SetFileList() -> Start() -> Next() + virtual void CheckInit(); + virtual void CheckSetFileList(); + virtual void CheckStart(); + virtual void SetBatchSize( + int batch); // batch size will be set in Init() function + // This function is used to pick one file from the global filelist(thread + // safe). + virtual bool PickOneFile(std::string* filename); + + std::vector filelist_; + size_t* file_idx_; + std::mutex* mutex_for_pick_file_; + + // the alias of used slots, and its order is determined by + // data_feed_desc(proto object) + std::vector use_slots_; + std::vector use_slots_is_dense_; + + // the alias of all slots, and its order is determined by data_feed_desc(proto + // object) + std::vector all_slots_; + std::vector all_slots_type_; + std::vector> use_slots_shape_; + std::vector inductive_shape_index_; + std::vector total_dims_without_inductive_; + // For the inductive shape passed within data + std::vector> multi_inductive_shape_index_; + std::vector + use_slots_index_; // -1: not used; >=0: the index of use_slots_ + + // The data read by DataFeed will be stored here + std::vector feed_vec_; + + // the batch size defined by user + int default_batch_size_; + // current batch size + int batch_size_; + + bool finish_init_; + bool finish_set_filelist_; + bool finish_start_; + std::string pipe_command_; +}; + +// PrivateQueueDataFeed is the base virtual class for ohther DataFeeds. +// It use a read-thread to read file and parse data to a private-queue +// (thread level), and get data from this queue when trainer call Next(). +template +class PrivateQueueDataFeed : public DataFeed { + public: + PrivateQueueDataFeed() {} + virtual ~PrivateQueueDataFeed() {} + virtual bool Start(); + virtual int Next(); + + protected: + // The thread implementation function for reading file and parse. + virtual void ReadThread(); + // This function is used to set private-queue size, and the most + // efficient when the queue size is close to the batch size. + virtual void SetQueueSize(int queue_size); + // The reading and parsing method called in the ReadThread. + virtual bool ParseOneInstance(T* instance) = 0; + virtual bool ParseOneInstanceFromPipe(T* instance) = 0; + // This function is used to put instance to vec_ins + virtual void AddInstanceToInsVec(T* vec_ins, const T& instance, + int index) = 0; + // This function is used to put ins_vec to feed_vec + virtual void PutToFeedVec(const T& ins_vec) = 0; + + // The thread for read files + std::thread read_thread_; + // using ifstream one line and one line parse is faster + // than using fread one buffer and one buffer parse. + // for a 601M real data: + // ifstream one line and one line parse: 6034 ms + // fread one buffer and one buffer parse: 7097 ms + std::ifstream file_; + std::shared_ptr fp_; + size_t queue_size_; + string::LineFileReader reader_; + // The queue for store parsed data + std::shared_ptr> queue_; +}; + +template +class InMemoryDataFeed : public DataFeed { + public: + InMemoryDataFeed(); + virtual ~InMemoryDataFeed() {} + virtual void Init(const DataFeedDesc& data_feed_desc) = 0; + virtual bool Start(); + virtual int Next(); + virtual void SetInputChannel(void* channel); + virtual void SetOutputChannel(void* channel); + virtual void SetConsumeChannel(void* channel); + virtual void SetThreadId(int thread_id); + virtual void SetThreadNum(int thread_num); + virtual void SetParseInsId(bool parse_ins_id); + virtual void LoadIntoMemory(); + + protected: + virtual bool ParseOneInstance(T* instance) = 0; + virtual bool ParseOneInstanceFromPipe(T* instance) = 0; + virtual void PutToFeedVec(const std::vector& ins_vec) = 0; + + int thread_id_; + int thread_num_; + bool parse_ins_id_; + std::ifstream file_; + std::shared_ptr fp_; + paddle::framework::ChannelObject* input_channel_; + paddle::framework::ChannelObject* output_channel_; + paddle::framework::ChannelObject* consume_channel_; +}; + +// This class define the data type of instance(ins_vec) in MultiSlotDataFeed +class MultiSlotType { + public: + MultiSlotType() {} + ~MultiSlotType() {} + void Init(const std::string& type, size_t reserved_size = 0) { + CheckType(type); + if (type_[0] == 'f') { + float_feasign_.clear(); + if (reserved_size) { + float_feasign_.reserve(reserved_size); + } + } else if (type_[0] == 'u') { + uint64_feasign_.clear(); + if (reserved_size) { + uint64_feasign_.reserve(reserved_size); + } + } + type_ = type; + } + void InitOffset(size_t max_batch_size = 0) { + if (max_batch_size > 0) { + offset_.reserve(max_batch_size + 1); + } + offset_.resize(1); + // LoDTensor' lod is counted from 0, the size of lod + // is one size larger than the size of data. + offset_[0] = 0; + } + const std::vector& GetOffset() const { return offset_; } + std::vector& MutableOffset() { return offset_; } + void AddValue(const float v) { + CheckFloat(); + float_feasign_.push_back(v); + } + void AddValue(const uint64_t v) { + CheckUint64(); + uint64_feasign_.push_back(v); + } + void CopyValues(const float* input, size_t size) { + CheckFloat(); + float_feasign_.resize(size); + memcpy(float_feasign_.data(), input, size * sizeof(float)); + } + void CopyValues(const uint64_t* input, size_t size) { + CheckUint64(); + uint64_feasign_.resize(size); + memcpy(uint64_feasign_.data(), input, size * sizeof(uint64_t)); + } + void AddIns(const MultiSlotType& ins) { + if (ins.GetType()[0] == 'f') { // float + CheckFloat(); + auto& vec = ins.GetFloatData(); + offset_.push_back(offset_.back() + vec.size()); + float_feasign_.insert(float_feasign_.end(), vec.begin(), vec.end()); + } else if (ins.GetType()[0] == 'u') { // uint64 + CheckUint64(); + auto& vec = ins.GetUint64Data(); + offset_.push_back(offset_.back() + vec.size()); + uint64_feasign_.insert(uint64_feasign_.end(), vec.begin(), vec.end()); + } + } + void AppendValues(const uint64_t* input, size_t size) { + CheckUint64(); + offset_.push_back(offset_.back() + size); + uint64_feasign_.insert(uint64_feasign_.end(), input, input + size); + } + void AppendValues(const float* input, size_t size) { + CheckFloat(); + offset_.push_back(offset_.back() + size); + float_feasign_.insert(float_feasign_.end(), input, input + size); + } + const std::vector& GetFloatData() const { return float_feasign_; } + std::vector& MutableFloatData() { return float_feasign_; } + const std::vector& GetUint64Data() const { return uint64_feasign_; } + std::vector& MutableUint64Data() { return uint64_feasign_; } + const std::string& GetType() const { return type_; } + size_t GetBatchSize() { return offset_.size() - 1; } + std::string& MutableType() { return type_; } + + std::string DebugString() { + std::stringstream ss; + ss << "\ntype: " << type_ << "\n"; + ss << "offset: "; + ss << "["; + for (const size_t& i : offset_) { + ss << offset_[i] << ","; + } + ss << "]\ndata: ["; + if (type_[0] == 'f') { + for (const float& i : float_feasign_) { + ss << i << ","; + } + } else { + for (const uint64_t& i : uint64_feasign_) { + ss << i << ","; + } + } + ss << "]\n"; + return ss.str(); + } + + private: + void CheckType(const std::string& type) const { + PADDLE_ENFORCE((type == "uint64") || (type == "float"), + "There is no this type<%s>.", type); + } + void CheckFloat() const { + PADDLE_ENFORCE(type_[0] == 'f', "Add %s value to float slot.", type_); + } + void CheckUint64() const { + PADDLE_ENFORCE(type_[0] == 'u', "Add %s value to uint64 slot.", type_); + } + std::vector float_feasign_; + std::vector uint64_feasign_; + std::string type_; + std::vector offset_; +}; + +template +paddle::framework::Archive& operator<<(paddle::framework::Archive& ar, + const MultiSlotType& ins) { + ar << ins.GetType(); +#ifdef _LINUX + ar << ins.GetOffset(); +#else + const auto& offset = ins.GetOffset(); + ar << (uint64_t)offset.size(); + for (const size_t& x : offset) { + ar << (const uint64_t)x; + } +#endif + ar << ins.GetFloatData(); + ar << ins.GetUint64Data(); + return ar; +} + +template +paddle::framework::Archive& operator>>(paddle::framework::Archive& ar, + MultiSlotType& ins) { + ar >> ins.MutableType(); +#ifdef _LINUX + ar >> ins.MutableOffset(); +#else + auto& offset = ins.MutableOffset(); + offset.resize(ar.template Get()); + for (size_t& x : offset) { + uint64_t t; + ar >> t; + x = (size_t)t; + } +#endif + ar >> ins.MutableFloatData(); + ar >> ins.MutableUint64Data(); + return ar; +} + +union FeatureKey { + uint64_t uint64_feasign_; + float float_feasign_; +}; + +struct FeatureItem { + FeatureItem() {} + FeatureItem(FeatureKey sign, uint16_t slot) { + this->sign() = sign; + this->slot() = slot; + } + FeatureKey& sign() { return *(reinterpret_cast(sign_buffer())); } + const FeatureKey& sign() const { + const FeatureKey* ret = reinterpret_cast(sign_buffer()); + return *ret; + } + uint16_t& slot() { return slot_; } + const uint16_t& slot() const { return slot_; } + + private: + char* sign_buffer() const { return const_cast(sign_); } + char sign_[sizeof(FeatureKey)]; + uint16_t slot_; +}; + +// sizeof Record is much less than std::vector +struct Record { + std::vector uint64_feasigns_; + std::vector float_feasigns_; + std::string ins_id_; +}; + +template +paddle::framework::Archive& operator<<(paddle::framework::Archive& ar, + const FeatureKey& fk) { + ar << fk.uint64_feasign_; + ar << fk.float_feasign_; + return ar; +} + +template +paddle::framework::Archive& operator>>(paddle::framework::Archive& ar, + FeatureKey& fk) { + ar >> fk.uint64_feasign_; + ar >> fk.float_feasign_; + return ar; +} + +template +paddle::framework::Archive& operator<<(paddle::framework::Archive& ar, + const FeatureItem& fi) { + ar << fi.sign(); + ar << fi.slot(); + return ar; +} + +template +paddle::framework::Archive& operator>>(paddle::framework::Archive& ar, + FeatureItem& fi) { + ar >> fi.sign(); + ar >> fi.slot(); + return ar; +} + +template +paddle::framework::Archive& operator<<(paddle::framework::Archive& ar, + const Record& r) { + ar << r.uint64_feasigns_; + ar << r.float_feasigns_; + ar << r.ins_id_; + return ar; +} + +template +paddle::framework::Archive& operator>>(paddle::framework::Archive& ar, + Record& r) { + ar >> r.uint64_feasigns_; + ar >> r.float_feasigns_; + ar >> r.ins_id_; + return ar; +} + +// This DataFeed is used to feed multi-slot type data. +// The format of multi-slot type data: +// [n feasign_0 feasign_1 ... feasign_n]* +class MultiSlotDataFeed + : public PrivateQueueDataFeed> { + public: + MultiSlotDataFeed() {} + virtual ~MultiSlotDataFeed() {} + virtual void Init(const DataFeedDesc& data_feed_desc); + virtual bool CheckFile(const char* filename); + + protected: + virtual void ReadThread(); + virtual void AddInstanceToInsVec(std::vector* vec_ins, + const std::vector& instance, + int index); + virtual bool ParseOneInstance(std::vector* instance); + virtual bool ParseOneInstanceFromPipe(std::vector* instance); + virtual void PutToFeedVec(const std::vector& ins_vec); +}; + +class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { + public: + MultiSlotInMemoryDataFeed() {} + virtual ~MultiSlotInMemoryDataFeed() {} + virtual void Init(const DataFeedDesc& data_feed_desc); + + protected: + virtual bool ParseOneInstance(Record* instance); + virtual bool ParseOneInstanceFromPipe(Record* instance); + virtual void PutToFeedVec(const std::vector& ins_vec); +}; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +template +class PrivateInstantDataFeed : public DataFeed { + public: + PrivateInstantDataFeed() {} + virtual ~PrivateInstantDataFeed() {} + void Init(const DataFeedDesc& data_feed_desc) override; + bool Start() override { return true; } + int Next() override; + + protected: + // The batched data buffer + std::vector ins_vec_; + + // This function is used to preprocess with a given filename, e.g. open it or + // mmap + virtual bool Preprocess(const std::string& filename) = 0; + + // This function is used to postprocess system resource such as closing file + // NOTICE: Ensure that it is safe to call before Preprocess + virtual bool Postprocess() = 0; + + // The reading and parsing method. + virtual bool ParseOneMiniBatch() = 0; + + // This function is used to put ins_vec to feed_vec + virtual void PutToFeedVec(); +}; + +class MultiSlotFileInstantDataFeed + : public PrivateInstantDataFeed> { + public: + MultiSlotFileInstantDataFeed() {} + virtual ~MultiSlotFileInstantDataFeed() {} + + protected: + int fd_{-1}; + char* buffer_{nullptr}; + size_t end_{0}; + size_t offset_{0}; + + bool Preprocess(const std::string& filename) override; + + bool Postprocess() override; + + bool ParseOneMiniBatch() override; +}; +#endif + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto new file mode 100644 index 00000000..03996e0e --- /dev/null +++ b/paddle/fluid/framework/data_feed.proto @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +syntax = "proto2"; +package paddle.framework; + +message Slot { + required string name = 1; + required string type = 2; + optional bool is_dense = 3 [ default = false ]; + optional bool is_used = 4 [ default = false ]; + repeated int32 shape = 5; // we can define N-D Tensor +} + +message MultiSlotDesc { repeated Slot slots = 1; } + +message DataFeedDesc { + optional string name = 1; + optional int32 batch_size = 2 [ default = 32 ]; + optional MultiSlotDesc multi_slot_desc = 3; + optional string pipe_command = 4; + optional int32 thread_num = 5; +} diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc new file mode 100644 index 00000000..ec1acad9 --- /dev/null +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_feed_factory.h" +#include +#include +#include + +#include "paddle/fluid/framework/data_feed.h" + +namespace paddle { +namespace framework { +typedef std::shared_ptr (*Createdata_feedFunction)(); +typedef std::unordered_map data_feedMap; +data_feedMap g_data_feed_map; + +#define REGISTER_DATAFEED_CLASS(data_feed_class) \ + namespace { \ + std::shared_ptr Creator_##data_feed_class() { \ + return std::shared_ptr(new data_feed_class); \ + } \ + class __Registerer_##data_feed_class { \ + public: \ + __Registerer_##data_feed_class() { \ + g_data_feed_map[#data_feed_class] = &Creator_##data_feed_class; \ + } \ + }; \ + __Registerer_##data_feed_class g_registerer_##data_feed_class; \ + } // namespace + +std::string DataFeedFactory::DataFeedTypeList() { + std::string data_feed_types; + for (auto iter = g_data_feed_map.begin(); iter != g_data_feed_map.end(); + ++iter) { + if (iter != g_data_feed_map.begin()) { + data_feed_types += ", "; + } + data_feed_types += iter->first; + } + return data_feed_types; +} + +std::shared_ptr DataFeedFactory::CreateDataFeed( + std::string data_feed_class) { + if (g_data_feed_map.count(data_feed_class) < 1) { + LOG(WARNING) << "Your DataFeed " << data_feed_class + << "is not supported currently"; + LOG(WARNING) << "Supported DataFeed: " << DataFeedTypeList(); + exit(-1); + } + return g_data_feed_map[data_feed_class](); +} + +REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); +REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed); +#endif +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed_factory.h b/paddle/fluid/framework/data_feed_factory.h new file mode 100644 index 00000000..13678edb --- /dev/null +++ b/paddle/fluid/framework/data_feed_factory.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/data_feed.h" + +namespace paddle { +namespace framework { +class DataFeedFactory { + public: + static std::string DataFeedTypeList(); + static std::shared_ptr CreateDataFeed(std::string data_feed_class); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc new file mode 100644 index 00000000..e1d62468 --- /dev/null +++ b/paddle/fluid/framework/data_feed_test.cc @@ -0,0 +1,330 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/data_feed.h" +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include +#include +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" + +paddle::framework::DataFeedDesc load_datafeed_param_from_file( + const char* filename) { + paddle::framework::DataFeedDesc data_feed_desc; + int file_descriptor = open(filename, O_RDONLY); + PADDLE_ENFORCE(file_descriptor != -1, "Can not open %s.", filename); + google::protobuf::io::FileInputStream fileInput(file_descriptor); + google::protobuf::TextFormat::Parse(&fileInput, &data_feed_desc); + close(file_descriptor); + return data_feed_desc; +} + +const std::vector load_filelist_from_file(const char* filename) { + std::vector filelist; + std::ifstream fin(filename); + PADDLE_ENFORCE(fin.good(), "Can not open %s.", filename); + std::string line; + while (getline(fin, line)) { + filelist.push_back(line); + } + fin.close(); + return filelist; +} + +void GenerateFileForTest(const char* protofile, const char* filelist) { + std::ofstream w_protofile(protofile); + w_protofile << "name: \"MultiSlotDataFeed\"\n" + "batch_size: 2\n" + "multi_slot_desc {\n" + " slots {\n" + " name: \"uint64_sparse_slot\"\n" + " type: \"uint64\"\n" + " is_dense: false\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"float_sparse_slot\"\n" + " type: \"float\"\n" + " is_dense: false\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"uint64_dense_slot\"\n" + " type: \"uint64\"\n" + " is_dense: true\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"float_dense_slot\"\n" + " type: \"float\"\n" + " is_dense: true\n" + " is_used: true\n" + " }\n" + " slots {\n" + " name: \"not_used_slot\"\n" + " type: \"uint64\"\n" + " is_dense: false\n" + " is_used: false\n" + " }\n" + "}"; + w_protofile.close(); + std::ofstream w_filelist(filelist); + int total_file = 4; + for (int i = 0; i < total_file; ++i) { + std::string filename = "TestMultiSlotDataFeed.data." + std::to_string(i); + w_filelist << filename; + if (i + 1 != total_file) { + w_filelist << std::endl; + } + std::ofstream w_datafile(filename.c_str()); + w_datafile << "3 3978 620 82 1 1926.08 1 1926 1 6.02 1 1996\n" + "2 1300 2983353 1 985.211 1 8 1 0.618 1 12\n" + "1 19260827 2 3.14 2.718 1 27 1 2.236 1 28\n"; + w_datafile.close(); + } + w_filelist.close(); +} + +class MultiTypeSet { + public: + MultiTypeSet() { + uint64_set_.clear(); + float_set_.clear(); + } + ~MultiTypeSet() {} + void AddValue(uint64_t v) { uint64_set_.insert(v); } + void AddValue(float v) { float_set_.insert(v); } + const std::set& GetUint64Set() const { return uint64_set_; } + const std::set& GetFloatSet() const { return float_set_; } + + private: + std::set uint64_set_; + std::set float_set_; +}; + +void GetElemSetFromReader(std::vector* reader_elem_set, + const paddle::framework::DataFeedDesc& data_feed_desc, + const std::vector& filelist, + const int thread_num) { + int used_slot_num = 0; + for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) { + if (data_feed_desc.multi_slot_desc().slots(i).is_used()) { + ++used_slot_num; + } + } + reader_elem_set->resize(used_slot_num); + std::vector threads; + std::vector> readers; + readers.resize(thread_num); + for (int i = 0; i < thread_num; ++i) { + readers[i] = paddle::framework::DataFeedFactory::CreateDataFeed( + data_feed_desc.name()); + readers[i]->Init(data_feed_desc); + } + readers[0]->SetFileList(filelist); + std::mutex mu; + for (int idx = 0; idx < thread_num; ++idx) { + threads.emplace_back(std::thread([&, idx] { + std::unique_ptr scope( + new paddle::framework::Scope()); + const auto& multi_slot_desc = data_feed_desc.multi_slot_desc(); + std::map + lodtensor_targets; + for (int i = 0; i < multi_slot_desc.slots_size(); ++i) { + const auto& slot = multi_slot_desc.slots(i); + if (slot.is_used()) { + const auto& name = slot.name(); + readers[idx]->AddFeedVar(scope->Var(name), name); + lodtensor_targets[name] = + &scope->FindVar(name)->Get(); + } + } + readers[idx]->Start(); + while (readers[idx]->Next()) { + int index = 0; + for (int k = 0; k < multi_slot_desc.slots_size(); ++k) { + const auto& slot = multi_slot_desc.slots(k); + if (!slot.is_used()) { + continue; + } + const paddle::framework::LoDTensor* tens = + lodtensor_targets[slot.name()]; + if (slot.is_dense()) { // dense branch + if (slot.type() == "uint64") { + const int64_t* data = tens->data(); + int batch_size = tens->dims()[0]; + int dim = tens->dims()[1]; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < dim; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue( + (uint64_t)data[i * dim + j]); + } + } + } else if (slot.type() == "float") { + const float* data = tens->data(); + int batch_size = tens->dims()[0]; + int dim = tens->dims()[1]; + for (int i = 0; i < batch_size; ++i) { + for (int j = 0; j < dim; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue(data[i * dim + j]); + } + } + } else { + PADDLE_THROW("Error type in proto file."); + } + } else { // sparse branch + if (slot.type() == "uint64") { + const int64_t* data = tens->data(); + for (size_t i = 0; i < tens->NumElements(); ++i) { + std::pair element = tens->lod_element(0, i); + for (size_t j = element.first; j < element.second; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue((uint64_t)data[j]); + } + } + } else if (slot.type() == "float") { + const float* data = tens->data(); + for (size_t i = 0; i < tens->NumElements(); ++i) { + std::pair element = tens->lod_element(0, i); + for (size_t j = element.first; j < element.second; ++j) { + std::lock_guard lock(mu); + (*reader_elem_set)[index].AddValue(data[j]); + } + } + } else { + PADDLE_THROW("Error type in proto file."); + } + } // end sparse branch + ++index; + } // end slots loop + } // end while Next() + })); // end anonymous function + } + for (auto& th : threads) { + th.join(); + } +} + +void CheckIsUnorderedSame(const std::vector& s1, + const std::vector& s2) { + EXPECT_EQ(s1.size(), s2.size()); + for (size_t i = 0; i < s1.size(); ++i) { + // check for uint64 + const std::set& uint64_s1 = s1[i].GetUint64Set(); + const std::set& uint64_s2 = s2[i].GetUint64Set(); + EXPECT_EQ(uint64_s1.size(), uint64_s2.size()); + auto uint64_it1 = uint64_s1.begin(); + auto uint64_it2 = uint64_s2.begin(); + while (uint64_it1 != uint64_s1.end()) { + EXPECT_EQ(*uint64_it1, *uint64_it2); + ++uint64_it1; + ++uint64_it2; + } + // check for float + const std::set& float_s1 = s1[i].GetFloatSet(); + const std::set& float_s2 = s2[i].GetFloatSet(); + EXPECT_EQ(float_s1.size(), float_s2.size()); + auto float_it1 = float_s1.begin(); + auto float_it2 = float_s2.begin(); + while (float_it1 != float_s1.end()) { + EXPECT_EQ(*float_it1, *float_it2); + ++float_it1; + ++float_it2; + } + } +} + +void GetElemSetFromFile(std::vector* file_elem_set, + const paddle::framework::DataFeedDesc& data_feed_desc, + const std::vector& filelist) { + int used_slot_num = 0; + for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) { + if (data_feed_desc.multi_slot_desc().slots(i).is_used()) { + ++used_slot_num; + } + } + file_elem_set->resize(used_slot_num); + for (const auto& file : filelist) { + std::ifstream fin(file.c_str()); + PADDLE_ENFORCE(fin.good(), "Can not open %s.", file.c_str()); + while (1) { + bool end_flag = false; + int index = 0; + for (auto i = 0; i < data_feed_desc.multi_slot_desc().slots_size(); ++i) { + int num; + if (fin >> num) { + auto slot = data_feed_desc.multi_slot_desc().slots(i); + auto type = slot.type(); + if (type == "uint64") { + while (num--) { + uint64_t feasign; + fin >> feasign; + if (slot.is_used()) { + (*file_elem_set)[index].AddValue(feasign); + } + } + } else if (type == "float") { + while (num--) { + float feasign; + fin >> feasign; + if (slot.is_used()) { + (*file_elem_set)[index].AddValue(feasign); + } + } + } else { + PADDLE_THROW("Error type in proto file."); + } + if (slot.is_used()) { + ++index; + } + } else { + end_flag = true; + break; + } + } + if (end_flag) { + break; + } + } + fin.close(); + } +} + +TEST(DataFeed, MultiSlotUnitTest) { + const char* protofile = "data_feed_desc.prototxt"; + const char* filelist_name = "filelist.txt"; + GenerateFileForTest(protofile, filelist_name); + const std::vector filelist = + load_filelist_from_file(filelist_name); + paddle::framework::DataFeedDesc data_feed_desc = + load_datafeed_param_from_file(protofile); + std::vector reader_elem_set; + std::vector file_elem_set; + // GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4); + // GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist); + // CheckIsUnorderedSame(reader_elem_set, file_elem_set); +} diff --git a/paddle/fluid/framework/data_layout.h b/paddle/fluid/framework/data_layout.h new file mode 100644 index 00000000..b611bb77 --- /dev/null +++ b/paddle/fluid/framework/data_layout.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +enum class DataLayout { + kNHWC = 0, + kNCHW = 1, + kAnyLayout = 2, + kMKLDNN = 3, // all layouts supported by MKLDNN internally +}; + +inline DataLayout StringToDataLayout(const std::string& str) { + std::string s(str); + for (size_t i = 0; i < s.size(); ++i) { + s[i] = toupper(s[i]); + } + + if (s == "NHWC") { + return DataLayout::kNHWC; + } else if (s == "NCHW") { + return DataLayout::kNCHW; + } else if (s == "ANYLAYOUT") { + return DataLayout::kAnyLayout; + } else if (s == "MKLDNNLAYOUT") { + return DataLayout::kMKLDNN; + } else { + PADDLE_THROW("Unknown storage order string: %s", s); + } +} + +inline std::string DataLayoutToString(const DataLayout& data_layout) { + switch (data_layout) { + case DataLayout::kNHWC: + return "NHWC"; + case DataLayout::kNCHW: + return "NCHW"; + case DataLayout::kAnyLayout: + return "ANY_LAYOUT"; + case DataLayout::kMKLDNN: + return "MKLDNNLAYOUT"; + default: + PADDLE_THROW("unknown DataLayout %d", data_layout); + } +} + +inline std::ostream& operator<<(std::ostream& out, const DataLayout& l) { + out << DataLayoutToString(l); + return out; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc new file mode 100644 index 00000000..bbcd3426 --- /dev/null +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -0,0 +1,185 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/data_layout_transform.h" +#include +#include + +#include "paddle/fluid/operators/math/math_function.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" +#endif + +namespace paddle { +namespace framework { + +std::vector GetAxis(const DataLayout& from, const DataLayout& to) { + PADDLE_ENFORCE_NE(from, to, + "layout transform should transform different layout"); + if (from == DataLayout::kNCHW && to == DataLayout::kNHWC) { + return {0, 2, 3, 1}; + } else if (from == DataLayout::kNHWC && to == DataLayout::kNCHW) { + return {0, 3, 1, 2}; + } else { + PADDLE_THROW("unsupported transform"); + } +} + +struct CastDataLayout { + CastDataLayout(const platform::DeviceContext* ctx, + const std::vector& axis, const framework::Tensor& in, + framework::Tensor* out) + : in_(in), out_(out), ctx_(ctx), axis_(axis) {} + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + const std::vector axis_; + + template + void apply() { + auto place = ctx_->GetPlace(); + + if (platform::is_cpu_place(place)) { + operators::math::Transpose trans4; + auto* context = static_cast(ctx_); + trans4(*context, in_, out_, axis_); + } else { + PADDLE_THROW("Unsupport CPU <-> GPU!"); + } + } +}; + +void TransDataLayout(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out) { + PADDLE_ENFORCE( + platform::places_are_same_class(kernel_type_for_var.place_, + expected_kernel_type.place_), + "TransDataLayout only support DataLayout transform on same place!"); + + PADDLE_ENFORCE(arity(in.dims()) == 4, "Input Arity only support 4!"); + + auto& pool = platform::DeviceContextPool::Instance(); + + auto src_dim = in.dims(); + std::vector dst_dim; + + auto axis = GetAxis(kernel_type_for_var.data_layout_, + expected_kernel_type.data_layout_); + dst_dim.resize(axis.size()); + for (size_t i = 0; i < axis.size(); i++) { + dst_dim[i] = src_dim[axis[i]]; + } + + out->Resize(make_ddim(dst_dim)); + out->mutable_data(expected_kernel_type.place_, in.type()); + + framework::VisitDataType( + in.type(), + CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out)); + + out->set_layout(expected_kernel_type.data_layout_); +} + +#ifdef PADDLE_WITH_MKLDNN +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; + +void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) { + switch (type) { + case mkldnn::memory::data_type::f32: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s8: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::u8: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s16: + return platform::to_void_cast(tensor.data()); + case mkldnn::memory::data_type::s32: + return platform::to_void_cast(tensor.data()); + default: + PADDLE_THROW("wrong mkldnn type provided"); + } +} +#endif + +void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, + const Tensor& in, Tensor* out) { + auto in_layout = kernel_type_for_var.data_layout_; + auto out_layout = expected_kernel_type.data_layout_; + + PADDLE_ENFORCE( + in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN, + "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to " + "non-MKLDNN"); + +#ifdef PADDLE_WITH_MKLDNN + PADDLE_ENFORCE(in.format() != memory::format::format_undef && + in.format() != memory::format::any, + "Input tensor should have specified memory format"); + + // Set default as NCHW in case not specified + out_layout = + out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; + + auto& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = dynamic_cast( + pool.Get(expected_kernel_type.place_)); + auto& cpu_engine = dev_ctx->GetEngine(); + + std::vector in_tz = paddle::framework::vectorize2int(in.dims()); + std::vector out_tz = in_tz; + + memory::data_type in_type = ToMKLDNNDataType(in.type()); + PADDLE_ENFORCE(in_type != memory::data_type::data_undef, + "Input tensor type is not supported: %s", in.type()); + + auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format()); + auto out_format = + platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); + + // output tensor has the same dims as input. Reorder don't change dims + out->Resize(in.dims()); + + if (in_format != out_format) { + void* in_data = GetDataFromTensor(in, in_type); + const std::string key = platform::ReorderMKLDNNHandler::GetHash( + in_tz, in_format, out_format, std::to_string(in_type)); + + platform::ReorderMKLDNNHandler handler(in_tz, in.type(), in_type, *dev_ctx, + cpu_engine, key); + + auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data); + auto reorder_dst_memory_p = + handler.AcquireDstMemory(out, out_format, expected_kernel_type.place_); + auto reorder_p = + handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); + + std::vector pipeline; + pipeline.push_back(*reorder_p); + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } else { + out->ShareDataWith(in); + } + out->set_layout(out_layout); + // reset format since the out tensor will be feed to non-MKLDNN OPkernel + out->set_format(memory::format::format_undef); +#endif +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h new file mode 100644 index 00000000..2c0a34b8 --- /dev/null +++ b/paddle/fluid/framework/data_layout_transform.h @@ -0,0 +1,79 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { + +#ifdef PADDLE_WITH_MKLDNN +using MKLDNNFormat = mkldnn::memory::format; +using MKLDNNDataType = mkldnn::memory::data_type; + +inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) { + switch (layout) { + case DataLayout::kNHWC: + return MKLDNNFormat::nhwc; + case DataLayout::kNCHW: + return MKLDNNFormat::nchw; + default: + PADDLE_THROW("Fail to convert layout %s to MKLDNN format", + DataLayoutToString(layout)); + } +} + +inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) { + switch (format) { + case MKLDNNFormat::nhwc: + return DataLayout::kNHWC; + case MKLDNNFormat::nchw: + return DataLayout::kNCHW; + default: + PADDLE_THROW("Fail to convert MKLDNN format to paddle layout"); + } +} + +inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) { + static std::unordered_map dict{ + {DataTypeTrait::DataType(), MKLDNNDataType::f32}, + {DataTypeTrait::DataType(), MKLDNNDataType::s8}, + {DataTypeTrait::DataType(), MKLDNNDataType::u8}, + {DataTypeTrait::DataType(), MKLDNNDataType::s16}, + {DataTypeTrait::DataType(), MKLDNNDataType::s32}}; + auto iter = dict.find(static_cast(type)); + if (iter != dict.end()) return iter->second; + return MKLDNNDataType::data_undef; +} + +#endif + +void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, + const Tensor& in, Tensor* out); + +std::vector GetAxis(const DataLayout& from, const DataLayout& to); + +void TransDataLayout(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_layout_transform_test.cc b/paddle/fluid/framework/data_layout_transform_test.cc new file mode 100644 index 00000000..a0d08826 --- /dev/null +++ b/paddle/fluid/framework/data_layout_transform_test.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/data_layout_transform.h" + +#include "gtest/gtest.h" +#include "paddle/fluid/platform/device_context.h" + +TEST(DataTransform, DataLayoutFunction) { + auto place = paddle::platform::CPUPlace(); + paddle::framework::Tensor in = paddle::framework::Tensor(); + paddle::framework::Tensor out = paddle::framework::Tensor(); + in.mutable_data(paddle::framework::make_ddim({2, 3, 1, 2}), place); + in.set_layout(paddle::framework::DataLayout::kNHWC); + + auto kernel_nhwc = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::FP32, place, + paddle::framework::DataLayout::kNHWC, + paddle::framework::LibraryType::kPlain); + auto kernel_ncwh = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::FP32, place, + paddle::framework::DataLayout::kNCHW, + paddle::framework::LibraryType::kPlain); + + paddle::framework::TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out); + + EXPECT_TRUE(out.layout() == paddle::framework::DataLayout::kNCHW); + EXPECT_TRUE(out.dims() == paddle::framework::make_ddim({2, 2, 3, 1})); + + TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out); + + EXPECT_TRUE(in.layout() == paddle::framework::DataLayout::kNHWC); + EXPECT_TRUE(in.dims() == paddle::framework::make_ddim({2, 3, 1, 2})); +} diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc new file mode 100644 index 00000000..f0c8ccc2 --- /dev/null +++ b/paddle/fluid/framework/data_set.cc @@ -0,0 +1,652 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/framework/data_set.h" +#include +#include +#include +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/platform/timer.h" +#include "xxhash.h" // NOLINT + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + +namespace paddle { +namespace framework { + +// constructor +template +DatasetImpl::DatasetImpl() { + VLOG(3) << "DatasetImpl::DatasetImpl() constructor"; + thread_num_ = 1; + trainer_num_ = 1; + channel_num_ = 1; + file_idx_ = 0; + cur_channel_ = 0; + fleet_send_batch_size_ = 80000; + fleet_send_sleep_seconds_ = 2; + merge_by_insid_ = false; + erase_duplicate_feas_ = true; + keep_unmerged_ins_ = true; + min_merge_size_ = 2; +} + +// set filelist, file_idx_ will reset to zero. +template +void DatasetImpl::SetFileList(const std::vector& filelist) { + VLOG(3) << "filelist size: " << filelist.size(); + filelist_ = filelist; + file_idx_ = 0; +} + +// set expect thread num. actually it may change +template +void DatasetImpl::SetThreadNum(int thread_num) { + VLOG(3) << "SetThreadNum thread_num=" << thread_num; + thread_num_ = thread_num; +} + +// if you run distributed, and want to do global shuffle, +// set this before global shuffle. +// be sure you call CreateReaders before SetTrainerNum +template +void DatasetImpl::SetTrainerNum(int trainer_num) { + trainer_num_ = trainer_num; +} + +// if you run distributed, and want to do global shuffle, +// set this before global shuffle. +// be sure you call CreateReaders before SetFleetSendBatchSize +template +void DatasetImpl::SetFleetSendBatchSize(int64_t size) { + fleet_send_batch_size_ = size; +} + +template +void DatasetImpl::SetHdfsConfig(const std::string& fs_name, + const std::string& fs_ugi) { + fs_name_ = fs_name; + fs_ugi_ = fs_ugi; + std::string cmd = std::string("hadoop fs"); + cmd += " -D fs.default.name=" + fs_name; + cmd += " -D hadoop.job.ugi=" + fs_ugi; + paddle::framework::hdfs_set_command(cmd); +} + +template +void DatasetImpl::SetDataFeedDesc(const std::string& data_feed_desc_str) { + google::protobuf::TextFormat::ParseFromString(data_feed_desc_str, + &data_feed_desc_); +} + +template +void DatasetImpl::SetChannelNum(int channel_num) { + channel_num_ = channel_num; +} + +template +void DatasetImpl::SetMergeByInsId( + const std::vector& merge_slot_list, bool erase_duplicate_feas, + int min_merge_size, bool keep_unmerged_ins) { + merge_by_insid_ = true; + merge_slots_list_ = merge_slot_list; + erase_duplicate_feas_ = erase_duplicate_feas; + min_merge_size_ = min_merge_size; + keep_unmerged_ins_ = keep_unmerged_ins; +} + +template +std::vector DatasetImpl::GetReaders() { + std::vector ret; + ret.reserve(readers_.size()); + for (auto i : readers_) { + ret.push_back(i.get()); + } + return ret; +} + +template +void DatasetImpl::CreateChannel() { + if (input_channel_ == nullptr) { + input_channel_ = paddle::framework::MakeChannel(); + } + if (multi_output_channel_.size() == 0) { + multi_output_channel_.reserve(channel_num_); + for (int i = 0; i < channel_num_; ++i) { + multi_output_channel_.push_back(paddle::framework::MakeChannel()); + } + } + if (multi_consume_channel_.size() == 0) { + multi_consume_channel_.reserve(channel_num_); + for (int i = 0; i < channel_num_; ++i) { + multi_consume_channel_.push_back(paddle::framework::MakeChannel()); + } + } +} + +// if sent message between workers, should first call this function +template +void DatasetImpl::RegisterClientToClientMsgHandler() { + auto fleet_ptr = FleetWrapper::GetInstance(); + VLOG(3) << "RegisterClientToClientMsgHandler"; + fleet_ptr->RegisterClientToClientMsgHandler( + 0, [this](int msg_type, int client_id, const std::string& msg) -> int { + return this->ReceiveFromClient(msg_type, client_id, msg); + }); + VLOG(3) << "RegisterClientToClientMsgHandler done"; +} + +// load data into memory, Dataset hold this memory, +// which will later be fed into readers' channel +template +void DatasetImpl::LoadIntoMemory() { + VLOG(3) << "DatasetImpl::LoadIntoMemory() begin"; + platform::Timer timeline; + timeline.Start(); + std::vector load_threads; + for (int64_t i = 0; i < thread_num_; ++i) { + load_threads.push_back(std::thread( + &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get())); + } + for (std::thread& t : load_threads) { + t.join(); + } + input_channel_->Close(); + int64_t in_chan_size = input_channel_->Size(); + input_channel_->SetBlockSize(in_chan_size / thread_num_ + 1); + timeline.Pause(); + VLOG(3) << "DatasetImpl::LoadIntoMemory() end" + << ", memory data size=" << input_channel_->Size() + << ", cost time=" << timeline.ElapsedSec() << " seconds"; +} + +template +void DatasetImpl::PreLoadIntoMemory() { + VLOG(3) << "DatasetImpl::PreLoadIntoMemory() begin"; + preload_threads_.clear(); + for (int64_t i = 0; i < thread_num_; ++i) { + preload_threads_.push_back(std::thread( + &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get())); + } + VLOG(3) << "DatasetImpl::PreLoadIntoMemory() end"; +} + +template +void DatasetImpl::WaitPreLoadDone() { + VLOG(3) << "DatasetImpl::WaitPreLoadDone() begin"; + for (std::thread& t : preload_threads_) { + t.join(); + } + input_channel_->Close(); + int64_t in_chan_size = input_channel_->Size(); + input_channel_->SetBlockSize(in_chan_size / thread_num_ + 1); + VLOG(3) << "DatasetImpl::WaitPreLoadDone() end"; +} + +// release memory data +template +void DatasetImpl::ReleaseMemory() { + VLOG(3) << "DatasetImpl::ReleaseMemory() begin"; + if (input_channel_) { + input_channel_->Clear(); + input_channel_ = nullptr; + } + for (size_t i = 0; i < multi_output_channel_.size(); ++i) { + if (!multi_output_channel_[i]) { + continue; + } + multi_output_channel_[i]->Clear(); + multi_output_channel_[i] = nullptr; + } + std::vector>().swap(multi_output_channel_); + for (size_t i = 0; i < multi_consume_channel_.size(); ++i) { + if (!multi_consume_channel_[i]) { + continue; + } + multi_consume_channel_[i]->Clear(); + multi_consume_channel_[i] = nullptr; + } + std::vector>().swap(multi_consume_channel_); + std::vector>().swap(readers_); + VLOG(3) << "DatasetImpl::ReleaseMemory() end"; +} + +// do local shuffle +template +void DatasetImpl::LocalShuffle() { + VLOG(3) << "DatasetImpl::LocalShuffle() begin"; + platform::Timer timeline; + timeline.Start(); + + if (!input_channel_ || input_channel_->Size() == 0) { + VLOG(3) << "DatasetImpl::LocalShuffle() end, no data to shuffle"; + return; + } + auto fleet_ptr = FleetWrapper::GetInstance(); + input_channel_->Close(); + std::vector data; + input_channel_->ReadAll(data); + std::shuffle(data.begin(), data.end(), fleet_ptr->LocalRandomEngine()); + input_channel_->Open(); + input_channel_->Write(std::move(data)); + data.clear(); + data.shrink_to_fit(); + input_channel_->Close(); + + timeline.Pause(); + VLOG(3) << "DatasetImpl::LocalShuffle() end, cost time=" + << timeline.ElapsedSec() << " seconds"; +} + +template +void DatasetImpl::GlobalShuffle() { + VLOG(3) << "DatasetImpl::GlobalShuffle() begin"; + platform::Timer timeline; + timeline.Start(); + auto fleet_ptr = FleetWrapper::GetInstance(); + + if (!input_channel_ || input_channel_->Size() == 0) { + VLOG(3) << "DatasetImpl::GlobalShuffle() end, no data to shuffle"; + return; + } + + // local shuffle + input_channel_->Close(); + std::vector data; + input_channel_->ReadAll(data); + std::shuffle(data.begin(), data.end(), fleet_ptr->LocalRandomEngine()); + input_channel_->Open(); + input_channel_->Write(std::move(data)); + data.clear(); + data.shrink_to_fit(); + + input_channel_->Close(); + input_channel_->SetBlockSize(fleet_send_batch_size_); + VLOG(3) << "DatasetImpl::GlobalShuffle() input_channel_ size " + << input_channel_->Size(); + + auto get_client_id = [this, fleet_ptr](const T& data) -> size_t { + if (!this->merge_by_insid_) { + return fleet_ptr->LocalRandomEngine()() % this->trainer_num_; + } else { + return XXH64(data.ins_id_.data(), data.ins_id_.length(), 0) % + this->trainer_num_; + } + }; + + auto global_shuffle_func = [this, get_client_id]() { + auto fleet_ptr = FleetWrapper::GetInstance(); + std::vector data; + while (this->input_channel_->Read(data)) { + std::vector ars(this->trainer_num_); + for (auto& t : data) { + auto client_id = get_client_id(t); + ars[client_id] << t; + } + std::vector> total_status; + std::vector send_index(this->trainer_num_); + for (int i = 0; i < this->trainer_num_; ++i) { + send_index[i] = i; + } + std::shuffle(send_index.begin(), send_index.end(), + fleet_ptr->LocalRandomEngine()); + for (auto index = 0u; index < this->trainer_num_; ++index) { + int i = send_index[index]; + if (ars[i].Length() == 0) { + continue; + } + std::string msg(ars[i].Buffer(), ars[i].Length()); + auto ret = fleet_ptr->SendClientToClientMsg(0, i, msg); + total_status.push_back(std::move(ret)); + } + for (auto& t : total_status) { + t.wait(); + } + ars.clear(); + ars.shrink_to_fit(); + data.clear(); + data.shrink_to_fit(); + sleep(this->fleet_send_sleep_seconds_); + } + }; + + VLOG(3) << "start global shuffle threads"; + std::vector global_shuffle_threads; + for (int i = 0; i < thread_num_; ++i) { + global_shuffle_threads.push_back(std::thread(global_shuffle_func)); + } + for (std::thread& t : global_shuffle_threads) { + t.join(); + } + global_shuffle_threads.clear(); + global_shuffle_threads.shrink_to_fit(); + input_channel_->Clear(); + timeline.Pause(); + VLOG(3) << "DatasetImpl::GlobalShuffle() end, cost time=" + << timeline.ElapsedSec() << " seconds"; +} + +template +void DatasetImpl::CreateReaders() { + VLOG(3) << "Calling CreateReaders()"; + VLOG(3) << "thread num in Dataset: " << thread_num_; + VLOG(3) << "Filelist size in Dataset: " << filelist_.size(); + VLOG(3) << "channel num in Dataset: " << channel_num_; + CHECK(thread_num_ > 0) << "thread num should > 0"; + CHECK(thread_num_ <= filelist_.size()) + << "thread num should <= filelist size"; + CHECK(channel_num_ > 0) << "channel num should > 0"; + CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num"; + VLOG(3) << "readers size: " << readers_.size(); + if (readers_.size() != 0) { + VLOG(3) << "readers_.size() = " << readers_.size() + << ", will not create again"; + return; + } + VLOG(3) << "data feed class name: " << data_feed_desc_.name(); + int channel_idx = 0; + for (int i = 0; i < thread_num_; ++i) { + readers_.push_back(DataFeedFactory::CreateDataFeed(data_feed_desc_.name())); + readers_[i]->Init(data_feed_desc_); + readers_[i]->SetThreadId(i); + readers_[i]->SetThreadNum(thread_num_); + readers_[i]->SetFileListMutex(&mutex_for_pick_file_); + readers_[i]->SetFileListIndex(&file_idx_); + readers_[i]->SetFileList(filelist_); + readers_[i]->SetParseInsId(merge_by_insid_); + if (input_channel_ != nullptr) { + readers_[i]->SetInputChannel(input_channel_.get()); + } + if (cur_channel_ == 0 && channel_idx < multi_output_channel_.size()) { + readers_[i]->SetOutputChannel(multi_output_channel_[channel_idx].get()); + readers_[i]->SetConsumeChannel(multi_consume_channel_[channel_idx].get()); + } else if (channel_idx < multi_output_channel_.size()) { + readers_[i]->SetOutputChannel(multi_consume_channel_[channel_idx].get()); + readers_[i]->SetConsumeChannel(multi_output_channel_[channel_idx].get()); + } + ++channel_idx; + if (channel_idx >= channel_num_) { + channel_idx = 0; + } + } + VLOG(3) << "readers size: " << readers_.size(); +} + +template +void DatasetImpl::DestroyReaders() { + VLOG(3) << "Calling DestroyReaders()"; + VLOG(3) << "readers size1: " << readers_.size(); + std::vector>().swap(readers_); + VLOG(3) << "readers size: " << readers_.size(); + file_idx_ = 0; + cur_channel_ = 1 - cur_channel_; +} + +template +int64_t DatasetImpl::GetMemoryDataSize() { + return input_channel_->Size(); +} + +template +int64_t DatasetImpl::GetShuffleDataSize() { + int64_t sum = 0; + for (size_t i = 0; i < multi_output_channel_.size(); ++i) { + sum += multi_output_channel_[i]->Size() + multi_consume_channel_[i]->Size(); + } + return sum; +} + +template +int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, + const std::string& msg) { +#ifdef _LINUX + VLOG(3) << "ReceiveFromClient msg_type=" << msg_type + << ", client_id=" << client_id << ", msg length=" << msg.length(); + if (msg.length() == 0) { + return 0; + } + paddle::framework::BinaryArchive ar; + ar.SetReadBuffer(const_cast(msg.c_str()), msg.length(), nullptr); + if (ar.Cursor() == ar.Finish()) { + return 0; + } + std::vector data; + while (ar.Cursor() < ar.Finish()) { + data.push_back(ar.Get()); + } + CHECK(ar.Cursor() == ar.Finish()); + + auto fleet_ptr = FleetWrapper::GetInstance(); + int64_t index = fleet_ptr->LocalRandomEngine()() % channel_num_; + VLOG(3) << "ramdom index=" << index; + multi_output_channel_[index]->Write(std::move(data)); + + data.clear(); + data.shrink_to_fit(); +#endif + return 0; +} + +// explicit instantiation +template class DatasetImpl; + +void MultiSlotDataset::MergeByInsId() { + VLOG(3) << "MultiSlotDataset::MergeByInsId begin"; + if (!merge_by_insid_) { + VLOG(3) << "merge_by_insid=false, will not MergeByInsId"; + return; + } + auto multi_slot_desc = data_feed_desc_.multi_slot_desc(); + std::unordered_map merge_slots; + std::vector use_slots; + std::vector use_slots_is_dense; + for (size_t i = 0; i < multi_slot_desc.slots_size(); ++i) { + const auto& slot = multi_slot_desc.slots(i); + if (slot.is_used()) { + use_slots.push_back(slot.name()); + use_slots_is_dense.push_back(slot.is_dense()); + } + } + for (size_t i = 0; i < use_slots.size(); ++i) { + // currently, we don't merge dense slots + if (std::find(merge_slots_list_.begin(), merge_slots_list_.end(), + use_slots[i]) != merge_slots_list_.end() && + !use_slots_is_dense[i]) { + merge_slots[i] = true; + } + } + CHECK(multi_output_channel_.size() != 0); // NOLINT + auto channel_data = paddle::framework::MakeChannel(); + VLOG(3) << "multi_output_channel_.size() " << multi_output_channel_.size(); + for (size_t i = 0; i < multi_output_channel_.size(); ++i) { + std::vector vec_data; + multi_output_channel_[i]->Close(); + multi_output_channel_[i]->ReadAll(vec_data); + channel_data->Write(std::move(vec_data)); + vec_data.clear(); + vec_data.shrink_to_fit(); + multi_output_channel_[i]->Clear(); + } + channel_data->Close(); + std::vector recs; + recs.reserve(channel_data->Size()); + channel_data->ReadAll(recs); + channel_data->Clear(); + std::sort(recs.begin(), recs.end(), [](const Record& a, const Record& b) { + return a.ins_id_ < b.ins_id_; + }); + + auto sort_cmp_uint64 = [&merge_slots](const FeatureItem& a, + const FeatureItem& b) { + auto& a_sign = a.sign().uint64_feasign_; + auto& b_sign = b.sign().uint64_feasign_; + return a_sign < b_sign || (a_sign == b_sign && a.slot() < b.slot()); + }; + auto sort_cmp_float = [&merge_slots](const FeatureItem& a, + const FeatureItem& b) { + auto& a_sign = a.sign().float_feasign_; + auto& b_sign = b.sign().float_feasign_; + return a_sign < b_sign || (a_sign == b_sign && a.slot() < b.slot()); + }; + auto unique_eq_uint64 = [&merge_slots](const FeatureItem& a, + const FeatureItem& b) { + if (a.slot() == b.slot() && + merge_slots.find(a.slot()) == merge_slots.end()) { + return true; + } + auto& a_sign = a.sign().uint64_feasign_; + auto& b_sign = b.sign().uint64_feasign_; + return a_sign == b_sign && a.slot() == b.slot(); + }; + auto unique_eq_float = [&merge_slots](const FeatureItem& a, + const FeatureItem& b) { + if (a.slot() == b.slot() && + merge_slots.find(a.slot()) == merge_slots.end()) { + return true; + } + auto& a_sign = a.sign().float_feasign_; + auto& b_sign = b.sign().float_feasign_; + return a_sign == b_sign && a.slot() == b.slot(); + }; + + std::vector results; + VLOG(3) << "recs.size() " << recs.size(); + for (size_t i = 0; i < recs.size();) { + size_t j = i + 1; + while (j < recs.size() && recs[j].ins_id_ == recs[i].ins_id_) { + j++; + } + if (j - i < min_merge_size_) { + if (keep_unmerged_ins_) { + for (size_t k = i; k < j; ++k) { + results.push_back(std::move(recs[k])); + } + } + i = j; + continue; + } + + std::vector merge_uint64_feasigns; + std::vector merge_float_feasigns; + Record rec = std::move(recs[i]); + + for (size_t k = i + 1; k < j; k++) { + for (auto& feature : recs[k].uint64_feasigns_) { + if (merge_slots.find(feature.slot()) != merge_slots.end()) { + merge_uint64_feasigns.push_back(std::move(feature)); + } + } + for (auto& feature : recs[k].float_feasigns_) { + if (merge_slots.find(feature.slot()) != merge_slots.end()) { + merge_float_feasigns.push_back(std::move(feature)); + } + } + recs[k] = Record(); + } + i = j; + + if (!erase_duplicate_feas_) { + rec.uint64_feasigns_.insert(rec.uint64_feasigns_.end(), + merge_uint64_feasigns.begin(), + merge_uint64_feasigns.end()); + rec.float_feasigns_.insert(rec.float_feasigns_.end(), + merge_float_feasigns.begin(), + merge_float_feasigns.end()); + } else { + std::vector not_merge_uint64_feasigns; + std::vector not_merge_float_feasigns; + + for (auto& feature : rec.uint64_feasigns_) { + if (merge_slots.find(feature.slot()) != merge_slots.end()) { + merge_uint64_feasigns.push_back(std::move(feature)); + } else { + not_merge_uint64_feasigns.push_back(std::move(feature)); + } + } + for (auto& feature : rec.float_feasigns_) { + if (merge_slots.find(feature.slot()) != merge_slots.end()) { + merge_float_feasigns.push_back(std::move(feature)); + } else { + not_merge_float_feasigns.push_back(std::move(feature)); + } + } + rec.uint64_feasigns_.clear(); + rec.float_feasigns_.clear(); + + // erase duplicate uint64 feasigns + std::sort(merge_uint64_feasigns.begin(), merge_uint64_feasigns.end(), + sort_cmp_uint64); + merge_uint64_feasigns.erase( + std::unique(merge_uint64_feasigns.begin(), + merge_uint64_feasigns.end(), unique_eq_uint64), + merge_uint64_feasigns.end()); + rec.uint64_feasigns_.insert(rec.uint64_feasigns_.end(), + merge_uint64_feasigns.begin(), + merge_uint64_feasigns.end()); + rec.uint64_feasigns_.insert(rec.uint64_feasigns_.end(), + not_merge_uint64_feasigns.begin(), + not_merge_uint64_feasigns.end()); + + // erase duplicate float feasigns + std::sort(merge_float_feasigns.begin(), merge_float_feasigns.end(), + sort_cmp_float); + merge_float_feasigns.erase( + std::unique(merge_float_feasigns.begin(), merge_float_feasigns.end(), + unique_eq_float), + merge_float_feasigns.end()); + rec.float_feasigns_.insert(rec.float_feasigns_.end(), + merge_float_feasigns.begin(), + merge_float_feasigns.end()); + rec.float_feasigns_.insert(rec.float_feasigns_.end(), + not_merge_float_feasigns.begin(), + not_merge_float_feasigns.end()); + } + results.push_back(rec); + } + VLOG(3) << "results size " << results.size(); + results.shrink_to_fit(); + + auto fleet_ptr = FleetWrapper::GetInstance(); + std::shuffle(results.begin(), results.end(), fleet_ptr->LocalRandomEngine()); + channel_data->Open(); + channel_data->Write(std::move(results)); + channel_data->Close(); + results.clear(); + results.shrink_to_fit(); + VLOG(3) << "channel data size " << channel_data->Size(); + channel_data->SetBlockSize(channel_data->Size() / channel_num_ + 1); + VLOG(3) << "channel data block size " << channel_data->BlockSize(); + for (size_t i = 0; i < multi_output_channel_.size(); ++i) { + std::vector vec_data; + channel_data->Read(vec_data); + multi_output_channel_[i]->Open(); + multi_output_channel_[i]->Write(std::move(vec_data)); + vec_data.clear(); + vec_data.shrink_to_fit(); + } + CHECK(channel_data->Size() == 0); // NOLINT + channel_data->Clear(); + VLOG(3) << "MultiSlotDataset::MergeByInsId end"; +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h new file mode 100644 index 00000000..3c40a7c0 --- /dev/null +++ b/paddle/fluid/framework/data_set.h @@ -0,0 +1,198 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include +#include + +#include "paddle/fluid/framework/data_feed.h" + +namespace paddle { +namespace framework { + +// Dataset is a abstract class, which defines user interfaces +// Example Usage: +// Dataset* dataset = DatasetFactory::CreateDataset("InMemoryDataset") +// dataset->SetFileList(std::vector{"a.txt", "b.txt"}) +// dataset->SetThreadNum(1) +// dataset->CreateReaders(); +// dataset->SetDataFeedDesc(your_data_feed_desc); +// dataset->LoadIntoMemory(); +// dataset->SetTrainerNum(2); +// dataset->GlobalShuffle(); +class Dataset { + public: + Dataset() {} + virtual ~Dataset() {} + // set file list + virtual void SetFileList(const std::vector& filelist) = 0; + // set readers' num + virtual void SetThreadNum(int thread_num) = 0; + // set workers' num + virtual void SetTrainerNum(int trainer_num) = 0; + // set fleet send batch size + virtual void SetFleetSendBatchSize(int64_t size) = 0; + // set fs name and ugi + virtual void SetHdfsConfig(const std::string& fs_name, + const std::string& fs_ugi) = 0; + // set data fedd desc, which contains: + // data feed name, batch size, slots + virtual void SetDataFeedDesc(const std::string& data_feed_desc_str) = 0; + // set channel num + virtual void SetChannelNum(int channel_num) = 0; + // set merge by ins id + virtual void SetMergeByInsId(const std::vector& merge_slot_list, + bool erase_duplicate_feas, int min_merge_size, + bool keep_unmerged_ins) = 0; + // get file list + virtual const std::vector& GetFileList() = 0; + // get thread num + virtual int GetThreadNum() = 0; + // get worker num + virtual int GetTrainerNum() = 0; + // get fleet send batch size + virtual int64_t GetFleetSendBatchSize() = 0; + // get hdfs config + virtual std::pair GetHdfsConfig() = 0; + // get data fedd desc + virtual const paddle::framework::DataFeedDesc& GetDataFeedDesc() = 0; + // get channel num + virtual int GetChannelNum() = 0; + // get readers, the reader num depend both on thread num + // and filelist size + virtual std::vector GetReaders() = 0; + // create input channel and output channel + virtual void CreateChannel() = 0; + // register message handler between workers + virtual void RegisterClientToClientMsgHandler() = 0; + // load all data into memory + virtual void LoadIntoMemory() = 0; + // load all data into memory in async mode + virtual void PreLoadIntoMemory() = 0; + // wait async load done + virtual void WaitPreLoadDone() = 0; + // release all memory data + virtual void ReleaseMemory() = 0; + // local shuffle data + virtual void LocalShuffle() = 0; + // global shuffle data + virtual void GlobalShuffle() = 0; + // create readers + virtual void CreateReaders() = 0; + // destroy readers + virtual void DestroyReaders() = 0; + // get memory data size + virtual int64_t GetMemoryDataSize() = 0; + // get shuffle data size + virtual int64_t GetShuffleDataSize() = 0; + // merge by ins id + virtual void MergeByInsId() = 0; + + protected: + virtual int ReceiveFromClient(int msg_type, int client_id, + const std::string& msg) = 0; +}; + +// DatasetImpl is the implementation of Dataset, +// it holds memory data if user calls load_into_memory +template +class DatasetImpl : public Dataset { + public: + DatasetImpl(); + virtual ~DatasetImpl() {} + + virtual void SetFileList(const std::vector& filelist); + virtual void SetThreadNum(int thread_num); + virtual void SetTrainerNum(int trainer_num); + virtual void SetFleetSendBatchSize(int64_t size); + virtual void SetHdfsConfig(const std::string& fs_name, + const std::string& fs_ugi); + virtual void SetDataFeedDesc(const std::string& data_feed_desc_str); + virtual void SetChannelNum(int channel_num); + virtual void SetMergeByInsId(const std::vector& merge_slot_list, + bool erase_duplicate_feas, int min_merge_size, + bool keep_unmerged_ins); + + virtual const std::vector& GetFileList() { return filelist_; } + virtual int GetThreadNum() { return thread_num_; } + virtual int GetTrainerNum() { return trainer_num_; } + virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; } + virtual std::pair GetHdfsConfig() { + return std::make_pair(fs_name_, fs_ugi_); + } + virtual const paddle::framework::DataFeedDesc& GetDataFeedDesc() { + return data_feed_desc_; + } + virtual int GetChannelNum() { return channel_num_; } + virtual std::vector GetReaders(); + virtual void CreateChannel(); + virtual void RegisterClientToClientMsgHandler(); + virtual void LoadIntoMemory(); + virtual void PreLoadIntoMemory(); + virtual void WaitPreLoadDone(); + virtual void ReleaseMemory(); + virtual void LocalShuffle(); + virtual void GlobalShuffle(); + virtual void CreateReaders(); + virtual void DestroyReaders(); + virtual int64_t GetMemoryDataSize(); + virtual int64_t GetShuffleDataSize(); + virtual void MergeByInsId() {} + + protected: + virtual int ReceiveFromClient(int msg_type, int client_id, + const std::string& msg); + std::vector> readers_; + paddle::framework::Channel input_channel_; + int channel_num_; + std::vector> multi_output_channel_; + std::vector> multi_consume_channel_; + // when read ins, we put ins from one channel to the other, + // and when finish reading, we set cur_channel = 1 - cur_channel, + // so if cur_channel=0, all data are in output_channel, else consume_channel + int cur_channel_; + int thread_num_; + paddle::framework::DataFeedDesc data_feed_desc_; + int trainer_num_; + std::vector filelist_; + size_t file_idx_; + std::mutex mutex_for_pick_file_; + std::string fs_name_; + std::string fs_ugi_; + int64_t fleet_send_batch_size_; + int64_t fleet_send_sleep_seconds_; + std::vector preload_threads_; + bool merge_by_insid_; + bool erase_duplicate_feas_; + bool keep_unmerged_ins_; + int min_merge_size_; + std::vector merge_slots_list_; +}; + +// use std::vector or Record as data type +class MultiSlotDataset : public DatasetImpl { + public: + MultiSlotDataset() {} + virtual void MergeByInsId(); + virtual ~MultiSlotDataset() {} +}; + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc new file mode 100644 index 00000000..82872224 --- /dev/null +++ b/paddle/fluid/framework/data_transform.cc @@ -0,0 +1,116 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_transform.h" + +#include "paddle/fluid/framework/data_device_transform.h" +#include "paddle/fluid/framework/data_layout_transform.h" +#include "paddle/fluid/framework/data_type_transform.h" + +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace framework { + +static void PassTensorData(Tensor *from, Tensor *to) { + to->ShareDataWith(*from); + *from = Tensor(); +} + +void TransformData(const OpKernelType &expected_kernel_type, + const OpKernelType &kernel_type_for_var, + const Tensor &input_tensor, Tensor *output_tensor) { + bool transformed = false; + Tensor in; + in.ShareDataWith(input_tensor); + Tensor out; + DataLayout lin = kernel_type_for_var.data_layout_; + DataLayout lout = expected_kernel_type.data_layout_; + + // do layout transform + if (NeedTransformLayout(lout, lin)) { + if (lin == DataLayout::kMKLDNN || lout == DataLayout::kMKLDNN) { + PADDLE_ENFORCE( + !(lin == DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN), + "No layout transform needed between two MKLDNN OPKernels"); + + if (lin != DataLayout::kMKLDNN && lout == DataLayout::kMKLDNN) { +#ifdef PADDLE_WITH_MKLDNN + // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel + // Just set layout/format. No real transform occur + + auto out_format = platform::MKLDNNFormatForSize(in.dims().size(), + ToMKLDNNFormat(lin)); + + out.ShareDataWith(input_tensor); + out.set_layout(DataLayout::kMKLDNN); + out.set_format(out_format); +#endif + } else { + // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel + // Do transform via MKLDNN lib + TransDataLayoutFromMKLDNN(kernel_type_for_var, expected_kernel_type, in, + &out); + } + } else { + // Case3 - transfrom between Non-MKLDNN OPKernels + TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out); + } + transformed = true; + PassTensorData(&out, &in); + } + + // do data type transform + if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) { + TransDataType(kernel_type_for_var, expected_kernel_type, in, &out); + transformed = true; + PassTensorData(&out, &in); + } + + // do device transform + if (!platform::is_same_place(kernel_type_for_var.place_, + expected_kernel_type.place_)) { + TransDataDevice(in, expected_kernel_type.place_, &out); + transformed = true; + PassTensorData(&out, &in); + } + + PADDLE_ENFORCE(transformed, "No transform is applied, please check!"); + // get output data + output_tensor->ShareDataWith(in); +} + +void SetTensorToVariable(const Variable &in_var, const Tensor &tensor, + Variable *out_var) { + if (in_var.IsType()) { + auto &in_lod_tensor = in_var.Get(); + auto *tran_lod_tensor = out_var->GetMutable(); + tran_lod_tensor->set_lod(in_lod_tensor.lod()); + tran_lod_tensor->set_layout(in_lod_tensor.layout()); + tran_lod_tensor->ShareDataWith(tensor); + } else if (in_var.IsType()) { + auto &in_selected_rows = in_var.Get(); + auto *trans_selected_rows = out_var->GetMutable(); + trans_selected_rows->set_height(in_selected_rows.height()); + trans_selected_rows->set_rows(in_selected_rows.rows()); + trans_selected_rows->mutable_value()->ShareDataWith(tensor); + } else { + PADDLE_THROW("unknown var type"); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h new file mode 100644 index 00000000..ae3ab051 --- /dev/null +++ b/paddle/fluid/framework/data_transform.h @@ -0,0 +1,44 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace framework { + +void TransformData(const OpKernelType &expected_kernel_type, + const OpKernelType &kernel_type_for_var, + const Tensor &input_tensor, Tensor *out); + +/** + * Set OutVar from InVar, except the tensor is shared with `tensor` + */ +void SetTensorToVariable(const Variable &in_var, const Tensor &tensor, + Variable *out_var); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc new file mode 100644 index 00000000..a0248cf3 --- /dev/null +++ b/paddle/fluid/framework/data_type.cc @@ -0,0 +1,98 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/data_type.h" +#include +#include +#include + +using float16 = paddle::platform::float16; + +namespace paddle { +namespace framework { + +struct DataTypeMap { + std::unordered_map cpp_to_proto_; + std::unordered_map proto_to_cpp_; + std::unordered_map proto_to_str_; + std::unordered_map proto_to_size_; +}; + +static DataTypeMap* InitDataTypeMap(); +// C++11 removes the need for manual locking. Concurrent execution shall wait if +// a static local variable is already being initialized. +// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex +static DataTypeMap& gDataTypeMap() { + static DataTypeMap* g_data_type_map_ = InitDataTypeMap(); + return *g_data_type_map_; +} + +template +static inline void RegisterType(DataTypeMap* map, + proto::VarType::Type proto_type, + const std::string& name) { + map->proto_to_cpp_.emplace(static_cast(proto_type), typeid(T)); + map->cpp_to_proto_.emplace(typeid(T), proto_type); + map->proto_to_str_.emplace(static_cast(proto_type), name); + map->proto_to_size_.emplace(static_cast(proto_type), sizeof(T)); +} + +static DataTypeMap* InitDataTypeMap() { + auto retv = new DataTypeMap(); + +#define RegType(cc_type, proto_type) \ + RegisterType(retv, proto_type, #cc_type) + + _ForEachDataType_(RegType); + +#undef RegType + return retv; +} + +proto::VarType::Type ToDataType(std::type_index type) { + auto it = gDataTypeMap().cpp_to_proto_.find(type); + if (it != gDataTypeMap().cpp_to_proto_.end()) { + return it->second; + } + PADDLE_THROW("Not support %s as tensor type", type.name()); +} + +std::type_index ToTypeIndex(proto::VarType::Type type) { + auto it = gDataTypeMap().proto_to_cpp_.find(static_cast(type)); + if (it != gDataTypeMap().proto_to_cpp_.end()) { + return it->second; + } + PADDLE_THROW("Not support proto::VarType::Type(%d) as tensor type", + static_cast(type)); +} + +std::string DataTypeToString(const proto::VarType::Type type) { + auto it = gDataTypeMap().proto_to_str_.find(static_cast(type)); + if (it != gDataTypeMap().proto_to_str_.end()) { + return it->second; + } + PADDLE_THROW("Not support proto::VarType::Type(%d) as tensor type", + static_cast(type)); +} + +size_t SizeOfType(proto::VarType::Type type) { + auto it = gDataTypeMap().proto_to_size_.find(static_cast(type)); + if (it != gDataTypeMap().proto_to_size_.end()) { + return it->second; + } + PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type)); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h new file mode 100644 index 00000000..60644820 --- /dev/null +++ b/paddle/fluid/framework/data_type.h @@ -0,0 +1,86 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace framework { + +template +struct DataTypeTrait {}; + +// Stub handle for void +template <> +struct DataTypeTrait { + constexpr static proto::VarType::Type DataType() { + return proto::VarType::RAW; + } +}; + +#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \ + callback(cpp_type, ::paddle::framework::proto::VarType::proto_type); + +#define _ForEachDataType_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, bool, BOOL); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ + _ForEachDataTypeHelper_(callback, int8_t, INT8) + +#define DefineDataTypeTrait(cpp_type, proto_type) \ + template <> \ + struct DataTypeTrait { \ + constexpr static proto::VarType::Type DataType() { return proto_type; } \ + } + +_ForEachDataType_(DefineDataTypeTrait); + +#undef DefineDataTypeTrait + +extern proto::VarType::Type ToDataType(std::type_index type); +extern std::type_index ToTypeIndex(proto::VarType::Type type); + +template +inline void VisitDataType(proto::VarType::Type type, Visitor visitor) { +#define VisitDataTypeCallback(cpp_type, proto_type) \ + do { \ + if (type == proto_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _ForEachDataType_(VisitDataTypeCallback); +#undef VisitDataTypeCallback + PADDLE_THROW("Not supported %d", type); +} + +extern std::string DataTypeToString(const proto::VarType::Type type); +extern size_t SizeOfType(proto::VarType::Type type); +inline std::ostream& operator<<(std::ostream& out, + const proto::VarType::Type& type) { + out << DataTypeToString(type); + return out; +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc new file mode 100644 index 00000000..2a380201 --- /dev/null +++ b/paddle/fluid/framework/data_type_test.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/data_type.h" + +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor.h" + +TEST(DataType, float16) { + using paddle::framework::Tensor; + using paddle::platform::CPUPlace; + using paddle::platform::float16; + namespace f = paddle::framework; + f::proto::VarType::Type dtype = f::proto::VarType::FP16; + + Tensor tensor; + CPUPlace cpu; + tensor.mutable_data(cpu, dtype); + + // test fp16 tensor + EXPECT_EQ(tensor.type(), f::ToDataType(typeid(float16))); + + // test fp16 size + EXPECT_EQ(f::SizeOfType(dtype), 2u); + + // test debug info + std::string type = "::paddle::platform::float16"; + EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str()); +} diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc new file mode 100644 index 00000000..d79f8cac --- /dev/null +++ b/paddle/fluid/framework/data_type_transform.cc @@ -0,0 +1,106 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type_transform.h" + +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace framework { + +template +struct CastDataTypeFunctor { + HOSTDEVICE inline OutType operator()(InType in) const { + return static_cast(in); + } +}; + +template +struct CastDataType { + CastDataType(const framework::Tensor& in, framework::Tensor* out, + const platform::DeviceContext* ctx) + : in_(in), out_(out), ctx_(ctx) {} + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + + template + void apply() { + auto* in_begin = in_.data(); + auto* in_end = in_begin + in_.numel(); + auto* out_begin = out_->mutable_data(in_.place()); + + if (platform::is_cpu_place(in_.place())) { + platform::Transform trans; + auto* context = static_cast(ctx_); + trans(*context, in_begin, in_end, out_begin, + CastDataTypeFunctor()); +#ifdef __NVCC__ + } else if (platform::is_gpu_place(in_.place())) { + platform::Transform trans; + auto* context = static_cast(ctx_); + trans(*context, in_begin, in_end, out_begin, + CastDataTypeFunctor()); + context->Wait(); +#endif + } else { + PADDLE_THROW("Unsupported place!"); + } + } +}; + +void TransDataType(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + + out->Resize(in.dims()); + auto src_type = kernel_type_for_var.data_type_; + auto dst_type = expected_kernel_type.data_type_; + auto ctx = pool.Get(in.place()); + + switch (src_type) { + case proto::VarType::FP16: + framework::VisitDataType(dst_type, + CastDataType(in, out, ctx)); + break; + case proto::VarType::FP32: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::FP64: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::INT32: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::INT64: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::BOOL: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::INT16: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + case proto::VarType::UINT8: + framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); + break; + default: + PADDLE_THROW("Not support type %d", src_type); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type_transform.cu b/paddle/fluid/framework/data_type_transform.cu new file mode 120000 index 00000000..f4649129 --- /dev/null +++ b/paddle/fluid/framework/data_type_transform.cu @@ -0,0 +1 @@ +data_type_transform.cc \ No newline at end of file diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h new file mode 100644 index 00000000..1c281b03 --- /dev/null +++ b/paddle/fluid/framework/data_type_transform.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +using KernelTypePair = std::pair; + +void TransDataType(const OpKernelType& kernel_type_for_var, + const OpKernelType& expected_kernel_type, const Tensor& in, + Tensor* out); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc new file mode 100644 index 00000000..bbebea9f --- /dev/null +++ b/paddle/fluid/framework/data_type_transform_test.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type_transform.h" + +#include "gtest/gtest.h" + +TEST(DataTypeTransform, CPUTransform) { + auto place = paddle::platform::CPUPlace(); + + auto kernel_fp16 = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::FP16, place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + auto kernel_fp32 = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::FP32, place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + auto kernel_fp64 = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::FP64, place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + auto kernel_int32 = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::INT32, place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + auto kernel_int64 = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::INT64, place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + auto kernel_bool = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::BOOL, place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + // data type transform from float32 + { + paddle::framework::Tensor in; + paddle::framework::Tensor out; + + float* ptr = + in.mutable_data(paddle::framework::make_ddim({2, 3}), place); + int data_number = 2 * 3; + + for (int i = 0; i < data_number; ++i) { + ptr[i] = i / 3; + } + + paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in, &out); + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_double[i], static_cast(i / 3)); + } + + paddle::framework::TransDataType(kernel_fp32, kernel_int32, in, &out); + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int[i], static_cast(i / 3)); + } + } + + // data type transform from/to float16 + { + paddle::framework::Tensor in; + paddle::framework::Tensor out; + + paddle::platform::float16* ptr = in.mutable_data( + paddle::framework::make_ddim({2, 3}), place); + int data_number = 2 * 3; + + for (int i = 0; i < data_number; ++i) { + ptr[i] = i; + } + + // transform from float16 to other data types + paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in, &out); + float* out_data_float = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_float[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in, &out); + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_double[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_fp16, kernel_int32, in, &out); + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_fp16, kernel_int64, in, &out); + int64_t* out_data_int64 = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int64[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_fp16, kernel_bool, in, &out); + bool* out_data_bool = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_bool[i], static_cast(ptr[i])); + } + + // transform float to float16 + float* in_data_float = + in.mutable_data(paddle::framework::make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_float[i] = i; + } + + paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_float[i]).x); + } + + // transform double to float16 + double* in_data_double = + in.mutable_data(paddle::framework::make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_double[i] = i; + } + + paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_double[i]).x); + } + + // transform int to float16 + int* in_data_int = + in.mutable_data(paddle::framework::make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int[i] = i; + } + + paddle::framework::TransDataType(kernel_int32, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_int[i]).x); + } + + // transform int64 to float16 + int64_t* in_data_int64 = + in.mutable_data(paddle::framework::make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int64[i] = i; + } + + paddle::framework::TransDataType(kernel_int64, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_int64[i]).x); + } + + // transform bool to float16 + bool* in_data_bool = + in.mutable_data(paddle::framework::make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_bool[i] = i; + } + + paddle::framework::TransDataType(kernel_bool, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_bool[i]).x); + } + } +} diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu new file mode 100644 index 00000000..0874509a --- /dev/null +++ b/paddle/fluid/framework/data_type_transform_test.cu @@ -0,0 +1,261 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/tensor_util.h" + +#include "gtest/gtest.h" + +TEST(DataTypeTransform, GPUTransform) { + auto cpu_place = paddle::platform::CPUPlace(); + auto gpu_place = paddle::platform::CUDAPlace(0); + paddle::platform::CUDADeviceContext context(gpu_place); + + auto kernel_fp16 = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::FP16, gpu_place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + auto kernel_fp32 = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::FP32, gpu_place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + auto kernel_fp64 = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::FP64, gpu_place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + auto kernel_int32 = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::INT32, gpu_place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + auto kernel_int64 = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::INT64, gpu_place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + auto kernel_bool = paddle::framework::OpKernelType( + paddle::framework::proto::VarType::BOOL, gpu_place, + paddle::framework::DataLayout::kAnyLayout, + paddle::framework::LibraryType::kPlain); + + // data type transform from float32 + { + paddle::framework::Tensor in; + paddle::framework::Tensor in_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; + + float* in_ptr = + in.mutable_data(paddle::framework::make_ddim({2, 3}), cpu_place); + float arr[6] = {0, 1, 2, 3, 4, 5}; + int data_number = sizeof(arr) / sizeof(arr[0]); + memcpy(in_ptr, arr, sizeof(arr)); + + paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + paddle::framework::TransDataType(kernel_fp32, kernel_fp64, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_double[i], static_cast(arr[i])); + } + + paddle::framework::TransDataType(kernel_fp32, kernel_int32, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int[i], static_cast(arr[i])); + } + } + + // data type transform from/to float16 + { + paddle::framework::Tensor in; + paddle::framework::Tensor in_gpu; + paddle::framework::Tensor out_gpu; + paddle::framework::Tensor out; + + paddle::platform::float16* ptr = in.mutable_data( + paddle::framework::make_ddim({2, 3}), cpu_place); + paddle::platform::float16 arr[6] = { + paddle::platform::float16(0), paddle::platform::float16(1), + paddle::platform::float16(2), paddle::platform::float16(3), + paddle::platform::float16(4), paddle::platform::float16(5)}; + + int data_number = sizeof(arr) / sizeof(arr[0]); + memcpy(ptr, arr, sizeof(arr)); + paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + + // transform from float16 to other data types + paddle::framework::TransDataType(kernel_fp16, kernel_fp32, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + float* out_data_float = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_float[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_fp16, kernel_fp64, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_double[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_fp16, kernel_int32, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_fp16, kernel_int64, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + int64_t* out_data_int64 = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int64[i], static_cast(ptr[i])); + } + + paddle::framework::TransDataType(kernel_fp16, kernel_bool, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + bool* out_data_bool = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_bool[i], static_cast(ptr[i])); + } + + // transform float to float16 + float* in_data_float = + in.mutable_data(paddle::framework::make_ddim({2, 3}), cpu_place); + for (int i = 0; i < data_number; ++i) { + in_data_float[i] = i; + } + + paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + paddle::framework::TransDataType(kernel_fp32, kernel_fp16, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_float[i]).x); + } + + // transform double to float16 + double* in_data_double = in.mutable_data( + paddle::framework::make_ddim({2, 3}), cpu_place); + for (int i = 0; i < data_number; ++i) { + in_data_double[i] = i; + } + + paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + paddle::framework::TransDataType(kernel_fp64, kernel_fp16, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_double[i]).x); + } + + // transform int to float16 + int* in_data_int = + in.mutable_data(paddle::framework::make_ddim({2, 3}), cpu_place); + for (int i = 0; i < data_number; ++i) { + in_data_int[i] = i; + } + + paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + paddle::framework::TransDataType(kernel_int32, kernel_fp16, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_int[i]).x); + } + + // transform int64 to float16 + int64_t* in_data_int64 = in.mutable_data( + paddle::framework::make_ddim({2, 3}), cpu_place); + for (int i = 0; i < data_number; ++i) { + in_data_int64[i] = i; + } + + paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + paddle::framework::TransDataType(kernel_int64, kernel_fp16, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_int64[i]).x); + } + + // transform bool to float16 + bool* in_data_bool = + in.mutable_data(paddle::framework::make_ddim({2, 3}), cpu_place); + for (int i = 0; i < data_number; ++i) { + in_data_bool[i] = i; + } + + paddle::framework::TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + paddle::framework::TransDataType(kernel_bool, kernel_fp16, in_gpu, + &out_gpu); + paddle::framework::TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, + static_cast(in_data_bool[i]).x); + } + } +} diff --git a/paddle/fluid/framework/dataset_factory.cc b/paddle/fluid/framework/dataset_factory.cc new file mode 100644 index 00000000..3a28c101 --- /dev/null +++ b/paddle/fluid/framework/dataset_factory.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/dataset_factory.h" +#include +#include +#include + +#include "paddle/fluid/framework/data_set.h" + +namespace paddle { +namespace framework { +typedef std::unique_ptr (*CreateDatasetFunction)(); +typedef std::unordered_map datasetMap; +datasetMap g_dataset_map; + +#define REGISTER_DATASET_CLASS(dataset_class) \ + namespace { \ + std::unique_ptr Creator_##dataset_class() { \ + return std::unique_ptr(new dataset_class); \ + } \ + class __Registerer_##dataset_class { \ + public: \ + __Registerer_##dataset_class() { \ + g_dataset_map[#dataset_class] = &Creator_##dataset_class; \ + } \ + }; \ + __Registerer_##dataset_class g_registerer_##dataset_class; \ + } // namespace + +std::string DatasetFactory::DatasetTypeList() { + std::string dataset_types; + for (auto iter = g_dataset_map.begin(); iter != g_dataset_map.end(); ++iter) { + if (iter != g_dataset_map.begin()) { + dataset_types += ", "; + } + dataset_types += iter->first; + } + return dataset_types; +} + +std::unique_ptr DatasetFactory::CreateDataset( + std::string dataset_class) { + if (g_dataset_map.count(dataset_class) < 1) { + LOG(WARNING) << "Your Dataset " << dataset_class + << "is not supported currently"; + LOG(WARNING) << "Supported Dataset: " << DatasetTypeList(); + exit(-1); + } + return g_dataset_map[dataset_class](); +} + +REGISTER_DATASET_CLASS(MultiSlotDataset); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dataset_factory.h b/paddle/fluid/framework/dataset_factory.h new file mode 100644 index 00000000..d4a36cec --- /dev/null +++ b/paddle/fluid/framework/dataset_factory.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/data_set.h" + +namespace paddle { +namespace framework { +class DatasetFactory { + public: + static std::string DatasetTypeList(); + static std::unique_ptr CreateDataset(std::string dataset_class); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc new file mode 100644 index 00000000..d4e46924 --- /dev/null +++ b/paddle/fluid/framework/ddim.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +DDim make_ddim(std::initializer_list dims) { + return DDim(dims.begin(), dims.size()); +} + +DDim make_ddim(const std::vector& dims) { + return DDim(dims.data(), dims.size()); +} + +DDim make_ddim(const std::vector& dims) { + return DDim(dims.data(), dims.size()); +} + +struct DDimEqualityVisitor { + explicit DDimEqualityVisitor(const int64_t* d) : d_(d) {} + + template + inline bool operator()(const Dim& self) const { + return UnrollCompare::Run(self.Get(), d_); + } + + const int64_t* d_; +}; + +bool DDim::operator==(const DDim& d) const { + return size() == d.size() && + this->apply_visitor(DDimEqualityVisitor(d.Get())); +} + +bool DDim::operator!=(const DDim& d) const { return !(*this == d); } + +std::vector vectorize(const DDim& ddim) { + std::vector result(DDim::kMaxRank); + dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); + result.resize(ddim.size()); + return result; +} + +// NOTE: framework::vectorize converts to type int64_t +// which does not fit cudnn inputs. +std::vector vectorize2int(const DDim& ddim) { + std::vector result(DDim::kMaxRank); + dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); + result.resize(ddim.size()); + return result; +} + +struct ProductVisitor { + template + inline int64_t operator()(const Dim& dim) { + return product(dim); + } +}; + +int64_t product(const DDim& ddim) { + return ddim.apply_visitor(ProductVisitor()); +} + +bool contain_unknown_dim(const DDim& ddim) { + for (int i = 0; i < ddim.size(); ++i) { + if (ddim[i] < 0) { + return true; + } + } + + return false; +} + +DDim slice_ddim(const DDim& dim, int begin, int end) { + PADDLE_ENFORCE(begin >= 0 && end <= dim.size(), + "[begin(%d), end(%d)) must be inside [0, %d) in ddim slice.", + begin, end, dim.size()); + // Constructor of DDim would check whether end - begin is valid + return DDim(dim.Get() + begin, end - begin); +} + +int arity(const DDim& d) { return d.size(); } + +struct DDimPrinter { + std::ostream& os; + explicit DDimPrinter(std::ostream& os_) : os(os_) {} + + template + void operator()(const Dim& t) { + os << t; + } +}; + +std::ostream& operator<<(std::ostream& os, const DDim& ddim) { + ddim.apply_visitor(DDimPrinter(os)); + return os; +} + +DDim flatten_to_2d(const DDim& src, int num_col_dims) { + return DDim({product(slice_ddim(src, 0, num_col_dims)), + product(slice_ddim(src, num_col_dims, src.size()))}); +} + +DDim flatten_to_1d(const DDim& src) { return DDim({product(src)}); } + +DDim stride(const DDim& ddim) { + DDim strides; + strides.rank_ = ddim.size(); + strides[ddim.size() - 1] = 1; + for (int i = ddim.size() - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * ddim[i + 1]; + } + return strides; +} + +DDim stride_numel(const DDim& ddim) { + DDim strides; + strides.rank_ = ddim.size(); + strides[ddim.size() - 1] = ddim[ddim.size() - 1]; + for (int i = ddim.size() - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * ddim[i]; + } + return strides; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h new file mode 100644 index 00000000..bfe3e55a --- /dev/null +++ b/paddle/fluid/framework/ddim.h @@ -0,0 +1,209 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/dim.h" + +namespace paddle { +namespace framework { + +#define PADDLE_VISIT_DDIM_BASE(rank, callback) \ + case (rank): { \ + constexpr auto kRank = (rank); \ + return (callback); \ + } + +#define PADDLE_VISIT_DDIM(rank, callback) \ + switch (rank) { \ + PADDLE_VISIT_DDIM_BASE(0, callback); \ + PADDLE_VISIT_DDIM_BASE(1, callback); \ + PADDLE_VISIT_DDIM_BASE(2, callback); \ + PADDLE_VISIT_DDIM_BASE(3, callback); \ + PADDLE_VISIT_DDIM_BASE(4, callback); \ + PADDLE_VISIT_DDIM_BASE(5, callback); \ + PADDLE_VISIT_DDIM_BASE(6, callback); \ + PADDLE_VISIT_DDIM_BASE(7, callback); \ + PADDLE_VISIT_DDIM_BASE(8, callback); \ + PADDLE_VISIT_DDIM_BASE(9, callback); \ + default: \ + PADDLE_THROW("Invalid rank %d", rank); \ + } + +template +inline void dynamic_dim_assign(const T1* in, T2* out, int n) { + PADDLE_VISIT_DDIM(n, (static_dim_assign(in, out))); +} + +/** + * \brief A dynamically sized dimension. + * + * The number of dimensions must be between [1, 9]. + */ +class DDim { + public: + constexpr static int kMaxRank = 9; + + DDim() : rank_(1) { dim_[0] = 0; } + + DDim(const DDim& ddim) : dim_() { CopyFrom(ddim); } + + DDim(const int* d, int n) : rank_(n) { + dynamic_dim_assign(d, dim_.GetMutable(), n); + } + + DDim(const int64_t* d, int n) : rank_(n) { + dynamic_dim_assign(d, dim_.GetMutable(), n); + } + + template + /*implicit*/ DDim(const Dim& in) : rank_(D) { // NOLINT + UnsafeCast() = in; + } + + /*implicit*/ DDim(std::initializer_list init_list) + : DDim(init_list.begin(), init_list.size()) {} + + inline DDim& operator=(const DDim& ddim) { return CopyFrom(ddim); } + + template + inline DDim& operator=(const Dim& dim) { + rank_ = D; + UnsafeCast() = dim; + return *this; + } + + inline int64_t& operator[](int idx) { return dim_[idx]; } + + inline int64_t operator[](int idx) const { return dim_[idx]; } + + inline int64_t& at(int idx) { + PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx); + return dim_[idx]; + } + + inline int64_t at(int idx) const { + PADDLE_ENFORCE(idx >= 0 && idx < rank_, "Invalid idx %d", idx); + return dim_[idx]; + } + + template + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor) { + PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast())); + } + + template + typename std::result_of&)>::type apply_visitor( + Visitor&& visitor) const { + PADDLE_VISIT_DDIM(rank_, visitor(UnsafeCast())); + } + + bool operator==(const DDim& d) const; + + bool operator!=(const DDim& d) const; + + inline const int64_t* Get() const { return dim_.Get(); } + + inline int64_t* GetMutable() { return dim_.GetMutable(); } + + inline int size() const { return rank_; } + + private: + template + inline Dim& UnsafeCast() { + static_assert(D >= 0 && D <= kMaxRank, "Invalid rank"); + auto* p = static_cast(&dim_); + return *reinterpret_cast*>(p); + } + + template + inline const Dim& UnsafeCast() const { + static_assert(D >= 0 && D <= kMaxRank, "Invalid rank"); + auto* p = static_cast(&dim_); + return *reinterpret_cast*>(p); + } + + inline DDim& CopyFrom(const DDim& ddim) { + PADDLE_VISIT_DDIM(ddim.rank_, (*this = ddim.UnsafeCast())); + } + + friend DDim stride(const DDim& ddim); + friend DDim stride_numel(const DDim& ddim); + + private: + Dim dim_; + int rank_; +}; + +#undef PADDLE_VISIT_DDIM_BASE +#undef PADDLE_VISIT_DDIM + +/** + * \brief Make a DDim from std::vector + * + * \param dims An vector of ints. Must be sized between [1, 9] + */ +DDim make_ddim(const std::vector& dims); + +DDim make_ddim(const std::vector& dims); + +/** + * \brief Make a DDim from an initializer list + * + * \param dims An initializer list of ints. Must be sized between [1, 9] + * + */ +DDim make_ddim(std::initializer_list dims); + +std::vector vectorize(const DDim& ddim); +std::vector vectorize2int(const DDim& ddim); + +int64_t product(const DDim& ddim); + +bool contain_unknown_dim(const DDim& ddim); + +/** + * \brief Slice a ddim + * + * Slice dim with [begin, end). + * e.g. DDim d = make_ddim({1,2,3,4,5}); + * slice_ddim(d, 1, 3); ====> {2,3} + */ +DDim slice_ddim(const DDim& dim, int begin, int end); + +/** + * \brief What is the length of this dimension? + * + * \param Dynamic dimension to inspect + */ + +int arity(const DDim& ddim); + +std::ostream& operator<<(std::ostream&, const DDim&); + +// Reshape a tensor to a matrix. The matrix's first dimension(column length) +// will be the product of tensor's first `num_col_dims` dimensions. +DDim flatten_to_2d(const DDim& src, int num_col_dims); + +DDim flatten_to_1d(const DDim& src); + +DDim stride(const DDim& ddim); + +DDim stride_numel(const DDim& ddim); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ddim_test.cc b/paddle/fluid/framework/ddim_test.cc new file mode 100644 index 00000000..b7b42fa0 --- /dev/null +++ b/paddle/fluid/framework/ddim_test.cc @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ddim.h" + +TEST(DDim, Equality) { + // construct a DDim from an initialization list + paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5}); + EXPECT_EQ(ddim[0], 9); + EXPECT_EQ(ddim[1], 1); + EXPECT_EQ(ddim[2], 5); + + // construct a DDim from a vector + std::vector vec({9, 1, 5}); + paddle::framework::DDim vddim = paddle::framework::make_ddim(vec); + EXPECT_EQ(ddim[0], 9); + EXPECT_EQ(ddim[1], 1); + EXPECT_EQ(ddim[2], 5); + + // mutate a DDim + ddim[1] = 2; + EXPECT_EQ(ddim[1], 2); + ddim[0] = 6; + EXPECT_EQ(ddim[0], 6); + + // vectorize a DDim + std::vector res_vec = paddle::framework::vectorize(vddim); + EXPECT_EQ(res_vec[0], 9); + EXPECT_EQ(res_vec[1], 1); + EXPECT_EQ(res_vec[2], 5); + paddle::framework::Dim<3> d(3, 2, 1); + res_vec = paddle::framework::vectorize(paddle::framework::DDim(d)); + EXPECT_EQ(res_vec[0], 3); + EXPECT_EQ(res_vec[1], 2); + EXPECT_EQ(res_vec[2], 1); + + // arity of a DDim + EXPECT_EQ(paddle::framework::arity(ddim), 3); + EXPECT_EQ(ddim.size(), 3); + + // product of a DDim + EXPECT_EQ(paddle::framework::product(vddim), 45); + EXPECT_EQ( + paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})), + 90); + + // slice a DDim + paddle::framework::DDim ddim2 = + paddle::framework::make_ddim({1, 2, 3, 4, 5, 6}); + paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5); + EXPECT_EQ(arity(ss), 3); + EXPECT_EQ(ss[0], 3); + EXPECT_EQ(ss[1], 4); + EXPECT_EQ(ss[2], 5); + paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6); + EXPECT_EQ(arity(ss2), 6); + EXPECT_EQ(ss2[0], 1); + EXPECT_EQ(ss2[1], 2); + EXPECT_EQ(ss2[2], 3); + EXPECT_EQ(ss2[3], 4); + EXPECT_EQ(ss2[4], 5); + EXPECT_EQ(ss2[5], 6); +} + +TEST(DDim, Print) { + // print a DDim + std::stringstream ss; + paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4}); + ss << ddim; + EXPECT_EQ("2, 3, 4", ss.str()); +} diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt new file mode 100644 index 00000000..c566f0a5 --- /dev/null +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -0,0 +1,99 @@ +cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node) +cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor) + +cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) +cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) +cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) +cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry) +cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry) +cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) + +cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) + +if(WITH_DISTRIBUTE) + if(NOT WITH_GRPC) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + endif() +endif() + + +if(WITH_GPU) + nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda variable_visitor) + nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda variable_visitor) + + if(WITH_DGC) + nv_library(sparse_all_reduce_op_handle SRCS sparse_all_reduce_op_handle.cc DEPS op_handle_base scope + lod_tensor ddim memory dynload_cuda variable_visitor dgc all_reduce_op_handle) + endif() + + if(WITH_DISTRIBUTE) + nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim dynload_cuda selected_rows_functor sendrecvop_rpc) + else() + nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim dynload_cuda selected_rows_functor) + endif() + nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) + nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) + +else() + cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + variable_visitor) + cc_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + variable_visitor) + if(WITH_DISTRIBUTE) + cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim selected_rows_functor sendrecvop_rpc) + else() + cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim selected_rows_functor) + endif() + cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) + cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) +endif() + +cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) + +cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) + +cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope) + +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass buffer_shared_inplace_op_pass) +if (WITH_GPU) + list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) +endif() +cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) + +cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope + simple_threadpool device_context) + +cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) + +set(ASYNC_SSA_GRAPH_EXECUTOR_DEPS threaded_ssa_graph_executor) +if(WITH_DISTRIBUTE) + list(APPEND ASYNC_SSA_GRAPH_EXECUTOR_DEPS communicator) +endif() +cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS}) + +cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory + device_context broadcast_op_handle) +cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory + device_context gather_op_handle) +cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor) +#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory +# device_context reduce_op_handle ) +cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc + DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) +cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle) + +cc_library(build_strategy SRCS build_strategy.cc DEPS + graph_viz_pass multi_devices_graph_pass + multi_devices_graph_print_pass multi_devices_graph_check_pass + fuse_elewise_add_act_pass multi_batch_merge_pass + fuse_relu_depthwise_conv_pass + memory_optimize_pass lock_free_optimize_pass + coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass + fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass record_skip_memory_opt_vars_pass) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc new file mode 100644 index 00000000..f806a4fa --- /dev/null +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/profiler.h" + +// asynchronous nccl allreduce or synchronous issue: +// https://github.com/PaddlePaddle/Paddle/issues/15049 +DEFINE_bool( + sync_nccl_allreduce, true, + "If set true, will call `cudaStreamSynchronize(nccl_stream)`" + "after allreduce, this mode can get better performance in some scenarios."); + +namespace paddle { +namespace framework { +namespace details { + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLCommunicator *ctxs) + : NCCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); +} +#else +AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places) + : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} +#endif + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +void AllReduceOpHandle::RunAllReduceFuncs( + const std::vector> &all_reduce_calls) { + this->RunAndRecordEvent([&] { + if (all_reduce_calls.size() == 1UL) { + // Do not use NCCLGroup when manage NCCL by per thread per device + all_reduce_calls[0](); + } else { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + } + }); + + if (FLAGS_sync_nccl_allreduce) { + for (auto &p : places_) { + int dev_id = boost::get(p).device; + auto *nccl_ctxs = + nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, use_hierarchical_allreduce_); + auto &nccl_ctx = nccl_ctxs->at(dev_id); + auto stream = nccl_ctx.stream(); + cudaError_t e_sync = cudaStreamSynchronize(stream); + if (e_sync != 0) { + LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync); + } + + cudaError_t e_get = cudaGetLastError(); + if (e_get != 0) { + LOG(FATAL) << "cudaGetLastError " << cudaGetErrorString(e_get) + << " errno:" << e_get; + } + } + } +} +#endif + +void AllReduceOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); + + WaitInputVarGenerated(); + + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + + std::vector lod_tensors; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &local_scope = local_exec_scopes_[i]; + auto &lod_tensor = + local_scope->FindVar(in_var_handles[i]->name())->Get(); + lod_tensors.emplace_back(&lod_tensor); + VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name() + << ", out_name:" << out_var_handles[i]->name(); + PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(), + "The name of input and output should be equal."); + } + + if (platform::is_gpu_place(lod_tensors[0]->place())) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + int dtype = -1; + size_t numel = 0; + std::vector> all_reduce_calls; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &p = places_[i]; + auto &lod_tensor = *lod_tensors[i]; + void *buffer = const_cast(lod_tensor.data()); + + if (dtype == -1) { + dtype = platform::ToNCCLDataType(lod_tensor.type()); + } + + if (numel == 0) { + numel = static_cast(lod_tensor.numel()); + } + + all_reduce_calls.emplace_back([=] { + NCCLAllReduce(p, buffer, buffer, numel, + static_cast(dtype), ncclSum); + }); + } + VLOG(10) << "allreduce size:" << numel * SizeOfType(lod_tensors[0]->type()); + RunAllReduceFuncs(all_reduce_calls); +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif + } else { // Special handle CPU only Operator's gradient. Like CRF + auto &trg = *this->local_exec_scopes_[0] + ->FindVar(out_var_handles[0]->name()) + ->GetMutable(); + + // Reduce All Tensor to trg in CPU + ReduceLoDTensor func(lod_tensors, &trg); + VisitDataType(lod_tensors[0]->type(), func); + + for (size_t i = 1; i < local_scopes_.size(); ++i) { + auto &scope = local_exec_scopes_[i]; + auto &p = places_[i]; + auto *var = scope->FindVar(out_var_handles[i]->name()); + auto *dev_ctx = dev_ctxes_.at(p); + + RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { + auto &tensor_gpu = *var->GetMutable(); + auto &tensor_cpu = trg; + TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu); + }); + } + } +} + +std::string AllReduceOpHandle::Name() const { return "all_reduce"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h new file mode 100644 index 00000000..ed5e475a --- /dev/null +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -0,0 +1,71 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/framework/details/nccl_op_handle.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +class AllReduceOpHandle : public NCCLOpHandleBase { + public: + AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLCommunicator *ctxs); +#else +class AllReduceOpHandle : public OpHandleBase { + public: + AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places); +#endif + std::string Name() const override; + + // Delay and buffer nccl_all_reduce together can significantly increase + // performance. Disable this feature by returning false. + bool IsMultiDeviceTransfer() override { return true; }; + + protected: + void RunImpl() override; + + std::vector GetLocalScopes() override { return local_scopes_; } + + std::vector local_scopes_; + +#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) + // NCCLOpHandleBase already have these attributes. + // Will polish it by class inheritance framework. + std::vector places_; +#endif + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + void RunAllReduceFuncs( + const std::vector> &all_reduce_calls); +#endif +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc new file mode 100644 index 00000000..2e247075 --- /dev/null +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -0,0 +1,210 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" + +#include "paddle/fluid/framework/variable_helper.h" + +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/communicator.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +inline void InitVarsInScope(const std::vector &var_infos, Scope *scope, + Scope *local_scope) { + VLOG(3) << "InitVarsInScope"; + for (auto &info : var_infos) { + if (info.persistable_) { // Persistable + auto *var = scope->FindVar(info.name_); + if (var != nullptr) { + VLOG(2) << info.name_ + << " has been initialized beforehand in global scope, skipped"; + continue; + } + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + InitializeVariable(local_scope->Var(info.name_), info.type_); + } + } +} + +// get RpcContext and remote send and recv op +void ProcessGraph(std::vector graphs, Scope *scope) { +#ifdef PADDLE_WITH_DISTRIBUTE + using RpcCtxMap = operators::distributed::RpcCtxMap; + VLOG(3) << "ProcessGraph"; + RpcCtxMap send_varname_to_ctx; + RpcCtxMap recv_varname_to_ctx; + for (auto &node : graphs[0]->Nodes()) { + VLOG(3) << "node name " << node->Name(); + if (node && node->IsOp()) { + if (node->Name() == "send") { + auto send_var_name = node->Op()->Input("X")[0]; + auto send_varnames = boost::get>( + node->Op()->GetNullableAttr("send_varnames")); + auto epmap = boost::get>( + node->Op()->GetNullableAttr("epmap")); + auto height_section = boost::get>( + node->Op()->GetNullableAttr("sections")); + auto trainer_id = + boost::get(node->Op()->GetNullableAttr("trainer_id")); + send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext( + send_var_name, send_varnames, epmap, height_section, trainer_id); + VLOG(3) << "find and init an send op: " + << send_varname_to_ctx[send_var_name]; + } else if (node->Name() == "recv") { + auto recv_var_name = node->Op()->Output("Out")[0]; + auto recv_varnames = boost::get>( + node->Op()->GetNullableAttr("recv_varnames")); + auto epmap = boost::get>( + node->Op()->GetNullableAttr("epmap")); + auto trainer_id = + boost::get(node->Op()->GetNullableAttr("trainer_id")); + recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext( + recv_var_name, recv_varnames, epmap, {}, trainer_id); + VLOG(3) << "find and remove an recv op: " + << recv_varname_to_ctx[recv_var_name]; + } + } + } + + // init communicator here + if (send_varname_to_ctx.size() > 0) { + VLOG(3) << "this is distribute mode, will use communicator"; + + if (operators::distributed::Communicator::GetInstance() == nullptr) { + operators::distributed::Communicator::Init(send_varname_to_ctx, + recv_varname_to_ctx, scope); + operators::distributed::Communicator::GetInstance()->Start(); + } else { + VLOG(3) << "communicator has been initialized, skip"; + } + } +#endif +} + +AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &local_exec_scopes, + const std::vector &places, std::vector graphs) + : strategy_(std::move(strategy)), + local_scopes_(std::move(local_scopes)), + local_exec_scopes_(local_exec_scopes), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), + places_(std::move(places)), + graphs_(std::move(graphs)) { + VLOG(3) << "build AsyncSSAGraphExecutor"; + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size()); + + // set the correct size of thread pool to each device. + strategy_.num_threads_ = strategy_.num_threads_ < places_.size() + ? 1UL + : strategy_.num_threads_ / places_.size(); + VLOG(1) << "set num_threads: " << strategy_.num_threads_ + << " to run the operators of the graph on each device."; + for (size_t i = 0; i < places.size(); ++i) { + executors_.emplace_back(new details::ThreadedSSAGraphExecutor( + strategy_, {local_scopes_[i]}, {local_exec_scopes_[i]}, {places_[i]}, + graphs_[i])); + } + + for (auto &node : graphs_[0]->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos_.emplace_back(); + var_infos_.back().name_ = node->Var()->Name(); + var_infos_.back().type_ = node->Var()->GetType(); + var_infos_.back().persistable_ = node->Var()->Persistable(); + } + } + + for (size_t i = 0; i < local_scopes_.size(); ++i) { + InitVarsInScope(var_infos_, local_scopes_[i], local_exec_scopes_[i]); + } + ProcessGraph(graphs_, local_scopes_[0]); +} + +void AsyncSSAGraphExecutor::StartOffPythonTrainLoop() { + VLOG(3) << "StartOffPythonTrainLoop size = " << places_.size(); + for (size_t i = 1; i < places_.size(); ++i) { + auto call = [this, i]() -> void { + VLOG(3) << "start off python thread " << i; + try { + while (true) { + executors_[i]->Run({}); + } + } catch (...) { + exception_holder_.Catch(std::current_exception()); + VLOG(3) << "get exception type = " << exception_holder_.Type(); + } + VLOG(3) << "thread " << i << " exited!"; + }; + run_futures_.emplace_back(pool_->enqueue(std::move(call))); + } +} + +void AsyncSSAGraphExecutor::HandleException() { + if (exception_holder_.IsCaught()) { + for (auto &f : run_futures_) { + VLOG(3) << "wait future"; + f.wait(); + } + VLOG(3) << "caught exception " << exception_holder_.Type() + << ", rethrow it"; + run_futures_.clear(); + exception_holder_.ReThrow(); + } +} + +FeedFetchList AsyncSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + // init once + if (run_futures_.size() == 0 && places_.size() > 1) { + exception_holder_.Clear(); + StartOffPythonTrainLoop(); + } + + if (places_.size() == 1) { + exception_holder_.Clear(); + } else { + HandleException(); + } + + FeedFetchList fetch_data; + fetch_data.reserve(fetch_tensors.size()); + + try { + fetch_data = executors_[0]->Run(fetch_tensors); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + + HandleException(); + + FeedFetchList ret; + for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { + std::vector lodtensor_ptrs; + lodtensor_ptrs.push_back(&fetch_data.at(fetch_idx)); + ret.emplace_back(); + ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + } + return ret; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h new file mode 100644 index 00000000..97472674 --- /dev/null +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "ThreadPool.h" +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +struct VarInfo { + std::string name_; + proto::VarType::Type type_; + bool persistable_; +}; + +class AsyncSSAGraphExecutor : public SSAGraphExecutor { + public: + AsyncSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &local_exec_scopes, + const std::vector &places, + std::vector graphs); + ~AsyncSSAGraphExecutor() final = default; + const ir::Graph &Graph() const override { return *graphs_[0]; } + + FeedFetchList Run(const std::vector &fetch_tensors) override; + + private: + void StartOffPythonTrainLoop(); + void HandleException(); + + private: + ExecutionStrategy strategy_; + std::vector local_scopes_; + std::vector local_exec_scopes_; + std::unique_ptr<::ThreadPool> pool_{nullptr}; + std::vector places_; + std::vector graphs_; + + std::vector> executors_; + ExceptionHolder exception_holder_; + std::vector> run_futures_; + std::vector var_infos_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc new file mode 100644 index 00000000..75143b9a --- /dev/null +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -0,0 +1,169 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/broadcast_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace framework { +namespace details { + +void BroadcastOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); + + if (places_.size() == 1) return; + + // The input and output may have dummy vars. + auto in_var_handles = DynamicCast(inputs_); + auto out_var_handles = DynamicCast(outputs_); + + PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, + "The number of input should be one."); + PADDLE_ENFORCE_EQ( + out_var_handles.size(), places_.size(), + "The number of output should equal to the number of places."); + + VarHandle *in_var_handle = in_var_handles[0]; + + WaitInputVarGenerated(); + + BroadcastOneVar(*in_var_handle, out_var_handles, local_exec_scopes_); +} + +void BroadcastOpHandle::BroadcastOneVar( + const VarHandle &in_var_handle, + const std::vector &out_var_handles, + const std::vector &var_scopes) { + auto *in_var = + var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name()); + PADDLE_ENFORCE_NOT_NULL(in_var); + Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); + if (UNLIKELY(!in_tensor.IsInitialized())) { + VLOG(3) << "in var " << in_var_handle.name() << "not inited, return!"; + return; + } + + InitOutputValue(in_var_handle, out_var_handles); + + if (platform::is_cpu_place(in_tensor.place())) { + for (auto *out_var_handle : out_var_handles) { + if (out_var_handle->IsTheSameVar(in_var_handle)) { + continue; + } + auto &out_p = out_var_handle->place(); + auto *out_var = var_scopes.at(out_var_handle->scope_idx()) + ->FindVar(out_var_handle->name()); + + RunAndRecordEvent(out_p, [in_tensor, out_var] { + paddle::framework::TensorCopy( + in_tensor, platform::CPUPlace(), + &VariableVisitor::GetMutableTensor(out_var)); + }); + } + } else { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + VarHandle *out_handle = nullptr; + int root_id = boost::get(in_tensor.place()).device; + std::vector> broadcast_calls; + + int type = platform::ToNCCLDataType(in_tensor.type()); + size_t numel = static_cast(in_tensor.numel()); + + for (auto out_var_handle : out_var_handles) { + Variable *out_var = var_scopes.at(out_var_handle->scope_idx()) + ->FindVar(out_var_handle->name()); + + int dst_id = + boost::get(out_var_handle->place()).device; + + auto &nccl_ctx = nccl_ctxs_->at(dst_id); + + void *send_recv_buffer = nullptr; + if (root_id == dst_id) { + send_recv_buffer = const_cast(in_tensor.data()); + out_handle = out_var_handle; + } else { + send_recv_buffer = VariableVisitor::GetMutableTensor(out_var) + .Resize(in_tensor.dims()) + .mutable_data(out_var_handle->place()); + } + + broadcast_calls.emplace_back( + [send_recv_buffer, numel, type, root_id, &nccl_ctx] { + PADDLE_ENFORCE(platform::dynload::ncclBcast( + send_recv_buffer, numel, static_cast(type), + root_id, nccl_ctx.comm_, nccl_ctx.stream())); + }); + } + + this->RunAndRecordEvent([&] { + { + platform::NCCLGroupGuard guard; + for (auto &call : broadcast_calls) { + call(); + } + } + + if (!out_handle->IsTheSameVar(in_var_handle)) { + auto out_var = var_scopes.at(in_var_handle.scope_idx()) + ->FindVar(out_var_handles[0]->name()); + paddle::framework::TensorCopy( + in_tensor, in_var_handle.place(), + *(dev_ctxes_.at(in_var_handle.place())), + &VariableVisitor::GetMutableTensor(out_var)); + } + }); +#else + PADDLE_THROW("CUDA is not enabled."); +#endif + } +} + +void BroadcastOpHandle::InitOutputValue( + const VarHandle &in_var_handle, + const std::vector &out_var_handles) const { + auto &var_scopes = local_exec_scopes_; + auto *in_var = + var_scopes.at(in_var_handle.scope_idx())->FindVar(in_var_handle.name()); + + Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); + + // NOTE: The tensors' Place of input and output must be all on GPU or all on + // CPU. + for (auto *out_var_handle : out_var_handles) { + if (out_var_handle->IsTheSameVar(in_var_handle)) { + continue; + } + auto t_out_p = out_var_handle->place(); + auto *out_var = var_scopes.at(out_var_handle->scope_idx()) + ->FindVar(out_var_handle->name()); + PADDLE_ENFORCE_NOT_NULL(out_var); + if (is_gpu_place(in_tensor.place())) { + PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), + "Places of input and output must be all on GPU."); + } else { + t_out_p = platform::CPUPlace(); + } + VariableVisitor::ShareDimsAndLoD(*in_var, out_var); + VariableVisitor::GetMutableTensor(out_var).mutable_data(t_out_p, + in_tensor.type()); + } +} + +std::string BroadcastOpHandle::Name() const { return "broadcast"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h new file mode 100644 index 00000000..45ccbb41 --- /dev/null +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -0,0 +1,82 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +struct BroadcastOpHandle : public OpHandleBase { + public: +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap *nccl_ctxs) + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + nccl_ctxs_(nccl_ctxs) { + if (nccl_ctxs_) { + for (auto &p_ctx : nccl_ctxs_->contexts_) { + this->SetDeviceContext(platform::CUDAPlace(p_ctx.first), + p_ctx.second.ctx_.get()); + } + } + } +#else + BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places) + : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} +#endif + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return true; }; + + protected: + void RunImpl() override; + + std::vector GetLocalScopes() override { return local_scopes_; } + + void BroadcastOneVar(const VarHandle &in_var_handle, + const std::vector &out_var_handles, + const std::vector &var_scopes); + + std::vector local_scopes_; + std::vector places_; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const platform::NCCLContextMap *nccl_ctxs_; +#endif + + void InitOutputValue(const VarHandle &in_var_handle, + const std::vector &out_var_handles) const; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc new file mode 100644 index 00000000..650de5a4 --- /dev/null +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/broadcast_op_handle_test.h" + +namespace paddle { +namespace framework { +namespace details { + +TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) { + TestBroadcastOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitBroadcastOp(input_scope_idx); + test_op.TestBroadcastLodTensor(input_scope_idx); +} + +TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) { + TestBroadcastOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitBroadcastOp(input_scope_idx); + test_op.TestBroadcastSelectedRows(input_scope_idx); +} + +#ifdef PADDLE_WITH_CUDA +TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) { + TestBroadcastOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(true); + test_op.InitBroadcastOp(input_scope_idx); + test_op.TestBroadcastLodTensor(input_scope_idx); +} + +TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) { + TestBroadcastOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(true); + test_op.InitBroadcastOp(input_scope_idx); + test_op.TestBroadcastSelectedRows(input_scope_idx); +} +#endif + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h new file mode 100644 index 00000000..abc3f39e --- /dev/null +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -0,0 +1,276 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/details/broadcast_op_handle.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +namespace f = paddle::framework; +namespace p = paddle::platform; + +// test data amount +const f::DDim kDims = {20, 20}; + +struct TestBroadcastOpHandle { + std::vector> ctxs_; + std::vector local_scopes_; + std::vector param_scopes_; + Scope g_scope_; + OpHandleBase* op_handle_; + std::vector vars_; + std::vector> nodes_; + std::vector place_list_; + bool use_gpu_; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + std::unique_ptr nccl_ctxs_; +#endif + + void WaitAll() { + for (size_t j = 0; j < ctxs_.size(); ++j) { + ctxs_[j]->Wait(); + } +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + if (nccl_ctxs_) { + nccl_ctxs_->WaitAll(); + } +#endif + } + + void InitCtxOnGpu(bool use_gpu) { + use_gpu_ = use_gpu; + if (use_gpu_) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " + "device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + auto p = p::CUDAPlace(i); + place_list_.push_back(p); + ctxs_.emplace_back(new p::CUDADeviceContext(p)); + } + nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_)); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { + int count = 8; + for (int i = 0; i < count; ++i) { + auto p = p::CPUPlace(); + place_list_.push_back(p); + ctxs_.emplace_back(new p::CPUDeviceContext(p)); + } +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + nccl_ctxs_.reset(nullptr); +#endif + } + } + + void InitBroadcastOp(size_t input_scope_idx) { + nodes_.clear(); + std::unordered_map scope_map; + for (size_t j = 0; j < place_list_.size(); ++j) { + local_scopes_.push_back(&(g_scope_.NewScope())); + Scope& local_scope = local_scopes_.back()->NewScope(); + local_scope.Var("out"); + param_scopes_.emplace_back(&local_scope); + scope_map.emplace(local_scopes_.back(), param_scopes_.back()); + } + param_scopes_[input_scope_idx]->Var("input"); + + nodes_.emplace_back( + ir::CreateNodeForTest("node0", ir::Node::Type::kOperation)); + if (use_gpu_) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, + place_list_, nccl_ctxs_.get()); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, + place_list_, nccl_ctxs_.get()); +#else + op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, + place_list_); +#endif + } + + op_handle_->SetLocalExecScopes(scope_map); + + nodes_.emplace_back( + ir::CreateNodeForTest("node1", ir::Node::Type::kVariable)); + auto* in_var_handle = new VarHandle(nodes_.back().get(), 1, input_scope_idx, + "input", place_list_[input_scope_idx]); + vars_.emplace_back(in_var_handle); + op_handle_->AddInput(in_var_handle); + + // add dummy var + + nodes_.emplace_back( + ir::CreateNodeForTest("node2", ir::Node::Type::kVariable)); + vars_.emplace_back(new DummyVarHandle(nodes_.back().get())); + DummyVarHandle* dummy_var_handle = + static_cast(vars_.back()); + dummy_var_handle->ClearGeneratedOp(); + op_handle_->AddInput(dummy_var_handle); + + for (size_t j = 0; j < place_list_.size(); ++j) { + if (!use_gpu_) { + op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get()); + } + nodes_.emplace_back( + ir::CreateNodeForTest("node3", ir::Node::Type::kVariable)); + VarHandle* out_var_handle = + new VarHandle(nodes_.back().get(), 2, j, "out", place_list_[j]); + vars_.emplace_back(out_var_handle); + op_handle_->AddOutput(out_var_handle); + } + + // add dummy var + nodes_.emplace_back( + ir::CreateNodeForTest("node4", ir::Node::Type::kVariable)); + vars_.emplace_back(new DummyVarHandle(nodes_.back().get())); + DummyVarHandle* out_dummy_var_handle = + static_cast(vars_.back()); + out_dummy_var_handle->ClearGeneratedOp(); + op_handle_->AddOutput(out_dummy_var_handle); + } + + std::vector InitLoDTensor(const std::string& varname, + size_t input_scope_idx, const f::LoD& lod, + float val_scalar = 0.0) { + auto var = param_scopes_[input_scope_idx]->FindVar(varname); + + PADDLE_ENFORCE_NOT_NULL(var); + auto lod_tensor = var->GetMutable(); + std::vector send_vector(static_cast(f::product(kDims))); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k + val_scalar; + } + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), lod_tensor); + lod_tensor->set_lod(lod); + lod_tensor->Resize(kDims); + return send_vector; + } + + std::vector InitSelectedRows(const std::string& varname, + size_t input_scope_idx, + const std::vector& rows, + int height, float value_scalar = 0.0) { + std::vector send_vector(static_cast(f::product(kDims))); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k + value_scalar; + } + + auto var = param_scopes_[input_scope_idx]->FindVar(varname); + PADDLE_ENFORCE_NOT_NULL(var); + auto selected_rows = var->GetMutable(); + auto value = selected_rows->mutable_value(); + value->mutable_data(kDims, place_list_[input_scope_idx]); + selected_rows->set_height(height); + selected_rows->set_rows(rows); + + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), value); + + return send_vector; + } + + void SelectedRowsEqual(const std::string& varname, int input_scope_idx, + const std::vector& send_vector, + const std::vector& rows, int height) { + auto var = param_scopes_[input_scope_idx]->FindVar(varname); + PADDLE_ENFORCE_NOT_NULL(var); + auto& selected_rows = var->Get(); + auto rt = selected_rows.value(); + PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal."); + + for (size_t k = 0; k < selected_rows.rows().size(); ++k) { + PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]); + } + + p::CPUPlace cpu_place; + f::Tensor result_tensor; + f::TensorCopySync(rt, cpu_place, &result_tensor); + float* ct = result_tensor.data(); + + for (int64_t i = 0; i < f::product(kDims); ++i) { + ASSERT_NEAR(ct[i], send_vector[i], 1e-5); + } + } + + void LoDTensorEqual(const std::string& varname, + const std::vector& send_vec, const f::LoD& lod, + framework::Scope* scope) { + p::CPUPlace cpu_place; + auto var = scope->FindVar(varname); + PADDLE_ENFORCE_NOT_NULL(var); + auto tensor = var->Get(); + PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal."); + f::Tensor result_tensor; + f::TensorCopySync(tensor, cpu_place, &result_tensor); + float* ct = result_tensor.mutable_data(cpu_place); + for (int64_t k = 0; k < f::product(kDims); ++k) { + ASSERT_NEAR(ct[k], send_vec[k], 1e-5); + } + } + + void TestBroadcastLodTensor(size_t input_scope_idx) { + f::LoD lod{{0, 10, 20}}; + auto send_vector = InitLoDTensor("input", input_scope_idx, lod); + + op_handle_->Run(false); + + WaitAll(); + for (size_t j = 0; j < place_list_.size(); ++j) { + LoDTensorEqual("out", send_vector, lod, param_scopes_[j]); + } + } + + void TestBroadcastSelectedRows(size_t input_scope_idx) { + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + int height = static_cast(kDims[0] * 2); + auto send_vector = InitSelectedRows("input", input_scope_idx, rows, height); + + op_handle_->Run(false); + + WaitAll(); + for (size_t j = 0; j < place_list_.size(); ++j) { + SelectedRowsEqual("out", input_scope_idx, send_vector, rows, height); + } + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc new file mode 100644 index 00000000..326a4631 --- /dev/null +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -0,0 +1,407 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/details/build_strategy.h" + +#include +#include +#include +#include +#include "paddle/fluid/framework/details/reduce_op_handle.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" +#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h" +#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h" + +DECLARE_bool(use_mkldnn); + +namespace paddle { +namespace framework { +namespace details { + +static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { + // Should fix the allreduce op order if scheduling + // them in multiple threads or processes to avoid hang. + // NOTE: ParallelGraph would execute this pass on each graph, so + // don't need to append it here. + return (!strategy.enable_sequential_execution_ && + strategy.num_trainers_ > 1) && + !strategy.enable_parallel_graph_; +} + +class ParallelExecutorPassBuilder : public ir::PassBuilder { + public: + explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) + : ir::PassBuilder(), strategy_(strategy) { + // Add a graph viz pass to record a graph. + if (!strategy_.debug_graphviz_path_.empty()) { + VLOG(1) << "Add graph_viz_pass"; + auto viz_pass = AppendPass("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + } + + // Note(zcd): record_skip_memory_opt_vars_pass should be the first pass. + VLOG(1) << "Add record_skip_memory_opt_vars_pass"; + AppendPass("record_skip_memory_opt_vars_pass"); + +#ifdef PADDLE_WITH_MKLDNN + if (FLAGS_use_mkldnn) { + VLOG(1) << "Add mkldnn_placement_pass"; + AppendPass("mkldnn_placement_pass"); + } else if (!strategy_.mkldnn_enabled_op_types_.empty()) { + LOG(WARNING) + << "mkldnn_enabled_op_types specify the operator type list to " + "use MKLDNN acceleration. It is null in default, means " + "that all the operators supported by MKLDNN will be " + "accelerated. And it should not be set when " + "FLAGS_use_mkldnn=false."; + } +#else + PADDLE_ENFORCE(!FLAGS_use_mkldnn, + "Please compile with MKLDNN first to use MKLDNN"); +#endif + + if (strategy_.enable_sequential_execution_) { + VLOG(1) << "Add sequential_execution_pass"; + AppendPass("sequential_execution_pass"); + } + + // Add op fusion. + if (strategy.sync_batch_norm_) { + AppendPass("sync_batch_norm_pass"); + } + + // Add op fusion. + if (strategy.fuse_relu_depthwise_conv_) { + VLOG(1) << "Add fuse_relu_depthwise_conv_pass"; + AppendPass("fuse_relu_depthwise_conv_pass"); + } + + // TODO(zjl): refactor MemoryOptimizePass to fit + // new strategy, which does not need to set + // var.persistable = True + if (strategy_.use_legacy_memory_optimize_strategy_) { + if (strategy_.enable_inplace_) { + VLOG(5) << "Add inplace_pass"; + AppendPass("inplace_pass"); + } + } + + if (strategy_.fuse_elewise_add_act_ops_) { + VLOG(1) << "Add fuse_elewise_add_act_pass"; + AppendPass("fuse_elewise_add_act_pass"); + } + + // for single card training, fuse_all_reduce_ops is unnecessary. + // coalesce_grad_tensor_pass should be before of MultiDevPass. + if (strategy_.fuse_all_reduce_ops_) { + VLOG(1) << "Add coalesce_grad_tensor_pass"; + AppendPass("coalesce_grad_tensor_pass"); + } + + // Fuse all the optimization operators. + if (strategy_.is_distribution_) { + VLOG(3) << "Currently, fuse_all_optimizer_ops only works under " + "Non-distributed mode."; + strategy_.fuse_all_optimizer_ops_ = false; + } + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce || + strategy_.is_distribution_) { + VLOG(3) << "Currently, fuse_all_optimizer_ops only works under AllReduce " + "mode."; + strategy_.fuse_all_optimizer_ops_ = false; + } + if (strategy_.fuse_all_optimizer_ops_) { + // NOTE: fuse_all_xx_ops will count the number of xx operator first, + // if the number is zero, fuse_all_reduce_ops will do nothing. + // Currently, only one type of optimization algorithm can be fused. + VLOG(1) << "Add fuse_adam_op_pass"; + AppendPass("fuse_adam_op_pass"); + VLOG(1) << "Add fuse_sgd_op_pass"; + AppendPass("fuse_sgd_op_pass"); + VLOG(1) << "Add fuse_momentum_op_pass"; + AppendPass("fuse_momentum_op_pass"); + } + + // Add a graph viz pass to record a graph. + if (!strategy.debug_graphviz_path_.empty()) { + auto viz_pass = AppendPass("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy_.debug_graphviz_path_.c_str(), "_fused_graph"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + } + + CollectiveContext *context = CollectiveContext::GetInstance(); + context->endpoints_ = strategy_.trainers_endpoints_; + context->trainer_id_ = strategy_.trainer_id_; + PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0"); + if (strategy_.trainer_id_ > 0 && strategy_.trainers_endpoints_.size() > 0) { + PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) < + strategy_.trainers_endpoints_.size(), + "trainer_id_ < endpoints_ size"); + } + VLOG(1) << "CollectiveContext:" << context->String(); + + // NOTE(dzh): memory optimize should be a runtime pass. + // However, after multi_devices_pass, VarHandle, OpHandle is + // the de-fact IR, any reuse on Graph is meaningless. + // A side-effect of that, memory optimize cannot forsee the fetched vars + // , so fetchlist should be set persistable before call the Run interface. + if (strategy_.use_legacy_memory_optimize_strategy_) { + if (strategy_.memory_optimize_) { + VLOG(5) << "Add memory_optimize_pass"; + AppendPass("memory_optimize_pass"); + } + } + + // runtime_context_cache pass should be the last pass to enable the attr of + // all original and fused operators. But no operators can be enabled this + // attr if putting it after MultiDevPass. + if (strategy_.cache_runtime_context_) { + VLOG(1) << "Add runtime_context_cache_pass"; + AppendPass("runtime_context_cache_pass"); + } + + AppendMultiDevPass(strategy_); + + if (strategy_.fuse_all_reduce_ops_) { + // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator + // first, if the number is zero, fuse_all_reduce_ops will do nothing. + VLOG(1) << "Add fuse_all_reduce_op_pass"; + AppendPass("fuse_all_reduce_op_pass"); + } + + // Add a graph print pass to record a graph with device info. + if (!strategy_.debug_graphviz_path_.empty()) { + VLOG(1) << "Add multi_devices_print_pass"; + auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); + const std::string graph_path = + string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(), + "_multi_devices_graph"); + multi_devices_print_pass->Set(ir::kGraphvizPath, + new std::string(graph_path)); + multi_devices_print_pass->Set( + "graph_printer", new ir::GraphvizSSAGraphPrinter); + } + + // experimental shows that the program will be faster if append + // all_reduce_deps_pass here. + if (!strategy_.enable_parallel_graph_ && + (SeqOnlyAllReduceOps(strategy_) || + strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) { + VLOG(1) << "Add all_reduce_deps_pass"; + AppendPass("all_reduce_deps_pass"); + } + + if (strategy_.num_trainers_ > 1 && !strategy_.async_mode_ && + !strategy_.is_distribution_ && + strategy_.enable_backward_optimizer_op_deps_) { + VLOG(1) << "Add backward_op_deps_pass"; + AppendPass("backward_optimizer_op_deps_pass"); + } + + if (strategy_.remove_unnecessary_lock_) { + VLOG(1) << "Add modify_op_lock_and_record_event_pass"; + AppendPass("modify_op_lock_and_record_event_pass"); + } + + // Verify that the graph is correct for multi-device executor. + VLOG(1) << "Add multi_devices_check_pass"; + AppendPass("multi_devices_check_pass"); + } + + // Convert graph to run on multi-devices. + void AppendMultiDevPass(const BuildStrategy &strategy) { + ir::Pass *multi_devices_pass = nullptr; + + if (strategy_.async_mode_) { + VLOG(1) << "Add async_multi_devices_pass"; + multi_devices_pass = AppendPass("async_multi_devices_pass").get(); + } else if (strategy_.is_distribution_) { + VLOG(1) + << "Add dist_multi_devices_pass, multi device parameter server mode"; + multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); + } else { + if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + VLOG(1) << "Add all_reduce_mode_multi_devices_pass"; + multi_devices_pass = + AppendPass("all_reduce_mode_multi_devices_pass").get(); + } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + VLOG(1) << "Add reduce_mode_multi_devices_pass"; + multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); + } else { + PADDLE_THROW("Unknown reduce strategy."); + } + } + multi_devices_pass->SetNotOwned("strategy", + &strategy_); + } + + private: + BuildStrategy strategy_; +}; + +std::shared_ptr BuildStrategy::CreatePassesFromStrategy( + bool finalize_strategy) const { + if (is_finalized_) { + return pass_builder_; + } + pass_builder_.reset(new ParallelExecutorPassBuilder(*this)); + if (finalize_strategy) { + is_finalized_ = true; + } + return pass_builder_; +} + +bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const { + return framework::ir::MultiDevSSAGraphBuilder().count(pass_name) > 0; +} + +ir::Graph *BuildStrategy::Apply(ir::Graph *graph, + const std::vector &places, + const std::string &loss_var_name, + const std::vector &local_scopes, + const size_t &nranks, +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const bool use_cuda, + platform::NCCLCommunicator *nccl_ctxs) const { +#else + const bool use_cuda) const { +#endif + VLOG(3) << "apply all passes"; + // Create a default one if not finalized by user. + CreatePassesFromStrategy(false); + + for (std::shared_ptr &pass : pass_builder_->AllPasses()) { + VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type(); + if (IsMultiDevPass(pass->Type())) { + pass->Erase(kPlaces); + pass->SetNotOwned>(kPlaces, &places); + pass->Erase(ir::kLossVarName); + pass->SetNotOwned(ir::kLossVarName, &loss_var_name); + pass->Erase(kLocalScopes); + pass->SetNotOwned>(kLocalScopes, + &local_scopes); + pass->Erase(ir::kNRanks); + pass->Set(ir::kNRanks, new size_t(nranks)); + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase(kNCCLCtxs); + pass->SetNotOwned(kNCCLCtxs, nctx); +#endif + } else if (pass->Type() == "coalesce_grad_tensor_pass" || + pass->Type() == "fuse_adam_op_pass" || + pass->Type() == "fuse_sgd_op_pass" || + pass->Type() == "fuse_momentum_op_pass" || + pass->Type() == "fuse_all_reduce_op_pass") { + pass->Erase(kPlaces); + pass->SetNotOwned>(kPlaces, &places); + pass->Erase(kLocalScopes); + pass->SetNotOwned>(kLocalScopes, + &local_scopes); + if (pass->Type() == "fuse_all_reduce_op_pass") { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase(kNCCLCtxs); + pass->SetNotOwned(kNCCLCtxs, nctx); + pass->Erase(kUseHierarchicalAllReduce); + pass->Set(kUseHierarchicalAllReduce, + new bool(use_hierarchical_allreduce_)); +#endif + } + } else if (pass->Type() == "coalesce_grad_tensor_pass") { + pass->Erase(kPlaces); + pass->SetNotOwned>(kPlaces, &places); + pass->Erase(kLocalScopes); + pass->SetNotOwned>(kLocalScopes, + &local_scopes); + } else if (pass->Type() == "sequential_execution_pass") { + LOG(INFO) << "set enable_sequential_execution:" + << enable_sequential_execution_; + } else if (pass->Type() == "all_reduce_deps_pass") { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase(kNCCLCtxs); + pass->SetNotOwned(kNCCLCtxs, nctx); + pass->Erase(kUseHierarchicalAllReduce); + pass->Set(kUseHierarchicalAllReduce, + new bool(use_hierarchical_allreduce_)); +#endif + LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this) + << ", num_trainers:" << num_trainers_; + } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { + if (!use_cuda) { + LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " + "GPU, skipped."; + continue; + } + } else if (pass->Type() == "inplace_pass") { + pass->Erase(ir::kUseCuda); + pass->Set(ir::kUseCuda, new bool(use_cuda)); + } else if (pass->Type() == "mkldnn_placement_pass") { + pass->Set("mkldnn_enabled_op_types", + new std::unordered_set(mkldnn_enabled_op_types_)); + } else if (pass->Type() == "backward_optimizer_op_deps_pass") { + if (!use_cuda) { + VLOG(1) << "backward_optimizer_op_deps_pass is only supported on " + "GPU, skipped."; + continue; + } + } + VLOG(3) << "Start Apply Pass " << pass->Type(); + graph = pass->Apply(graph); + VLOG(3) << "Finish Apply Pass " << pass->Type(); + } + VLOG(3) << "All Passes Applied"; + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +USE_PASS(sync_batch_norm_pass); +USE_PASS(fuse_relu_depthwise_conv_pass); +USE_PASS(fuse_elewise_add_act_pass); +USE_PASS(graph_viz_pass); +USE_PASS(multi_batch_merge_pass); +USE_PASS(reduce_mode_multi_devices_pass); +USE_PASS(all_reduce_mode_multi_devices_pass); +USE_PASS(dist_multi_devices_pass); +USE_PASS(multi_devices_check_pass); +USE_PASS(multi_devices_print_pass); +USE_PASS(memory_optimize_pass); +USE_PASS(sequential_execution_pass); +USE_PASS(all_reduce_deps_pass); +USE_PASS(backward_optimizer_op_deps_pass); +USE_PASS(modify_op_lock_and_record_event_pass); +USE_PASS(inplace_pass); +USE_PASS(lock_free_optimize_pass); +USE_PASS(coalesce_grad_tensor_pass); +USE_PASS(graph_to_program_pass); +USE_PASS(fuse_adam_op_pass); +USE_PASS(fuse_sgd_op_pass); +USE_PASS(fuse_momentum_op_pass); +USE_PASS(fuse_all_reduce_op_pass); +USE_PASS(runtime_context_cache_pass); +USE_PASS(record_skip_memory_opt_vars_pass); +#ifdef PADDLE_WITH_MKLDNN +USE_PASS(mkldnn_placement_pass); +#endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h new file mode 100644 index 00000000..14fb1783 --- /dev/null +++ b/paddle/fluid/framework/details/build_strategy.h @@ -0,0 +1,189 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/pass_builder.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +struct BuildStrategy { + // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and + // kReduce, for CPU and GPU. If you use kAllReduce, different threads + // optimize their parameters separately. If you use kReduce, the optimizations + // of parameters are distributed to different threads. + // For example, a model has 100 parameters and is running with four threads, + // if you choose kAllReduce, every thread is to optimize 100 parameters + // separately, if you choose kReduce, every thread is to optimize 25 + // parameters. + // Of particular note is, if you use kReduce when using CPU training, + // all the parameters are shared between different threads. This feature will + // save memory. + // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not + // equal for GPU. Because, the result of the different order of summing maybe + // different, for example, the result of `a+b+c+d` may be different with the + // result of `c+a+b+d`. + // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL, + // so the result of kAllReduce and kReduce maybe not equal. + // For CPU, if you want to fix the order of summing to make the result + // of kAllReduce and kReduce no diff, you can add + // `FLAGS_cpu_deterministic=true` to env. + enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 }; + + enum class GradientScaleStrategy { + kCoeffNumDevice = 0, + kOne = 1, + // user can customize gradient scale to use, and just feed + // it into exe.run(). + kCustomized = 2, + }; + + ReduceStrategy reduce_{ReduceStrategy::kAllReduce}; + GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice}; + + std::string debug_graphviz_path_{""}; + + // Add dependency between backward ops and optimization ops, make sure that + // all the backward ops are finished before running the optimization ops. + // It might make the training speed of data parallelism faster. + bool enable_backward_optimizer_op_deps_{true}; + // TODO(dev-paddle): enable_sequential_execution depends on + // kStaleProgramOpDescs, it is not appropriate, because kStaleProgramOpDescs + // will be removed in the near future. + bool enable_sequential_execution_{false}; + bool remove_unnecessary_lock_{true}; + // TODO(dev-paddle): cache_runtime_context may cause some models to hang up + // while running. + bool cache_runtime_context_{false}; + + // Operator fusion + // TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have + // cycle. + bool fuse_elewise_add_act_ops_{false}; + // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients + // should not be sparse types + bool fuse_all_optimizer_ops_{false}; + bool fuse_all_reduce_ops_{false}; + // fuse_relu_depthwise_conv can fuse the `relu -> + // depthwise_conv` + bool fuse_relu_depthwise_conv_{false}; + // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program + // faster. Because fusing broadcast OP equals delaying the execution of all + // broadcast Ops, in this case, all nccl streams are used only for reduce + // operations for a period of time. + bool fuse_broadcast_ops_{false}; + // replace batch_norm with sync_batch_norm. + bool sync_batch_norm_{false}; + + // mkldnn_enabled_op_types specify the operator type list to + // use MKLDNN acceleration. It is null in default, means + // that all the operators supported by MKLDNN will be + // accelerated. And it should not be set when + // FLAGS_use_mkldnn=false + std::unordered_set mkldnn_enabled_op_types_; + + // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4 + // to open them by default, we need to solve the fetch variable issue + // TODO(liuwei1031): memory_optimize depends on kStaleProgramOpDescs, + // it is not appropriate, because kStaleProgramOpDescs will be removed in the + // near future. + bool memory_optimize_{false}; + + // Turn on inplace by default. + bool enable_inplace_{true}; + + // TODO(zjl): Remove this flag when MemoryOptimizePass is refactored + bool use_legacy_memory_optimize_strategy_{false}; + + // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, + // num_trainers is 1, so the current fields of build_strategy doesn't tell if + // it's distributed model. + bool is_distribution_{false}; + bool async_mode_{false}; + int num_trainers_{1}; + int trainer_id_{0}; + std::vector trainers_endpoints_; + + // NCCL config + size_t nccl_comm_num_{1}; + // The picture is here: + // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396 + bool use_hierarchical_allreduce_{false}; + // Nccl ranks in a node when use hierarchical allreduce, it's setted to gpu + // cards' number in most cases. + size_t hierarchical_allreduce_inter_nranks_{0}; + // Nccl ranks bewteen nodes when use hierarchical allreduce, it's setted to + // nodes number. + size_t hierarchical_allreduce_exter_nranks_{0}; + + // NOTE: + // Before you add new options, think if it's a general strategy that works + // with other strategy. If not, the strategy should be created through + // CreatePassesFromStrategy and the pass can be managed separately. + + // User normally doesn't need to call this API. + // The PassBuilder allows for more customized insert, remove of passes + // from python side. + // A new PassBuilder is created based on configs defined above and + // passes are owned by the PassBuilder. + std::shared_ptr CreatePassesFromStrategy( + bool finalize_strategy) const; + + bool IsFinalized() const { return is_finalized_; } + + bool IsMultiDevPass(const std::string &pass_name) const; + + // Apply the passes built by the pass_builder_. The passes will be + // applied to the Program and output an ir::Graph. + ir::Graph *Apply(ir::Graph *graph, const std::vector &places, + const std::string &loss_var_name, + const std::vector &local_scopes, + const size_t &nranks, +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const bool use_cuda, + platform::NCCLCommunicator *nccl_ctxs) const; +#else + const bool use_cuda) const; +#endif + + // If set true, ParallelExecutor would build the main_program into multiple + // graphs, + // each of the graphs would run with one device. This approach can achieve + // better performance + // on some scenarios. + mutable bool enable_parallel_graph_ = false; + + private: + mutable bool is_finalized_ = false; + mutable std::shared_ptr pass_builder_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc new file mode 100644 index 00000000..0b653e57 --- /dev/null +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/computation_op_handle.h" + +#include + +namespace paddle { +namespace framework { +namespace details { +ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, + platform::Place place, + size_t scope_idx) + : OpHandleBase(node), + op_(framework::OpRegistry::CreateOp(*node->Op())), + scope_(scope), + place_(place), + scope_idx_(scope_idx) {} + +void ComputationOpHandle::RunImpl() { + WaitInputVarGenerated(place_); + + auto run_func = [this]() { op_->Run(*local_exec_scopes_[0], place_); }; + + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); + } +} + +bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) { + bool need_wait = + in_var && in_var->GeneratedOp() && + in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_); + return need_wait; +} + +std::string ComputationOpHandle::Name() const { return op_->Type(); } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h new file mode 100644 index 00000000..5a65aaf0 --- /dev/null +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -0,0 +1,65 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { +class ComputationOpHandle : public OpHandleBase { + public: + ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, + size_t scope_idx); + + OperatorBase *GetOp() { return op_.get(); } + + std::string Name() const override; + + const Scope *GetScope() const { return scope_; } + + Scope *GetScope() { return scope_; } + + const platform::Place &GetPlace() const { return place_; } + + void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; } + + size_t GetScopeIdx() const { return scope_idx_; } + + protected: + void RunImpl() override; + + bool NeedWait(VarHandleBase *in_var) override; + + std::vector GetLocalScopes() override { return {scope_}; } + + private: + std::unique_ptr op_; + Scope *scope_; + platform::Place place_; + size_t scope_idx_; + bool is_lock_and_record_event_free_{false}; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/container_cast.h b/paddle/fluid/framework/details/container_cast.h new file mode 100644 index 00000000..a42ae78d --- /dev/null +++ b/paddle/fluid/framework/details/container_cast.h @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace paddle { +namespace framework { +namespace details { + +template +std::vector DynamicCast(const std::vector& container) { + static_assert(std::is_base_of::value, + "ElementType must be a base class of ResultType"); + std::vector res; + for (auto* ptr : container) { + auto* derived = dynamic_cast(ptr); + if (derived) { + res.emplace_back(derived); + } + } + return res; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h new file mode 100644 index 00000000..090517ff --- /dev/null +++ b/paddle/fluid/framework/details/cow_ptr.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include // NOLINT + +namespace paddle { +namespace framework { +namespace details { + +template +class COWPtr { + public: + typedef std::shared_ptr RefPtr; + + private: + RefPtr m_sp; + + public: + COWPtr() : m_sp(nullptr) {} + explicit COWPtr(T* t) : m_sp(t) {} + + const T& Data() const { return *m_sp; } + + T* MutableData() { + DetachIfNotUnique(); + return m_sp.get(); + } + + void DetachIfNotUnique() { + T* tmp = m_sp.get(); + if (!(tmp == nullptr || m_sp.unique())) { + Detach(); + } + } + + void Detach() { + T* tmp = m_sp.get(); + m_sp = RefPtr(new T(*tmp)); + } +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc new file mode 100644 index 00000000..5b055d7c --- /dev/null +++ b/paddle/fluid/framework/details/cow_ptr_test.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/details/cow_ptr.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +namespace details { + +TEST(COWPtr, all) { + COWPtr ptr(new int{0}); + ASSERT_EQ(ptr.Data(), 0); + COWPtr ptr2 = ptr; + ASSERT_EQ(ptr2.Data(), 0); + ASSERT_EQ(&ptr2.Data(), &ptr.Data()); + *ptr2.MutableData() = 10; + ASSERT_EQ(ptr.Data(), 0); + ASSERT_EQ(ptr2.Data(), 10); +} + +TEST(COWPtr, change_old) { + COWPtr ptr(new int{0}); + COWPtr ptr2 = ptr; + *ptr.MutableData() = 10; + ASSERT_EQ(ptr2.Data(), 0); + ASSERT_EQ(ptr.Data(), 10); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/dgc_const_values.h b/paddle/fluid/framework/details/dgc_const_values.h new file mode 100644 index 00000000..fbe50dc9 --- /dev/null +++ b/paddle/fluid/framework/details/dgc_const_values.h @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace paddle { +namespace framework { +namespace details { + +constexpr char g_dgc_counter_name[] = "__g_dgc_counter__"; +constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__"; +constexpr char g_dgc_u[] = "__dgc_u__"; +constexpr char g_dgc_v[] = "__dgc_v__"; +constexpr char g_dgc_k[] = "__dgc_k__"; +constexpr char g_dgc_encoded[] = "__dgc_encoded__"; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc new file mode 100644 index 00000000..817fe03c --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +EagerDeletionOpHandle::EagerDeletionOpHandle( + ir::Node *node, Scope *scope, const platform::Place &place, + const std::unordered_set &vars, GarbageCollector *gc) + : OpHandleBase(node), + scope_(scope), + place_(place), + var_infos_(vars.begin(), vars.end()), + gc_(gc) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place)) { + dev_ctx_ = reinterpret_cast( + platform::DeviceContextPool::Instance().Get(place)); + if (dynamic_cast(gc_)) { + platform::CUDADeviceGuard guard( + boost::get(place).device); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + PADDLE_ENFORCE_NOT_NULL(event_); + } + } +#endif + PADDLE_ENFORCE(!vars.empty(), "Var names cannot be empty"); + for (auto *var : var_infos_) { + PADDLE_ENFORCE_NOT_NULL(var); + } +} + +EagerDeletionOpHandle::~EagerDeletionOpHandle() { +#ifdef PADDLE_WITH_CUDA + if (event_) { + auto gpu_place = boost::get(dev_ctx_->GetPlace()); + platform::CUDADeviceGuard guard(gpu_place.device); + PADDLE_ENFORCE(cudaEventDestroy(event_)); + } +#endif +} + +void EagerDeletionOpHandle::InitCUDA() { +#ifdef PADDLE_WITH_CUDA + int dev_id = + boost::get(dev_ctxes_.begin()->first).device; + events_[dev_id] = nullptr; +#endif +} + +void EagerDeletionOpHandle::CallOnce() { + PADDLE_ENFORCE(vars_.empty(), "vars_ must be initialized here"); + Scope *exec_scope = local_exec_scopes_[0]; + for (auto *var_info : var_infos_) { + auto *var = exec_scope->FindVar(var_info->Name()); + PADDLE_ENFORCE_NOT_NULL(var, "Variable %s should not be nullptr", + var_info->Name()); + vars_.emplace_back(var); + } +} + +std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } + +void EagerDeletionOpHandle::RunImpl() { + if (vars_.size() != var_infos_.size()) { + CallOnce(); + } + + platform::RecordEvent record_event(Name()); + std::deque> garbages; + for (size_t i = 0; i < var_infos_.size(); ++i) { + auto *var_info = var_infos_[i]; + if (var_info->IsSkipped() || !var_info->DecreaseRefCnt()) { + continue; + } + + VLOG(2) << "Erase variable " << var_info->Name() << " on " << place_; + + Variable *var = vars_[i]; + + if (var->IsType()) { + garbages.emplace_back(var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back( + var->GetMutable()->mutable_value()->MoveMemoryHolder()); + } else if (var->IsType()) { + auto *tensor_arr = var->GetMutable(); + for (auto &t : *tensor_arr) { + garbages.emplace_back(t.MoveMemoryHolder()); + } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + framework::ToTypeName(var->Type()), var_info->Name()); + } + } + + if (!garbages.empty()) { + ClearGarbages(&garbages); + } +} + +void EagerDeletionOpHandle::ClearGarbages( + std::deque> *garbages) { +#ifdef PADDLE_WITH_CUDA + if (event_) { + auto compute_stream = dev_ctx_->stream(); + auto callback_stream = + reinterpret_cast(gc_)->stream(); + auto callback_func = [=]() { + PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + }; + gc_->Add(std::move(*garbages), callback_func); + } else { +#endif + gc_->Add(std::move(*garbages)); +#ifdef PADDLE_WITH_CUDA + } +#endif +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h new file mode 100644 index 00000000..4b2d4a83 --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -0,0 +1,78 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace ir { +class MemOptVarInfo; +} // namespace ir + +namespace details { + +class EagerDeletionOpHandle : public OpHandleBase { + public: + EagerDeletionOpHandle(ir::Node *node, Scope *scope, + const platform::Place &place, + const std::unordered_set &vars, + GarbageCollector *gc); + + ~EagerDeletionOpHandle(); + + std::string Name() const override; + + /** + * Currently, EagerDeletionOpHandle has the highest priority. + * This priority settings speed up gc 15% in Transformer + * V100 8-GPU model. + */ + Priority GetPriority() const override { return kHighest; } + + protected: + void RunImpl() override; + + void InitCUDA() override; + + std::vector GetLocalScopes() override { return {scope_}; } + + private: + void ClearGarbages(std::deque> *garbages); + + void CallOnce(); + + Scope *scope_; + platform::Place place_; + std::vector var_infos_; // not own + GarbageCollector *gc_; // not own + std::vector vars_; +#ifdef PADDLE_WITH_CUDA + platform::CUDADeviceContext *dev_ctx_{nullptr}; + cudaEvent_t event_{nullptr}; +#endif +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h new file mode 100644 index 00000000..f8fd395b --- /dev/null +++ b/paddle/fluid/framework/details/exception_holder.h @@ -0,0 +1,115 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace details { + +class ExceptionHolder { + public: + void Catch(std::exception_ptr eptr) { + try { + std::rethrow_exception(eptr); + } catch (platform::EOFException exp) { + Catch(exp); + } catch (platform::EnforceNotMet exp) { + Catch(exp); + } catch (std::exception& ex) { + LOG(FATAL) << "std::exception caught, " << ex.what(); + } catch (...) { + LOG(FATAL) << "Unknown exception caught"; + } + } + + bool IsCaught() const { + std::lock_guard lock(mu_); + return exception_.get() != nullptr; + } + + void ReThrow() { + std::lock_guard lock(mu_); + switch (type_) { + case kNone: + break; + case kEnforceNotMet: { + auto e = *static_cast(exception_.get()); + throw e; + } + case kEOF: { + auto e = *static_cast(exception_.get()); + throw e; + } + } + ClearImpl(); + } + + void Clear() { + std::lock_guard lock(mu_); + ClearImpl(); + } + + std::string Type() { + std::lock_guard lock(mu_); + switch (type_) { + case kNone: + return "None"; + case kEnforceNotMet: { + return "EnforceNotMet"; + } + case kEOF: { + return "EOF"; + } + } + return "unknown"; + } + + private: + void ClearImpl() { + exception_.reset(); + type_ = kNone; + } + + void Catch(const platform::EnforceNotMet& exp) { + std::lock_guard lock(mu_); + exception_.reset(new platform::EnforceNotMet(exp)); + type_ = kEnforceNotMet; + } + + void Catch(const platform::EOFException& exp) { + std::lock_guard lock(mu_); + // EOFException will not cover up existing EnforceNotMet. + if (exception_.get() == nullptr) { + exception_.reset(new platform::EOFException(exp)); + type_ = kEOF; + } + } + + enum ExceptionType { kNone, kEnforceNotMet, kEOF }; + ExceptionType type_{kNone}; + + std::unique_ptr exception_; + mutable std::mutex mu_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h new file mode 100644 index 00000000..68de1580 --- /dev/null +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include // for size_t + +namespace paddle { +namespace framework { +namespace details { + +struct ExecutionStrategy { + enum ExecutorType { kDefault = 0, kExperimental = 1 }; + + // num_threads indicates the size of thread pool. + size_t num_threads_{0}; + bool use_cuda_{true}; + // Note that allow_op_delay is invalid now. + bool allow_op_delay_{false}; + // num_iteration_per_drop_scope indicates how many + // iterations the framework cleans up a local execution scope. + // In some models, the value of this parameter has a great + // influence on the performance(about 15%) of the program. + size_t num_iteration_per_drop_scope_{1}; + // At present, the kExperimental executor is the fastest in most models. + ExecutorType type_{kExperimental}; + // This debug option. + bool dry_run_{false}; + + // only use with async_ssa_graph_executor + // and pyreader with data queue + size_t num_iteration_per_run_{1}; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc new file mode 100644 index 00000000..7daab6da --- /dev/null +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -0,0 +1,286 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/fetch_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace framework { +namespace details { + +FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &local_exec_scopes, + const std::vector &places, ir::Graph *graph) + : strategy_(strategy), + local_scopes_(local_scopes), + local_exec_scopes_(local_exec_scopes), + places_(places), + graph_(graph), + fetch_ctxs_(places), + pool_(strategy.num_threads_), + // add one more thread for generate op_deps + prepare_pool_(1) { + for (auto &op : ir::FilterByNodeWrapper(*graph_)) { + int dep = static_cast(op->NotReadyInputSize()); + op_deps_.emplace(op, dep); + if (dep == 0) { + bootstrap_ops_.emplace_back(op); + } + } + PADDLE_ENFORCE_GT(op_deps_.size(), 0, "The graph doesn't have operators."); + PrepareAtomicOpDeps(); +} + +FeedFetchList FastThreadedSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + VLOG(3) << "enter FastThreadedSSAGraphExecutor Run"; + std::unique_ptr event( + new platform::RecordEvent("FastThreadedSSAGraphExecutorPrepare")); + std::unique_ptr>> + op_deps = atomic_op_deps_.get(); + PrepareAtomicOpDeps(); + size_t num_ops = op_deps->size(); + + paddle::framework::FeedFetchList fetches; + fetches.resize(fetch_tensors.size()); + std::unordered_map> fetched_vars; + std::vector fetch_ops; + std::vector ready_fetch_ops; + exception_.Clear(); + + InsertFetchOps(fetch_tensors, &fetches, &fetched_vars, op_deps.get(), + &fetch_ops, &ready_fetch_ops); + event.reset(nullptr); + if (strategy_.num_threads_ == 1 && traced_ops_.size() == num_ops) { + // If the num_threads is 1, we can record the order of operator's + // execution in the first iteration, and in subsequent iterations, + // run the recorded operators directly. This strategy could make the + // execution faster. + VLOG(3) << "Run the traced ops."; + RunTracedOps(traced_ops_); + RunTracedOps(fetch_ops); + if (exception_.IsCaught()) { + ExecutionFinal(&fetch_ops); + } + } else { + traced_ops_.clear(); + remaining_ = 0; + auto complete_q = std::make_shared>(); + for (auto op : bootstrap_ops_) { + RunOpAsync(op_deps.get(), op, complete_q); + } + for (auto op : ready_fetch_ops) { + RunOpAsync(op_deps.get(), op, complete_q); + } + + size_t num_complete = 0; + while (num_complete != op_deps->size()) { + size_t num_comp = complete_q->Pop(); + if (num_comp == -1UL) { + int remaining = 0; + while (true) { + remaining = remaining_; + if (remaining == 0) { + break; + } + for (int i = 0; i < remaining; ++i) { + complete_q->Pop(); + } + } + if (exception_.IsCaught()) { + ExecutionFinal(&fetch_ops); + } + } + num_complete += num_comp; + } + } + // Wait FetchOps. + ClearFetchOp(graph_, &fetch_ops); + return fetches; +} + +void FastThreadedSSAGraphExecutor::InsertFetchOps( + const std::vector &fetch_tensors, FeedFetchList *fetches, + std::unordered_map> *fetched_vars, + std::unordered_map> *op_deps, + std::vector *fetch_ops, + std::vector *ready_fetch_ops) { + std::unordered_set fetch_tensor_set(fetch_tensors.begin(), + fetch_tensors.end()); + for (auto &fetch_var_name : fetch_tensor_set) { + for (auto &var_map : graph_->Get(kGraphVars)) { + auto it = var_map.find(fetch_var_name); + if (it != var_map.end()) { + (*fetched_vars)[fetch_var_name].push_back(*it->second.rbegin()); + } + } + } + + for (size_t i = 0; i < fetch_tensors.size(); ++i) { + auto &var_name = fetch_tensors.at(i); + auto fetched_var_it = fetched_vars->find(var_name); + PADDLE_ENFORCE(fetched_var_it != fetched_vars->end(), + "Cannot find fetched variable(%s).(Perhaps the main_program " + "is not set to ParallelExecutor)", + var_name); + + auto &vars = fetched_var_it->second; + + ir::Node *fetch_node = + graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation); + auto *op = new FetchOpHandle(fetch_node, fetches, i, &local_scopes_, + &local_exec_scopes_); + fetch_ops->emplace_back(op); + + for (auto &p : places_) { + op->SetDeviceContext(p, fetch_ctxs_.Get(p)); + } + + for (auto *var : vars) { + op->AddInput(var); + } + + int dep = static_cast(op->NotReadyInputSize()); + (*op_deps)[op] = dep; + if (dep == 0) { + ready_fetch_ops->emplace_back(op); + } + } +} + +bool FastThreadedSSAGraphExecutor::RunOp( + OpHandleBase *op, const std::shared_ptr> &complete_q, + size_t *complete) { + RunOpSync(op); + if (LIKELY(!exception_.IsCaught())) { + if (LIKELY(!strategy_.dry_run_)) { + RecordOps(op); + } + ++(*complete); + return true; + } else { + --remaining_; + complete_q->Push(-1UL); + return false; + } +} + +void FastThreadedSSAGraphExecutor::RunOpAsync( + std::unordered_map> *op_deps, + OpHandleBase *op, + const std::shared_ptr> &complete_q) { + ++remaining_; + this->pool_.enqueue([=] { + std::queue op_queue; + op_queue.push(op); + + size_t complete = 0; + while (!op_queue.empty()) { + OpHandleBase *op_to_run = op_queue.front(); + op_queue.pop(); + + if (!RunOp(op_to_run, complete_q, &complete)) { + return; + } + + auto &outputs = op_to_run->Outputs(); + op_to_run = nullptr; + for (auto &output : outputs) { + for (auto &pending_op : output->PendingOps()) { + std::atomic &deps = op_deps->at(pending_op); + if (deps.fetch_sub(1) != 1) continue; + + // NOTE(zjl): op with highest priority should run + // first without switching to another thread. + if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) { + op_queue.push(pending_op); + } else { + if (op_to_run == nullptr) { + op_to_run = pending_op; + } else { + RunOpAsync(op_deps, pending_op, complete_q); + } + } + } + } + + if (op_to_run != nullptr) op_queue.push(op_to_run); + } + --remaining_; + complete_q->Push(complete); + }); +} + +void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { + atomic_op_deps_ = prepare_pool_.enqueue([&] { + auto *op_deps = new std::unordered_map>; + for (auto &pair : op_deps_) { + (*op_deps)[pair.first] = pair.second; + } + return std::unique_ptr< + std::unordered_map>>(op_deps); + }); +} + +const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; } + +void FastThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) { + if (strategy_.num_threads_ == 1 && !dynamic_cast(op)) { + traced_ops_.emplace_back(op); + } +} + +void FastThreadedSSAGraphExecutor::ExecutionFinal( + std::vector *fetch_ops) { + VLOG(3) << "caught exception " << exception_.Type() << ", rethrow it"; + ClearFetchOp(graph_, fetch_ops); + exception_.ReThrow(); +} + +void FastThreadedSSAGraphExecutor::RunTracedOps( + const std::vector &traced_ops) { + for (auto &op : traced_ops) { + if (exception_.IsCaught()) { + return; + } + RunOpSync(op); + } +} + +void FastThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) { + try { + if (VLOG_IS_ON(10)) { + VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); + } + if (LIKELY(!strategy_.dry_run_)) { + op->Run(strategy_.use_cuda_); + } + VLOG(10) << op << " " << op->Name() << " Done "; + } catch (...) { + exception_.Catch(std::current_exception()); + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h new file mode 100644 index 00000000..5d11c2cf --- /dev/null +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -0,0 +1,95 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/details/exception_holder.h" +#include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/ssa_graph_executor.h" + +namespace paddle { +namespace framework { +class Scope; +namespace details { + +class OpHandleBase; +class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { + public: + FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &local_exec_scopes, + const std::vector &places, + ir::Graph *graph); + FeedFetchList Run(const std::vector &fetch_tensors) override; + const ir::Graph &Graph() const override; + + private: + // Note(zcd): the ThreadPool should be placed last so that ThreadPool should + // be destroyed first. + ExecutionStrategy strategy_; + std::vector local_scopes_; + std::vector local_exec_scopes_; + std::vector places_; + ir::Graph *graph_; + + std::unordered_map op_deps_; + std::vector bootstrap_ops_; + + platform::DeviceContextPool fetch_ctxs_; + std::atomic remaining_; + + std::future< + std::unique_ptr>>> + atomic_op_deps_; + ExceptionHolder exception_; + + ::ThreadPool pool_; + ::ThreadPool prepare_pool_; + + std::vector traced_ops_; + + bool RunOp(OpHandleBase *op, + const std::shared_ptr> &complete_q, + size_t *complete); + + void RunOpAsync(std::unordered_map> *op_deps, + OpHandleBase *op, + const std::shared_ptr> &complete_q); + + void PrepareAtomicOpDeps(); + + inline void RecordOps(OpHandleBase *op); + + inline void ExecutionFinal(std::vector *fetch_ops); + + inline void RunOpSync(OpHandleBase *op); + + void RunTracedOps(const std::vector &traced_ops); + + void InsertFetchOps( + const std::vector &fetch_tensors, FeedFetchList *fetches, + std::unordered_map> + *fetched_vars, + std::unordered_map> *op_deps, + std::vector *fetch_ops, + std::vector *ready_fetch_ops); +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fetch_barrier_op_handle.cc b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc new file mode 100644 index 00000000..127183a3 --- /dev/null +++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h" + +#include + +namespace paddle { +namespace framework { +namespace details { +FetchBarrierOpHandle::FetchBarrierOpHandle( + ir::Node *node, const std::vector &local_scopes, + const std::vector &places) + // fetch_barrier op always run on place0, but output on all places. + : OpHandleBase(node), + op_(framework::OpRegistry::CreateOp(*node->Op())), + local_scopes_(local_scopes), + places_(places), + run_scope_(local_scopes[0]), + place_(places[0]) { + for (auto &p : places) { + this->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); + } +} + +bool FetchBarrierOpHandle::IsMultiDeviceTransfer() { + // override IsMultiDeviceTransfer to return true + return true; +} + +void FetchBarrierOpHandle::RunImpl() { + WaitInputVarGenerated(place_); + + auto run_func = [this]() { op_->Run(*local_exec_scopes_[0], place_); }; + + if (is_lock_and_record_event_free_) { + run_func(); + } else { + this->RunAndRecordEvent(run_func); + } +} + +bool FetchBarrierOpHandle::NeedWait(VarHandleBase *in_var) { + bool need_wait = + in_var && in_var->GeneratedOp() && + in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_); + return need_wait; +} + +std::string FetchBarrierOpHandle::Name() const { return op_->Type(); } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fetch_barrier_op_handle.h b/paddle/fluid/framework/details/fetch_barrier_op_handle.h new file mode 100644 index 00000000..d1f7e08b --- /dev/null +++ b/paddle/fluid/framework/details/fetch_barrier_op_handle.h @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +// **NOTE**: fetch_barrier op is special it outputs all recved variables on +// all places if there are multiple places, must init with +// multiple dev_ctxes_ !!!! + +struct FetchBarrierOpHandle : public OpHandleBase { + public: + FetchBarrierOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places); + + bool IsMultiDeviceTransfer() override; + + std::string Name() const override; + + protected: + void RunImpl() override; + + std::vector GetLocalScopes() override { return local_scopes_; } + + bool NeedWait(VarHandleBase *in_var) override; + + private: + std::unique_ptr op_; + std::vector local_scopes_; + std::vector places_; + Scope *run_scope_; + platform::Place place_; + + bool is_lock_and_record_event_free_{false}; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc new file mode 100644 index 00000000..1ac32ca9 --- /dev/null +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fetch_op_handle.h" +#include +#include +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace framework { +namespace details { + +FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset, + std::vector *local_scopes, + std::vector *local_exec_scopes) + : OpHandleBase(node), + data_(data), + offset_(offset), + local_scopes_(local_scopes), + local_exec_scopes_(local_exec_scopes) {} + +FetchOpHandle::~FetchOpHandle() {} + +void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { + PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error"); +} + +void FetchOpHandle::WaitAndMergeCPUTensors() const { + std::vector tensors_ptr; + tensors_ptr.reserve(tensors_.size()); + for (auto &t : tensors_) { + tensors_ptr.emplace_back(&t); + } + data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace()); +} + +void FetchOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); + WaitInputVarGenerated(platform::CPUPlace()); + + tensors_.resize(inputs_.size()); + platform::CPUPlace cpu; + auto &scopes = *local_exec_scopes_; + + for (size_t i = 0; i < inputs_.size(); ++i) { + auto *var_handle = static_cast(inputs_[i]); + auto &scope = scopes.at(var_handle->scope_idx()); + auto *var = scope->FindVar(var_handle->name()); + PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope", + var_handle->name()); + + auto &t = var->Get(); + if (platform::is_gpu_place(t.place())) { +#ifdef PADDLE_WITH_CUDA + TensorCopy(t, cpu, &tensors_[i]); +#endif + } else { + tensors_[i].ShareDataWith(t); + } + tensors_[i].set_lod(t.lod()); + } + + this->WaitAndMergeCPUTensors(); +} + +void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) { + auto cpu_ctx = platform::DeviceContextPool::Instance().Get(place); + for (auto *input : inputs_) { + if (input->GeneratedOp()) { + input->GeneratedOp()->RecordWaitEventOnCtx(cpu_ctx); + } + } +} + +bool FetchOpHandle::IsMultiDeviceTransfer() { return true; } + +std::string FetchOpHandle::Name() const { return "Fetch"; } + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h new file mode 100644 index 00000000..f3af4e61 --- /dev/null +++ b/paddle/fluid/framework/details/fetch_op_handle.h @@ -0,0 +1,62 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +struct FetchOpHandle : public OpHandleBase { + public: + FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset, + std::vector *local_scopes, + std::vector *local_exec_scopes); + + ~FetchOpHandle(); + + void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) override; + + void WaitAndMergeCPUTensors() const; + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override; + + protected: + void RunImpl() override; + + std::vector GetLocalScopes() override { return *local_scopes_; } + + void WaitInputVarGenerated(const platform::Place &place) override; + + private: + FeedFetchList *data_; + size_t offset_; + std::vector *local_scopes_; + std::vector *local_exec_scopes_; + std::vector tensors_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc new file mode 100644 index 00000000..23f0b439 --- /dev/null +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" +#include +#include +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/device_memory_aligment.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_bool(skip_fused_all_reduce_check, false, ""); +namespace paddle { +namespace framework { +namespace details { + +typedef std::vector>> + GradientAndLoDTensor; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +FusedAllReduceOpHandle::FusedAllReduceOpHandle( + ir::Node *node, const std::vector &local_scopes, + const std::vector &places, const size_t num_of_all_reduce, + const platform::NCCLCommunicator *ctxs) + : NCCLOpHandleBase(node, places, ctxs), + local_scopes_(local_scopes), + num_of_all_reduce_(num_of_all_reduce) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); +} +#else + +FusedAllReduceOpHandle::FusedAllReduceOpHandle( + ir::Node *node, const std::vector &local_scopes, + const std::vector &places, const size_t num_of_all_reduce) + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + num_of_all_reduce_(num_of_all_reduce) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); +} + +#endif + +void FusedAllReduceOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); + + VLOG(4) << this->DebugString(); + + WaitInputVarGenerated(); + // The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... + // The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + + size_t place_num = places_.size(); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), place_num * num_of_all_reduce_, + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + + GradientAndLoDTensor grads_tensor; + grads_tensor.resize(place_num); + + int64_t numel = -1; + auto dtype = static_cast(0); + for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { + auto &g_tensor = grads_tensor.at(scope_idx); + g_tensor.reserve(num_of_all_reduce_); + + GetGradLoDTensor(scope_idx, in_var_handles, out_var_handles, &g_tensor); + + int64_t element_num = 0; + framework::proto::VarType::Type ele_dtype = + static_cast(0); + GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num); + + if (numel == -1) { + numel = element_num; + } + if (dtype == static_cast(0)) { + dtype = ele_dtype; + PADDLE_ENFORCE_NE(ele_dtype, + static_cast(0)); + } + PADDLE_ENFORCE_EQ(ele_dtype, dtype); + + // Check whether the address space is contiguous. + std::sort( + g_tensor.begin(), g_tensor.end(), + [](const std::pair &grad1, + const std::pair &grad2) -> bool { + return grad1.second->data() < grad2.second->data(); + }); + + size_t size_of_dtype = framework::SizeOfType(dtype); + for (size_t k = 1; k < g_tensor.size(); ++k) { + const void *cur_address = g_tensor.at(k - 1).second->data(); + int64_t len = g_tensor.at(k - 1).second->numel(); + auto offset = platform::Alignment(len * size_of_dtype, places_[0]); + void *infer_next_address = reinterpret_cast( + reinterpret_cast(cur_address) + offset); + const void *next_address = g_tensor.at(k).second->data(); + + VLOG(10) << string::Sprintf( + "Input[%d](%s) address: 0X%02x, Input[%d](%s) address: 0X%02x, Infer " + "input[%d] address: 0X%02x. The offset: %d", + k - 1, g_tensor.at(k - 1).first, cur_address, g_tensor.at(k).first, k, + next_address, k, infer_next_address, offset); + PADDLE_ENFORCE_EQ(infer_next_address, next_address, + "The address is not consistent."); + } + } + + if (!FLAGS_skip_fused_all_reduce_check) { + for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { + for (size_t j = 1; j < num_of_all_reduce_; ++j) { + PADDLE_ENFORCE_EQ(grads_tensor.at(0).at(j).first, + grads_tensor.at(scope_idx).at(j).first); + } + } + } + + std::vector lod_tensor_data; + for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { + auto data = grads_tensor.at(scope_idx).at(0).second->data(); + lod_tensor_data.emplace_back(data); + } + + if (platform::is_gpu_place(places_[0])) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + int nccl_dtype = platform::ToNCCLDataType(dtype); + std::vector> all_reduce_calls; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &p = places_[i]; + void *buffer = const_cast(lod_tensor_data.at(i)); + + all_reduce_calls.emplace_back([=] { + NCCLAllReduce(p, buffer, buffer, numel, + static_cast(nccl_dtype), ncclSum); + }); + } + + VLOG(10) << "fusedallreduce size:" << numel * SizeOfType(dtype); + + this->RunAndRecordEvent([&] { + if (all_reduce_calls.size() == 1UL) { + // Do not use NCCLGroup when manage NCCL by per thread per device + all_reduce_calls[0](); + } else { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + } + }); +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif + } else { + // Special handle CPU only Operator's gradient. Like CRF + auto grad_name = grads_tensor.at(0).at(0).first; + auto &trg = *this->local_exec_scopes_[0] + ->FindVar(grad_name) + ->GetMutable(); + + // Reduce All data to trg in CPU + ReduceBufferData func(lod_tensor_data, trg.data(), numel); + VisitDataType(trg.type(), func); + + for (size_t i = 1; i < local_exec_scopes_.size(); ++i) { + auto &scope = *local_exec_scopes_[i]; + auto &p = places_[i]; + auto *var = scope.FindVar(grad_name); + auto *dev_ctx = dev_ctxes_.at(p); + size_t size = numel * SizeOfType(trg.type()); + RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] { + auto dst_ptr = var->GetMutable()->data(); + platform::CPUPlace cpu_place; + memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data(), size); + }); + } + } +} + +void FusedAllReduceOpHandle::GetGradLoDTensor( + const size_t &scope_idx, const std::vector &in_var_handles, + const std::vector &out_var_handles, + std::vector> *grad_tensor) const { + auto *local_scope = local_exec_scopes_[scope_idx]; + size_t place_num = places_.size(); + + for (size_t j = 0; j < in_var_handles.size(); j += place_num) { + auto var_name = in_var_handles[j]->name(); + PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name()); + auto &lod_tensor = local_scope->FindVar(var_name)->Get(); + PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx)); + grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor)); + } +} + +void FusedAllReduceOpHandle::GetDTypeAndNumel( + const std::vector> &grad_tensor, + proto::VarType::Type *dtype, int64_t *numel) const { + *numel = 0; + size_t size_of_dtype = 0; + for (size_t i = 0; i < grad_tensor.size(); ++i) { + // Get dtype + auto ele_type = grad_tensor.at(i).second->type(); + if (i == 0) { + *dtype = ele_type; + size_of_dtype = framework::SizeOfType(ele_type); + } + PADDLE_ENFORCE_EQ(ele_type, *dtype); + + // Get element number + int64_t len = grad_tensor.at(i).second->numel(); + PADDLE_ENFORCE_GT(len, 0); + *numel += + platform::Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; + } +} + +std::string FusedAllReduceOpHandle::Name() const { return "fused_all_reduce"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h new file mode 100644 index 00000000..fccbd772 --- /dev/null +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -0,0 +1,81 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/framework/details/nccl_op_handle.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +struct FusedAllReduceOpHandle : public NCCLOpHandleBase { + FusedAllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places, + const size_t num_of_all_reduce, + const platform::NCCLCommunicator *ctxs); +#else +struct FusedAllReduceOpHandle : public OpHandleBase { + FusedAllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places, + const size_t num_of_all_reduce); +#endif + std::string Name() const override; + + // Delay and buffer nccl_all_reduce together can significantly increase + // performance. Disable this feature by returning false. + bool IsMultiDeviceTransfer() override { return true; }; + + protected: + void RunImpl() override; + + std::vector GetLocalScopes() override { return local_scopes_; } + + private: + std::vector local_scopes_; +#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) + // NCCLOpHandleBase already have these attributes. + // Will polish it by class inheritance framework. + std::vector places_; +#endif + size_t num_of_all_reduce_; + + // Check the dtype of the input + void GetDTypeAndNumel( + const std::vector> &g_tensor, + proto::VarType::Type *dtype, int64_t *total_num) const; + + // Get gradient's name and LoDTensor + void GetGradLoDTensor(const size_t &scope_idx, + const std::vector &in_var_handles, + const std::vector &out_var_handles, + std::vector> + *grad_tensor) const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc new file mode 100644 index 00000000..59c5da0d --- /dev/null +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace framework { +namespace details { + +void FusedBroadcastOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); + + if (places_.size() == 1UL) return; + + auto in_var_handles = DynamicCast(inputs_); + auto out_var_handles = DynamicCast(outputs_); + + WaitInputVarGenerated(); + + size_t place_num = places_.size(); + PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size()); + + for (size_t i = 0; i < in_var_handles.size(); ++i) { + BroadcastOneVar( + *in_var_handles[i], + std::vector(out_var_handles.begin() + i * place_num, + out_var_handles.begin() + (i + 1) * place_num), + local_exec_scopes_); + } +} + +std::string FusedBroadcastOpHandle::Name() const { return "fused_broadcast"; } + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h new file mode 100644 index 00000000..e43d545c --- /dev/null +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -0,0 +1,57 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/broadcast_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +struct FusedBroadcastOpHandle : public BroadcastOpHandle { + public: +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + FusedBroadcastOpHandle(ir::Node *node, + const std::vector local_scopes, + const std::vector &places, + const platform::NCCLContextMap *nccl_ctx) + : BroadcastOpHandle(node, local_scopes, places, nccl_ctx) {} +#else + FusedBroadcastOpHandle(ir::Node* node, const std::vector local_scopes, + const std::vector& places) + : BroadcastOpHandle(node, local_scopes, places) {} +#endif + std::string Name() const override; + + protected: + void RunImpl() override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc new file mode 100644 index 00000000..49404509 --- /dev/null +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -0,0 +1,171 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/details/broadcast_op_handle_test.h" + +namespace paddle { +namespace framework { +namespace details { + +struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { + std::vector out_varnames_; + std::vector> nodes_; + + void InitFusedBroadcastOp(std::vector input_scope_idxes) { + nodes_.clear(); + // initialize scope and var + std::unordered_map scope_map; + for (size_t i = 0; i < place_list_.size(); ++i) { + local_scopes_.push_back(&(g_scope_.NewScope())); + Scope& local_scope = local_scopes_.back()->NewScope(); + for (size_t j = 0; j < input_scope_idxes.size(); ++j) { + local_scope.Var("out_var" + std::to_string(j)); + if (i == j) local_scope.Var("in_var" + std::to_string(j)); + } + param_scopes_.emplace_back(&local_scope); + scope_map.emplace(local_scopes_.back(), param_scopes_.back()); + } + + // create op handle node + nodes_.emplace_back( + ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation)); + if (use_gpu_) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + op_handle_ = new FusedBroadcastOpHandle( + nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); +#else + PADDLE_THROW("CUDA is not supported."); +#endif + } else { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + op_handle_ = new FusedBroadcastOpHandle( + nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); +#else + op_handle_ = new FusedBroadcastOpHandle(nodes_.back().get(), + local_scopes_, place_list_); +#endif + } + + op_handle_->SetLocalExecScopes(scope_map); + + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + // add input var handle + nodes_.emplace_back(ir::CreateNodeForTest("in_node" + std::to_string(i), + ir::Node::Type::kVariable)); + VarHandle* in_var_handle = new VarHandle( + nodes_.back().get(), 1, input_scope_idxes[i], + "in_var" + std::to_string(i), place_list_[input_scope_idxes[i]]); + vars_.emplace_back(in_var_handle); + op_handle_->AddInput(in_var_handle); + + // add output var handle + for (size_t j = 0; j < place_list_.size(); ++j) { + nodes_.emplace_back(ir::CreateNodeForTest( + "out_node" + std::to_string(i), ir::Node::Type::kVariable)); + VarHandle* out_var_handle = + new VarHandle(nodes_.back().get(), 2, j, + "out_var" + std::to_string(i), place_list_[j]); + vars_.emplace_back(out_var_handle); + op_handle_->AddOutput(out_var_handle); + } + } + } + + void TestFusedBroadcastLoDTensor(std::vector input_scope_idxes) { + std::vector> send_vec; + f::LoD lod{{0, 10, 20}}; + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string varname("in_var" + std::to_string(i)); + float val_scalar = static_cast(i); + send_vec.push_back( + InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar)); + } + + op_handle_->Run(false); + + WaitAll(); + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string& varname("out_var" + std::to_string(i)); + for (size_t j = 0; j < place_list_.size(); ++j) { + LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]); + } + } + } + + void TestFusedBroadcastSelectedRows(std::vector input_scope_idxes) { + std::vector> send_vector; + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + int height = static_cast(kDims[0] * 2); + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string varname("in_var" + std::to_string(i)); + float val_scalar = static_cast(i); + send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i], + rows, height, val_scalar)); + } + + op_handle_->Run(false); + + WaitAll(); + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string& varname("out_var" + std::to_string(i)); + for (size_t j = 0; j < place_list_.size(); ++j) { + SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows, + height); + } + } + } +}; + +TEST(FusedBroadcastTester, CPULodTensor) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(false); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastLoDTensor(input_scope_idxes); +} + +TEST(FusedBroadcastTester, CPUSelectedRows) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(false); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastSelectedRows(input_scope_idxes); +} + +#ifdef PADDLE_WITH_CUDA +TEST(FusedBroadcastTester, GPULodTensor) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(true); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastLoDTensor(input_scope_idxes); +} + +TEST(FusedBroadcastTester, GPUSelectedRows) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(true); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastSelectedRows(input_scope_idxes); +} +#endif + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc new file mode 100644 index 00000000..a039c620 --- /dev/null +++ b/paddle/fluid/framework/details/gather_op_handle.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/gather_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/variable_visitor.h" + +namespace paddle { +namespace framework { +namespace details { + +GatherOpHandle::GatherOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places) + : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} + +void GatherOpHandle::RunImpl() { + if (places_.size() == 1) return; + // the input and output may have dummy var. + auto in_var_handles = DynamicCast(inputs_); + + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The number of output should equal to the number of places."); + + VarHandle *out_var_handle; + { + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, + "The number of output should be one."); + out_var_handle = out_var_handles.front(); + } + + auto &var_scopes = local_exec_scopes_; + + auto in_0_handle = in_var_handles[0]; + auto pre_in_var = + var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); + PADDLE_ENFORCE_NOT_NULL(pre_in_var); + + PADDLE_ENFORCE(pre_in_var->IsType(), + "Currently, gather_op only can gather SelectedRows."); + + // Wait input done, this Wait is asynchronous operation + WaitInputVarGenerated(); + + auto &pre_in_value = pre_in_var->Get(); + std::vector out_rows; + std::vector in_tensors; + + // Gather the inputs + for (auto *in_handle : in_var_handles) { + auto *in_var = + var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); + PADDLE_ENFORCE_NOT_NULL(in_var); + VariableVisitor::EnforceShapeAndDTypeEQ(*in_var, *pre_in_var); + + auto &in_sr_value = in_var->Get(); + + auto &in_sr_rows = in_sr_value.rows(); + out_rows.insert(out_rows.end(), in_sr_rows.begin(), in_sr_rows.end()); + in_tensors.emplace_back(in_sr_value.value()); + } + + // NOTE: The Places of all input tensor must be all on CPU or all on GPU. + platform::Place t_out_p = out_var_handle->place(); + if (platform::is_gpu_place(pre_in_value.place())) { + PADDLE_ENFORCE(platform::is_gpu_place(t_out_p), + "Places of input and output must be all on GPU."); + } else { + t_out_p = platform::CPUPlace(); + } + + auto out_var = var_scopes.at(out_var_handle->scope_idx()) + ->FindVar(out_var_handle->name()); + PADDLE_ENFORCE_NOT_NULL(out_var); + auto out_value = out_var->GetMutable(); + out_value->set_height(pre_in_value.height()); + out_value->set_rows(out_rows); + size_t rows = out_rows.size(); + DDim out_dim = pre_in_value.GetCompleteDims(); + out_dim[0] = static_cast(rows); + out_value->mutable_value()->Resize(out_dim).mutable_data( + t_out_p, pre_in_value.value().type()); + Tensor *out_tensor = out_value->mutable_value(); + + // copy + auto dev_ctx = dev_ctxes_.at(out_var_handle->place()); + RunAndRecordEvent(out_var_handle->place(), [in_tensors, out_tensor, &dev_ctx, + t_out_p] { + int s = 0, e = 0; + for (size_t j = 0; j < in_tensors.size(); ++j) { + e += in_tensors[j].dims()[0]; + auto sub_out = out_tensor->Slice(s, e); + paddle::framework::TensorCopy(in_tensors[j], t_out_p, *dev_ctx, &sub_out); + s = e; + } + }); +} + +std::string GatherOpHandle::Name() const { return "gather"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h new file mode 100644 index 00000000..ac87b246 --- /dev/null +++ b/paddle/fluid/framework/details/gather_op_handle.h @@ -0,0 +1,52 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +struct GatherOpHandle : public OpHandleBase { + public: + GatherOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places); + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; + + std::vector GetLocalScopes() override { return local_scopes_; } + + private: + const std::vector &local_scopes_; + const std::vector &places_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc new file mode 100644 index 00000000..5d8562e7 --- /dev/null +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -0,0 +1,212 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/gather_op_handle.h" +#include +#include +#include "gtest/gtest.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { +namespace f = paddle::framework; +namespace p = paddle::platform; + +// test data amount +const f::DDim kDims = {20, 20}; + +struct TestGatherOpHandle { + std::vector> ctxs_; + std::vector local_scopes_; + std::vector param_scopes_; + Scope g_scope_; + OpHandleBase* op_handle_; + std::vector vars_; + std::vector gpu_list_; + std::vector> nodes_; + + void WaitAll() { + for (size_t j = 0; j < ctxs_.size(); ++j) { + ctxs_[j]->Wait(); + } + } + + void InitCtxOnGpu(bool use_gpu) { + if (use_gpu) { +#ifdef PADDLE_WITH_CUDA + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " + "device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + auto p = p::CUDAPlace(i); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CUDADeviceContext(p)); + } +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { + int count = 8; + for (int i = 0; i < count; ++i) { + auto p = p::CPUPlace(); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CPUDeviceContext(p)); + } + } + } + + void InitGatherOp(size_t input_scope_idx) { + nodes_.clear(); + std::unordered_map scope_map; + for (size_t j = 0; j < gpu_list_.size(); ++j) { + local_scopes_.push_back(&(g_scope_.NewScope())); + Scope& local_scope = local_scopes_.back()->NewScope(); + local_scope.Var("input"); + param_scopes_.emplace_back(&local_scope); + scope_map.emplace(local_scopes_.back(), param_scopes_.back()); + } + param_scopes_[input_scope_idx]->Var("out"); + + nodes_.emplace_back( + ir::CreateNodeForTest("node", ir::Node::Type::kOperation).release()); + op_handle_ = + new GatherOpHandle(nodes_.back().get(), local_scopes_, gpu_list_); + + op_handle_->SetLocalExecScopes(scope_map); + + // add input + for (size_t j = 0; j < gpu_list_.size(); ++j) { + op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); + nodes_.emplace_back( + ir::CreateNodeForTest("node1", ir::Node::Type::kVariable).release()); + auto* in_var_handle = + new VarHandle(nodes_.back().get(), 1, j, "input", gpu_list_[j]); + vars_.emplace_back(in_var_handle); + op_handle_->AddInput(in_var_handle); + } + + // add dummy var + nodes_.emplace_back( + ir::CreateNodeForTest("node2", ir::Node::Type::kVariable).release()); + vars_.emplace_back(new DummyVarHandle(nodes_.back().get())); + DummyVarHandle* in_dummy_var_handle = + static_cast(vars_.back()); + in_dummy_var_handle->ClearGeneratedOp(); + op_handle_->AddInput(in_dummy_var_handle); + + // add output + nodes_.emplace_back( + ir::CreateNodeForTest("node3", ir::Node::Type::kVariable).release()); + auto* out_var_handle = + new VarHandle(nodes_.back().get(), 2, input_scope_idx, "out", + gpu_list_[input_scope_idx]); + vars_.emplace_back(out_var_handle); + op_handle_->AddOutput(out_var_handle); + + // add dummy var + nodes_.emplace_back( + ir::CreateNodeForTest("node4", ir::Node::Type::kVariable).release()); + vars_.emplace_back(new DummyVarHandle(nodes_.back().get())); + DummyVarHandle* dummy_var_handle = + static_cast(vars_.back()); + op_handle_->AddOutput(dummy_var_handle); + } + + void TestGatherSelectedRows(size_t output_scope_idx) { + int height = kDims[0] * 2; + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + std::vector send_vector(f::product(kDims)); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k; + } + + for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); + ++input_scope_idx) { + auto in_var = param_scopes_.at(input_scope_idx)->FindVar("input"); + PADDLE_ENFORCE_NOT_NULL(in_var); + auto in_selected_rows = in_var->GetMutable(); + auto value = in_selected_rows->mutable_value(); + value->mutable_data(kDims, gpu_list_[input_scope_idx]); + + in_selected_rows->set_height(height); + in_selected_rows->set_rows(rows); + + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), value); + value->Resize(kDims); + } + + auto out_var = param_scopes_.at(output_scope_idx)->FindVar("out"); + PADDLE_ENFORCE_NOT_NULL(out_var); + auto out_selected_rows = out_var->GetMutable(); + + auto in_var = param_scopes_.at(output_scope_idx)->FindVar("input"); + auto in_selected_rows = in_var->GetMutable(); + + out_selected_rows->mutable_value()->ShareDataWith( + in_selected_rows->value()); + + op_handle_->Run(false); + + WaitAll(); + + p::CPUPlace cpu_place; + + auto& out_select_rows = out_var->Get(); + auto rt = out_select_rows.value(); + + PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); + for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { + PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); + } + + f::Tensor result_tensor; + f::TensorCopy(rt, cpu_place, *(ctxs_[output_scope_idx]), &result_tensor); + float* ct = result_tensor.data(); + + for (int64_t j = 0; + j < f::product(kDims) * static_cast(gpu_list_.size()); ++j) { + ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5); + } + } +}; + +TEST(GatherTester, TestCPUGatherTestSelectedRows) { + TestGatherOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitGatherOp(input_scope_idx); + test_op.TestGatherSelectedRows(input_scope_idx); +} + +#ifdef PADDLE_WITH_CUDA + +TEST(GatherTester, TestGPUGatherTestSelectedRows) { + TestGatherOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitGatherOp(input_scope_idx); + test_op.TestGatherSelectedRows(input_scope_idx); +} +#endif +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h new file mode 100644 index 00000000..d139f848 --- /dev/null +++ b/paddle/fluid/framework/details/graph_test_base.h @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class DummyOp : public OperatorBase { + public: + DummyOp(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class AssignOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SplitOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", ""); + AddOutput("Out", "").AsDuplicable(); + AddComment(""); + } +}; + +class DummyVarTypeInference : public VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + auto& inputs = ctx->Input("X"); + auto type = ctx->GetType(inputs.front()); + auto out_var_name = ctx->Output("Out").front(); + ctx->SetType(out_var_name, type); + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_helper.cc b/paddle/fluid/framework/details/multi_devices_helper.cc new file mode 100644 index 00000000..0242274a --- /dev/null +++ b/paddle/fluid/framework/details/multi_devices_helper.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/details/multi_devices_helper.h" + +namespace paddle { +namespace framework { +namespace details {} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h new file mode 100644 index 00000000..8cd41954 --- /dev/null +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -0,0 +1,77 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/var_handle.h" + +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/platform/place.h" + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +// all variable in each devices. +// The outside vector is the device vector. Each element of this vector is a +// map from variable name to variables. The variables, who have the same name, +// will have a differsent version. The offset in the +// `std::vector` is the version of varaibles. +typedef std::vector>> + GraphVars; +constexpr char kGraphVars[] = "vars"; + +constexpr char kPlaces[] = "places"; +constexpr char kLocalScopes[] = "local_scopes"; +constexpr char kNCCLCtxs[] = "nccl_ctxs"; +constexpr char kUseHierarchicalAllReduce[] = "use_hierarchical_allreduce"; + +// aux variables to represent dependency. Useful to resolve data hazard. +typedef std::unordered_set GraphDepVars; +constexpr char kGraphDepVars[] = "dep_vars"; + +typedef std::unordered_set FusedVars; +constexpr char kFusedVars[] = "fused_vars"; +constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@"; + +typedef std::string FusedOptType; +constexpr char kFusedOptType[] = "fused_opt_type"; + +typedef std::vector FusedGrads; +constexpr char kFusedGrads[] = "fused_gradients"; + +typedef std::vector> ParamsAndGrads; +constexpr char kParamsAndDenseGrads[] = "params_and_dense_grads"; +constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads"; + +typedef std::vector ProgramDescs; +constexpr char kProgramDescs[] = "program_descs"; + +typedef std::vector>> + GroupParamsAndGrads; +constexpr char kGroupParamsAndDenseGrads[] = "group_params_dense_grads"; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/nccl_op_handle.h b/paddle/fluid/framework/details/nccl_op_handle.h new file mode 100644 index 00000000..56daccca --- /dev/null +++ b/paddle/fluid/framework/details/nccl_op_handle.h @@ -0,0 +1,235 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/fluid/platform/nccl_helper.h" + +DECLARE_bool(sync_nccl_allreduce); + +namespace paddle { +namespace framework { +namespace details { + +class NCCLOpHandleBase : public OpHandleBase { + public: + NCCLOpHandleBase(ir::Node* node, const std::vector& places, + const platform::NCCLCommunicator* nccl_ctxs) + : OpHandleBase(node), places_(places), nccl_ctxs_(nccl_ctxs) { + if (nccl_ctxs == nullptr) { + return; + } + // init device context + auto default_nccl_ctxs = nccl_ctxs_->DefaultFlatCtx(); + for (auto& p : places_) { + this->SetDeviceContext(p, default_nccl_ctxs->DevCtx(p)); + } + } + virtual ~NCCLOpHandleBase() { + for (auto& ev : inter_events_) { + PADDLE_ENFORCE(cudaEventDestroy(ev.second)); + } + for (auto& ev : exter_events_) { + PADDLE_ENFORCE(cudaEventDestroy(ev.second)); + } + } + void SetRunEnv(int run_order, bool use_hierarchical_allreduce) { + PADDLE_ENFORCE(run_order >= 0, "run_order must >= 0"); + run_order_ = run_order; + use_hierarchical_allreduce_ = use_hierarchical_allreduce; + + VLOG(10) << "SetRunEnv " + << " run_order:" << run_order + << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce + << ", nccl_ctx_:" << nccl_ctxs_; + + if (nccl_ctxs_ == nullptr) { + return; + } + + if (!use_hierarchical_allreduce_) { + auto ctxs = nccl_ctxs_->GetFlatCtx(run_order); + for (auto& p : places_) { + this->SetDeviceContext(p, ctxs->DevCtx(p)); + } + return; + } + + PADDLE_ENFORCE(places_.size() == 1, + "HierarchicalAllReduce run one proc with one card mode."); + + for (auto& p : places_) { + auto ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order); + this->SetDeviceContext(p, ctxs->DevCtx(p)); + } + + for (auto& p : dev_ctxes_) { + int dev_id = boost::get(p.first).device; + if (inter_events_.find(dev_id) != inter_events_.end()) { + continue; + } + + PADDLE_ENFORCE(cudaSetDevice(dev_id)); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&inter_events_[dev_id], + cudaEventDisableTiming)); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&exter_events_[dev_id], + cudaEventDisableTiming)); + VLOG(10) << "Create events on dev_id:" << dev_id + << ", inter_event:" << &inter_events_[dev_id] + << ", exter_event:" << &exter_events_[dev_id]; + } + } + + void FlatNCCLAllReduce(platform::Place place, const void* sendbuff, + void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op) { + PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); + auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); + int dev_id = boost::get(place).device; + auto& nccl_ctx = flat_nccl_ctxs->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + + VLOG(10) << "before all reduce buffer:" << sendbuff << ", numel:" << count + << ", dev_id:" << dev_id << ", dtype:" << datatype + << ", place:" << place; + + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, count, datatype, op, comm, stream)); + } + + void NCCLAllReduce(platform::Place place, const void* sendbuff, + void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op) { + PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); + if (!use_hierarchical_allreduce_) { + FlatNCCLAllReduce(place, sendbuff, recvbuff, count, datatype, op); + return; + } + + HierarchicalAllReduce(place, sendbuff, recvbuff, count, datatype, op); + } + + void HierarchicalAllReduce(platform::Place place, const void* sendbuff, + void* recvbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op) { + PADDLE_ENFORCE(run_order_ >= 0, "run_order must > 0"); + InterReduce(place, sendbuff, recvbuff, count, datatype, op); + // When a trainer is not in exter allreduce ring + // they need not to call this. + if (nccl_ctxs_->NeedExterAllReduce()) { + ExterAllReduce(place, recvbuff, recvbuff, count, datatype, op); + } + InterBroadCast(place, recvbuff, count, datatype, op); + } + + protected: + void InterReduce(platform::Place place, const void* sendbuff, void* recvbuff, + size_t count, ncclDataType_t datatype, ncclRedOp_t op) { + auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_); + int dev_id = boost::get(place).device; + auto& nccl_ctx = nccl_ctxs->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + + VLOG(10) << "before all reduce" + << " run_order:" << run_order_ << ", buffer:" << sendbuff + << ", numel:" << count << ", dev_id:" << dev_id + << ", dtype:" << datatype << ", place:" << place + << ", stream:" << stream; + + PADDLE_ENFORCE(platform::dynload::ncclReduce( + sendbuff, recvbuff, count, datatype, ncclSum, 0, comm, stream)); + + cudaEventRecord(inter_events_.at(dev_id), stream); + + if (FLAGS_sync_nccl_allreduce) { + PADDLE_ENFORCE(cudaStreamSynchronize(stream), + "sync HierarchicalAllReduce inter stream error"); + } + } + + void ExterAllReduce(platform::Place place, const void* sendbuff, + void* recvbuff, size_t count, ncclDataType_t datatype, + ncclRedOp_t op) { + auto nccl_ctxs = nccl_ctxs_->GetHierarchicalExterCtx(run_order_); + PADDLE_ENFORCE(nccl_ctxs_, "can't get exter %d nccl_ctxs", run_order_); + int dev_id = boost::get(place).device; + auto& nccl_ctx = nccl_ctxs->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + + VLOG(10) << "before all reduce run_order:" << run_order_ + << "buffer:" << sendbuff << ", numel:" << count + << ", dev_id:" << dev_id << ", dtype:" << datatype + << ", place:" << place << ", stream:" << stream; + + cudaStreamWaitEvent(stream, inter_events_.at(dev_id), 0); + + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, count, datatype, op, comm, stream)); + + cudaEventRecord(exter_events_.at(dev_id), stream); + + if (FLAGS_sync_nccl_allreduce) { + PADDLE_ENFORCE(cudaStreamSynchronize(stream), + "sync HierarchicalAllReduce exter stream error"); + } + } + + void InterBroadCast(platform::Place place, void* sendbuff, size_t count, + ncclDataType_t datatype, ncclRedOp_t op) { + auto nccl_ctxs = nccl_ctxs_->GetHierarchicalInterCtx(run_order_); + int dev_id = boost::get(place).device; + auto& nccl_ctx = nccl_ctxs->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + + VLOG(10) << "before InterBroadCast buffer:" << sendbuff + << ", numel:" << count << ", dev_id:" << dev_id + << ", dtype:" << datatype << ", place:" << place + << ", stream:" << stream; + + cudaStreamWaitEvent(stream, exter_events_.at(dev_id), 0); + PADDLE_ENFORCE(platform::dynload::ncclBcast(sendbuff, count, datatype, 0, + comm, stream)); + } + + protected: + std::vector places_; + const platform::NCCLCommunicator* nccl_ctxs_{nullptr}; + // When multi trainer call collective function, they need run the same order. + // Or the program will hang.So we use allreduce_deps_pass to set this + // run_order_. + int run_order_{0}; + // Use 2d allreduce or not. + bool use_hierarchical_allreduce_{false}; + + private: + // hierarchical needed events + std::unordered_map inter_events_; + std::unordered_map exter_events_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc new file mode 100644 index 00000000..b2fa31f7 --- /dev/null +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -0,0 +1,254 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/details/op_handle_base.h" +#include +#include + +namespace paddle { +namespace framework { +namespace details { +std::string OpHandleBase::DebugString() const { + std::stringstream ss; + ss << Name() << "("; + for (auto *var : inputs_) { + ss << var->DebugString() << ", "; + } + ss << ") --> ("; + for (auto *var : outputs_) { + ss << var->DebugString() << ", "; + } + ss << ")\n"; + return ss.str(); +} + +OpHandleBase::~OpHandleBase() { +#ifdef PADDLE_WITH_CUDA + for (auto &ev : events_) { + if (ev.second) { + PADDLE_ENFORCE(cudaEventDestroy(ev.second)); + } + } +#endif +} + +void OpHandleBase::InitCUDA() { +#ifdef PADDLE_WITH_CUDA + for (auto &p : dev_ctxes_) { + int dev_id = boost::get(p.first).device; + PADDLE_ENFORCE(cudaSetDevice(dev_id)); + PADDLE_ENFORCE( + cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); + } + if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) { + for (auto &out_var : outputs_) { + auto *out_var_handle = dynamic_cast(out_var); + if (out_var_handle) { + int dev_id = + boost::get(out_var_handle->place()).device; + out_var_handle->SetGenerateEvent(events_.at(dev_id)); + } + } + } else { + PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL, + "%s should have only one dev_ctx.", Name()); + auto &place = dev_ctxes_.begin()->first; + int dev_id = boost::get(place).device; + for (auto &out_var : outputs_) { + auto *out_var_handle = dynamic_cast(out_var); + if (out_var_handle) { + PADDLE_ENFORCE(platform::is_same_place(place, out_var_handle->place()), + "The place of output(%s) is not consistent with the " + "place of current op(%s).", + out_var_handle->Name(), Name()); + out_var_handle->SetGenerateEvent(events_.at(dev_id)); + } + } + } +#endif +} + +void OpHandleBase::Run(bool use_cuda) { +#ifdef PADDLE_WITH_CUDA + if (events_.empty() && use_cuda && dev_ctxes_.size() > 0) { + InitCUDA(); + } +#else + PADDLE_ENFORCE(!use_cuda); +#endif + + RunImpl(); +} + +void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_NOT_NULL(waited_ctx); + if (platform::is_cpu_place(waited_ctx->GetPlace()) || events_.empty()) { + for (auto &dev_ctx : dev_ctxes_) { + PADDLE_ENFORCE_NOT_NULL(dev_ctx.second); + dev_ctx.second->Wait(); + } + } else { + auto stream = + static_cast(waited_ctx)->stream(); + for (auto &ev : events_) { + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); + } + } +#else + for (auto &dev_ctx : dev_ctxes_) { + dev_ctx.second->Wait(); + } +#endif +} + +void OpHandleBase::AddInput(VarHandleBase *in) { + this->inputs_.emplace_back(in); + node_->inputs.push_back(in->Node()); + in->AddOutput(this, this->Node()); +} + +void OpHandleBase::AddOutput(VarHandleBase *out) { + outputs_.emplace_back(out); + node_->outputs.push_back(out->Node()); + out->AddInput(this, this->Node()); +} + +void OpHandleBase::WaitInputVarGenerated() { + for (auto in_var : inputs_) { + if (NeedWait(in_var)) { + // Dummy Variable is used to represent dependencies between operators, so + // there doesn't add event for it. + auto *in_var_handle = dynamic_cast(in_var); + if (in_var_handle) { + auto &place = in_var_handle->place(); + if (platform::is_gpu_place(place)) { +#ifdef PADDLE_WITH_CUDA + auto stream = + static_cast(dev_ctxes_.at(place)) + ->stream(); + PADDLE_ENFORCE( + cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#else + PADDLE_THROW("Doesn't compile the GPU."); +#endif + } + // There are nothing to do when the place is CPUPlace. + } + } + } +} + +void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { + for (auto in_var : inputs_) { + if (NeedWait(in_var)) { + // Dummy Variable is used to represent dependencies between operators, so + // there doesn't add event for it. + auto *in_var_handle = dynamic_cast(in_var); + if (in_var_handle) { + if (platform::is_gpu_place(in_var_handle->place())) { +#ifdef PADDLE_WITH_CUDA + auto stream = static_cast( + dev_ctxes_.at(in_var_handle->place())) + ->stream(); + PADDLE_ENFORCE( + cudaStreamWaitEvent(stream, in_var_handle->GetEvent(), 0)); +#else + PADDLE_THROW("Doesn't compile the GPU."); +#endif + } + // There are nothing to do when the place is CPUPlace. + } + } + } +} + +size_t OpHandleBase::NoDummyInputSize() const { + size_t cnt = 0; + for (auto *in : inputs_) { + if (dynamic_cast(in) == nullptr) { + ++cnt; + } + } + return cnt; +} + +bool OpHandleBase::NeedWait(VarHandleBase *in_var) { + return in_var && in_var->GeneratedOp(); +} + +void OpHandleBase::RunAndRecordEvent(const std::function &callback) { +#ifdef PADDLE_WITH_CUDA + if (!events_.empty()) { // Use event + std::function method = callback; + for (auto &p : dev_ctxes_) { + method = [method, p, this]() { + VLOG(10) << "cudadevicecontext:" + << static_cast(p.second) + << ", dev_id:" + << boost::get(p.first).device; + + static_cast(p.second)->RecordEvent( + events_.at(boost::get(p.first).device), + method); + }; + } + method(); + } else { +#endif + callback(); +#ifdef PADDLE_WITH_CUDA + } +#endif +} + +void OpHandleBase::RunAndRecordEvent(platform::Place p, + const std::function &callback) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_cpu_place(p) || events_.empty()) { + callback(); + } else { + auto *ctx = dev_ctxes_.at(p); + auto *cuda_ctx = static_cast(ctx); + cuda_ctx->RecordEvent(events_.at(boost::get(p).device), + callback); + } +#else + callback(); +#endif +} + +size_t OpHandleBase::NotReadyInputSize() const { + std::unordered_set res; + for (auto *var : inputs_) { + if (var->GeneratedOp() != nullptr) { + res.emplace(var); + } + } + return res.size(); +} + +void OpHandleBase::SetLocalExecScopes( + const std::unordered_map &scope_map) { + local_exec_scopes_.clear(); + auto scopes = GetLocalScopes(); + for (auto *scope : scopes) { + auto iter = scope_map.find(scope); + PADDLE_ENFORCE(iter != scope_map.end(), "Local scope not found"); + local_exec_scopes_.emplace_back(iter->second); + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h new file mode 100644 index 00000000..4c708691 --- /dev/null +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -0,0 +1,142 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class Scope; + +namespace details { + +// Wraps ir::Node and provide helper utilities. +// It's responsible for populating necessary fields of ir::Node. +class OpHandleBase { + public: + /** + * NOTE(zjl): Some op should have higher priority than others. + * The higher priority op would run first without switching + * threads in Executor. + */ + enum Priority { kHighest = 0, kNormal = 1 }; + + // Owned by `node`. No need to be deleted explicitly. + explicit OpHandleBase(ir::Node *node) : node_(node) { + node_->WrappedBy(this); + } + + virtual ~OpHandleBase(); + + std::string DebugString() const; + + virtual Priority GetPriority() const { return kNormal; } + + virtual std::string Name() const = 0; + + void Run(bool use_cuda); + + virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx); + + void AddInput(VarHandleBase *in); + + void AddOutput(VarHandleBase *out); + + // This method adds the wait events of all the input on all the device + // context. + // NODE: This Wait is asynchronous operation. + virtual void WaitInputVarGenerated(); + + // This method adds the wait events of all the input on the specified device + // context. + // NODE: This Wait is asynchronous operation. + virtual void WaitInputVarGenerated(const platform::Place &place); + + virtual bool NeedWait(VarHandleBase *in_var); + + // If the Op involves data transfer of multiple devices that + // will likely block other computations. + virtual bool IsMultiDeviceTransfer() { return false; } + + const platform::DeviceContext *DeviceContext(platform::Place place) { + auto it = dev_ctxes_.find(place); + return it != dev_ctxes_.end() ? it->second : nullptr; + } + const std::map &DeviceContext() { + return dev_ctxes_; + } + + void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) { + dev_ctxes_[place] = ctx_; + } + + const std::vector &Inputs() const { return inputs_; } + + size_t NoDupInputSize() const { + std::unordered_set res; + for (auto *var : inputs_) { + res.emplace(var); + } + return res.size(); + } + + size_t NotReadyInputSize() const; + + const std::vector &Outputs() const { return outputs_; } + + size_t NoDummyInputSize() const; + + ir::Node *Node() { return node_; } + + void SetLocalExecScopes( + const std::unordered_map &scope_map); + + protected: + virtual std::vector GetLocalScopes() = 0; + + void RunAndRecordEvent(const std::function &callback); + + void RunAndRecordEvent(platform::Place p, + const std::function &callback); + + virtual void RunImpl() = 0; + + virtual void InitCUDA(); + + ir::Node *node_; + std::vector inputs_; + std::vector outputs_; + std::map dev_ctxes_; + + std::vector local_exec_scopes_; + +#ifdef PADDLE_WITH_CUDA + std::unordered_map events_; +#endif + + DISABLE_COPY_AND_ASSIGN(OpHandleBase); +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h new file mode 100644 index 00000000..0f03ca51 --- /dev/null +++ b/paddle/fluid/framework/details/op_registry.h @@ -0,0 +1,245 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/inplace_op_inference.h" +#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace framework { +namespace details { + +enum OpInfoFillType { + kOperator = 0, + kOpProtoAndCheckerMaker = 1, + kGradOpDescMaker = 2, + kVarTypeInference = 3, + kShapeInference = 4, + kInplaceOpInference = 5, + kNoNeedBufferVarsInference = 6, + kUnknown = -1 +}; + +namespace internal { +template +struct TypePair { + using Type = T; + static constexpr OpInfoFillType kFillType = kType; +}; + +using OpRegistryClasses = std::tuple< // NOLINT + TypePair, // NOLINT + TypePair, // NOLINT + TypePair, // NOLINT + TypePair, // NOLINT + TypePair, // NOLINT + TypePair, // NOLINT + TypePair // NOLINT + >; + +static constexpr int kOpRegistryClassNumber = + std::tuple_size::value; + +template +struct IsMatchedBaseTypeImpl { + using PairType = typename std::tuple_element::type; + static constexpr bool kValue = + std::is_base_of::value; +}; + +template +struct IsMatchedBaseTypeImpl { + static constexpr bool kValue = false; +}; + +template +static inline constexpr bool IsMatchedBaseType() { + return IsMatchedBaseTypeImpl< + T, kPos, (kPos >= 0 && kPos < kOpRegistryClassNumber)>::kValue; +} + +template +struct OpInfoFillTypeGetterImpl {}; + +// This case should not happen +template +struct OpInfoFillTypeGetterImpl {}; + +template +struct OpInfoFillTypeGetterImpl { + static constexpr OpInfoFillType kType = kUnknown; +}; + +template +struct OpInfoFillTypeGetterImpl { + static constexpr OpInfoFillType kType = + OpInfoFillTypeGetterImpl()>::kType; +}; + +template +struct OpInfoFillTypeGetterImpl { + using PairType = typename std::tuple_element::type; + static constexpr OpInfoFillType kType = PairType::kFillType; +}; + +template +using OpInfoFillTypeGetter = + OpInfoFillTypeGetterImpl()>; + +} // namespace internal + +template +struct OpInfoFillTypeID { + static constexpr OpInfoFillType ID() { + return internal::OpInfoFillTypeGetter::kType; + } +}; + +template ::ID()> +struct OpInfoFiller; + +template +class OperatorRegistrarRecursive; + +template +class OperatorRegistrarRecursive { + public: + using T = typename std::tuple_element>::type; + OperatorRegistrarRecursive(const char* op_type, OpInfo* info) { + OpInfoFiller fill; + fill(op_type, info); + constexpr auto size = sizeof...(ARGS); + OperatorRegistrarRecursive reg(op_type, + info); + (void)(reg); + } +}; + +template +class OperatorRegistrarRecursive { + public: + OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {} +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->creator_ = [](const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, + const AttributeMap& attrs) { + return new T(type, inputs, outputs, attrs); + }; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->proto_ = new proto::OpProto; + info->checker_ = new OpAttrChecker(); + T maker; + maker(info->proto_, info->checker_); + info->proto_->set_type(op_type); + PADDLE_ENFORCE( + info->proto_->IsInitialized(), + "Fail to initialize %s's OpProto, because %s is not initialized", + op_type, info->proto_->InitializationErrorString()); + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->grad_op_maker_ = []( + const OpDesc& fwd_op, + const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var, + const std::vector& grad_block) { + T maker(fwd_op, no_grad_set, grad_to_var, grad_block); + return maker(); + }; + + info->use_default_grad_op_desc_maker_ = + std::is_base_of, T>::value || + std::is_base_of, T>::value; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_var_type_ = [](InferVarTypeContext* context) { + T inference; + inference(context); + }; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_shape_ = [](InferShapeContext* ctx) { + T inference; + inference(ctx); + }; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_inplace_ = [](const OpDesc& op_desc, bool use_cuda) { + T infer; + return infer(op_desc, use_cuda); + }; + } +}; + +template +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const { + info->infer_no_need_buffer_vars_ = [](const VariableNameMap& inputs, + const VariableNameMap& outputs, + const AttributeMap& attrs) { + T infer(inputs, outputs, attrs); + return infer(); + }; + } +}; + +// A fake OpInfoFiller of void +template <> +struct OpInfoFiller { + void operator()(const char* op_type, OpInfo* info) const {} +}; + +} // namespace details + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc new file mode 100644 index 00000000..1a3c753e --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -0,0 +1,181 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +std::vector> +ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(ir::Graph *graph) { + std::vector> graphs; + graphs.reserve(places_.size()); + for (size_t i = 0; i < places_.size(); ++i) { + ProgramDesc empty; + graphs.emplace_back(std::unique_ptr(new ir::Graph(empty))); + auto &g = graphs.back(); + g->Set(kGraphVars, new GraphVars(1UL)); + g->Set(kGraphDepVars, new GraphDepVars); + auto &stale_ops = + graph->Get>(details::kStaleProgramOpDescs); + g->Erase(details::kStaleProgramOpDescs); + g->Set>(details::kStaleProgramOpDescs, + new std::vector(stale_ops)); + } + auto op_handles = ir::FilterByNodeWrapper(*graph); + + for (auto &op : op_handles) { + auto &dev_ctx = op->DeviceContext(); + auto &p = dev_ctx.begin()->first; + int dev_id = boost::get(p).device; + auto &dev_dummys = graphs[dev_id]->Get(kGraphDepVars); + graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release()); + + for (auto &var : op->Inputs()) { + auto dummy_ptr = dynamic_cast(var); + if (dummy_ptr) { + dev_dummys.insert(var); + if (graph->Nodes().count(var->Node())) + graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release()); + } + } + for (auto &var : op->Outputs()) { + auto dummy_ptr = dynamic_cast(var); + if (dummy_ptr) { + dev_dummys.insert(var); + if (graph->Nodes().count(var->Node())) + graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release()); + } + } + } + + for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) { + auto &dev_vars = graphs[dev_id]->Get(kGraphVars)[0]; + auto &origin_vars = graph->Get(kGraphVars)[dev_id]; + for (auto &name_pair : origin_vars) { + dev_vars.emplace(name_pair.first, name_pair.second); + for (auto &version_pair : name_pair.second) { + if (graph->Nodes().count(version_pair->Node())) { + graphs[dev_id]->AddNode( + graph->RemoveNode(version_pair->Node()).release()); + } + } + } + } + + return graphs; +} + +ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &local_exec_scopes, + const std::vector &places, ir::Graph *graph) + : strategy_(std::move(strategy)), + local_scopes_(std::move(local_scopes)), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), + places_(std::move(places)), + // TODO(Yancey1989): Copying graphs is not safely since it deleted the + // attrs. + graphs_(SeparateMultiDevicesGraph(graph)) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + + auto seq_allreduce_pass = + ir::PassRegistry::Instance().Get("all_reduce_deps_pass"); + seq_allreduce_pass->Set(kUseHierarchicalAllReduce, new bool(false)); + for (size_t i = 0; i < graphs_.size(); ++i) { + graphs_[i].reset(seq_allreduce_pass->Apply(graphs_[i].release())); + } + + // set the correct size of thread pool to each device. + strategy_.num_threads_ = strategy_.num_threads_ < places_.size() + ? 1UL + : strategy_.num_threads_ / places_.size(); + VLOG(1) << "set num_threads: " << strategy_.num_threads_ + << " to run the operators of the graph on each device."; + for (size_t i = 0; i < places.size(); ++i) { + executors_.emplace_back(new details::FastThreadedSSAGraphExecutor( + strategy_, local_scopes_, local_exec_scopes, {places_[i]}, + graphs_.at(i).get())); + } +} + +std::vector ParallelSSAGraphExecutor::Graphs() { + std::vector result; + result.reserve(graphs_.size()); + for (auto &g : graphs_) { + result.emplace_back(g.get()); + } + return result; +} + +FeedFetchList ParallelSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + std::vector> run_futures; + + std::vector fetch_data; + FeedFetchList ret; + + fetch_data.reserve(places_.size()); + ret.reserve(fetch_tensors.size()); + exception_holder_.Clear(); + + for (size_t i = 0; i < places_.size(); ++i) { + auto call = [this, i, &fetch_tensors]() -> FeedFetchList { + try { + return executors_[i]->Run(fetch_tensors); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + return FeedFetchList(); + }; + + if (pool_) { + run_futures.emplace_back(pool_->enqueue(std::move(call))); + } else { + fetch_data.emplace_back(call()); + } + } + + if (pool_) { + for (auto &f : run_futures) { + if (exception_holder_.IsCaught()) { + f.wait(); + } else { + fetch_data.emplace_back(f.get()); + } + } + } + if (exception_holder_.IsCaught()) { + exception_holder_.ReThrow(); + } + + for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { + std::vector lodtensor_ptrs; + lodtensor_ptrs.reserve(local_scopes_.size()); + for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { + lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx)); + } + ret.emplace_back(); + ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + } + return ret; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h new file mode 100644 index 00000000..6889c54d --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -0,0 +1,61 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "ThreadPool.h" +#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +class ParallelSSAGraphExecutor : public SSAGraphExecutor { + public: + ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &local_exec_scopes, + const std::vector &places, + ir::Graph *graph); + ~ParallelSSAGraphExecutor() final = default; + + const ir::Graph &Graph() const override { return *graphs_[0]; } + + std::vector Graphs(); + + FeedFetchList Run(const std::vector &fetch_tensors) override; + + private: + std::vector> SeparateMultiDevicesGraph( + ir::Graph *graph); + + ExecutionStrategy strategy_; + std::vector local_scopes_; + std::unique_ptr<::ThreadPool> pool_{nullptr}; + std::vector places_; + std::vector> graphs_; + + std::vector> + executors_; + ExceptionHolder exception_holder_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h new file mode 100644 index 00000000..0de8e436 --- /dev/null +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -0,0 +1,122 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/selected_rows.h" +namespace paddle { +namespace framework { +namespace details { + +struct ReduceLoDTensor { + const std::vector &src_tensors_; + LoDTensor &dst_tensor_; + + ReduceLoDTensor(const std::vector &src, LoDTensor *dst) + : src_tensors_(src), dst_tensor_(*dst) {} + + template + void apply() const { + PADDLE_ENFORCE(!src_tensors_.empty()); + auto &t0 = *src_tensors_[0]; + PADDLE_ENFORCE_NE(t0.numel(), 0); + + dst_tensor_.Resize(t0.dims()); + T *dst = dst_tensor_.mutable_data(platform::CPUPlace()); + + for (size_t i = 0; i < src_tensors_.size(); ++i) { + auto &t = *src_tensors_[i]; + if (dst == t.data()) { + continue; + } + + PADDLE_ENFORCE_EQ(t.dims(), t0.dims()); + PADDLE_ENFORCE_EQ(t.type(), t0.type()); + std::transform(t.data(), t.data() + t.numel(), dst, dst, + [](T a, T b) -> T { return a + b; }); + } + } +}; + +struct ReduceBufferData { + const std::vector &src_data_; + void *dst_data_; + int64_t numel_; + + ReduceBufferData(const std::vector &src, void *dst, + int64_t numel) + : src_data_(src), dst_data_(dst), numel_(numel) {} + + template + void apply() const { + T *dst_data = reinterpret_cast(dst_data_); + for (size_t i = 0; i < src_data_.size(); ++i) { + auto srd_data = reinterpret_cast(src_data_[i]); + VLOG(10) << "dst: " << dst_data_ << ", " << srd_data; + if (srd_data == dst_data_) { + continue; + } + + std::transform(srd_data, srd_data + numel_, dst_data, dst_data, + [](T a, T b) -> T { return a + b; }); + } + } +}; + +inline void GatherLocalSelectedRows( + const std::vector &src_selecte_rows_, + const std::vector &in_places, + const std::map &dev_ctxes, + const platform::Place &out_place, SelectedRows *dst_selecte_rows) { + PADDLE_ENFORCE(!src_selecte_rows_.empty()); + + std::vector in_tensors; + std::vector out_rows; + + for (auto in_sr_ptr : src_selecte_rows_) { + auto &in_sr = *in_sr_ptr; + in_tensors.emplace_back(in_sr.value()); + out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end()); + } + + auto &pre_in = src_selecte_rows_[0]; + + auto &dst_tensor = *dst_selecte_rows; + dst_tensor.set_height(pre_in->height()); + dst_tensor.set_rows(out_rows); + size_t rows = out_rows.size(); + DDim out_dim = pre_in->GetCompleteDims(); + out_dim[0] = static_cast(rows); + dst_tensor.mutable_value()->Resize(out_dim); + dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type()); + Tensor *out_tensor = dst_tensor.mutable_value(); + + // copy + int s = 0, e = 0; + for (size_t j = 0; j < in_tensors.size(); ++j) { + e += in_tensors[j].dims()[0]; + auto sub_out = out_tensor->Slice(s, e); + paddle::framework::TensorCopy(in_tensors[j], out_place, + *(dev_ctxes.at(in_places[j])), &sub_out); + s = e; + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc new file mode 100644 index 00000000..26153b7d --- /dev/null +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -0,0 +1,330 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/reduce_op_handle.h" +#include +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/details/variable_visitor.h" +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/collective_client.h" +#include "paddle/fluid/operators/distributed/collective_server.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#endif +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_bool( + cpu_deterministic, false, + "Whether to make the result of computation deterministic in CPU side."); + +namespace paddle { +namespace framework { +namespace details { + +std::once_flag CollectiveContext::init_flag_; +std::unique_ptr CollectiveContext::context_; + +static inline std::string GetRemoteVarName(const std::string &var_name, + int trainer_id) { + return string::Sprintf("%s_merged_tmp@trainer_%d", var_name, trainer_id); +} + +void ReduceOpHandle::Wait( + const std::map &dev_ctxes) { + // TODO(gongwb): use event wait? + for (auto &dev_ctx : dev_ctxes) { + dev_ctx.second->Wait(); + } +} + +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE +template +void ReduceOpHandle::GatherSelectedRows( + const std::vector &src_selected_rows, + const std::vector &in_places, + const std::map &dev_ctxes, + VarHandle *out_var_handle, const platform::Place &out_place, + SelectedRows *dst_selected_rows) { + const CollectiveContext &collective_context = + *CollectiveContext::GetInstance(); + + // 1. gather local selected rows, merge them + std::string gathered_var_name = out_var_handle->name() + "_gathered_tmp"; + auto scope = local_scopes_.at(out_var_handle->scope_idx()); + auto gathered_var_mid = scope->Var(gathered_var_name); + auto gathered_select_rows = + gathered_var_mid->GetMutable(); + GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place, + gathered_select_rows); + // FIXME(gongwb): remove this Wait. + Wait(dev_ctxes); + + // merge them + auto merged_dev_ctx = dynamic_cast(dev_ctxes.at(out_place)); + std::string merged_var_name = + GetRemoteVarName(out_var_handle->name(), collective_context.trainer_id_); + auto merged_select_rows = + scope->Var(merged_var_name)->GetMutable(); + operators::math::scatter::MergeAdd merge_func; + merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows); + + // 2. start collective server if it doesn't exist + operators::distributed::CollectiveServer *server = + operators::distributed::CollectiveServer::GetInstance( + collective_context.endpoints_[collective_context.trainer_id_], + collective_context.endpoints_.size() - 1); + + auto rpc_server = server->GetRPCServer(); + rpc_server->RegisterVar(merged_var_name, + operators::distributed::kRequestGetMonomerVariable, + scope, merged_dev_ctx); + + // 3. gather them from all remote nodes. + std::vector remote; + operators::distributed::CollectiveClient *client = + operators::distributed::CollectiveClient::GetInstance(); + + std::vector vars; + for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) { + if (i == (unsigned)collective_context.trainer_id_) continue; + + operators::distributed::RemoteVar var; + var.trainer_id_ = i; + var.var_name_ = GetRemoteVarName(out_var_handle->name(), i); + var.ep_ = collective_context.endpoints_[i]; + + vars.push_back(var); + VLOG(4) << "gather from:" << var.String(); + } + + // erase gathered vars + merged_dev_ctx->Wait(); + scope->EraseVars(std::vector{gathered_var_name}); + + PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope)); + PADDLE_ENFORCE(remote.size() == vars.size()); + + // 4. merged local selected rows. + std::vector all; + all.resize(collective_context.endpoints_.size()); + for (auto v : vars) { + all[v.trainer_id_] = + scope->FindVar(v.var_name_)->GetMutable(); + } + all[collective_context.trainer_id_] = merged_select_rows; + + merge_func(*merged_dev_ctx, all, dst_selected_rows); + + rpc_server->WaitVarBarrier(merged_var_name); + rpc_server->ClearVar(merged_var_name); + + // 5. clear mid vars + std::vector tmp_vars{merged_var_name}; + for (auto r : vars) { + tmp_vars.push_back(r.var_name_); + } + scope->EraseVars(tmp_vars); +} +#endif + +void ReduceOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); + + if (places_.size() == 1) return; + // the input and output may have dummy var. + auto in_var_handles = DynamicCast(inputs_); + + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The number of output should equal to the number of places."); + + VarHandle *out_var_handle; + { + auto out_var_handles = DynamicCast(outputs_); + + PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL, + "The number of output should be one."); + out_var_handle = out_var_handles.front(); + } + + auto in_0_handle = in_var_handles[0]; + + auto &var_scopes = local_exec_scopes_; + + auto pre_in_var = + var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); + PADDLE_ENFORCE_NOT_NULL(pre_in_var); + + // Wait input done, this Wait is asynchronous operation + WaitInputVarGenerated(); + + // NOTE: The Places of all input tensor must be all on CPU or all on GPU. + std::vector in_places; // used to get dev_ctx + for (auto *in_handle : in_var_handles) { + in_places.emplace_back(in_handle->place()); + auto in_var = + var_scopes.at(in_handle->scope_idx())->FindVar(in_handle->name()); + PADDLE_ENFORCE_NOT_NULL(in_var); + VariableVisitor::EnforceShapeAndDTypeEQ(*pre_in_var, *in_var); + } + + auto out_var = var_scopes.at(out_var_handle->scope_idx()) + ->FindVar(out_var_handle->name()); + PADDLE_ENFORCE_NOT_NULL(out_var); + + // NOTE: The tensors' Place of input and output must be all on GPU or all on + // CPU. + auto in_p = VariableVisitor::GetMutableTensor(pre_in_var).place(); + platform::Place t_out_p; + if (platform::is_gpu_place(in_p)) { + PADDLE_ENFORCE(platform::is_gpu_place(out_var_handle->place()), + "Places of input and output must be all on GPU."); + t_out_p = out_var_handle->place(); + } else { + t_out_p = platform::CPUPlace(); + } + + if (pre_in_var->IsType()) { + this->RunAndRecordEvent([&] { + std::vector in_selected_rows = + GetInputValues(in_var_handles, var_scopes); + + const CollectiveContext &collective_context = + *CollectiveContext::GetInstance(); + VLOG(10) << "GatherSelectedRows CollectiveContext:" + << collective_context.String(); + + // TODO(gongwb): add cpu support + if (collective_context.endpoints_.size() <= 1 || + is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) { + GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_, + t_out_p, + out_var->GetMutable()); + return; + } + +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE + if (in_selected_rows[0]->value().type() == + framework::proto::VarType::FP32) { + GatherSelectedRows( + in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, + out_var->GetMutable()); + } else if (in_selected_rows[0]->value().type() == + framework::proto::VarType::FP64) { + GatherSelectedRows( + in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, + out_var->GetMutable()); + } else { + PADDLE_THROW("only support double or float when gather SelectedRows"); + } +#endif + }); + } else { + std::vector lod_tensors = + GetInputValues(in_var_handles, var_scopes); + + if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) { + this->RunAndRecordEvent([&] { + // FIXME(zcd): The order of summing is important, + // especially when the type of data is float or double. + // For example, the result of `a+b+c+d` may be different + // with the result of `c+a+b+d`, so the summing order should be fixed. + if (!FLAGS_cpu_deterministic) { + ReduceLoDTensor func(lod_tensors, + out_var->GetMutable()); + VisitDataType(lod_tensors[0]->type(), func); + } else { + // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0 + // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0. + auto &reduce_sum_trg = *this->local_exec_scopes_[0] + ->FindVar(out_var_handle->name()) + ->GetMutable(); + ReduceLoDTensor func(lod_tensors, &reduce_sum_trg); + VisitDataType(lod_tensors[0]->type(), func); + + auto trg = out_var->GetMutable(); + if (reduce_sum_trg.data() != trg->data()) { + TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg); + } + } + }); + } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto pre_in = pre_in_var->Get(); + VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); + VariableVisitor::GetMutableTensor(out_var).mutable_data( + out_var_handle->place(), pre_in.type()); + + auto out_p = out_var_handle->place(); + int root_id = boost::get(out_p).device; + std::vector> all_reduce_calls; + for (size_t i = 0; i < var_scopes.size(); ++i) { + auto &p = in_places[i]; + auto &lod_tensor = *lod_tensors[i]; + + int dev_id = boost::get(p).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + + void *buffer = const_cast(lod_tensor.data()); + void *recvbuffer = nullptr; + if (root_id == dev_id) { + recvbuffer = + out_var->GetMutable()->mutable_data( + out_var_handle->place()); + } + + int type = platform::ToNCCLDataType(lod_tensor.type()); + size_t numel = static_cast(lod_tensor.numel()); + all_reduce_calls.emplace_back( + [buffer, recvbuffer, type, numel, root_id, &nccl_ctx] { + PADDLE_ENFORCE(platform::dynload::ncclReduce( + buffer, recvbuffer, numel, static_cast(type), + ncclSum, root_id, nccl_ctx.comm_, nccl_ctx.stream())); + }); + } + + this->RunAndRecordEvent([&] { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); + } + }); +#else + PADDLE_THROW("CUDA is not enabled."); +#endif + } else { + PADDLE_THROW("Place should be CPUPlace or CUDAPlace."); + } + } +} + +template +std::vector ReduceOpHandle::GetInputValues( + const std::vector &in_var_handles, + const std::vector &var_scopes) const { + std::vector in_selected_rows; + for (auto *in_handle : in_var_handles) { + auto &in_sr = var_scopes.at(in_handle->scope_idx()) + ->FindVar(in_handle->name()) + ->Get(); + in_selected_rows.emplace_back(&in_sr); + } + return in_selected_rows; +} + +std::string ReduceOpHandle::Name() const { return "reduce"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h new file mode 100644 index 00000000..15064a10 --- /dev/null +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +namespace details { +struct CollectiveContext { + std::vector endpoints_; + int trainer_id_{0}; + + std::string String() const { + std::stringstream ss; + ss << "endpoints_:"; + for (auto e : endpoints_) { + ss << e << ","; + } + + ss << "trainer_id_:" << trainer_id_; + + return ss.str(); + } + + static CollectiveContext *GetInstance() { + std::call_once(init_flag_, + [&]() { context_.reset(new CollectiveContext()); }); + return context_.get(); + } + + private: + static std::once_flag init_flag_; + static std::unique_ptr context_; +}; + +struct ReduceOpHandle : public OpHandleBase { + std::vector local_scopes_; + std::vector places_; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const platform::NCCLContextMap *nccl_ctxs_; + ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap *nccl_ctxs) + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + nccl_ctxs_(nccl_ctxs) { + if (nccl_ctxs_) { + for (auto &p_ctx : nccl_ctxs_->contexts_) { + this->SetDeviceContext(platform::CUDAPlace(p_ctx.first), + p_ctx.second.ctx_.get()); + } + } + } +#else + ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places) + : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} +#endif + + std::string Name() const override; + + bool IsMultiDeviceTransfer() override { return true; }; + + protected: + void RunImpl() override; + + std::vector GetLocalScopes() override { return local_scopes_; } + +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE + template + void GatherSelectedRows( + const std::vector &src_selecte_rows_, + const std::vector &in_places, + const std::map &dev_ctxes, + VarHandle *out_var_handle, const platform::Place &out_place, + SelectedRows *dst_selecte_rows); +#endif + + void Wait( + const std::map &dev_ctxes); + + template + std::vector GetInputValues( + const std::vector &in_var_handles, + const std::vector &var_scopes) const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc new file mode 100644 index 00000000..664bd00f --- /dev/null +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -0,0 +1,294 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/reduce_op_handle.h" +#include +#include "gtest/gtest.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { +namespace f = paddle::framework; +namespace p = paddle::platform; + +// test data amount +const f::DDim kDims = {20, 20}; + +struct TestReduceOpHandle { + bool use_gpu_; + Scope g_scope_; + std::vector local_scopes_; + std::vector param_scopes_; + OpHandleBase *op_handle_; + std::vector vars_; + std::vector gpu_list_; + std::vector> ctxs_; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + std::unique_ptr nccl_ctxs_; +#endif + + void WaitAll() { + for (size_t j = 0; j < ctxs_.size(); ++j) { + ctxs_[j]->Wait(); + } +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + if (nccl_ctxs_) { + nccl_ctxs_->WaitAll(); + } +#endif + } + + void InitCtxOnGpu(bool use_gpu) { + use_gpu_ = use_gpu; + if (use_gpu) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " + "device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + auto p = p::CUDAPlace(i); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CUDADeviceContext(p)); + } + nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { + int count = 8; + for (int i = 0; i < count; ++i) { + auto p = p::CPUPlace(); + gpu_list_.push_back(p); + ctxs_.emplace_back(new p::CPUDeviceContext(p)); + } +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + nccl_ctxs_.reset(nullptr); +#endif + } + } + + void InitReduceOp(size_t out_scope_idx) { + std::vector> nodes; + // init scope + std::unordered_map scope_map; + for (size_t j = 0; j < gpu_list_.size(); ++j) { + local_scopes_.push_back(&(g_scope_.NewScope())); + Scope &local_scope = local_scopes_.back()->NewScope(); + local_scope.Var("input"); + param_scopes_.emplace_back(&local_scope); + scope_map.emplace(local_scopes_.back(), param_scopes_.back()); + } + param_scopes_[out_scope_idx]->Var("out"); + + nodes.emplace_back(new ir::Node("node")); + if (use_gpu_) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, + gpu_list_, nccl_ctxs_.get())); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, + gpu_list_, nccl_ctxs_.get())); +#else + op_handle_.reset( + new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_)); +#endif + } + + op_handle_->SetLocalExecScopes(scope_map); + + // init op handle + // add input + for (size_t j = 0; j < gpu_list_.size(); ++j) { + if (!use_gpu_) { + op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); + } + nodes.emplace_back(new ir::Node("node1")); + auto *in_var_handle = + new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]); + in_var_handle->ClearGeneratedOp(); + vars_.emplace_back(in_var_handle); + op_handle_->AddInput(in_var_handle); + } + + // add dummy var + vars_.emplace_back(new DummyVarHandle()); + DummyVarHandle *in_dummy_var_handle = + static_cast(vars_.back().get()); + in_dummy_var_handle->ClearGeneratedOp(); + op_handle_->AddInput(in_dummy_var_handle); + + // add output + nodes.emplace_back(new ir::Node("node2")); + auto *out_var_handle = new VarHandle(nodes.back().get(), 2, out_scope_idx, + "out", gpu_list_[out_scope_idx]); + vars_.emplace_back(out_var_handle); + op_handle_->AddOutput(out_var_handle); + + // add dummy var + vars_.emplace_back(new DummyVarHandle()); + DummyVarHandle *dummy_var_handle = + static_cast(vars_.back().get()); + op_handle_->AddOutput(dummy_var_handle); + } + + void TestReduceSelectedRows(size_t output_scope_idx) { + int height = kDims[0] * 2; + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + std::vector send_vector(f::product(kDims)); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k; + } + + for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); + ++input_scope_idx) { + auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); + PADDLE_ENFORCE_NOT_NULL(in_var); + auto in_selected_rows = in_var->GetMutable(); + auto value = in_selected_rows->mutable_value(); + value->mutable_data(kDims, gpu_list_[input_scope_idx]); + + in_selected_rows->set_height(height); + in_selected_rows->set_rows(rows); + + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), value); + value->Resize(kDims); + } + + auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); + PADDLE_ENFORCE_NOT_NULL(out_var); + auto out_selected_rows = out_var->GetMutable(); + + auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); + auto in_selected_rows = in_var->GetMutable(); + + out_selected_rows->mutable_value()->ShareDataWith( + in_selected_rows->value()); + + op_handle_->Run(false); + + WaitAll(); + + p::CPUPlace cpu_place; + + auto &out_select_rows = out_var->Get(); + auto rt = out_select_rows.value(); + + PADDLE_ENFORCE_EQ(out_select_rows.height(), height, "height is not equal."); + for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { + PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k % rows.size()]); + } + + f::Tensor result_tensor; + f::TensorCopySync(rt, cpu_place, &result_tensor); + float *ct = result_tensor.data(); + + for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { + ASSERT_NEAR(ct[j], send_vector[j % send_vector.size()], 1e-5); + } + } + + void TestReduceLodTensors(size_t output_scope_idx) { + std::vector send_vector(static_cast(f::product(kDims))); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k; + } + f::LoD lod{{0, 10, 20}}; + + for (size_t input_scope_idx = 0; input_scope_idx < gpu_list_.size(); + ++input_scope_idx) { + auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); + PADDLE_ENFORCE_NOT_NULL(in_var); + auto in_lod_tensor = in_var->GetMutable(); + in_lod_tensor->mutable_data(kDims, gpu_list_[input_scope_idx]); + in_lod_tensor->set_lod(lod); + + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor); + } + + auto out_var = param_scopes_[output_scope_idx]->FindVar("out"); + PADDLE_ENFORCE_NOT_NULL(out_var); + auto out_lodtensor = out_var->GetMutable(); + + auto in_var = param_scopes_[output_scope_idx]->FindVar("input"); + auto in_lodtensor = in_var->Get(); + + out_lodtensor->ShareDataWith(in_lodtensor); + + op_handle_->Run(false); + + WaitAll(); + + p::CPUPlace cpu_place; + + auto &rt = out_var->Get(); + + f::Tensor result_tensor; + f::TensorCopySync(rt, cpu_place, &result_tensor); + float *ct = result_tensor.data(); + + for (int64_t j = 0; j < f::product(result_tensor.dims()); ++j) { + ASSERT_NEAR(ct[j], send_vector[j] * gpu_list_.size(), 1e-5); + } + } +}; + +TEST(ReduceTester, TestCPUReduceTestSelectedRows) { + TestReduceOpHandle test_op; + size_t out_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitReduceOp(out_scope_idx); + test_op.TestReduceSelectedRows(out_scope_idx); +} +TEST(ReduceTester, TestCPUReduceTestLodTensor) { + TestReduceOpHandle test_op; + size_t out_scope_idx = 0; + test_op.InitCtxOnGpu(false); + test_op.InitReduceOp(out_scope_idx); + test_op.TestReduceLodTensors(out_scope_idx); +} +#ifdef PADDLE_WITH_CUDA + +TEST(ReduceTester, TestGPUReduceTestSelectedRows) { + TestReduceOpHandle test_op; + size_t out_scope_idx = 0; + test_op.InitCtxOnGpu(true); + test_op.InitReduceOp(out_scope_idx); + test_op.TestReduceSelectedRows(out_scope_idx); +} + +TEST(ReduceTester, TestGPUReduceTestLodTensor) { + TestReduceOpHandle test_op; + size_t out_scope_idx = 0; + test_op.InitCtxOnGpu(true); + test_op.InitReduceOp(out_scope_idx); + test_op.TestReduceLodTensors(out_scope_idx); +} +#endif + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc new file mode 100644 index 00000000..8d61a103 --- /dev/null +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/rpc_op_handle.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace framework { +namespace details { + +RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc, + Scope *local_scope, const std::string &name, + const platform::Place &place) + : OpHandleBase(node), + op_(framework::OpRegistry::CreateOp(op_desc)), + local_scope_(local_scope), + name_(name), + place_(place) {} + +void RPCOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); + + for (auto *in : inputs_) { + auto &p = static_cast(in)->place(); + if (ir::IsControlDepVar(*in->Node())) { + continue; + } + if (in->GeneratedOp()) { + in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p)); + } + } + this->RunAndRecordEvent([this] { op_->Run(*local_exec_scopes_[0], place_); }); +} + +std::string RPCOpHandle::Name() const { return name_; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h new file mode 100644 index 00000000..d86d33dd --- /dev/null +++ b/paddle/fluid/framework/details/rpc_op_handle.h @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace details { + +struct RPCOpHandle : public OpHandleBase { + RPCOpHandle(ir::Node* node, const framework::OpDesc& op_desc, + Scope* local_scope, const std::string& name, + const platform::Place& place); + + std::string Name() const override; + + // Delay and buffer nccl_all_reduce together can significantly increase + // performance. Disable this feature by returning false. + bool IsMultiDeviceTransfer() override { return false; }; + + protected: + void RunImpl() override; + + std::vector GetLocalScopes() override { return {local_scope_}; } + + private: + std::unique_ptr op_; + Scope* local_scope_; + const std::string name_; + platform::Place place_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc new file mode 100644 index 00000000..7ab21609 --- /dev/null +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" +#include +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace framework { +namespace details { +ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, + Scope *scope, + platform::Place place, + platform::DeviceContext *dev_ctx, + proto::VarType::Type dtype) + : OpHandleBase(node), + coeff_(static_cast(1.0 / num_dev)), + scope_(scope), + place_(place), + out_dtype_(dtype) { + this->SetDeviceContext(place_, dev_ctx); +} + +ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} + +struct ScaleLossGradFunctor { + float coeff_; + Tensor *out_; + platform::Place place_; + OpHandleBase *op_handle_; + proto::VarType::Type out_dtype_; + platform::DeviceContext *ctx_; + + ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place, + OpHandleBase *op_handle, proto::VarType::Type dtype, + platform::DeviceContext *ctx) + : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {} + + template + void apply() const { + auto *out_data = out_->mutable_data(place_); + if (platform::is_cpu_place(place_)) { + *out_data = static_cast(coeff_); + } else { +#ifdef PADDLE_WITH_CUDA + OutT cast_coeff = static_cast(coeff_); + auto stream = static_cast(ctx_)->stream(); + memory::Copy(boost::get(place_), out_data, + platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_), + stream); + VLOG(10) << place_ << "RUN Scale loss grad op"; + +#endif + } + } +}; + +void ScaleLossGradOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); + // Doesn't wait any event + std::string var_name = static_cast(this->outputs_[0])->name(); + + auto *tensor = + local_exec_scopes_[0]->FindVar(var_name)->GetMutable(); + tensor->Resize(make_ddim({1})); + +#ifdef PADDLE_WITH_CUDA + ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, + this->dev_ctxes_.at(place_)); + this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); }); +#else + ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr); + framework::VisitDataType(out_dtype_, func); +#endif +} + +std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h new file mode 100644 index 00000000..d4f28dbe --- /dev/null +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace details { + +struct ScaleLossGradOpHandle : public OpHandleBase { + ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope, + platform::Place place, platform::DeviceContext *context, + proto::VarType::Type dtype); + + ~ScaleLossGradOpHandle() final; + + std::string Name() const override; + + protected: + void RunImpl() override; + + std::vector GetLocalScopes() override { return {scope_}; } + + private: + float coeff_; + Scope *scope_; + platform::Place place_; + proto::VarType::Type out_dtype_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc new file mode 100644 index 00000000..8459f3a4 --- /dev/null +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace framework { +namespace details { +ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( + ExecutionStrategy strategy, std::vector local_scopes, + std::vector local_exec_scopes, std::vector var_infos, + std::vector places, + std::unique_ptr &&underlying_executor) + : strategy_(std::move(strategy)), + underlying_executor_(std::move(underlying_executor)), + local_scopes_(std::move(local_scopes)), + local_exec_scopes_(std::move(local_exec_scopes)), + var_infos_(std::move(var_infos)), + places_(std::move(places)) { + PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size()); + PrepareLocalExeScopes(); +} + +FeedFetchList ScopeBufferedSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + if (drop_scope_counter_ == 0) { + platform::RecordEvent e("InitLocalVars"); + InitVariables(); + } + + std::vector fetch_data; + std::exception_ptr eptr = nullptr; + try { + fetch_data = underlying_executor_->Run(fetch_tensors); + } catch (...) { + eptr = std::current_exception(); + } + + ++drop_scope_counter_; + if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { + DropLocalExeScopes(); + } + if (eptr) { + std::rethrow_exception(eptr); + } else { + return fetch_data; + } +} + +void ScopeBufferedSSAGraphExecutor::InitVariables() { + for (auto &info : tmp_var_infos_) { + for (auto &pair : info) { + InitializeVariable(pair.first, pair.second); + } + } + + const ir::Graph &graph = Graph(); + if (graph.Has(details::kProgramDescs)) { + auto &program_descs = + graph.Get(details::kProgramDescs); + // Init vars + auto &fused_grad_vars = graph.Get(details::kFusedVars); + for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { + for (auto &var_name : fused_grad_vars) { + auto var = local_exec_scopes_[i]->Var(var_name); + var->GetMutable(); + } + } + + for (auto &program_desc : program_descs) { + for (auto &op_desc : program_desc.Block(0).AllOps()) { + for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { + auto op = OpRegistry::CreateOp(*op_desc); + op->Run(*local_exec_scopes_[i], places_[i]); + } + } + } + } +} + +void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() { + platform::RecordEvent drop_scope_event("DropLocalExeScopes"); + drop_scope_counter_ = 0; + for (auto &p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + + for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { + local_exec_scopes_[i]->EraseVarsExcept(preserve_vars_[i]); + local_exec_scopes_[i]->DropKids(); + for (auto &preserve_var : preserve_vars_[i]) { + preserve_var->Clear(); + } + VLOG(3) << "Drop local execution scope: " << local_scopes_[i]; + } +} + +void ScopeBufferedSSAGraphExecutor::PrepareLocalExeScopes() { + // Create local scopes. + preserve_vars_.resize(local_scopes_.size()); + tmp_var_infos_.resize(local_scopes_.size()); + + for (auto it = local_scopes_.rbegin(); it != local_scopes_.rend(); ++it) { + size_t idx = local_scopes_.size() - 1 - (it - local_scopes_.rbegin()); + auto *scope = local_scopes_[idx]; + auto *local_scope = local_exec_scopes_[idx]; + + for (auto &info : var_infos_) { + if (info.persistable_) { // Persistable + auto var = scope->FindVar(info.name_); + if (var != nullptr) { + VLOG(2) + << info.name_ + << " has been initialized beforehand in global scope, skipped"; + continue; + } + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + Variable *tmp_var = local_scope->Var(info.name_); + preserve_vars_[idx].emplace(tmp_var); + tmp_var_infos_[idx].emplace_back(tmp_var, info.type_); + } + } + } +} + +bool ScopeBufferedSSAGraphExecutor::NeedCreateLocalExeScope() { + return drop_scope_counter_ == 0; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h new file mode 100644 index 00000000..988882e6 --- /dev/null +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -0,0 +1,78 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/var_handle.h" + +#include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/ssa_graph_executor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/place.h" +namespace paddle { +namespace framework { +namespace details { + +struct VariableInfo { + std::string name_; + proto::VarType::Type type_; + bool persistable_; +}; + +class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { + public: + ScopeBufferedSSAGraphExecutor( + ExecutionStrategy strategy, std::vector local_scopes, + std::vector local_exec_scopes, + std::vector var_infos, std::vector places, + std::unique_ptr&& underlying_executor); + + const ir::Graph& Graph() const override { + return underlying_executor_->Graph(); + } + + FeedFetchList Run(const std::vector& fetch_tensors) override; + + void DropLocalExeScopes(); + + bool NeedCreateLocalExeScope(); + + void PrepareLocalExeScopes(); + + private: + void InitVariables(); + + size_t drop_scope_counter_{0}; + ExecutionStrategy strategy_; + std::unique_ptr underlying_executor_; + std::vector local_scopes_; + + std::vector local_exec_scopes_; + std::vector> preserve_vars_; + std::vector>> + tmp_var_infos_; + + std::vector var_infos_; + std::vector places_; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc new file mode 100644 index 00000000..8539eb9d --- /dev/null +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" +#include +#include +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace details { + +// TODO(zjl): support SelectedRows +static inline const Tensor &GetTensorFromVar(const Variable *var) { + if (var->IsType()) { + return var->Get(); + } else { + PADDLE_THROW("Variable must be type of LoDTensor"); + } +} + +static inline Tensor *GetMutableTensorFromVar(Variable *var) { + if (var->IsType()) { + return var->GetMutable(); + } else { + PADDLE_THROW("Variable must be type of LoDTensor"); + } +} + +ShareTensorBufferOpHandle::ShareTensorBufferOpHandle( + ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type, + const std::vector &in_var_infos, + const std::vector &out_var_names) + : OpHandleBase(node), + scope_(scope), + scope_idx_(scope_idx), + op_type_(op_type), + in_var_infos_(in_var_infos), + out_var_names_(out_var_names) { + PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size()); + for (size_t i = 0; i < in_var_infos_.size(); ++i) { + Add(in_var_infos_[i], out_var_names_[i]); + } +} + +std::unordered_set ShareTensorBufferOpHandle::ReusedVarSet() + const { + std::unordered_set result; + for (auto &in_var_info : in_var_infos_) { + result.insert(in_var_info->Name()); + } + return result; +} + +void ShareTensorBufferOpHandle::Add(ir::MemOptVarInfo *in_var_info, + const std::string &out_var_name) { + PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr"); + PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name, + "in/out cannot have same name: %s", out_var_name); + in_var_infos_.emplace_back(in_var_info); + out_var_names_.emplace_back(out_var_name); +} + +void ShareTensorBufferOpHandle::InitCUDA() { +#ifdef PADDLE_WITH_CUDA + int dev_id = + boost::get(dev_ctxes_.begin()->first).device; + events_[dev_id] = nullptr; +#endif +} + +void ShareTensorBufferOpHandle::CallOnce() { + PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here"); + Scope *exec_scope = local_exec_scopes_[0]; + for (size_t i = 0; i < in_var_infos_.size(); ++i) { + auto *in_var = exec_scope->FindVar(in_var_infos_[i]->Name()); + auto *out_var = exec_scope->FindVar(out_var_names_[i]); + PADDLE_ENFORCE_NOT_NULL(in_var); + PADDLE_ENFORCE_NOT_NULL(out_var); + PADDLE_ENFORCE_NE(in_var, out_var); + in_out_vars_.emplace_back(in_var, out_var); + } +} + +void ShareTensorBufferOpHandle::RunImpl() { + if (in_var_infos_.size() != in_out_vars_.size()) { + CallOnce(); + } + + for (size_t i = 0; i < in_var_infos_.size(); ++i) { + const auto &in_tensor = GetTensorFromVar(in_out_vars_[i].first); + auto *out_tensor = GetMutableTensorFromVar(in_out_vars_[i].second); + auto *in_var_info = in_var_infos_[i]; + + if (UNLIKELY(in_var_info->IsSkipped())) { + // If in_var is inplaced in the previous batch and we want to fetch + // in_var in the current batch, we have to reset memory of out_var + // to avoid wrong calculation result. + if (in_tensor.Holder() == out_tensor->Holder()) { + VLOG(1) << "Clear " << out_var_names_[i] + << " because you may want to fetch an inplaced variable " + << in_var_info->Name() + << " in previous batch: " << in_var_info->Name() << " -> " + << out_var_names_[i]; + out_tensor->clear(); + } + } else { + out_tensor->ShareBufferWith(in_tensor); + + VLOG(2) << "Share tensor buffer when running " << op_type_ << " : " + << in_var_info->Name() << " -> " << out_var_names_[i]; + } + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h new file mode 100644 index 00000000..87e971ba --- /dev/null +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" + +namespace paddle { +namespace framework { + +class Variable; +class Scope; +class Tensor; + +namespace ir { +class MemOptVarInfo; +} // namespace ir + +namespace details { + +class ShareTensorBufferOpHandle : public OpHandleBase { + public: + ShareTensorBufferOpHandle( + ir::Node *node, Scope *scope, size_t scope_idx, + const std::string &op_type, + const std::vector &in_vars_infos, + const std::vector &out_var_names); + + std::unordered_set ReusedVarSet() const; + + Priority GetPriority() const override { return Priority::kHighest; } + + size_t GetScopeIdx() const { return scope_idx_; } + + void Add(ir::MemOptVarInfo *in_var_info, const std::string &ou_var_name); + + protected: + std::string Name() const override { return "buffer_share"; } + + void RunImpl() final; + + void InitCUDA() override; + + std::vector GetLocalScopes() override { return {scope_}; } + + private: + void CallOnce(); + + Scope *scope_; + size_t scope_idx_; + std::string op_type_; + std::vector in_var_infos_; + std::vector out_var_names_; + + std::vector> in_out_vars_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc new file mode 100644 index 00000000..a2461a36 --- /dev/null +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc @@ -0,0 +1,186 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h" +#include +#include "dgc/dgc.h" +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/reduce_and_gather.h" +#include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/profiler.h" + +DECLARE_bool(sync_nccl_allreduce); + +namespace paddle { +namespace framework { +namespace details { + +SparseAllReduceOpHandle::SparseAllReduceOpHandle( + ir::Node *node, const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLCommunicator *ctxs, bool is_encoded, int nranks) + : AllReduceOpHandle(node, local_scopes, places, ctxs), + is_encoded_(is_encoded), + nranks_(nranks) { + // TODO(gongwb) :polish them! + if (is_encoded) { + VLOG(1) << "Use dgc allreduce mode"; + } +} + +void SparseAllReduceOpHandle::RunImplEncoded() { + platform::RecordEvent record_event(Name()); + + WaitInputVarGenerated(); + + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + + std::vector ins; + std::vector outs; + int k = -1; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto *local_scope = local_exec_scopes_[i]; + auto original_name = + paddle::framework::GradOriginalVarName(in_var_handles[i]->name()); + auto encode_var_name = original_name + g_dgc_encoded; + auto *in_var = local_scope->FindVar(encode_var_name); + PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name); + auto &in = in_var->Get(); + ins.emplace_back(&in); + + auto *out = local_scope->FindVar(out_var_handles[i]->name()) + ->GetMutable(); + outs.emplace_back(out); + + if (k < 0) { + k = GetKValue(in_var_handles[i]->name()); + } + } + + PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place())); + PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place())); + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + + int dtype = -1; + size_t in_numel = 0; + size_t out_numel = 0; + PADDLE_ENFORCE(nranks_ > 1); + std::vector> all_reduce_calls; + + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &place = places_[i]; + auto &in = *ins[i]; + void *in_tensor_buf = const_cast(in.data()); + + auto &out = *outs[i]; + float *out_tensor_buf = out.data(); + + dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype; + in_numel = (in_numel == 0) ? static_cast(in.numel()) : in_numel; + PADDLE_ENFORCE(in_numel % 2 == 0); + PADDLE_ENFORCE(in_numel / 2 == static_cast(k)); + out_numel = (out_numel == 0) ? static_cast(out.numel()) : out_numel; + + int dev_id = boost::get(place).device; + auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false); + auto &nccl_ctx = nccl_ctxs->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + + auto &allocator = + platform::DeviceTemporaryAllocator::Instance().Get(place, stream); + int encode_size = 2 * k * sizeof(int); + // dgc use ncclAllGather to get all the encoded data + // so the buffer need nranks. + int buf_size = nranks_ * encode_size; + auto tmp_ious_data = allocator.Allocate(buf_size); + void *gather_buff = reinterpret_cast(tmp_ious_data->ptr()); + + VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel + << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size + << ", k:" << k << ", place:" << place << ", dtype:" << dtype; + + all_reduce_calls.emplace_back([=] { + PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce( + in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm, + stream)); + }); + } + + RunAllReduceFuncs(all_reduce_calls); +} + +int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) { + auto original_name = paddle::framework::GradOriginalVarName(grad_name); + auto var_name = original_name + g_dgc_k; + PADDLE_ENFORCE(local_scopes_.size() > 0); + + auto *scope = local_exec_scopes_[0]; + auto var = scope->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var); + auto tensor = var->Get().data(); + return *tensor; +} + +bool SparseAllReduceOpHandle::IsEncoded() { + if (!is_encoded_) { + return false; + } + auto counter_name = g_dgc_counter_name; + auto step_name = g_dgc_rampup_begin_step; + PADDLE_ENFORCE(local_scopes_.size() > 0); + + auto *local_scope = local_exec_scopes_[0]; + auto count_var = local_scope->FindVar(counter_name); + auto step_var = local_scope->FindVar(step_name); + if (count_var == nullptr || step_var == nullptr) { + PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name, + step_var); + } + + float count = *count_var->Get().data(); + float step = *step_var->Get().data(); + if (static_cast(count) < static_cast(step)) { + VLOG(10) << "in all_reduce currentstep:" << count + << " < rampup_begin_step:" << step + << " so not use sparse all reduce"; + return false; + } + + return true; +} + +void SparseAllReduceOpHandle::RunImpl() { + if (!IsEncoded()) { + AllReduceOpHandle::RunImpl(); + return; + } + + RunImplEncoded(); +} + +std::string SparseAllReduceOpHandle::Name() const { + return "sparse_all_reduce"; +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h new file mode 100644 index 00000000..9802f8db --- /dev/null +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h @@ -0,0 +1,52 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/dgc_const_values.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/nccl_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +class SparseAllReduceOpHandle : public AllReduceOpHandle { + public: + SparseAllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLCommunicator *ctxs, + bool is_encoded = false, int nranks = -1); + std::string Name() const override; + + protected: + void RunImpl() override; + int GetKValue(const std::string &grad_name); + bool IsEncoded(); + void RunImplEncoded(); + + private: + bool is_encoded_{false}; + int nranks_{-1}; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc new file mode 100644 index 00000000..4f1e44ca --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_executor.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { +SSAGraphExecutor::~SSAGraphExecutor() {} + +void ClearFetchOp(ir::Graph* graph, std::vector* fetch_ops) { + if (fetch_ops->empty()) return; + + for (auto& op : *fetch_ops) { + PADDLE_ENFORCE_NOT_NULL( + dynamic_cast(op), + "The input ops of ClearFetchOp function should be FetchOpHandle."); + for (auto& out_var : op->Node()->outputs) { + graph->RemoveNode(out_var); + } + for (auto& in_var : op->Inputs()) { + in_var->RemoveOutput(op, op->Node()); + } + graph->RemoveNode(op->Node()); + } + fetch_ops->clear(); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h new file mode 100644 index 00000000..2454ec2b --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/fetch_op_handle.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { +class SSAGraphExecutor { + DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); + + public: + SSAGraphExecutor() {} + + virtual ~SSAGraphExecutor(); + + virtual const ir::Graph& Graph() const = 0; + + virtual FeedFetchList Run(const std::vector& fetch_tensors) = 0; +}; + +void ClearFetchOp(ir::Graph* graph, std::vector* fetch_ops); +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc new file mode 100644 index 00000000..db28e1fe --- /dev/null +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -0,0 +1,349 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace framework { +namespace details { +ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &local_exec_scopes, + const std::vector &places, ir::Graph *graph) + : graph_(graph), + local_scopes_(local_scopes), + local_exec_scopes_(local_exec_scopes), + places_(places), + fetch_ctxs_(places), + strategy_(strategy), + prepare_pool_(1), + pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) + : nullptr) { + if (strategy_.num_iteration_per_run_ > 1) { + int read_op_num = 0; + for (auto *node : graph_->Nodes()) { + if (node->IsOp() && node->Name() == "read") { + read_op_num++; + } + } + if (read_op_num == 0) { + LOG(WARNING) << "when num_iteration_per_run_ is larger then 1, the model " + "should use pyreader to feed data!"; + } + } + PrepareOpDeps(); + CopyOpDeps(); +} + +inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl( + const std::vector &fetch_tensors) { + std::unique_ptr event( + new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare")); + std::unique_ptr op_deps = op_deps_futures_.get(); + CopyOpDeps(); + + VLOG(10) << "ThreadedSSAGraphExecutor::Run"; + std::shared_ptr> ready_vars( + new BlockingQueue); + auto &pending_ops = op_deps->pending_ops_; + auto &pending_vars = op_deps->pending_vars_; + auto &ready_ops = op_deps->ready_ops_; + size_t num_ops = op_deps->num_ops_; + + // Step 2. Insert FetchOps + std::vector fetch_ops; + std::unordered_set fetch_dependencies; + FeedFetchList fetch_data(fetch_tensors.size()); + + InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &ready_ops, + &pending_ops, &pending_vars, &fetch_data); + + exception_holder_.Clear(); + event.reset(nullptr); + + // Step 3. Execution + if (strategy_.num_threads_ == 1 && traced_ops_.size() == num_ops) { + // If the num_threads is 1, we can record the order of operator's + // execution in the first iteration, and in subsequent iterations, + // run the recorded operators directly. This strategy could make the + // execution faster. + VLOG(3) << "Run the traced ops."; + RunTracedOps(traced_ops_); + RunTracedOps(fetch_ops); + if (exception_holder_.IsCaught()) { + ExecutionFinal(&fetch_ops); + } + } else { + traced_ops_.clear(); + auto run_all_ops = [&](std::unordered_set &set) { + for (auto *op : set) { + RunOp(ready_vars, op); + } + set.clear(); + }; + // Clean run context + run_op_futures_.clear(); + + while (!pending_vars.empty()) { + // 1. Run All Ready ops + // Keep loop until all vars are ready. + run_all_ops(ready_ops); + + // 2. Find ready variable + bool timeout; + auto cur_ready_vars = ready_vars->PopAll(1, &timeout); + if (timeout) { + for (auto &run_op_future : run_op_futures_) { + run_op_future.wait(); + } + if (exception_holder_.IsCaught()) { + ExecutionFinal(&fetch_ops); + } else { + continue; + } + } + + // 3. Remove the dependency of ready_var. + // Find the ready_ops after the ready_var. + for (auto ready_var : cur_ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->PendingOps()) { + auto &deps = pending_ops[op]; + --deps; + if (deps == 0) { + ready_ops.insert(op); + } + } + } + } + PADDLE_ENFORCE(ready_ops.empty()); + } + + // Wait FetchOps. + ClearFetchOp(graph_, &fetch_ops); + + return fetch_data; +} + +FeedFetchList ThreadedSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + for (size_t j = 0; j < strategy_.num_iteration_per_run_ - 1; ++j) { + RunImpl({}); + } + return RunImpl(fetch_tensors); +} + +void ThreadedSSAGraphExecutor::InsertFetchOps( + const std::vector &fetch_tensors, + std::vector *fetch_ops, + std::unordered_set *fetch_dependencies, + std::unordered_set *ready_ops, + std::unordered_map *pending_ops, + std::unordered_set *pending_vars, + FeedFetchList *fetch_data) { + std::unordered_map> fetched_vars; + std::unordered_set local_ready_vars; + std::unordered_set fetch_tensor_set(fetch_tensors.begin(), + fetch_tensors.end()); + for (auto &fetch_var_name : fetch_tensor_set) { + for (auto &var_map : graph_->Get(details::kGraphVars)) { + auto it = var_map.find(fetch_var_name); + if (it != var_map.end()) { + fetched_vars[fetch_var_name].emplace_back(*it->second.rbegin()); + } + } + } + + for (size_t i = 0; i < fetch_tensors.size(); ++i) { + auto &var_name = fetch_tensors[i]; + auto fetched_var_it = fetched_vars.find(var_name); + PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(), + "Cannot find fetched variable(%s).(Perhaps the main_program " + "is not set to ParallelExecutor)", + var_name); + + auto &vars = fetched_var_it->second; + + ir::Node *fetch_node = + graph_->CreateEmptyNode("fetch", ir::Node::Type::kOperation); + auto *op = new FetchOpHandle(fetch_node, fetch_data, i, &local_scopes_, + &local_exec_scopes_); + fetch_ops->emplace_back(op); + + for (auto &p : places_) { + op->SetDeviceContext(p, fetch_ctxs_.Get(p)); + } + + for (auto *var : vars) { + op->AddInput(var); + } + + ir::Node *fetch_var = + graph_->CreateEmptyNode("fetch", ir::Node::Type::kVariable); + auto *fetch_dummy = new DummyVarHandle(fetch_var); + op->AddOutput(fetch_dummy); + fetch_dependencies->emplace(fetch_dummy); + + this->InsertPendingVar(pending_vars, &local_ready_vars, fetch_dummy); + + size_t wait_input_num = 0; + std::unordered_set input_set(vars.begin(), vars.end()); + for (auto *var : input_set) { + if (pending_vars->count(var)) { + ++wait_input_num; + } + } + if (wait_input_num) { + pending_ops->insert({op, wait_input_num}); + } else { + ready_ops->insert(static_cast(op)); + } + } + PADDLE_ENFORCE_EQ(local_ready_vars.size(), 0); +} + +void ThreadedSSAGraphExecutor::InsertPendingOp( + std::unordered_map *pending_ops, + OpHandleBase *op_instance) const { + pending_ops->insert({op_instance, op_instance->NoDupInputSize()}); +} + +void ThreadedSSAGraphExecutor::InsertPendingVar( + std::unordered_set *pending_vars, + std::unordered_set *ready_vars, VarHandleBase *var) const { + pending_vars->insert(var); + if (var->GeneratedOp() == nullptr) { + ready_vars->insert(var); + } +} + +void ThreadedSSAGraphExecutor::PrepareOpDeps() { + op_deps_.reset(new OpDependentData()); + std::unordered_map &pending_ops = + op_deps_->pending_ops_; + std::unordered_set &pending_vars = op_deps_->pending_vars_; + std::unordered_set &ready_ops = op_deps_->ready_ops_; + std::unordered_set ready_vars; + + // Transform SSAGraph to pending_ops & pending_vars + for (auto &var_map : graph_->Get(details::kGraphVars)) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + InsertPendingVar(&pending_vars, &ready_vars, version_pair); + } + } + } + for (auto &var : graph_->Get(details::kGraphDepVars)) { + InsertPendingVar(&pending_vars, &ready_vars, var); + } + + for (auto &op : ir::FilterByNodeWrapper(*graph_)) { + if (op->Inputs().empty()) { // Special case, Op has no input. + ready_ops.insert(op); + } else { + InsertPendingOp(&pending_ops, op); + } + } + op_deps_->num_ops_ = ready_ops.size() + pending_ops.size(); + PADDLE_ENFORCE_GT(op_deps_->num_ops_, 0, "The graph doesn't have operators."); + + for (auto ready_var : ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->PendingOps()) { + auto &deps = pending_ops[op]; + --deps; + if (deps == 0) { + ready_ops.insert(op); + } + } + } +} + +void ThreadedSSAGraphExecutor::CopyOpDeps() { + op_deps_futures_ = prepare_pool_.enqueue([&] { + auto *op_deps = new OpDependentData(); + op_deps->pending_ops_.insert(op_deps_->pending_ops_.begin(), + op_deps_->pending_ops_.end()); + op_deps->pending_vars_.insert(op_deps_->pending_vars_.begin(), + op_deps_->pending_vars_.end()); + op_deps->ready_ops_.insert(op_deps_->ready_ops_.begin(), + op_deps_->ready_ops_.end()); + op_deps->num_ops_ = op_deps_->num_ops_; + return std::unique_ptr(op_deps); + }); +} + +void ThreadedSSAGraphExecutor::RunOp( + const std::shared_ptr> &ready_var_q, + details::OpHandleBase *op) { + auto op_run = [ready_var_q, op, this] { + RunOpSync(op); + try { + ready_var_q->Extend(op->Outputs()); + VLOG(10) << op << " " << op->Name() << " Signal posted"; + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + }; + + if (pool_) { + run_op_futures_.emplace_back(pool_->enqueue(op_run)); + } else { + op_run(); + } + + RecordOps(op); +} + +void ThreadedSSAGraphExecutor::RunTracedOps( + const std::vector &traced_ops) { + for (auto &op : traced_ops) { + if (exception_holder_.IsCaught()) { + return; + } + RunOpSync(op); + } +} + +void ThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) { + try { + if (VLOG_IS_ON(10)) { + VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); + } + if (LIKELY(!strategy_.dry_run_)) { + op->Run(strategy_.use_cuda_); + } + VLOG(10) << op << " " << op->Name() << " Done "; + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } +} + +void ThreadedSSAGraphExecutor::ExecutionFinal( + std::vector *fetch_ops) { + VLOG(3) << "caught exception " << exception_holder_.Type() << ", rethrow it"; + ClearFetchOp(graph_, fetch_ops); + exception_holder_.ReThrow(); +} + +void ThreadedSSAGraphExecutor::RecordOps(OpHandleBase *op) { + if (strategy_.num_threads_ == 1 && !dynamic_cast(op)) { + traced_ops_.emplace_back(op); + } +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h new file mode 100644 index 00000000..fe6ef95a --- /dev/null +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -0,0 +1,119 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include // ThreadPool in thrird party + +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/details/exception_holder.h" +#include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/fetch_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/ssa_graph_executor.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace details { + +struct OpDependentData { + std::unordered_map pending_ops_; + std::unordered_set pending_vars_; + std::unordered_set ready_ops_; + size_t num_ops_{0}; +}; + +class ThreadedSSAGraphExecutor : public SSAGraphExecutor { + public: + ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &local_exec_scopes, + const std::vector &places, + ir::Graph *graph); + + const ir::Graph &Graph() const override { return *graph_; } + // Run a SSAGraph by a thread pool + // Use topological sort algorithm + FeedFetchList Run(const std::vector &fetch_tensors) override; + + ~ThreadedSSAGraphExecutor() final = default; + + private: + inline FeedFetchList RunImpl(const std::vector &fetch_tensors); + void RunOp(const std::shared_ptr> &ready_var_q, + details::OpHandleBase *op); + + private: + // Note(zcd): the ThreadPool should be placed last so that ThreadPool should + // be destroyed first. + ir::Graph *graph_; + std::vector local_scopes_; + std::vector local_exec_scopes_; + + std::vector places_; + platform::DeviceContextPool fetch_ctxs_; + ExceptionHolder exception_holder_; + std::unique_ptr op_deps_; + std::future> op_deps_futures_; + ExecutionStrategy strategy_; + // use std::list because clear(), push_back, and for_each are O(1) + std::list> run_op_futures_; + ::ThreadPool prepare_pool_; + std::unique_ptr<::ThreadPool> pool_; + std::vector traced_ops_; + + void InsertPendingOp(std::unordered_map *pending_ops, + OpHandleBase *op_instance) const; + + void InsertPendingVar(std::unordered_set *pending_vars, + std::unordered_set *ready_vars, + VarHandleBase *var) const; + + void InsertFetchOps(const std::vector &fetch_tensors, + std::vector *fetch_ops, + std::unordered_set *fetch_dependencies, + std::unordered_set *ready_ops, + std::unordered_map *pending_ops, + std::unordered_set *pending_vars, + FeedFetchList *fetch_data); + + void PrepareOpDeps(); + + void CopyOpDeps(); + + inline void RecordOps(OpHandleBase *op); + + inline void ExecutionFinal(std::vector *fetch_ops); + + inline void RunOpSync(OpHandleBase *op); + + void RunTracedOps(const std::vector &traced_ops); +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc new file mode 100644 index 00000000..95d62e66 --- /dev/null +++ b/paddle/fluid/framework/details/var_handle.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/var_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +VarHandleBase::~VarHandleBase() {} + +VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); } + +std::string VarHandle::DebugString() const { + std::stringstream ss; + ss << "name:" << name_ << ", place:" << place_ << ", version:" << version_ + << ", scope_idx:" << scope_idx_; + return ss.str(); +} + +std::string DummyVarHandle::DebugString() const { return node_->Name(); } + +DummyVarHandle::~DummyVarHandle() { + VLOG(4) << "deleting dummy var handle " << DebugString(); +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h new file mode 100644 index 00000000..93060ef2 --- /dev/null +++ b/paddle/fluid/framework/details/var_handle.h @@ -0,0 +1,166 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +namespace details { +class OpHandleBase; + +// Wraps ir::Node and provide helper utilities. +// It's responsible for populating necessary fields of ir::Node. +// +// VarHandleBase is the var node in the dependency graph. +// A variable can only be generated by a single operator. i.e. +// This is a single assignment graph. +struct VarHandleBase { + // Owned by `node`. No need to be deleted explicitly. + explicit VarHandleBase(ir::Node* node) : node_(node) { + node_->WrappedBy(this); + } + + virtual ~VarHandleBase(); + + virtual std::string DebugString() const = 0; + virtual const std::string& Name() const = 0; + + void AddInput(OpHandleBase* in, ir::Node* node) { + node_->inputs.clear(); + node_->inputs.push_back(node); + generated_op_ = in; + } + + void AddOutput(OpHandleBase* out, ir::Node* node) { + if (pending_ops_.find(out) == pending_ops_.end()) { + PADDLE_ENFORCE(out != nullptr, "The output of %s should not be nullptr", + this->Node()->Name()); + pending_ops_.insert(out); + node_->outputs.push_back(node); + } + } + + void RemoveOutput(OpHandleBase* out, ir::Node* node) { + pending_ops_.erase(out); + node_->outputs.erase( + std::remove(node_->outputs.begin(), node_->outputs.end(), node), + node_->outputs.end()); + } + + void ClearGeneratedOp() { + generated_op_ = nullptr; + node_->inputs.clear(); + } + + OpHandleBase* GeneratedOp() { return generated_op_; } + + const std::unordered_set& PendingOps() const { + return pending_ops_; + } + + ir::Node* Node() { return node_; } + + protected: + // The operator who generate this variable. nullptr if the variable + // is a root node. + OpHandleBase* generated_op_{nullptr}; + + // Operators which depend on this variable ready. + std::unordered_set pending_ops_; + ir::Node* node_; +}; + +// VarHandle is actually a single version of Runtime Variable. +// Variable in Runtime mapped to many VarHandles in Graph. +// Each assignment will generate a new var handle with newer version. +// +// NOTE: runtime variables have place. +struct VarHandle : public VarHandleBase { + virtual ~VarHandle(); + + std::string DebugString() const override; + + VarHandle(ir::Node* node, size_t version, size_t scope_index, + std::string name, platform::Place place) + : VarHandleBase(node), + version_(version), + scope_idx_(scope_index), + name_(std::move(name)), + place_(std::move(place)) {} + +#ifdef PADDLE_WITH_CUDA + bool HasEvent() { return has_event_; } + + const cudaEvent_t& GetEvent() { + PADDLE_ENFORCE(HasEvent(), "The event is not set."); + return event_; + } + + void SetGenerateEvent(const cudaEvent_t& event) { + has_event_ = true; + event_ = event; + } +#endif + + // version field currently is not used, however, just store the version to + // debug easily. + private: + size_t version_; + size_t scope_idx_; + std::string name_; + platform::Place place_; +#ifdef PADDLE_WITH_CUDA + // Only when this event is triggered, var is generated. + cudaEvent_t event_; + bool has_event_{false}; +#endif + + public: + bool IsTheSameVar(const VarHandle& o) const { + return o.generated_op_ == generated_op_ && o.name_ == name_ && + o.scope_idx_ == scope_idx_; + } + + size_t version() const { return version_; } + size_t scope_idx() const { return scope_idx_; } + const std::string& Name() const override { return name_; } + const std::string& name() const { return name_; } + const platform::Place& place() const { return place_; } +}; + +// Dummy Variable. It is used to represent dependencies between operators +struct DummyVarHandle : public VarHandleBase { + explicit DummyVarHandle(ir::Node* node) : VarHandleBase(node) {} + + virtual ~DummyVarHandle(); + + std::string DebugString() const override; + + public: + const std::string& Name() const override { return name_; } + std::string name_{"DummyVar"}; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/variable_visitor.cc b/paddle/fluid/framework/details/variable_visitor.cc new file mode 100644 index 00000000..134f7590 --- /dev/null +++ b/paddle/fluid/framework/details/variable_visitor.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/framework/selected_rows.h" +namespace paddle { +namespace framework { +namespace details { +template +static void VisitVariable(Variable* var, Func* func) { + if (var->IsType()) { + (*func)(var->GetMutable()); + } else if (var->IsType()) { + (*func)(var->GetMutable()); + } else { + PADDLE_THROW("Not supported type %s", ToTypeName(var->Type())); + } +} + +template +static void VisitVariable(const Variable& var, Func* func) { + if (var.IsType()) { + (*func)(var.Get()); + } else if (var.IsType()) { + (*func)(var.Get()); + } else { + PADDLE_THROW("Not supported type %s", ToTypeName(var.Type())); + } +} + +struct TensorVisitor { + Tensor* result_{nullptr}; + + void operator()(LoDTensor* tensor) { result_ = tensor; } + + void operator()(SelectedRows* selected_rows) { + result_ = selected_rows->mutable_value(); + } + + template + void operator()() { + PADDLE_THROW("Not Support to get LoDTensor from %s", typeid(T).name()); + } +}; + +Tensor& VariableVisitor::GetMutableTensor(Variable* var) { + TensorVisitor vistor; + VisitVariable(var, &vistor); + return *vistor.result_; +} + +struct ShareDimsAndLoDVisitor { + Variable* trg_; + void operator()(const LoDTensor& val) { + auto* tensor = trg_->GetMutable(); + tensor->set_layout(val.layout()); + tensor->set_lod(val.lod()); + tensor->Resize(val.dims()); + } + + void operator()(const SelectedRows& val) { + auto* selected_rows = trg_->GetMutable(); + selected_rows->set_rows(val.rows()); + selected_rows->set_height(val.height()); + selected_rows->mutable_value()->Resize(val.value().dims()); + } + + template + void operator()(const T&) { + PADDLE_ENFORCE("ShareDimsAndLoD is not supported by type %s", + typeid(T).name()); + } +}; + +void VariableVisitor::ShareDimsAndLoD(const Variable& src, Variable* trg) { + ShareDimsAndLoDVisitor visitor{trg}; + VisitVariable(src, &visitor); +} + +struct EnforceShapeAndDTypeEQVisitor { + const Variable* trg_; + + void operator()(const LoDTensor& src) { + auto& tensor = trg_->Get(); + PADDLE_ENFORCE_EQ( + src.place().which(), tensor.place().which(), + "The Places of the two Variable must be all on CPU or all on GPU."); + PADDLE_ENFORCE_EQ(src.type(), tensor.type(), + "The dtype of the two Variable is not equal."); + PADDLE_ENFORCE_EQ(src.dims(), tensor.dims(), + "The dims of the two Variable is not equal."); + PADDLE_ENFORCE_EQ(src.lod(), tensor.lod(), + "The lod of the two Variable is not equal."); + PADDLE_ENFORCE_EQ(src.layout(), tensor.layout(), + "The layout of the two Variable's tensor is not equal."); + } + + void operator()(const SelectedRows& src) { + auto& selected_rows = trg_->Get(); + PADDLE_ENFORCE_EQ( + src.place().which(), selected_rows.place().which(), + "The Places of the two Variable must be all on CPU or all on GPU."); + PADDLE_ENFORCE_EQ(src.value().type(), selected_rows.value().type(), + "The dtype of the two Variable is not equal."); + PADDLE_ENFORCE_EQ(src.value().layout(), selected_rows.value().layout(), + "The layout of the two Variable's tensor is not equal."); + PADDLE_ENFORCE_EQ(src.height(), selected_rows.height(), + "The height of the two Variable is not equal."); + PADDLE_ENFORCE_EQ(src.GetCompleteDims(), selected_rows.GetCompleteDims(), + "The dims of the two Variable is not equal."); + } + + template + void operator()(const T&) { + PADDLE_ENFORCE("EnforceShapeAndDTypeEQ is not supported by type %s", + typeid(T).name()); + } +}; + +void VariableVisitor::EnforceShapeAndDTypeEQ(const Variable& var1, + const Variable& var2) { + EnforceShapeAndDTypeEQVisitor visitor{&var1}; + VisitVariable(var2, &visitor); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/variable_visitor.h b/paddle/fluid/framework/details/variable_visitor.h new file mode 100644 index 00000000..ca9a19bd --- /dev/null +++ b/paddle/fluid/framework/details/variable_visitor.h @@ -0,0 +1,36 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { +namespace details { + +class VariableVisitor { + public: + static Tensor &GetMutableTensor(Variable *var); + + static void ShareDimsAndLoD(const Variable &src, Variable *trg); + + static void EnforceShapeAndDTypeEQ(const Variable &var1, + const Variable &var2); +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc new file mode 100644 index 00000000..7fe60b44 --- /dev/null +++ b/paddle/fluid/framework/device_worker.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/device_worker.h" + +namespace paddle { +namespace framework { + +void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; } + +void DeviceWorker::SetDataFeed(DataFeed* data_feed) { + device_reader_ = data_feed; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h new file mode 100644 index 00000000..8c91a5b9 --- /dev/null +++ b/paddle/fluid/framework/device_worker.h @@ -0,0 +1,305 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include + +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/trainer_desc.pb.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/reader/blocking_queue.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/timer.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { + +#define SEC_LOG \ + VLOG(3) << "[s" << section_id_ << "p" << pipeline_id_ << "t" << thread_id_ \ + << "]: " + +class PullDenseWorker { + public: + virtual ~PullDenseWorker() {} + virtual void Initialize(const TrainerDesc& param); + int Start(); + void Stop(); + void SetRootScope(Scope* scope) { root_scope_ = scope; } + void IncreaseThreadVersion(int thread_id, uint64_t table_id); + void ResetThreadVersion(uint64_t table_id); + void Wait(std::vector<::std::future>* status_vec); + void PullDense(bool force_update = false); + static std::shared_ptr GetInstance() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::framework::PullDenseWorker()); + } + return s_instance_; + } + + private: + PullDenseWorker() : root_scope_(NULL) {} + void Run(); + bool CheckUpdateParam(uint64_t table_id); + + private: + static std::shared_ptr s_instance_; + std::shared_ptr fleet_ptr_; + PullDenseWorkerParameter param_; + DownpourWorkerParameter dwp_param_; + Scope* root_scope_; + bool running_; + + static std::map last_versions_; + static std::map current_version_; + static std::mutex mutex_for_version_; + static std::map> training_versions_; + static std::map> dense_value_names_; + + std::thread t_; + int thread_num_; + int sleep_time_ms_; + int threshold_; + + std::vector<::std::future> pull_dense_status_; + uint32_t pull_dense_fail_times_ = 0; + std::vector base_norm_param_; + std::vector mean_; + std::vector scale_; + float squared_sum_epsilon_ = 1e-4; + std::mutex mutex_for_mean_scale_; + float total_batch_num_ = 0; +}; + +// should incorporate different type of device +class DeviceWorker { + public: + DeviceWorker() { use_cvm_ = false; } + virtual ~DeviceWorker() {} + virtual void Initialize(const TrainerDesc& desc) = 0; + virtual void SetDeviceIndex(int tid) = 0; + virtual void TrainFiles() = 0; + virtual void PrintFetchVars() = 0; + virtual void TrainFilesWithProfiler() = 0; + virtual void CreateDeviceResource(const ProgramDesc& main_prog) = 0; + // will make this zero copy in the future + virtual void BindingDataFeedMemory() = 0; + virtual void SetRootScope(Scope* root_scope); + virtual void SetDataFeed(DataFeed* data_feed); + virtual void SetPlace(const paddle::platform::Place& place) { + place_ = place; + } + + protected: + Scope* root_scope_ = nullptr; + paddle::platform::Place place_; + DataFeed* device_reader_ = nullptr; + int64_t batch_num_; + FetchConfig fetch_config_; + bool use_cvm_; +}; + +class CPUWorkerBase : public DeviceWorker { + public: + CPUWorkerBase() {} + virtual ~CPUWorkerBase() {} + virtual void SetDeviceIndex(int tid) { thread_id_ = tid; } + virtual void TrainFiles() = 0; + virtual void TrainFilesWithProfiler() {} + virtual void PrintFetchVars() {} + virtual void CreateDeviceResource(const ProgramDesc& main_prog) {} + + protected: + int thread_id_; +}; + +class HogwildWorker : public CPUWorkerBase { + public: + HogwildWorker() {} + virtual ~HogwildWorker() {} + virtual void Initialize(const TrainerDesc& desc); + virtual void TrainFiles(); + virtual void TrainFilesWithProfiler(); + virtual void PrintFetchVars(); + virtual void CreateDeviceResource(const ProgramDesc& main_prog); + virtual void BindingDataFeedMemory(); + + protected: + void CreateThreadOperators(const ProgramDesc& program); + void CreateThreadScope(const ProgramDesc& program); + std::vector op_names_; + std::vector ops_; + Scope* thread_scope_; + HogwildWorkerParameter param_; + std::vector skip_ops_; +}; + +class DownpourWorker : public HogwildWorker { + public: + DownpourWorker() {} + virtual ~DownpourWorker() {} + virtual void Initialize(const TrainerDesc& desc); + virtual void TrainFiles(); + virtual void TrainFilesWithProfiler(); + + protected: + std::shared_ptr fleet_ptr_; + std::shared_ptr pull_dense_worker_; + void FillSparseValue(size_t table_id); + void PushGradients(); + void CollectLabelInfo(size_t table_id); + + private: + bool need_to_push_dense_; + bool need_to_push_sparse_; + DownpourWorkerParameter param_; + // just save the value in param_ for easy access + std::map label_var_name_; + std::map> sparse_key_names_; + std::map> sparse_value_names_; + std::map> sparse_grad_names_; + std::map> dense_value_names_; + std::map> dense_grad_names_; + + // feasign + std::map> features_; + // feasign stats + std::map> feature_labels_; + // feasign embedding + std::map>> feature_values_; + // feasign embedding gradient + std::map>> feature_grads_; + // skipped ops + std::vector skip_ops_; + + std::shared_ptr _pull_dense_worker; + std::vector<::std::future> push_sparse_status_; + std::vector<::std::future> push_dense_status_; +}; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +using ScopeQueue = operators::reader::BlockingQueue; + +class SyncFunctor { + public: + SyncFunctor(int rank_id, int rank_num, int sync_steps); + virtual ~SyncFunctor() {} + + void SetSyncParam(const std::vector& sync_param) { + sync_param_ = &sync_param; + } + void SetNcclCtxMap(platform::NCCLContextMap* nccl_ctx_map) { + nccl_ctx_map_ = nccl_ctx_map; + } + + int operator()(Scope* scope); + static std::vector pipeline_scopes_; + static uint64_t sync_flag_; + + protected: + const int rank_id_; + const int rank_num_; + const std::vector* sync_param_ = nullptr; + platform::NCCLContextMap* nccl_ctx_map_ = nullptr; + + uint64_t sync_signal_; + const int sync_steps_; + int counter_; + + void Synchronize(); +}; + +class SectionWorker : public DeviceWorker { + public: + SectionWorker() {} + ~SectionWorker() override {} + + void Initialize(const TrainerDesc& desc) override; + + void BindingDataFeedMemory() override {} + void CreateDeviceResource(const ProgramDesc& main_prog) override{}; + + void TrainFiles() override; + void TrainFilesWithProfiler() override; + + void PrintFetchVars() override {} + + const platform::Place& place() const { return place_; } + + void SetSectionIndex(int section_id) { section_id_ = section_id; } + void SetDeviceIndex(int tid) override { pipeline_id_ = tid; } + void SetThreadIndex(int thread_id) { thread_id_ = thread_id; } + void SetVarNames(const std::vector& in_var_names, + const std::vector& out_var_names) { + in_var_names_ = &in_var_names; + out_var_names_ = &out_var_names; + } + void SetScopeQueue(ScopeQueue* in_scope_queue, ScopeQueue* out_scope_queue) { + in_scope_queue_ = in_scope_queue; + out_scope_queue_ = out_scope_queue; + } + void SetCountMutex(std::mutex* mutex) { worker_count_mutex_ = mutex; } + void SetWorkerCount(int* worker_count) { worker_count_ = worker_count; } + void SetSectionNum(int section_num) { section_num_ = section_num; } + void SetPipelineNum(int pipeline_num) { pipeline_num_ = pipeline_num; } + void SetNextSectionPlace(const paddle::platform::Place& place) { + next_section_place_ = place; + } + SyncFunctor* sync_func_ = nullptr; + void SetSyncFunctor(SyncFunctor* sync_func) { sync_func_ = sync_func; } + + static std::atomic cpu_id_; + + protected: + void AutoSetCPUAffinity(bool reuse); + int section_id_; + int pipeline_id_; + int section_num_; + int pipeline_num_; + int thread_id_; + + // This worker will consume scope from in_scope_queue_ + // and produce scope to out_scope_queue_ + ScopeQueue* in_scope_queue_ = nullptr; + ScopeQueue* out_scope_queue_ = nullptr; + const std::vector* in_var_names_ = nullptr; + const std::vector* out_var_names_ = nullptr; + std::mutex* worker_count_mutex_ = nullptr; + int* worker_count_ = nullptr; + paddle::platform::Place next_section_place_; + + std::vector> ops_; + + platform::DeviceContext* dev_ctx_ = nullptr; +}; +#endif +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc new file mode 100644 index 00000000..dc85941f --- /dev/null +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/device_worker_factory.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +typedef std::shared_ptr (*Createdevice_workerFunction)(); +typedef std::unordered_map + device_workerMap; +device_workerMap g_device_worker_map; +#define REGISTER_DEVICE_WORKER_CLASS(device_worker_class) \ + namespace { \ + std::shared_ptr Creator_##device_worker_class() { \ + return std::shared_ptr(new device_worker_class); \ + } \ + class __Registerer_##device_worker_class { \ + public: \ + __Registerer_##device_worker_class() { \ + g_device_worker_map[#device_worker_class] = \ + &Creator_##device_worker_class; \ + } \ + }; \ + __Registerer_##device_worker_class g_registerer_##device_worker_class; \ + } // namespace + +std::string DeviceWorkerFactory::DeviceWorkerTypeList() { + std::string device_worker_types; + for (auto iter = g_device_worker_map.begin(); + iter != g_device_worker_map.end(); ++iter) { + if (iter != g_device_worker_map.begin()) { + device_worker_types += ", "; + } + device_worker_types += iter->first; + } + return device_worker_types; +} + +std::shared_ptr DeviceWorkerFactory::CreateDeviceWorker( + std::string device_worker_class) { + if (g_device_worker_map.count(device_worker_class) < 1) { + exit(-1); + } + return g_device_worker_map[device_worker_class](); +} + +REGISTER_DEVICE_WORKER_CLASS(HogwildWorker); +REGISTER_DEVICE_WORKER_CLASS(DownpourWorker); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +REGISTER_DEVICE_WORKER_CLASS(SectionWorker); +#endif +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/device_worker_factory.h b/paddle/fluid/framework/device_worker_factory.h new file mode 100644 index 00000000..9d061338 --- /dev/null +++ b/paddle/fluid/framework/device_worker_factory.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/device_worker.h" + +namespace paddle { +namespace framework { + +class DeviceWorkerFactory { + public: + static std::string DeviceWorkerTypeList(); + static std::shared_ptr CreateDeviceWorker( + std::string device_worker_class); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/device_worker_test.cc b/paddle/fluid/framework/device_worker_test.cc new file mode 100644 index 00000000..faa648ab --- /dev/null +++ b/paddle/fluid/framework/device_worker_test.cc @@ -0,0 +1,24 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { +TEST() { + // create hogwild device worker +} +} +} diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h new file mode 100644 index 00000000..531ee844 --- /dev/null +++ b/paddle/fluid/framework/dim.h @@ -0,0 +1,101 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/array.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace framework { + +// Statically sized, statically indexed dimension +template +class Dim : public Array { + public: + static_assert(D >= 0, "D must be not less than 0"); + + static constexpr int kRank = D; + using BaseClass = Array; + + inline Dim(int64_t head, const Dim& tail) { + (*this)[0] = head; + new (this->GetMutable() + 1) Dim(tail); + } + + template + HOSTDEVICE explicit Dim(int64_t head, Args... args) + : BaseClass(head, args...) {} + + /** Construct a Dim with each dimension set to the given index */ + HOSTDEVICE explicit Dim(int64_t idx) { this->Fill(idx); } + + HOSTDEVICE Dim() = default; + + HOST std::string to_string() const; +}; + +// Product of a Dim +template +HOSTDEVICE inline int64_t product(const Dim& a) { + return UnrollProduct::Run(a.Get()); +} + +/** + * Helper function to create a Dim + * + * \param idxes The type of Dim constructed depends on the number of params + * + */ + +template +HOSTDEVICE inline Dim make_dim(Args... idxes) { + return Dim(idxes...); +} + +// Allows us to output a Dim +template +inline std::ostream& operator<<(std::ostream& os, const Dim& d) { + os << d[0]; + for (int i = 1; i < D; ++i) { + os << ", " << d[i]; + } + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Dim<0>& d) { + return os; +} + +template +HOST std::string Dim::to_string() const { + std::stringstream stream; + stream << *this; + return stream.str(); +} + +template +inline void static_dim_assign(const T1* in, T2* out) { + UnrollAssign::Run(in, out); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dim_test.cu b/paddle/fluid/framework/dim_test.cu new file mode 100644 index 00000000..7add6d14 --- /dev/null +++ b/paddle/fluid/framework/dim_test.cu @@ -0,0 +1,86 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/dim.h" + +__global__ void test(paddle::framework::Dim<2>* o) { + o[0] = paddle::framework::make_dim(5, 6); +} + +__global__ void dyn_idx_gpu(int64_t* o) { + auto d = paddle::framework::make_dim(5, 6); + o[0] = d[1]; +} + +TEST(Dim, Equality) { + // construct a Dim on the CPU + auto a = paddle::framework::make_dim(3, 4); + EXPECT_EQ(a[0], 3); + EXPECT_EQ(a[1], 4); + + // construct a Dim on the GPU + thrust::device_vector> t(2); + test<<<1, 1>>>(thrust::raw_pointer_cast(t.data())); + a = t[0]; + EXPECT_EQ(a[0], 5); + EXPECT_EQ(a[1], 6); + + // product + EXPECT_EQ(paddle::framework::product(a), 30); + + // mutate a Dim + auto b = paddle::framework::make_dim(7, 8); + b[1] = 10; + EXPECT_EQ(b[0], 7); + EXPECT_EQ(b[1], 10); + + b[0] = 8; + b[1] = 11; + EXPECT_EQ(b[0], 8); + EXPECT_EQ(b[1], 11); + + // dynamic access on GPU + thrust::device_vector r(1); + dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data())); + int64_t res = r[0]; + EXPECT_EQ(res, 6); +} + +TEST(Dim, Bool) { + auto a = paddle::framework::make_dim(3, 4); + auto b = paddle::framework::make_dim(5, 6); + auto c = paddle::framework::make_dim(3, 4); + + // comparison + EXPECT_TRUE(a == a); + EXPECT_FALSE(a == b); + EXPECT_TRUE(a == c); +} + +TEST(Dim, Print) { + { + std::stringstream ss; + auto a = paddle::framework::make_dim(2, 3); + ss << a; + EXPECT_EQ(ss.str(), "2, 3"); + } + { + std::stringstream ss; + ss << paddle::framework::make_dim(8); + EXPECT_EQ(ss.str(), "8"); + } +} diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc new file mode 100644 index 00000000..8cd0789c --- /dev/null +++ b/paddle/fluid/framework/dist_multi_trainer.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { + +void DistMultiTrainer::Initialize(const TrainerDesc& trainer_desc, + Dataset* dataset) { + thread_num_ = trainer_desc.thread_num(); + SetDataset(dataset); + + const std::vector readers = + dataset->GetReaders(); + + thread_num_ = readers.size(); + workers_.resize(thread_num_); + + for (int i = 0; i < thread_num_; ++i) { + workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( + trainer_desc.device_worker_name()); + workers_[i]->SetDeviceIndex(i); + workers_[i]->SetDataFeed(readers[i]); + workers_[i]->Initialize(trainer_desc); + } + + VLOG(3) << "going to initialize pull dense worker"; + pull_dense_worker_ = PullDenseWorker::GetInstance(); + pull_dense_worker_->Initialize(trainer_desc); + VLOG(3) << "initialize pull dense worker"; + SetDebug(trainer_desc.debug()); +} + +void DistMultiTrainer::InitOtherEnv(const ProgramDesc& main_program) { + pull_dense_worker_->SetRootScope(root_scope_); + pull_dense_worker_->Start(); + VLOG(3) << "init other env done."; +} + +void DistMultiTrainer::Run() { + for (int thidx = 0; thidx < thread_num_; ++thidx) { + if (!debug_) { + threads_.push_back( + std::thread(&DeviceWorker::TrainFiles, workers_[thidx].get())); + } else { + threads_.push_back(std::thread(&DeviceWorker::TrainFilesWithProfiler, + workers_[thidx].get())); + } + } +} + +void DistMultiTrainer::Finalize() { + for (auto& th : threads_) { + th.join(); + } + pull_dense_worker_->Stop(); + root_scope_->DropKids(); +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc new file mode 100644 index 00000000..39652706 --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -0,0 +1,124 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/dlpack_tensor.h" +#include "paddle/fluid/framework/data_type.h" +namespace paddle { +namespace framework { + +namespace internal { +template +static ::DLDataType GetDLDataTypeCode() { + ::DLDataType dtype; + if (std::is_same::value || + std::is_floating_point::value) { + dtype.code = kDLFloat; + } else if (std::is_unsigned::value) { + dtype.code = kDLUInt; + } else if (std::is_integral::value) { + dtype.code = kDLInt; + } else { + PADDLE_THROW("Unsupported data type %s", typeid(T).name()); + } + dtype.bits = 8 * sizeof(T); + dtype.lanes = 1; + return dtype; +} + +static std::unordered_map CreateDLDataTypeMap() { + static std::unordered_map result; + +#define REG_DL_DATA_TYPE(cpp_type, proto_type) \ + result[static_cast(proto_type)] = GetDLDataTypeCode() + + _ForEachDataType_(REG_DL_DATA_TYPE); +#undef REG_DL_DATA_TYPE + return result; +} + +static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) { + static auto type_to_dtype_map = CreateDLDataTypeMap(); + static auto type_to_dtype_map_end_it = type_to_dtype_map.end(); + auto it = type_to_dtype_map.find(static_cast(type)); + PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %d", + type); + return it->second; +#undef REG_DL_DATA_TYPE +} + +struct DLContextVisitor : public boost::static_visitor<::DLContext> { + inline ::DLContext operator()(const platform::CPUPlace &place) const { + ::DLContext ctx; + ctx.device_type = kDLCPU; + ctx.device_id = 0; + return ctx; + } + + inline ::DLContext operator()(const platform::CUDAPlace &place) const { +#ifdef PADDLE_WITH_CUDA + ::DLContext ctx; + ctx.device_type = kDLGPU; + ctx.device_id = place.device; + return ctx; +#else + PADDLE_THROW("platform::CUDAPlace is not supported in CPU only version"); +#endif + } + + inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { +#ifdef PADDLE_WITH_CUDA + ::DLContext ctx; + ctx.device_type = kDLCPUPinned; + ctx.device_id = 0; + return ctx; +#else + PADDLE_THROW( + "platform::CUDAPinnedPlace is not supported in CPU only version"); +#endif + } +}; +} // namespace internal + +DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { + // init data, data buffer + t_.data = const_cast(tensor.data()); + + // init ctx, DLContext type with device_type and device_id + auto place = tensor.place(); + t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place); + + // init dtype + t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type()); + t_.dtype.lanes = lanes; + + // init ndim, tensor rank + auto &dims = tensor.dims(); + using DimType = decltype(t_.ndim); // int + t_.ndim = static_cast(dims.size()); + + // init shape, tensor dims + t_.shape = shape_; + for (DimType i = 0; i < t_.ndim; ++i) { + t_.shape[i] = dims[i]; + } + + // init strides, nullptr means the tensor is compact + t_.strides = nullptr; + + // init byte_offset + t_.byte_offset = 0; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h new file mode 100644 index 00000000..e48b0d5c --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { + +class DLPackTensor { + public: + using LaneType = decltype(::DLTensor::dtype.lanes); // uint16_t + using ShapeType = + std::remove_reference::type; // int64_t + + // lanes is only used in CPU to enable vectorization + explicit DLPackTensor(const Tensor& tensor, LaneType lanes = 1); + + inline operator const ::DLTensor&() const { return t_; } + + inline operator ::DLTensor&() { return t_; } + + private: + ::DLTensor t_; + + // The shape in DLTensor is defined as int64_t* + // Add this member to make TVMTensor init without heap allocation + ShapeType shape_[DDim::kMaxRank]; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc new file mode 100644 index 00000000..c0a8e1bc --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/dlpack_tensor.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +namespace { // NOLINT +template +constexpr uint8_t GetDLDataTypeCode() { + return std::is_same::value || + std::is_floating_point::value + ? static_cast(kDLFloat) + : (std::is_unsigned::value + ? static_cast(kDLUInt) + : (std::is_integral::value ? static_cast(kDLInt) + : static_cast(-1))); +} +} // NOLINT + +template +void TestMain(const platform::Place &place, uint16_t lanes) { + DDim dims{4, 5, 6, 7}; + Tensor tensor; + tensor.Resize(dims); + void *p = tensor.mutable_data(place); + + DLPackTensor dlpack_tensor(tensor, lanes); + ::DLTensor &dl_tensor = dlpack_tensor; + + CHECK_EQ(p, dl_tensor.data); + if (platform::is_cpu_place(place)) { + CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type); + CHECK_EQ(0, dl_tensor.ctx.device_id); + } else if (platform::is_gpu_place(place)) { + CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type); + CHECK_EQ(boost::get(place).device, + dl_tensor.ctx.device_id); + } else if (platform::is_cuda_pinned_place(place)) { + CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type); + CHECK_EQ(0, dl_tensor.ctx.device_id); + } else { + CHECK_EQ(false, true); + } + + CHECK_EQ(dims.size(), dl_tensor.ndim); + for (auto i = 0; i < dims.size(); ++i) { + CHECK_EQ(dims[i], dl_tensor.shape[i]); + } + + CHECK_EQ(dl_tensor.strides == nullptr, true); + CHECK_EQ(static_cast(0), dl_tensor.byte_offset); + + CHECK_EQ(lanes, dl_tensor.dtype.lanes); + CHECK_EQ(sizeof(T) * 8, dl_tensor.dtype.bits); + + CHECK_EQ(GetDLDataTypeCode(), dl_tensor.dtype.code); +} + +template +void TestMainLoop() { +#ifdef PADDLE_WITH_CUDA + std::vector places{platform::CPUPlace(), + platform::CUDAPlace(0), + platform::CUDAPinnedPlace()}; + if (platform::GetCUDADeviceCount() > 1) { + places.emplace_back(platform::CUDAPlace(1)); + } +#else + std::vector places{platform::CPUPlace()}; +#endif + std::vector lanes{1, 2}; + for (auto &p : places) { + for (auto &l : lanes) { + TestMain(p, l); + } + } +} +TEST(dlpack, test_all) { +#define TestCallback(cpp_type, proto_type) TestMainLoop() + + _ForEachDataType_(TestCallback); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc new file mode 100644 index 00000000..510a9943 --- /dev/null +++ b/paddle/fluid/framework/downpour_worker.cc @@ -0,0 +1,523 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/device_worker.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/platform/cpu_helper.h" + +namespace paddle { +namespace framework { + +void DownpourWorker::Initialize(const TrainerDesc& desc) { + param_ = desc.downpour_param(); + for (int i = 0; i < param_.sparse_table_size(); ++i) { + uint64_t table_id = + static_cast(param_.sparse_table(i).table_id()); + TableParameter table = param_.sparse_table(i); + sparse_key_names_[table_id].resize(table.sparse_key_name_size()); + for (int j = 0; j < table.sparse_key_name_size(); ++j) { + sparse_key_names_[table_id][j] = table.sparse_key_name(j); + } + sparse_value_names_[table_id].resize(table.sparse_value_name_size()); + for (int j = 0; j < table.sparse_value_name_size(); ++j) { + sparse_value_names_[table_id][j] = table.sparse_value_name(j); + } + sparse_grad_names_[table_id].resize(table.sparse_grad_name_size()); + for (int j = 0; j < table.sparse_grad_name_size(); ++j) { + sparse_grad_names_[table_id][j] = table.sparse_grad_name(j); + } + label_var_name_[table_id] = table.label_var_name(); + } + + for (int i = 0; i < param_.dense_table_size(); ++i) { + uint64_t table_id = static_cast(param_.dense_table(i).table_id()); + auto table = param_.dense_table(i); + dense_value_names_[table_id].resize(table.dense_value_name_size()); + for (int j = 0; j < table.dense_value_name_size(); ++j) { + dense_value_names_[table_id][j] = table.dense_value_name(j); + } + dense_grad_names_[table_id].resize(table.dense_grad_name_size()); + for (int j = 0; j < table.dense_grad_name_size(); ++j) { + dense_grad_names_[table_id][j] = table.dense_grad_name(j); + } + } + + skip_ops_.resize(param_.skip_ops_size()); + for (int i = 0; i < param_.skip_ops_size(); ++i) { + skip_ops_[i] = param_.skip_ops(i); + } + + need_to_push_sparse_ = param_.push_sparse(); + need_to_push_dense_ = param_.push_dense(); + + fleet_ptr_ = FleetWrapper::GetInstance(); + fetch_config_ = desc.fetch_config(); + use_cvm_ = desc.use_cvm(); +} + +void DownpourWorker::CollectLabelInfo(size_t table_idx) { + uint64_t table_id = static_cast( + param_.program_config(0).pull_sparse_table_id(table_idx)); + + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == table_id) { + table = i; + break; + } + } + auto& feature = features_[table_id]; + auto& feature_label = feature_labels_[table_id]; + feature_label.resize(feature.size()); + Variable* var = thread_scope_->FindVar(label_var_name_[table_id]); + LoDTensor* tensor = var->GetMutable(); + int64_t* label_ptr = tensor->data(); + + size_t global_index = 0; + for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) { + VLOG(3) << "sparse_key_names_[" << i + << "]: " << sparse_key_names_[table_id][i]; + Variable* fea_var = thread_scope_->FindVar(sparse_key_names_[table_id][i]); + if (fea_var == nullptr) { + continue; + } + LoDTensor* tensor = fea_var->GetMutable(); + CHECK(tensor != nullptr) << "tensor of var " + << sparse_key_names_[table_id][i] << " is null"; + int64_t* ids = tensor->data(); + size_t fea_idx = 0; + // tensor->lod()[0].size() == batch_size + 1 + for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) { + for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) { + // should be skipped feasign defined in protobuf + if (ids[fea_idx] == 0u) { + continue; + } + feature_label[global_index++] = + static_cast(label_ptr[lod_idx - 1]); + } + } + } + CHECK(global_index == feature.size()) + << "expect fea info size:" << feature.size() << " real:" << global_index; +} + +void DownpourWorker::FillSparseValue(size_t table_idx) { + uint64_t table_id = static_cast( + param_.program_config(0).pull_sparse_table_id(table_idx)); + + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == table_id) { + table = i; + break; + } + } + + auto& fea_value = feature_values_[table_id]; + auto fea_idx = 0u; + + std::vector init_value(table.fea_dim()); + for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) { + std::string slot_name = sparse_key_names_[table_id][i]; + std::string emb_slot_name = sparse_value_names_[table_id][i]; + Variable* var = thread_scope_->FindVar(slot_name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + CHECK(tensor != nullptr) << "tensor of var " << slot_name << " is null"; + int64_t* ids = tensor->data(); + int len = tensor->numel(); + Variable* var_emb = thread_scope_->FindVar(emb_slot_name); + LoDTensor* tensor_emb = var_emb->GetMutable(); + float* ptr = tensor_emb->mutable_data({len, table.emb_dim()}, + platform::CPUPlace()); + memset(ptr, 0, sizeof(float) * len * table.emb_dim()); + auto& tensor_lod = tensor->lod()[0]; + LoD data_lod{tensor_lod}; + tensor_emb->set_lod(data_lod); + for (int index = 0; index < len; ++index) { + if (use_cvm_) { + if (ids[index] == 0u) { + memcpy(ptr + table.emb_dim() * index, init_value.data(), + sizeof(float) * table.emb_dim()); + continue; + } + memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data(), + sizeof(float) * table.emb_dim()); + fea_idx++; + } else { + if (ids[index] == 0u) { + memcpy(ptr + table.emb_dim() * index, init_value.data() + 2, + sizeof(float) * table.emb_dim()); + continue; + } + memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data() + 2, + sizeof(float) * table.emb_dim()); + fea_idx++; + } + } + } +} + +void DownpourWorker::TrainFilesWithProfiler() { + VLOG(3) << "Begin to train files with profiler"; + platform::SetNumThreads(1); + device_reader_->Start(); + std::vector op_total_time; + std::vector op_name; + for (auto& op : ops_) { + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op_name.push_back(op->Type()); + } + } + + VLOG(3) << "op name size: " << op_name.size(); + op_total_time.resize(op_name.size()); + for (size_t i = 0; i < op_total_time.size(); ++i) { + op_total_time[i] = 0.0; + } + platform::Timer timeline; + double total_time = 0.0; + double read_time = 0.0; + double pull_sparse_time = 0.0; + double collect_label_time = 0.0; + double fill_sparse_time = 0.0; + double push_sparse_time = 0.0; + double push_dense_time = 0.0; + int cur_batch; + int batch_cnt = 0; + uint64_t total_inst = 0; + double op_sum_time = 0; + std::unordered_map op_to_time; + timeline.Start(); + while ((cur_batch = device_reader_->Next()) > 0) { + timeline.Pause(); + read_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + VLOG(3) << "program config size: " << param_.program_config_size(); + for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); + ++i) { + uint64_t tid = static_cast( + param_.program_config(0).pull_sparse_table_id(i)); + TableParameter table; + for (auto j : param_.sparse_table()) { + if (j.table_id() == tid) { + table = j; + break; + } + } + timeline.Start(); + fleet_ptr_->PullSparseVarsSync(*thread_scope_, tid, + sparse_key_names_[tid], &features_[tid], + &feature_values_[tid], table.fea_dim()); + timeline.Pause(); + pull_sparse_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + timeline.Start(); + CollectLabelInfo(i); + timeline.Pause(); + collect_label_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + timeline.Start(); + FillSparseValue(i); + timeline.Pause(); + fill_sparse_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + VLOG(3) << "Fill sparse value for all sparse table done."; + + int run_op_idx = 0; + for (auto& op : ops_) { + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + timeline.Start(); + VLOG(3) << "Going to run op " << op_name[run_op_idx]; + op->Run(*thread_scope_, place_); + VLOG(3) << "Op " << op_name[run_op_idx] << " Finished"; + timeline.Pause(); + op_total_time[run_op_idx++] += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + } + + if (need_to_push_sparse_) { + for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size(); + ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_sparse_table_id(i)); + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == tid) { + table = i; + break; + } + } + timeline.Start(); + fleet_ptr_->PushSparseVarsWithLabelAsync( + *thread_scope_, tid, features_[tid], feature_labels_[tid], + sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), + &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_); + timeline.Pause(); + push_sparse_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + } + + if (need_to_push_dense_) { + timeline.Start(); + for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); + ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_dense_table_id(i)); + fleet_ptr_->PushDenseVarsAsync( + *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_); + } + timeline.Pause(); + push_dense_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + VLOG(3) << "push sparse and dense gradient done."; + int32_t tmp_push_dense_wait_times = -1; + static uint32_t push_dense_wait_times = + static_cast(tmp_push_dense_wait_times); + if (push_dense_status_.size() >= push_dense_wait_times) { + for (auto& t : push_dense_status_) { + t.wait(); + } + push_dense_status_.resize(0); + } + + if (tmp_push_dense_wait_times == -1) { + push_dense_status_.resize(0); + } + } + + if (need_to_push_sparse_) { + int32_t tmp_push_sparse_wait_times = -1; + static uint32_t push_sparse_wait_times = + static_cast(tmp_push_sparse_wait_times); + if (push_sparse_status_.size() >= push_sparse_wait_times) { + for (auto& t : push_sparse_status_) { + t.wait(); + } + push_sparse_status_.resize(0); + } + + if (tmp_push_sparse_wait_times == -1) { + push_sparse_status_.resize(0); + } + + VLOG(3) << "going to increase thread version"; + VLOG(3) << "push dense table id size: " + << param_.program_config(0).push_dense_table_id_size(); + } + + if (need_to_push_dense_) { + for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); + ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_dense_table_id(i)); + pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid); + } + } + + PrintFetchVars(); + thread_scope_->DropKids(); + total_inst += cur_batch; + ++batch_cnt; + + if (thread_id_ == 0) { + // should be configured here + if (batch_cnt > 0 && batch_cnt % 100 == 0) { + for (size_t i = 0; i < op_total_time.size(); ++i) { + fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i, + op_name[i].c_str(), op_total_time[i] / batch_cnt); + if (op_to_time.find(op_name[i]) == op_to_time.end()) { + op_to_time[op_name[i]] = 0.0; + } + op_to_time[op_name[i]] += op_total_time[i]; + op_sum_time += op_total_time[i]; + } + for (auto& i : op_to_time) { + fprintf(stderr, "op [%s] run total time: [%f]ms\n", i.first.c_str(), + i.second / batch_cnt); + } + fprintf(stderr, "op run total time: %fs\n", op_sum_time / batch_cnt); + fprintf(stderr, "train total time: %fs\n", total_time / batch_cnt); + fprintf(stderr, "pull sparse time: %fs\n", + pull_sparse_time / batch_cnt); + fprintf(stderr, "fill sparse time: %fs\n", + fill_sparse_time / batch_cnt); + fprintf(stderr, "push sparse time: %fs\n", + push_sparse_time / batch_cnt); + fprintf(stderr, "push dense time: %fs\n", push_dense_time / batch_cnt); + fprintf(stderr, "collect label time: %fs\n", + collect_label_time / batch_cnt); + fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt); + fprintf(stderr, "IO percent: %f\n", read_time / total_time * 100); + fprintf(stderr, "pull sparse time percent: %f\n", + pull_sparse_time / total_time * 100); + fprintf(stderr, "collect label time percent: %f\n", + collect_label_time / total_time * 100); + fprintf(stderr, "fill sparse time percent: %f\n", + fill_sparse_time / total_time * 100); + fprintf(stderr, "push sparse time percent: %f\n", + push_sparse_time / total_time * 100); + fprintf(stderr, "push dense time percent: %f\n", + push_dense_time / total_time * 100); + fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time); + } + } + timeline.Start(); + } +} + +void DownpourWorker::TrainFiles() { + VLOG(3) << "Begin to train files"; + platform::SetNumThreads(1); + device_reader_->Start(); + int batch_cnt = 0; + int cur_batch; + while ((cur_batch = device_reader_->Next()) > 0) { + // pull sparse here + for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); + ++i) { + uint64_t tid = static_cast( + param_.program_config(0).pull_sparse_table_id(i)); + TableParameter table; + for (auto j : param_.sparse_table()) { + if (j.table_id() == tid) { + table = j; + break; + } + } + fleet_ptr_->PullSparseVarsSync(*thread_scope_, tid, + sparse_key_names_[tid], &features_[tid], + &feature_values_[tid], table.fea_dim()); + CollectLabelInfo(i); + FillSparseValue(i); + } + VLOG(3) << "fill sparse value for all sparse table done."; + + // do computation here + for (auto& op : ops_) { + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*thread_scope_, place_); + } + } + + if (need_to_push_sparse_) { + // push gradients here + for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size(); + ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_sparse_table_id(i)); + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == tid) { + table = i; + break; + } + } + fleet_ptr_->PushSparseVarsWithLabelAsync( + *thread_scope_, tid, features_[tid], feature_labels_[tid], + sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), + &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_); + } + } + + if (need_to_push_dense_) { + for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); + ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_dense_table_id(i)); + fleet_ptr_->PushDenseVarsAsync( + *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_); + } + + VLOG(3) << "push dense gradient done."; + + // the following code should be more precise and clean + // TODO(guru4elephant) + int32_t tmp_push_dense_wait_times = -1; + static uint32_t push_dense_wait_times = + static_cast(tmp_push_dense_wait_times); + + if (push_dense_status_.size() >= push_dense_wait_times) { + for (auto& t : push_dense_status_) { + t.wait(); + } + push_dense_status_.resize(0); + } + + if (tmp_push_dense_wait_times == -1) { + push_dense_status_.resize(0); + } + } + + if (need_to_push_sparse_) { + VLOG(3) << "push sparse gradient done."; + int32_t tmp_push_sparse_wait_times = -1; + static uint32_t push_sparse_wait_times = + static_cast(tmp_push_sparse_wait_times); + if (push_sparse_status_.size() >= push_sparse_wait_times) { + for (auto& t : push_sparse_status_) { + t.wait(); + } + push_sparse_status_.resize(0); + } + + if (tmp_push_sparse_wait_times == -1) { + push_sparse_status_.resize(0); + } + } + + if (need_to_push_dense_) { + for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); + ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_dense_table_id(i)); + pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid); + } + } + + PrintFetchVars(); + thread_scope_->DropKids(); + ++batch_cnt; + } +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h new file mode 100644 index 00000000..5bafa434 --- /dev/null +++ b/paddle/fluid/framework/eigen.h @@ -0,0 +1,119 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace framework { + +// EigenDim converts paddle::platform::DDim into Eigen::DSizes. +template +struct EigenDim { + using Type = Eigen::DSizes; + + static Type From(const DDim& dims) { + PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)"); + Type ret; + for (int64_t d = 0; d < arity(dims); d++) { + ret[d] = dims[d]; + } + return ret; + } +}; + +// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor. +template +struct EigenTensor { + // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on + // the speed of aligned and unaligned version in future. + using Type = Eigen::TensorMap>; + + using ConstType = + Eigen::TensorMap>; + + static Type From(Tensor& tensor, DDim dims) { // NOLINT + return Type(tensor.data(), EigenDim::From(dims)); + } + + static Type From(Tensor& tensor) { // NOLINT + return From(tensor, tensor.dims_); + } // NOLINT + + static ConstType From(const Tensor& tensor, DDim dims) { + return ConstType(tensor.data(), EigenDim::From(dims)); + } + + static ConstType From(const Tensor& tensor) { + return From(tensor, tensor.dims_); + } +}; + +template +struct EigenMatrix : public EigenTensor { + static typename EigenMatrix::Type Reshape(Tensor& tensor, // NOLINT + int num_col_dims) { + int rank = tensor.dims_.size(); + PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, + "`num_col_dims` must be between (0, rank_of_tensor)."); + return EigenMatrix::From(tensor, + flatten_to_2d(tensor.dims(), num_col_dims)); + } + + static typename EigenMatrix::ConstType Reshape(const Tensor& tensor, + int num_col_dims) { + int rank = tensor.dims_.size(); + PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank, + "`num_col_dims` must be between (0, rank_of_tensor)."); + return EigenMatrix::From(tensor, + flatten_to_2d(tensor.dims(), num_col_dims)); + } +}; + +template +struct EigenVector : public EigenTensor { + // Flatten reshapes a Tensor into an EigenVector. + static typename EigenVector::Type Flatten(Tensor& tensor) { // NOLINT + return EigenVector::From(tensor, {product(tensor.dims_)}); + } + + static typename EigenVector::ConstType Flatten( + const Tensor& tensor) { // NOLINT + return EigenVector::From(tensor, {product(tensor.dims_)}); + } +}; + +template +struct EigenScalar { + // Scalar tensor (implemented as a rank-0 tensor) of scalar type T. + using Type = Eigen::TensorMap< + Eigen::TensorFixedSize, MajorType, IndexType>>; + using ConstType = Eigen::TensorMap< + Eigen::TensorFixedSize, MajorType, IndexType>>; + + static Type From(Tensor& tensor) { return Type(tensor.data()); } // NOLINT + + static ConstType From(const Tensor& tensor) { + return ConstType(tensor.data()); + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/eigen_test.cc b/paddle/fluid/framework/eigen_test.cc new file mode 100644 index 00000000..bdc526d8 --- /dev/null +++ b/paddle/fluid/framework/eigen_test.cc @@ -0,0 +1,132 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/eigen.h" +#include + +namespace paddle { +namespace framework { + +TEST(EigenDim, From) { + EigenDim<3>::Type ed = EigenDim<3>::From(make_ddim({1, 2, 3})); + ASSERT_EQ(1, ed[0]); + ASSERT_EQ(2, ed[1]); + ASSERT_EQ(3, ed[2]); +} + +TEST(Eigen, Tensor) { + Tensor t; + float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); + for (int i = 0; i < 1 * 2 * 3; i++) { + p[i] = static_cast(i); + } + + EigenTensor::Type et = EigenTensor::From(t); + + ASSERT_EQ(1, et.dimension(0)); + ASSERT_EQ(2, et.dimension(1)); + ASSERT_EQ(3, et.dimension(2)); + + for (int i = 0; i < 1; i++) { + for (int j = 0; j < 2; j++) { + for (int k = 0; k < 3; k++) { + ASSERT_NEAR((i * 2 + j) * 3 + k, et(i, j, k), 1e-6f); + } + } + } +} + +TEST(Eigen, ScalarFrom) { + Tensor t; + int* p = t.mutable_data(make_ddim({1}), platform::CPUPlace()); + *p = static_cast(100); + + EigenScalar::Type es = EigenScalar::From(t); + + ASSERT_EQ(0, es.dimension(0)); + ASSERT_EQ(100, es(0)); +} + +TEST(Eigen, VectorFrom) { + Tensor t; + float* p = t.mutable_data(make_ddim({6}), platform::CPUPlace()); + for (int i = 0; i < 6; i++) { + p[i] = static_cast(i); + } + + EigenVector::Type ev = EigenVector::From(t); + + ASSERT_EQ(6, ev.dimension(0)); + + for (int i = 0; i < 6; i++) { + ASSERT_NEAR(i, ev(i), 1e-6f); + } +} + +TEST(Eigen, VectorFlatten) { + Tensor t; + float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); + for (int i = 0; i < 1 * 2 * 3; i++) { + p[i] = static_cast(i); + } + + EigenVector::Type ev = EigenVector::Flatten(t); + + ASSERT_EQ(1 * 2 * 3, ev.dimension(0)); + + for (int i = 0; i < 1 * 2 * 3; i++) { + ASSERT_NEAR(i, ev(i), 1e-6f); + } +} + +TEST(Eigen, Matrix) { + Tensor t; + float* p = t.mutable_data(make_ddim({2, 3}), platform::CPUPlace()); + for (int i = 0; i < 2 * 3; i++) { + p[i] = static_cast(i); + } + + EigenMatrix::Type em = EigenMatrix::From(t); + + ASSERT_EQ(2, em.dimension(0)); + ASSERT_EQ(3, em.dimension(1)); + + for (int i = 0; i < 2; i++) { + for (int j = 0; j < 3; j++) { + ASSERT_NEAR(i * 3 + j, em(i, j), 1e-6f); + } + } +} + +TEST(Eigen, MatrixReshape) { + Tensor t; + float* p = t.mutable_data({2, 3, 6, 4}, platform::CPUPlace()); + for (int i = 0; i < 2 * 3 * 6 * 4; ++i) { + p[i] = static_cast(i); + } + + EigenMatrix::Type em = EigenMatrix::Reshape(t, 2); + + ASSERT_EQ(2 * 3, em.dimension(0)); + ASSERT_EQ(6 * 4, em.dimension(1)); + + for (int i = 0; i < 2 * 3; i++) { + for (int j = 0; j < 6 * 4; j++) { + ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f); + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc new file mode 100644 index 00000000..cfab2f5f --- /dev/null +++ b/paddle/fluid/framework/executor.cc @@ -0,0 +1,498 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/executor.h" +#include +#include +#include +#include +#include +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/trainer_desc.pb.h" +#include "paddle/fluid/framework/trainer_factory.h" +#include "paddle/fluid/framework/transfer_scope_cache.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" + +#ifdef PADDLE_WITH_NGRAPH +#include "paddle/fluid/operators/ngraph/ngraph_engine.h" +DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); +#endif + +DECLARE_bool(benchmark); +DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); + +namespace paddle { +namespace framework { +namespace { +// block id starts from 0. This id is used to represent the codeblock +// wrapping the first block 0. +int kProgramId = -1; +} // namespace + +ExecutorPrepareContext::ExecutorPrepareContext( + const framework::ProgramDesc& prog, size_t block_id) + : prog_(prog), block_id_(block_id) {} + +void ExecutorPrepareContext::PrepareUnusedVars( + const std::vector& keep_vars, bool force_disable_gc) { + force_disable_gc_ = force_disable_gc; + if (GetEagerDeletionThreshold() < 0 || force_disable_gc_) { + return; + } + unused_vars_ = GetUnusedVars(prog_.Block(block_id_), ops_, keep_vars); +} + +ExecutorPrepareContext::~ExecutorPrepareContext() { + VLOG(5) << "destroy ExecutorPrepareContext"; +} + +Executor::Executor(const platform::Place& place) : place_(place) {} + +void Executor::Close() { +#ifdef PADDLE_WITH_DISTRIBUTE + // TODO(typhoonzero): complete message will need to use real trainer_id, + // except 0. + auto client = + paddle::operators::distributed::RPCClient::GetInstance(0); + client->SendComplete(); +#endif +} + +void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, + int block_id) { + auto& global_block = pdesc.Block(block_id); + + const Scope* ancestor_scope = scope; + while (ancestor_scope->parent()) { + ancestor_scope = ancestor_scope->parent(); + } + + if (ancestor_scope != scope) { + for (auto& var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var->Persistable()) { + auto* ptr = const_cast(ancestor_scope)->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + auto* ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + } else { + for (auto& var : global_block.AllVars()) { + auto* ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + VLOG(3) << "Create variable " << var->Name() << ", which pointer is " + << ptr; + } + } +} + +void Executor::RunFromDataset(const ProgramDesc& main_program, Scope* scope, + Dataset* dataset, + const std::string& trainer_desc_str) { + VLOG(3) << "Start to RunFromDataset in executor"; + TrainerDesc trainer_desc; + bool success = trainer_desc.ParseFromString(trainer_desc_str); + PADDLE_ENFORCE(success, "Fail to parse TrainerDesc from string:\n%s", + trainer_desc_str.c_str()); + VLOG(3) << "Going to create trainer, trainer class is " + << trainer_desc.class_name(); + std::shared_ptr trainer; + trainer = TrainerFactory::CreateTrainer(trainer_desc.class_name()); + // initialize trainer + VLOG(3) << "Going to initialize trainer"; + trainer->Initialize(trainer_desc, dataset); + VLOG(3) << "Set root scope here"; + trainer->SetScope(scope); + // prepare training environment and helper environment + VLOG(3) << "Try to init train environment"; + trainer->InitTrainerEnv(main_program, place_); + VLOG(3) << "Try to init other environment"; + trainer->InitOtherEnv(main_program); + // training and finalize training + VLOG(3) << "Trainer starts to run"; + trainer->Run(); + VLOG(3) << "Trainer going to finalize"; + trainer->Finalize(); + return; +} + +void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, + bool create_local_scope, bool create_vars, + const std::vector& skip_ref_cnt_vars, + bool force_disable_gc) { + platform::RecordBlock b(block_id); + if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); + auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); + RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); +} + +// Check whether the block already has feed operators and feed_holder. +// Return false if the block does not have any feed operators. +// If some feed operators have been prepended to the block, check that +// the info contained in these feed operators matches the feed_targets +// and feed_holder_name. Raise exception when any mismatch is found. +// Return true if the block has feed operators and holder of matching info. +static bool has_feed_operators( + const BlockDesc& block, + const std::map& feed_targets, + const std::string& feed_holder_name) { + size_t feed_count = 0; + for (auto* op : block.AllOps()) { + if (op->Type() == kFeedOpType) { + feed_count++; + // The input variable's name of feed_op should be feed_holder_name. + PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name, + "Input to feed op should be '%s'", feed_holder_name); + std::string feed_target_name = op->Output("Out")[0]; + PADDLE_ENFORCE( + feed_targets.find(feed_target_name) != feed_targets.end(), + "Feed operator output name '%s' cannot be found in 'feed_targets'", + feed_target_name); + } + } + + if (feed_count > 0) { + PADDLE_ENFORCE_EQ( + feed_count, feed_targets.size(), + "The number of feed operators should match 'feed_targets'"); + + if (!feed_holder_name.empty()) { + // When feed operator are present, so should be feed_holder. + auto var = block.FindVar(feed_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + feed_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FEED_MINIBATCH, + "'%s' variable should be 'FEED_MINIBATCH' type", + feed_holder_name); + } + } + + return feed_count > 0; +} + +// Check whether the block already has fetch operators and fetch_holder. +// Return false if the block does not have any fetch operators. +// If some fetch operators have been appended to the block, check that +// the info contained in these fetch operators matches the fetch_targets +// and fetch_holder_name. Raise exception when any mismatch is found. +// Return true if the block has fetch operators and holder of matching info. +static bool has_fetch_operators( + const BlockDesc& block, + const std::map& fetch_targets, + const std::string& fetch_holder_name) { + size_t fetch_count = 0; + for (auto* op : block.AllOps()) { + if (op->Type() == kFetchOpType) { + fetch_count++; + // The output variable's name of fetch_op should be fetch_holder_name. + PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name, + "Output of fetch op should be '%s'", fetch_holder_name); + std::string fetch_target_name = op->Input("X")[0]; + PADDLE_ENFORCE( + fetch_targets.find(fetch_target_name) != fetch_targets.end(), + "Fetch operator input name '%s' cannot be found in 'fetch_targets'", + fetch_target_name); + } + } + + if (fetch_count > 0) { + PADDLE_ENFORCE_EQ( + fetch_count, fetch_targets.size(), + "The number of fetch operators should match 'fetch_targets'"); + + if (!fetch_holder_name.empty()) { + // When fetch operator are present, so should be fetch_holder. + auto var = block.FindVar(fetch_holder_name); + PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable", + fetch_holder_name); + PADDLE_ENFORCE_EQ(var->GetType(), proto::VarType::FETCH_LIST, + "'%s' variable should be 'FETCH_LIST' type", + fetch_holder_name); + } + } + + return fetch_count > 0; +} + +std::unique_ptr Executor::PrepareCtxCache( + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars, bool force_disable_gc) { + return Prepare(program, block_id, skip_ref_cnt_vars, force_disable_gc); +} + +void Executor::Run(const ProgramDesc& program, Scope* scope, + std::map* feed_targets, + std::map* fetch_targets, + bool create_local_scope, bool create_vars, + const std::string& feed_holder_name, + const std::string& fetch_holder_name) { + platform::RecordBlock b(kProgramId); + if (FLAGS_use_mkldnn) EnableMKLDNN(program); + bool has_feed_ops = + has_feed_operators(program.Block(0), *feed_targets, feed_holder_name); + bool has_fetch_ops = + has_fetch_operators(program.Block(0), *fetch_targets, fetch_holder_name); + + ProgramDesc* copy_program = const_cast(&program); + std::unique_ptr unique_ptr_of_copy_program; + if (!has_feed_ops || !has_fetch_ops) { + unique_ptr_of_copy_program.reset(new ProgramDesc(program)); + copy_program = unique_ptr_of_copy_program.get(); + } + auto* global_block = copy_program->MutableBlock(0); + + if (!has_feed_ops) { + // create feed_holder variable + auto* feed_holder = global_block->Var(feed_holder_name); + feed_holder->SetType(proto::VarType::FEED_MINIBATCH); + feed_holder->SetPersistable(true); + + int i = 0; + for (auto& feed_target : (*feed_targets)) { + std::string var_name = feed_target.first; + VLOG(3) << "feed target's name: " << var_name; + + // prepend feed op + auto* op = global_block->PrependOp(); + op->SetType(kFeedOpType); + op->SetInput("X", {feed_holder_name}); + op->SetOutput("Out", {var_name}); + op->SetAttr("col", {static_cast(i)}); + op->CheckAttrs(); + + i++; + } + } + + if (!has_fetch_ops) { + // create fetch_holder variable + auto* fetch_holder = global_block->Var(fetch_holder_name); + fetch_holder->SetType(proto::VarType::FETCH_LIST); + fetch_holder->SetPersistable(true); + + int i = 0; + for (auto& fetch_target : (*fetch_targets)) { + std::string var_name = fetch_target.first; + VLOG(3) << "fetch target's name: " << var_name; + + // append fetch op + auto* op = global_block->AppendOp(); + op->SetType(kFetchOpType); + op->SetInput("X", {var_name}); + op->SetOutput("Out", {fetch_holder_name}); + op->SetAttr("col", {static_cast(i)}); + op->CheckAttrs(); + + i++; + } + } + + auto ctx = Prepare(*copy_program, 0); + RunPreparedContext(ctx.get(), scope, feed_targets, fetch_targets, + create_local_scope, create_vars, feed_holder_name, + fetch_holder_name); +} + +std::unique_ptr Executor::Prepare( + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars, bool force_disable_gc) { + std::unique_ptr ctx( + new ExecutorPrepareContext(program, block_id)); + PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); + auto& block = program.Block(block_id); + for (auto& op_desc : block.AllOps()) { + ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); + } +#ifdef PADDLE_WITH_NGRAPH + if (FLAGS_use_ngraph && ctx->block_id_ == 0) { + paddle::operators::NgraphEngine::FuseNgraphOps( + ctx->prog_.Block(ctx->block_id_), &ctx->ops_); + } +#endif + ctx->PrepareUnusedVars(skip_ref_cnt_vars, force_disable_gc); + return ctx; +} + +std::vector> Executor::Prepare( + const ProgramDesc& program, const std::vector& block_ids, + const std::vector>& skip_ref_cnt_vars, + bool force_disable_gc) { + PADDLE_ENFORCE( + skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(), + "skip_ref_cnt_vars should be either empty or equals to block number %d", + block_ids.size()); + std::vector> result; + size_t idx = 0; + for (auto& bid : block_ids) { + PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); + auto* ctx = new ExecutorPrepareContext(program, bid); + auto& block = program.Block(bid); + for (auto& op_desc : block.AllOps()) { + ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); + } + if (skip_ref_cnt_vars.empty()) { + ctx->PrepareUnusedVars(std::vector(), force_disable_gc); + } else { + ctx->PrepareUnusedVars(skip_ref_cnt_vars[idx], force_disable_gc); + } + result.push_back(std::shared_ptr(ctx)); + ++idx; + } + return result; +} + +void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, + bool create_local_scope, bool create_vars, + bool keep_kids) { + platform::RecordBlock b(kProgramId); + PADDLE_ENFORCE_NOT_NULL(scope); + Scope* local_scope = scope; + if (create_vars) { + if (create_local_scope) { + local_scope = &scope->NewScope(); + } + CreateVariables(ctx->prog_, local_scope, ctx->block_id_); + } + + int64_t max_memory_size = GetEagerDeletionThreshold(); + std::unique_ptr gc; + // FIXME(zjl): recurrent_op is rather complex, we would + // disable gc forcely in recurrent_op + if (!ctx->force_disable_gc_ && max_memory_size >= 0) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place_)) { + if (IsFastEagerDeletionModeEnabled()) { + gc.reset(new UnsafeFastGPUGarbageCollector( + boost::get(place_), max_memory_size)); + } else { + gc.reset(new DefaultStreamGarbageCollector( + boost::get(place_), max_memory_size)); + } + } else if (platform::is_cpu_place(place_)) { +#endif + gc.reset(new CPUGarbageCollector(boost::get(place_), + max_memory_size)); +#ifdef PADDLE_WITH_CUDA + } +#endif + // If gc is enabled and block size > 1 + if (gc && ctx->prog_.Size() > 1) { + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_, + ctx->ops_); + operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( + ctx->block_id_, ctx->ops_); + } + } + + for (auto& op : ctx->ops_) { + op->Run(*local_scope, place_); + if (gc) { + DeleteUnusedTensors(*local_scope, op.get(), ctx->unused_vars_, gc.get()); + } + } + + platform::DeviceContextPool::Instance().Get(place_)->Wait(); + + if (local_scope != scope) { + scope->DeleteScope(local_scope); + } else { + if (!keep_kids) { + // By default, we should delete all kid scopes after run executor because + // some operators may create local scope when running, such as while_op. + // But when while_op also create a local executor to run it's sub block, + // the sub scopes it created should not be dropped immediately, because + // while_grad_op will use some variables created during while_op run, so + // we need to keep the kids and wait for the outer executor to drop them. + scope->DropKids(); + } + } +} + +void Executor::RunPreparedContext( + ExecutorPrepareContext* ctx, Scope* scope, + std::map* feed_targets, + std::map* fetch_targets, bool create_local_scope, + bool create_vars, const std::string& feed_holder_name, + const std::string& fetch_holder_name) { + auto& global_block = ctx->prog_.Block(ctx->block_id_); + + PADDLE_ENFORCE( + has_feed_operators(global_block, *feed_targets, feed_holder_name), + "Program in ExecutorPrepareContext should has feed_ops."); + PADDLE_ENFORCE( + has_fetch_operators(global_block, *fetch_targets, fetch_holder_name), + "Program in the prepared context should has fetch_ops."); + + // map the data of feed_targets to feed_holder + for (auto* op : global_block.AllOps()) { + if (op->Type() == kFeedOpType) { + std::string feed_target_name = op->Output("Out")[0]; + int idx = boost::get(op->GetAttr("col")); + SetFeedVariable(scope, *(*feed_targets)[feed_target_name], + feed_holder_name, idx); + } + } + + RunPreparedContext(ctx, scope, create_local_scope, create_vars); + + // obtain the data of fetch_targets from fetch_holder + for (auto* op : global_block.AllOps()) { + if (op->Type() == kFetchOpType) { + std::string fetch_target_name = op->Input("X")[0]; + int idx = boost::get(op->GetAttr("col")); + *(*fetch_targets)[fetch_target_name] = + GetFetchVariable(*scope, fetch_holder_name, idx); + } + } +} + +void Executor::EnableMKLDNN(const ProgramDesc& program) { +#ifdef PADDLE_WITH_MKLDNN + VLOG(3) << "use_mkldnn=True"; + for (size_t bid = 0; bid < program.Size(); ++bid) { + auto* block = const_cast(program).MutableBlock(bid); + for (auto* op : block->AllOps()) { + if (op->HasAttr("use_mkldnn")) { + op->SetAttr("use_mkldnn", true); + } + } + } +#else + LOG(WARNING) + << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; +#endif +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h new file mode 100644 index 00000000..d0d12b30 --- /dev/null +++ b/paddle/fluid/framework/executor.h @@ -0,0 +1,129 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +struct ExecutorPrepareContext { + ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id); + + ~ExecutorPrepareContext(); + + void PrepareUnusedVars(const std::vector& keep_vars, + bool force_disable_gc = false); + + const framework::ProgramDesc& prog_; + const size_t block_id_; + + std::vector> ops_; + + std::unordered_map> unused_vars_; + bool force_disable_gc_{false}; +}; + +class Executor { + public: + // TODO(dzhwinter) : Do not rely on this function, it will be removed + explicit Executor(const platform::DeviceContext& device) + : Executor(device.GetPlace()) {} + + explicit Executor(const platform::Place& place); + + /* + * Close this Executor. + * Calling this method will send complete messages to all pserver instances. + */ + void Close(); + + /* @Brief + * Runtime evaluation of the given ProgramDesc under certain Scope + * + * @param + * ProgramDesc + * Scope + */ + void Run(const ProgramDesc& prog, Scope* scope, int block_id, + bool create_local_scope = true, bool create_vars = true, + const std::vector& skip_ref_cnt_vars = + std::vector(), + bool force_disable_gc = false); + + // This API is very slow. + void Run(const ProgramDesc& program, Scope* scope, + std::map* feed_targets, + std::map* fetch_targets, + bool create_local_scope = true, bool create_vars = true, + const std::string& feed_holder_name = "feed", + const std::string& fetch_holder_name = "fetch"); + + // This API is very slow. + void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, + std::map* feed_targets, + std::map* fetch_targets, + bool create_local_scope = true, + bool create_vars = true, + const std::string& feed_holder_name = "feed", + const std::string& fetch_holder_name = "fetch"); + + std::unique_ptr PrepareCtxCache( + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars = + std::vector(), + bool force_disable_gc = false); + + static std::unique_ptr Prepare( + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars = + std::vector(), + bool force_disable_gc = false); + + static std::vector> Prepare( + const ProgramDesc& program, const std::vector& block_ids, + const std::vector>& skip_ref_cnt_vars = + std::vector>(), + bool force_disable_gc = false); + + void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); + + void RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, + bool create_local_scope = true, + bool create_vars = true, bool keep_kids = false); + + void EnableMKLDNN(const ProgramDesc& program); + + void RunFromDataset(const ProgramDesc& main_program, Scope* scope, + Dataset* dataset, const std::string& trainer_desc_str); + + private: + const platform::Place place_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc new file mode 100644 index 00000000..77b0977b --- /dev/null +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -0,0 +1,189 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/executor_gc_helper.h" +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +struct OpInOutInfo { + public: + void Build(const OperatorBase *op) { + is_built_ = true; + auto &inferer = op->Info().NoNeedBufferVarsInferer(); + if (inferer) { + no_need_buffer_ins_ = inferer(op->Inputs(), op->Outputs(), op->Attrs()); + + if (no_need_buffer_ins_.empty()) return; + + for (auto &in_name_pair : op->Inputs()) { + if (no_need_buffer_ins_.count(in_name_pair.first) != 0) { + continue; + } + + for (auto &in_arg_name : in_name_pair.second) { + other_args_set_.insert(in_arg_name); + } + } + + for (auto &out_name_pair : op->Outputs()) { + for (auto &out_arg_name : out_name_pair.second) { + other_args_set_.insert(out_arg_name); + } + } + } + } + + bool IsBuilt() const { return is_built_; } + + bool IsInArgBufferNeeded(const std::string &in_arg_name) const { + return no_need_buffer_ins_.empty() || + other_args_set_.count(in_arg_name) != 0; + } + + private: + // A set to record unused buffer input vars of op + std::unordered_set no_need_buffer_ins_; + // A set to record other args of op (including in, out) + std::unordered_set other_args_set_; + bool is_built_{false}; +}; + +static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block, + const std::unordered_set &skip_vars) { + if (skip_vars.count(name) != 0) { + return false; + } + + auto *var_desc = block.FindVar(name); + if (var_desc == nullptr || var_desc->Persistable()) { + return false; + } + + auto type = var_desc->Proto()->type().type(); + + return type == proto::VarType::LOD_TENSOR || + type == proto::VarType::SELECTED_ROWS || + type == proto::VarType::LOD_TENSOR_ARRAY; +} + +std::unordered_map> GetUnusedVars( + const BlockDesc &block, + const std::vector> &ops, + const std::vector &skip_var_list) { + std::unordered_set skip_vars(skip_var_list.begin(), + skip_var_list.end()); + + std::unordered_map var_op_idx_map; + + for (size_t i = 0; i < ops.size(); ++i) { + auto *op = ops[i].get(); + + OpInOutInfo info; + for (auto &name_pair : op->Inputs()) { + for (auto &name : name_pair.second) { + if (!VarCanBeDeleted(name, block, skip_vars)) { + continue; + } + + // var can be gc-ed + if (!info.IsBuilt()) { + info.Build(op); + } + + if (info.IsInArgBufferNeeded(name)) { + // Update the last living op of variable to current op + var_op_idx_map[name] = i; + } else { + VLOG(10) << "Skip reference count computing of variable " + << name_pair.first << "(" << name << ") in Operator " + << op->Type(); + } + } + } + + for (auto &name_pair : op->Outputs()) { + for (auto &name : name_pair.second) { + if (VarCanBeDeleted(name, block, skip_vars)) { + // Update the last living op of variable to current op + var_op_idx_map[name] = i; + } + } + } + } + + std::unordered_map> result; + for (auto &name_op_idx_pair : var_op_idx_map) { + auto &name = name_op_idx_pair.first; + size_t op_idx = name_op_idx_pair.second; + result[ops[op_idx].get()].emplace_back(name); + } + return result; +} + +void DeleteUnusedTensors( + const Scope &scope, OperatorBase *op, + const std::unordered_map> + &delete_vars_map, + GarbageCollector *gc) { + auto iter = delete_vars_map.find(op); + if (iter == delete_vars_map.end()) { + return; + } + + auto &delete_vars = iter->second; + + std::deque> garbages; + + for (auto &var_name : delete_vars) { + auto *var = scope.FindVar(var_name); + if (var == nullptr) { + continue; + } + + VLOG(2) << "Erase variable " << var_name; + if (var->IsType()) { + garbages.emplace_back(var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back( + var->GetMutable()->mutable_value()->MoveMemoryHolder()); + } else if (var->IsType()) { + auto *lod_tensor_arr = var->GetMutable(); + for (auto &t : *lod_tensor_arr) { + garbages.emplace_back(t.MoveMemoryHolder()); + } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + framework::ToTypeName(var->Type()), var_name); + } + } + + if (!garbages.empty()) { + gc->Add(std::move(garbages)); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor_gc_helper.h b/paddle/fluid/framework/executor_gc_helper.h new file mode 100644 index 00000000..8553273f --- /dev/null +++ b/paddle/fluid/framework/executor_gc_helper.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +// Result map: op -> variable names that can be deleted after op runs +std::unordered_map> GetUnusedVars( + const BlockDesc &block, + const std::vector> &ops, + const std::vector &skip_vars); + +// Collect unused tensors after op runs +void DeleteUnusedTensors( + const Scope &scope, OperatorBase *op, + const std::unordered_map> + &delete_vars_map, + GarbageCollector *gc); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc new file mode 100644 index 00000000..005d98c6 --- /dev/null +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -0,0 +1,698 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/executor_thread_worker.h" +#include +#include +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" + +#include "gflags/gflags.h" +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/timer.h" +#include "paddle/fluid/pybind/pybind.h" +namespace paddle { +namespace framework { + +#ifdef PADDLE_WITH_PSLIB +int DensePullThread::start() { + _running = true; + _t = std::thread(&DensePullThread::run, this); + return 0; +} + +void DensePullThread::run() { + while (_running) { + _pull_dense_status.resize(0); + for (auto& t : _dense_variable_name) { + if (check_update_param(t.first)) { + auto status = pull_dense(t.first); + _pull_dense_status.emplace_back(std::move(status)); + reset_thread_version(t.first); + } + } + if (_pull_dense_status.size() != 0) { + wait_all(); + } + + usleep(_sleep_time_ms * 1000); + } +} +bool DensePullThread::check_update_param(uint64_t table_id) { + { + std::lock_guard lock(_mutex_for_version); + auto& version = _training_versions[table_id]; + _current_version[table_id] = + *(std::min_element(version.begin(), version.end())); + } + if (_current_version[table_id] - _last_versions[table_id] < _threshold) { + return false; + } + return true; +} + +void DensePullThread::reset_thread_version(uint64_t table_id) { + std::lock_guard lock(_mutex_for_version); + _last_versions[table_id] = _current_version[table_id]; +} +std::future DensePullThread::pull_dense(uint64_t table_id) { + auto& regions = _regions[table_id]; + regions.clear(); + auto& variables = _dense_variable_name[table_id]; + regions.resize(variables.size()); + + for (auto i = 0u; i < variables.size(); ++i) { + auto& t = variables[i]; + Variable* var = _root_scope->FindVar(t); + LoDTensor* tensor = var->GetMutable(); + + float* w = tensor->data(); + paddle::ps::Region reg(w, tensor->numel()); + regions[i] = std::move(reg); + } + return _ps_client->pull_dense(regions.data(), regions.size(), table_id); +} + +void DensePullThread::wait_all() { + for (auto& t : _pull_dense_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times; + } + } + + if (_pull_dense_fail_times > 20) { + LOG(FATAL) << "pull dense failed times more than 20 times"; + exit(-1); + } + + _pull_dense_status.resize(0); +} + +void DensePullThread::increase_thread_version(int thread_id, + uint64_t table_id) { + std::lock_guard lock(_mutex_for_version); + _training_versions[table_id][thread_id]++; +} +#endif + +void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) { + auto& block = program.Block(0); + op_names_.clear(); + for (auto& op_desc : block.AllOps()) { + std::unique_ptr local_op = OpRegistry::CreateOp(*op_desc); + op_names_.push_back(op_desc->Type()); + OperatorBase* local_op_ptr = local_op.release(); + ops_.push_back(local_op_ptr); + continue; + } +} + +void ExecutorThreadWorker::CreateThreadResource( + const framework::ProgramDesc& program, + const paddle::platform::Place& place) { + CreateThreadScope(program); + CreateThreadOperators(program); + SetMainProgram(program); + SetPlace(place); +} + +void ExecutorThreadWorker::CreateThreadScope(const ProgramDesc& program) { + auto& block = program.Block(0); + + PADDLE_ENFORCE_NOT_NULL( + root_scope_, "root_scope should be set before creating thread scope"); + + thread_scope_ = &root_scope_->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Persistable()) { + auto* ptr = root_scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } else { + auto* ptr = thread_scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } + } +} + +void ExecutorThreadWorker::SetDataFeed( + const std::shared_ptr& datafeed) { + thread_reader_ = datafeed; +} + +void ExecutorThreadWorker::BindingDataFeedMemory() { + const std::vector& input_feed = + thread_reader_->GetUseSlotAlias(); + for (auto name : input_feed) { + thread_reader_->AddFeedVar(thread_scope_->Var(name), name); + } +} + +void ExecutorThreadWorker::SetFetchVarNames( + const std::vector& fetch_var_names) { + fetch_var_names_.clear(); + fetch_var_names_.insert(fetch_var_names_.end(), fetch_var_names.begin(), + fetch_var_names.end()); +} + +void ExecutorThreadWorker::SetDevice() { +#if defined _WIN32 || defined __APPLE__ + return; +#else + static unsigned concurrency_cap = std::thread::hardware_concurrency(); + LOG(WARNING) << "concurrency capacity " << concurrency_cap; + int thread_id = this->thread_id_; + + if (static_cast(thread_id) < concurrency_cap) { + unsigned proc = thread_id; + + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(proc, &mask); + + if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) { + VLOG(1) << "WARNING: Failed to set thread affinity for thread " + << thread_id; + } else { + CPU_ZERO(&mask); + if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) || + (CPU_ISSET(proc, &mask) == 0)) { + VLOG(3) << "WARNING: Failed to set thread affinity for thread " + << thread_id; + } + } + } else { + VLOG(1) << "WARNING: Failed to set thread affinity for thread " + << thread_id; + } +#endif +} + +template +void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) { + auto inspect = lod_tensor.data(); + auto element_num = lod_tensor.numel(); + + std::ostringstream sstream; + sstream << var_name << " (element num " << element_num << "): ["; + sstream << inspect[0]; + for (int j = 1; j < element_num; ++j) { + sstream << " " << inspect[j]; + } + sstream << "]"; + + std::cout << sstream.str() << std::endl; +} + +static void print_fetch_var(Scope* scope, const std::string& var_name) { + auto& tensor = scope->FindVar(var_name)->Get(); + +#define PrintLoDTensorCallback(cpp_type, proto_type) \ + do { \ + if (tensor.type() == proto_type) { \ + print_lod_tensor(var_name, tensor); \ + return; \ + } \ + } while (0) + + _ForEachDataType_(PrintLoDTensorCallback); + VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type(); +} + +void ExecutorThreadWorker::TrainFilesWithTimer() { + platform::SetNumThreads(1); + SetDevice(); + thread_reader_->Start(); + + std::vector op_total_time; + std::vector op_name; + for (auto& op : ops_) { + op_name.push_back(op->Type()); + } + op_total_time.resize(ops_.size()); + for (size_t i = 0; i < op_total_time.size(); ++i) { + op_total_time[i] = 0.0; + } + platform::Timer timeline; + double total_time = 0.0; + double read_time = 0.0; + int cur_batch; + int batch_cnt = 0; + timeline.Start(); + while ((cur_batch = thread_reader_->Next()) > 0) { + timeline.Pause(); + read_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + for (size_t i = 0; i < ops_.size(); ++i) { + timeline.Start(); + ops_[i]->Run(*thread_scope_, place_); + timeline.Pause(); + op_total_time[i] += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + ++batch_cnt; + thread_scope_->DropKids(); + if (thread_id_ == 0) { + if (batch_cnt > 0 && batch_cnt % 100 == 0) { + for (size_t i = 0; i < ops_.size(); ++i) { + fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i, + op_name[i].c_str(), op_total_time[i] / batch_cnt); + } + fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt); + int fetch_var_num = fetch_var_names_.size(); + for (int i = 0; i < fetch_var_num; ++i) { + print_fetch_var(thread_scope_, fetch_var_names_[i]); + } + fprintf(stderr, "IO percent: %f\n", read_time / total_time); + } + } + timeline.Start(); + } +} + +void ExecutorThreadWorker::TrainFiles() { + platform::SetNumThreads(1); + + // todo: configurable + // SetDevice(); + + int fetch_var_num = fetch_var_names_.size(); + fetch_values_.clear(); + fetch_values_.resize(fetch_var_num); + + thread_reader_->Start(); + + int cur_batch; + int batch_cnt = 0; + while ((cur_batch = thread_reader_->Next()) > 0) { + // executor run here + for (auto& op : ops_) { + op->Run(*thread_scope_, place_); + } + + ++batch_cnt; + thread_scope_->DropKids(); + + if (debug_ == false || thread_id_ != 0) { + continue; + } + + for (int i = 0; i < fetch_var_num; ++i) { + print_fetch_var(thread_scope_, fetch_var_names_[i]); + } // end for (int i = 0...) + } // end while () +} + +void ExecutorThreadWorker::SetThreadId(int tid) { thread_id_ = tid; } + +void ExecutorThreadWorker::SetPlace(const platform::Place& place) { + place_ = place; +} + +void ExecutorThreadWorker::SetMainProgram( + const ProgramDesc& main_program_desc) { + main_program_.reset(new ProgramDesc(main_program_desc)); +} + +void ExecutorThreadWorker::SetRootScope(Scope* g_scope) { + root_scope_ = g_scope; +} + +#ifdef PADDLE_WITH_PSLIB +// AsyncExecutor +void AsyncExecutorThreadWorker::TrainFiles() { + SetDevice(); + + int fetch_var_num = fetch_var_names_.size(); + fetch_values_.clear(); + fetch_values_.resize(fetch_var_num); + + thread_reader_->Start(); + + int cur_batch; + int batch_cnt = 0; + while ((cur_batch = thread_reader_->Next()) > 0) { + // executor run here + TrainOneNetwork(); + + ++batch_cnt; + thread_scope_->DropKids(); + + if (debug_ == false || thread_id_ != 0) { + continue; + } + + for (int i = 0; i < fetch_var_num; ++i) { + print_fetch_var(thread_scope_, fetch_var_names_[i]); + } // end for (int i = 0...) + } // end while () +} + +void AsyncExecutorThreadWorker::SetPSlibPtr( + std::shared_ptr pslib_ptr) { + _pslib_ptr = pslib_ptr; +} + +void AsyncExecutorThreadWorker::SetPullDenseThread( + std::shared_ptr dpt) { + _pull_dense_thread = dpt; +} + +void AsyncExecutorThreadWorker::TrainOneNetwork() { + PrepareParams(); + + for (auto& op : ops_) { + if (op->Type().find("sgd") != std::string::npos) { + continue; + } + bool need_skip = false; + for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { + if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*thread_scope_, place_); + } + } + UpdateParams(); +} + +void AsyncExecutorThreadWorker::SetParamConfig( + AsyncWorkerParamConfig* param_config) { + _param_config = param_config; +} + +void AsyncExecutorThreadWorker::PrepareParams() { + for (auto table_id : _param_config->sparse_table_id) { + PullSparse(table_id); + for (auto& t : _pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "pull sparse failed, status[" << status << "]"; + exit(-1); + } + } + } + _pull_sparse_status.resize(0); + + for (auto table_id : _param_config->sparse_table_id) { + FillSparse(table_id); + } +} + +void AsyncExecutorThreadWorker::UpdateParams() { + for (auto i : _param_config->sparse_table_id) { + PushSparse(i); + } + for (auto i : _param_config->dense_table_id) { + PushDense(i); + } + int32_t tmp_push_dense_wait_times = -1; + int32_t tmp_push_sparse_wait_times = -1; + static uint32_t push_dense_wait_times = + static_cast(tmp_push_dense_wait_times); + static uint32_t push_sparse_wait_times = + static_cast(tmp_push_sparse_wait_times); + + if (_push_dense_status.size() >= push_dense_wait_times) { + for (auto& t : _push_dense_status) { + t.wait(); + } + _push_dense_status.resize(0); + } + if (tmp_push_dense_wait_times == -1) { + _push_dense_status.resize(0); + } + if (_push_sparse_status.size() >= push_sparse_wait_times) { + for (auto& t : _push_sparse_status) { + t.wait(); + } + _push_sparse_status.resize(0); + } + if (tmp_push_sparse_wait_times == -1) { + _push_sparse_status.resize(0); + } + for (auto dense_table_id : _param_config->dense_table_id) { + _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); + } +} + +void AsyncExecutorThreadWorker::PushDense(int table_id) { + std::vector regions; + for (auto& t : _param_config->dense_gradient_variable_name[table_id]) { + Variable* var = thread_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + int count = tensor->numel(); + float* g = tensor->data(); + paddle::ps::Region reg(g, count); + regions.emplace_back(std::move(reg)); + } + + auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(), + regions.size(), table_id); + _push_dense_status.push_back(std::move(status)); +} + +void AsyncExecutorThreadWorker::PullSparse(int table_id) { + auto& features = _features[table_id]; + auto& feature_value = _feature_value[table_id]; + auto fea_dim = _param_config->fea_dim; + // slot id starts from 1 + features.clear(); + features.resize(0); + features.reserve(MAX_FEASIGN_NUM); + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + for (auto i = 0u; i < len; ++i) { + // todo(colourful-tree): current trick - filter feasign=use_slot_mod( + // bug: datafeed fill use_slot_mod for empty slot) + if (ids[i] == 0u) { + continue; + } + features.push_back(static_cast(ids[i])); + } + } + check_pull_push_memory(features, &feature_value, fea_dim); + + std::vector pull_feature_value; + for (auto i = 0u; i < features.size(); ++i) { + pull_feature_value.push_back(feature_value[i].data()); + } + + auto status = _pslib_ptr->_worker_ptr->pull_sparse( + pull_feature_value.data(), table_id, features.data(), features.size()); + _pull_sparse_status.push_back(std::move(status)); + + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, &push_g, fea_dim); + collect_feasign_info(table_id); +} + +void AsyncExecutorThreadWorker::FillSparse(int table_id) { + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; + auto& features = _features[table_id]; + auto& fea_value = _feature_value[table_id]; + + CHECK(features.size() > 0) << "feature size check failed"; + + auto fea_idx = 0u; + + std::vector init_value(fea_dim); + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + Variable* var_emb = thread_scope_->FindVar( + _param_config->slot_input_vec[table_id][slot_idx - 1]); + LoDTensor* tensor_emb = var_emb->GetMutable(); + float* ptr = + tensor_emb->mutable_data({len, slot_dim}, platform::CPUPlace()); + memset(ptr, 0, sizeof(float) * len * slot_dim); + auto& tensor_lod = tensor->lod()[0]; + + LoD data_lod{tensor_lod}; + tensor_emb->set_lod(data_lod); + + for (auto index = 0u; index < len; ++index) { + if (ids[index] == 0u) { + memcpy(ptr + slot_dim * index, init_value.data() + 2, + sizeof(float) * slot_dim); + continue; + } + memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, + sizeof(float) * slot_dim); + fea_idx++; + } + } +} + +void AsyncExecutorThreadWorker::PushSparse(int table_id) { + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; + auto& features = _features[table_id]; + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, &push_g, fea_dim); + CHECK(push_g.size() == features.size() + 1) + << "push_g size:" << push_g.size() + << " features size:" << features.size(); + uint64_t fea_idx = 0u; + auto& fea_info = _fea_info[table_id]; + int offset = 2; + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) == + _param_config->slot_alias_to_table.end()) { + LOG(ERROR) << "ERROR slot_idx:" << slot_idx + << " name:" << feed_vec[slot_idx]; + } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != + table_id) { + continue; + } + Variable* g_var = thread_scope_->FindVar( + _param_config->gradient_var[table_id][slot_idx - 1]); + CHECK(g_var != nullptr) + << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] + << "] not found"; + LoDTensor* g_tensor = g_var->GetMutable(); + if (g_tensor == NULL) { + LOG(ERROR) << "var[" + << _param_config->gradient_var[table_id][slot_idx - 1] + << "] not found"; + exit(-1); + } + float* g = g_tensor->data(); + + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found"; + LoDTensor* tensor = var->GetMutable(); + if (tensor == NULL) { + LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; + exit(-1); + } + int len = tensor->numel(); + CHECK(slot_dim * len == g_tensor->numel()) + << "len:" << len << " g_numel:" << g_tensor->numel(); + CHECK(len == tensor->numel()) << "len:" << len + << "t_numel:" << tensor->numel(); + int64_t* ids = tensor->data(); + for (auto id_idx = 0u; id_idx < len; ++id_idx) { + if (ids[id_idx] == 0) { + g += slot_dim; + continue; + } + memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); + push_g[fea_idx][0] = 1.0f; + CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx + << " size:" << fea_info.size(); + push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); + g += slot_dim; + fea_idx++; + } + } + CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx + << " features size:" << features.size(); + CHECK_GT(features.size(), 0); + + std::vector push_g_vec; + for (auto i = 0u; i < features.size(); ++i) { + push_g_vec.push_back(push_g[i].data()); + } + auto status = _pslib_ptr->_worker_ptr->push_sparse( + table_id, features.data(), (const float**)push_g_vec.data(), + features.size()); + _push_sparse_status.push_back(std::move(status)); +} + +void AsyncExecutorThreadWorker::collect_feasign_info(int table_id) { + auto& fea_info = _fea_info[table_id]; + auto& feature = _features[table_id]; + fea_info.resize(feature.size()); + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + Variable* var = thread_scope_->FindVar(feed_vec[0]); + LoDTensor* tensor = var->GetMutable(); + int64_t* label = tensor->data(); + + int global_index = 0; + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + + int fea_idx = 0; + for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) { + for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) { + if (ids[fea_idx] == 0u) { + continue; + } + FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]}; + + fea_info[global_index++] = std::move(info); + } + } + } + CHECK(global_index == feature.size()) + << "expect fea info size:" << feature.size() << " real:" << global_index; +} + +void AsyncExecutorThreadWorker::check_pull_push_memory( + const std::vector& features, + std::vector>* push_g, int dim) { + push_g->resize(features.size() + 1); + for (auto& t : *push_g) { + t.resize(dim); + } +} + +void AsyncExecutorThreadWorker::check_pull_push_memory( + const std::vector& features, std::vector* push_g, + int dim) { + if (features.size() > push_g->size()) { + push_g->reserve(features.size() + 1); + auto size = features.size() - push_g->size() + 1; + for (auto i = 0u; i < size; ++i) { + float* ptr = new float[dim]; + push_g->push_back(ptr); + } + } +} +#endif + +} // einit_modelnd namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h new file mode 100644 index 00000000..524922b0 --- /dev/null +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -0,0 +1,245 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#ifdef PADDLE_WITH_PSLIB +#include +#endif + +namespace paddle { +namespace framework { + +void CreateTensor(Variable* var, proto::VarType::Type var_type); +#ifdef PADDLE_WITH_PSLIB +static const uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; + +struct AsyncWorkerParamConfig { + int slot_dim; + int fea_dim; + int32_t tmp_push_dense_wait_times; + int32_t tmp_push_sparse_wait_times; + + std::vector skip_op; + + std::map> dense_variable_name; + std::map> dense_gradient_variable_name; + std::vector dense_table_id; + // fea_dim for each dense table + std::vector dense_table_size; + std::vector sparse_table_id; + std::map> slot_input_vec; + std::map> gradient_var; + std::map slot_alias_to_table; +}; + +struct DensePullThreadParam { + std::shared_ptr ps_client; + int threshold; + int training_thread_num; + Scope* root_scope; + std::map>* dense_params; + int sleep_time_ms = 2; +}; + +class DensePullThread { + public: + explicit DensePullThread(const DensePullThreadParam& param) + : _running(false) { + _ps_client = param.ps_client; + _threshold = param.threshold; + _thread_num = param.training_thread_num; + _root_scope = param.root_scope; + _sleep_time_ms = param.sleep_time_ms; + + for (auto& t : *param.dense_params) { + _dense_variable_name[t.first].insert(_dense_variable_name[t.first].end(), + t.second.begin(), t.second.end()); + _training_versions[t.first].resize(_thread_num, 0); + _last_versions[t.first] = 0; + _current_version[t.first] = 0; + } + } + + int start(); + + void stop() { + if (_running) { + _running = false; + _t.join(); + } + } + + void increase_thread_version(int thread_id, uint64_t table_id); + void reset_thread_version(uint64_t table_id); + std::future pull_dense(uint64_t table_id); + void pull_dense2(uint64_t table_id); + void wait_all(); + + private: + void run(); + bool check_update_param(uint64_t table_id); + + private: + std::shared_ptr _ps_client; + int _thread_num; + int _threshold; + int _sleep_time_ms; + Scope* _root_scope; + bool _running; + + std::map _last_versions; + std::map _current_version; + std::mutex _mutex_for_version; + std::map> _training_versions; + std::map> _dense_variable_name; + + std::thread _t; + + std::vector<::std::future> _pull_dense_status; + + std::map> _regions; + uint32_t _pull_dense_fail_times = 0; + + std::vector _base_norm_param; + std::vector _mean; + std::vector _scale; + float _squared_sum_epsilon = 1e-4; + std::mutex _mutex_for_mean_scale; + + float _total_batch_num = 0; +}; +#endif + +class ExecutorThreadWorker { + public: + ExecutorThreadWorker() + : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} + virtual ~ExecutorThreadWorker() {} + + void CreateThreadResource(const framework::ProgramDesc& program, + const paddle::platform::Place& place); + void SetThreadId(int tid); + void SetDebug(const bool debug) { debug_ = debug; } + void SetRootScope(Scope* g_scope); + // set cpu device in this function + // cpu binding is used by default + void SetDevice(); + // since we read data into memory that can not be accessed by program + // we need to bind memory of data with corresponding variables in program + // this function should be called after data feed is set + void BindingDataFeedMemory(); + // set data feed declared in executor + void SetDataFeed(const std::shared_ptr& datafeed); + // A multi-thread training function + virtual void TrainFiles(); + // with timer log + virtual void TrainFilesWithTimer(); + // set fetch variable names from python interface assigned by users + void SetFetchVarNames(const std::vector& fetch_var_names); +#ifdef PADDLE_WITH_PSLIB + virtual void SetPSlibPtr( + std::shared_ptr pslib_ptr) {} + virtual void SetPullDenseThread(std::shared_ptr dpt) {} + virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {} +#endif + + private: + void CreateThreadScope(const framework::ProgramDesc& program); + void CreateThreadOperators(const framework::ProgramDesc& program); + void SetMainProgram(const ProgramDesc& main_program_desc); + void SetPlace(const paddle::platform::Place& place); + + protected: + // thread index + std::shared_ptr thread_reader_; // shared queue, thread buffer + int thread_id_; + // operator name + std::vector op_names_; + // thread level, local operators for forward and backward + std::vector ops_; + // main program for training + std::unique_ptr main_program_; + // execution place + platform::Place place_; + // root scope for model parameters + Scope* root_scope_; + // a thread scope, father scope is global score which is shared + Scope* thread_scope_; + std::vector fetch_var_names_; + std::vector> fetch_values_; + bool debug_; +}; + +#ifdef PADDLE_WITH_PSLIB +class AsyncExecutorThreadWorker : public ExecutorThreadWorker { + public: + AsyncExecutorThreadWorker() {} + virtual ~AsyncExecutorThreadWorker() {} + void SetPSlibPtr(std::shared_ptr pslib_ptr); + void SetPullDenseThread(std::shared_ptr dpt); + void SetParamConfig(AsyncWorkerParamConfig* param_config); + void TrainFiles(); + void TrainOneNetwork(); + void PrepareParams(); + void UpdateParams(); + void PullSparse(int table_id); + void FillSparse(int table_id); + void PushSparse(int table_id); + void PushDense(int table_id); + + void check_pull_push_memory(const std::vector& features, + std::vector* push_g, int dim); + void check_pull_push_memory(const std::vector& features, + std::vector>* push_g, int dim); + void collect_feasign_info(int table_id); + + private: + struct FeasignInfo { + uint32_t slot; + uint32_t ins; + int64_t label; + }; + + std::map> _features; + std::map> _fea_info; + std::map>> _feature_value; + std::map>> _feature_push_value; + + std::shared_ptr _pslib_ptr; + + std::shared_ptr _pull_dense_thread; + + std::vector<::std::future> _pull_sparse_status; + std::vector<::std::future> _pull_dense_status; + std::vector<::std::future> _push_sparse_status; + std::vector<::std::future> _push_dense_status; + + AsyncWorkerParamConfig* _param_config; +}; +#endif + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/expect.h b/paddle/fluid/framework/expect.h new file mode 100644 index 00000000..146f4de9 --- /dev/null +++ b/paddle/fluid/framework/expect.h @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + +#ifdef _LINUX +#ifndef likely +#define likely(x) __builtin_expect((x), 1) +#endif +#endif + +#ifdef _LINUX +#ifndef unlikely +#define unlikely(x) __builtin_expect((x), 0) +#endif +#endif diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc new file mode 100644 index 00000000..96530b2a --- /dev/null +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/feed_fetch_method.h" +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +void SetFeedVariable(Scope* scope, const LoDTensor& input, + const std::string& var_name, size_t index) { + // If var_name Variable is not found in GlobalScope, a new variable will + // be created. + VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index; + Variable* g_feed_value = scope->Var(var_name); + auto& feed_inputs = *(g_feed_value->GetMutable()); + if (index >= feed_inputs.size()) { + feed_inputs.resize(index + 1); + } + // shared data with input tensor + feed_inputs[index].ShareDataWith(input); + // set lod + feed_inputs[index].set_lod(input.lod()); +} + +LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, + size_t index) { + // Since we want to fetch LodTensor from a variable, the variable must + // be created alreadly. + Variable* g_fetch_value = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(g_fetch_value, "%s is not found.", var_name); + PADDLE_ENFORCE(g_fetch_value->IsType(), + "Only %s can be invoked by GetFetchVariable", + typeid(FeedFetchList).name()); + auto& fetch_outputs = *g_fetch_value->GetMutable(); + auto& tensor = fetch_outputs[index]; + VLOG(3) << "Fetch " << var_name << " with index " << index + << " shape= " << tensor.dims(); + PADDLE_ENFORCE_LT(index, fetch_outputs.size()); + return tensor; +} + +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { + Variable* var = scope.FindVar(var_name); + PADDLE_ENFORCE(var, "%s no in scope", var_name); + PADDLE_ENFORCE(var->IsType(), "Only support lod tensor now."); + return *var->GetMutable(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h new file mode 100644 index 00000000..031f8e01 --- /dev/null +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +void SetFeedVariable(Scope* scope, const LoDTensor& input, + const std::string& var_name, size_t index); + +LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, + size_t index); + +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h new file mode 100644 index 00000000..fae792ad --- /dev/null +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +using FeedFetchType = LoDTensor; +using FeedFetchList = std::vector; + +static const char kFeedOpType[] = "feed"; +static const char kFetchOpType[] = "fetch"; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt new file mode 100644 index 00000000..12fc454f --- /dev/null +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -0,0 +1,7 @@ +if(WITH_PSLIB) + cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib) +else() + cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) +endif(WITH_PSLIB) + +cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope) diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc new file mode 100644 index 00000000..62221f4a --- /dev/null +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -0,0 +1,625 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" +#include +#include +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +const uint32_t MAX_FEASIGN_NUM = 1024 * 100 * 100; +std::shared_ptr FleetWrapper::s_instance_ = NULL; +bool FleetWrapper::is_initialized_ = false; + +#ifdef PADDLE_WITH_PSLIB +template +paddle::ps::Archive& operator<<(paddle::ps::Archive& ar, + const MultiSlotType& ins) { + ar << ins.GetType(); + ar << ins.GetOffset(); + ar << ins.GetFloatData(); + ar << ins.GetUint64Data(); + return ar; +} + +template +paddle::ps::Archive& operator>>(paddle::ps::Archive& ar, + MultiSlotType& ins) { + ar >> ins.MutableType(); + ar >> ins.MutableOffset(); + ar >> ins.MutableFloatData(); + ar >> ins.MutableUint64Data(); + return ar; +} +#endif + +#ifdef PADDLE_WITH_PSLIB +std::shared_ptr FleetWrapper::pslib_ptr_ = NULL; +#endif + +void FleetWrapper::InitServer(const std::string& dist_desc, int index) { +#ifdef PADDLE_WITH_PSLIB + if (!is_initialized_) { + VLOG(3) << "Going to init server"; + pslib_ptr_ = std::shared_ptr( + new paddle::distributed::PSlib()); + pslib_ptr_->init_server(dist_desc, index); + is_initialized_ = true; + } else { + VLOG(3) << "Server can be initialized only once"; + } +#endif +} + +void FleetWrapper::InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, + int node_num, int index) { +#ifdef PADDLE_WITH_PSLIB + if (!is_initialized_) { + VLOG(3) << "Going to init worker"; + pslib_ptr_ = std::shared_ptr( + new paddle::distributed::PSlib()); + pslib_ptr_->init_worker(dist_desc, + const_cast(host_sign_list.data()), + node_num, index); + is_initialized_ = true; + } else { + VLOG(3) << "Worker can be initialized only once"; + } +#endif +} + +void FleetWrapper::StopServer() { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to stop server"; + pslib_ptr_->stop_server(); +#endif +} + +uint64_t FleetWrapper::RunServer() { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to run server"; + return pslib_ptr_->run_server(); +#else + return 0; +#endif +} + +void FleetWrapper::GatherServers(const std::vector& host_sign_list, + int node_num) { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to gather server ips"; + pslib_ptr_->gather_servers(const_cast(host_sign_list.data()), + node_num); +#endif +} + +void FleetWrapper::GatherClients(const std::vector& host_sign_list) { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to gather client ips"; + size_t len = host_sign_list.size(); + pslib_ptr_->gather_clients(const_cast(host_sign_list.data()), len); +#endif +} + +std::vector FleetWrapper::GetClientsInfo() { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to get client info"; + return pslib_ptr_->get_client_info(); +#endif + return std::vector(); +} + +void FleetWrapper::CreateClient2ClientConnection() { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to create client2client connection"; + pslib_ptr_->create_client2client_connection(); +#endif +} + +void FleetWrapper::PullSparseVarsSync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, std::vector* fea_keys, + std::vector>* fea_values, int fea_value_dim) { +#ifdef PADDLE_WITH_PSLIB + std::vector<::std::future> pull_sparse_status; + pull_sparse_status.resize(0); + fea_keys->clear(); + fea_keys->resize(0); + fea_keys->reserve(MAX_FEASIGN_NUM); + for (auto name : var_names) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + CHECK(tensor != nullptr) << "tensor of var " << name << " is null"; + int64_t* ids = tensor->data(); + int len = tensor->numel(); + for (auto i = 0u; i < len; ++i) { + if (ids[i] == 0u) { + continue; + } + fea_keys->push_back(static_cast(ids[i])); + } + } + fea_values->resize(fea_keys->size() + 1); + for (auto& t : *fea_values) { + t.resize(fea_value_dim); + } + std::vector pull_result_ptr; + for (auto& t : *fea_values) { + pull_result_ptr.push_back(t.data()); + } + auto status = pslib_ptr_->_worker_ptr->pull_sparse( + pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); + pull_sparse_status.push_back(std::move(status)); + for (auto& t : pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; + exit(-1); + } + } +#endif +} + +void FleetWrapper::PullDenseVarsAsync( + const Scope& scope, const uint64_t tid, + const std::vector& var_names, + std::vector<::std::future>* pull_dense_status) { +#ifdef PADDLE_WITH_PSLIB + auto& regions = _regions[tid]; + regions.clear(); + regions.resize(var_names.size()); + for (auto i = 0u; i < var_names.size(); ++i) { + Variable* var = scope.FindVar(var_names[i]); + LoDTensor* tensor = var->GetMutable(); + float* w = tensor->data(); + paddle::ps::Region reg(w, tensor->numel()); + regions[i] = std::move(reg); + } + auto status = + pslib_ptr_->_worker_ptr->pull_dense(regions.data(), regions.size(), tid); + pull_dense_status->push_back(std::move(status)); +#endif +} + +void FleetWrapper::PullDenseVarsSync( + const Scope& scope, const uint64_t tid, + const std::vector& var_names) { +#ifdef PADDLE_WITH_PSLIB + auto& regions = _regions[tid]; + regions.clear(); + regions.reserve(var_names.size()); + for (auto& t : var_names) { + Variable* var = scope.FindVar(t); + LoDTensor* tensor = var->GetMutable(); + float* w = tensor->data(); + paddle::ps::Region reg(w, tensor->numel()); + regions.emplace_back(std::move(reg)); + } + auto status = + pslib_ptr_->_worker_ptr->pull_dense(regions.data(), regions.size(), tid); + status.wait(); +#endif +} + +void FleetWrapper::PushDenseParamSync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names) { +#ifdef PADDLE_WITH_PSLIB + auto place = platform::CPUPlace(); + std::vector regions; + for (auto& t : var_names) { + Variable* var = scope.FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + float* g = tensor->mutable_data(place); + paddle::ps::Region reg(g, tensor->numel()); + regions.emplace_back(std::move(reg)); + } + auto push_status = pslib_ptr_->_worker_ptr->push_dense_param( + regions.data(), regions.size(), table_id); + push_status.wait(); + auto status = push_status.get(); + CHECK(status == 0) << "push dense param failed, status[" << status << "]"; +#endif +} + +void FleetWrapper::PushDenseVarsSync( + Scope* scope, const uint64_t table_id, + const std::vector& var_names) {} + +void FleetWrapper::PushDenseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector<::std::future>* push_sparse_status) { +#ifdef PADDLE_WITH_PSLIB + std::vector regions; + for (auto& t : var_names) { + Variable* var = scope.FindVar(t); + LoDTensor* tensor = var->GetMutable(); + int count = tensor->numel(); + float* g = tensor->data(); + paddle::ps::Region reg(g, count); + regions.emplace_back(std::move(reg)); + } + auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(), + regions.size(), table_id); + push_sparse_status->push_back(std::move(status)); +#endif +} + +void FleetWrapper::PushSparseVarsWithLabelAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& fea_keys, const std::vector& fea_labels, + const std::vector& sparse_key_names, + const std::vector& sparse_grad_names, const int emb_dim, + std::vector>* push_values, + std::vector<::std::future>* push_sparse_status, + const int batch_size, const bool use_cvm) { +#ifdef PADDLE_WITH_PSLIB + int offset = 2; + int grad_dim = emb_dim; + if (use_cvm) { + offset = 0; + grad_dim = emb_dim - 2; + } + CHECK_GE(grad_dim, 0); + + push_values->resize(fea_keys.size() + 1); + for (auto& t : *push_values) { + t.resize(emb_dim + offset); + } + uint64_t fea_idx = 0u; + for (size_t i = 0; i < sparse_key_names.size(); ++i) { + Variable* var = scope.FindVar(sparse_key_names[i]); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + if (tensor == nullptr) { + LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null"; + exit(-1); + } + int len = tensor->numel(); + int64_t* ids = tensor->data(); + + Variable* g_var = scope.FindVar(sparse_grad_names[i]); + CHECK(g_var != nullptr) << "var[" << sparse_grad_names[i] << "] not found"; + LoDTensor* g_tensor = g_var->GetMutable(); + if (g_tensor == nullptr) { + LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null"; + exit(-1); + } + float* g = g_tensor->data(); + + if (scale_sparse_gradient_with_batch_size_ && grad_dim > 0) { + int dim = emb_dim + offset; + Eigen::Map< + Eigen::Matrix> + g_mat(g, g_tensor->numel() / dim, dim); + g_mat.rightCols(grad_dim) *= batch_size; + } + for (auto id_idx = 0u; id_idx < len; ++id_idx) { + if (ids[id_idx] == 0) { + g += emb_dim; + continue; + } + CHECK(fea_idx < (*push_values).size()); + CHECK(fea_idx < fea_labels.size()); + if (use_cvm) { + memcpy((*push_values)[fea_idx].data() + offset, g, + sizeof(float) * emb_dim); + } else { + memcpy((*push_values)[fea_idx].data() + offset, g, + sizeof(float) * emb_dim); + (*push_values)[fea_idx][0] = 1.0f; + (*push_values)[fea_idx][1] = static_cast(fea_labels[fea_idx]); + } + g += emb_dim; + fea_idx++; + } + } + CHECK(fea_idx == fea_keys.size()) << "fea_idx: " << fea_idx + << "features size: " << fea_keys.size(); + std::vector push_g_vec; + for (auto i = 0u; i < fea_keys.size(); ++i) { + push_g_vec.push_back((*push_values)[i].data()); + } + auto status = pslib_ptr_->_worker_ptr->push_sparse( + table_id, fea_keys.data(), (const float**)push_g_vec.data(), + fea_keys.size()); + push_sparse_status->push_back(std::move(status)); + +#endif +} + +void FleetWrapper::LoadFromPaddleModel(Scope& scope, const uint64_t table_id, + std::vector var_list, + std::string model_path, + std::string model_proto_file, + bool load_combine) { + // load ProgramDesc from model file + auto read_proto_func = [](const std::string& filename) -> ProgramDesc { + std::string contents; + std::ifstream fin(filename, std::ios::in | std::ios::binary); + fin.seekg(0, std::ios::end); + contents.resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&contents[0], contents.size()); + fin.close(); + ProgramDesc program_desc(contents); + return program_desc; + }; + const ProgramDesc old_program = read_proto_func(model_proto_file); + Scope* old_scope = new Scope(); + auto& old_block = old_program.Block(0); + auto place = platform::CPUPlace(); + std::vector old_param_list; + + for (auto& t : var_list) { + VarDesc* old_var_desc = old_block.FindVar(t); + if (old_var_desc == nullptr) { + continue; + } + // init variable in scope + Variable* old_var = old_scope->Var(old_var_desc->Name()); + InitializeVariable(old_var, old_var_desc->GetType()); + old_param_list.push_back(t); + if (load_combine) { + continue; + } + // load variable from model + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", model_path + "/" + old_var_desc->Name()}); + auto load_op = paddle::framework::OpRegistry::CreateOp( + "load", {}, {{"Out", {old_var_desc->Name()}}}, attrs); + load_op->Run(*old_scope, place); + } + + if (load_combine) { + std::sort(old_param_list.begin(), old_param_list.end()); + paddle::framework::AttributeMap attrs; + attrs.insert({"file_path", model_path}); + auto load_op = paddle::framework::OpRegistry::CreateOp( + "load_combine", {}, {{"Out", old_param_list}}, attrs); + load_op->Run(*old_scope, place); + } + + for (auto& t : old_param_list) { + Variable* old_var = old_scope->Var(t); + // old model data, here we assume data type is float + LoDTensor* old_tensor = old_var->GetMutable(); + float* old_data = old_tensor->data(); + // new model data, here we assume data type is float + Variable* var = scope.FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + float* data = tensor->data(); + // copy from old data to new data + if (old_tensor->numel() > tensor->numel()) { + memcpy(data, old_data, tensor->numel() * sizeof(float)); + } else { + memcpy(data, old_data, old_tensor->numel() * sizeof(float)); + } + } + delete old_scope; + PushDenseParamSync(scope, table_id, old_param_list); +} + +void FleetWrapper::LoadModel(const std::string& path, const int mode) { +#ifdef PADDLE_WITH_PSLIB + auto ret = pslib_ptr_->_worker_ptr->load(path, std::to_string(mode)); + ret.wait(); + if (ret.get() != 0) { + LOG(ERROR) << "load model from path:" << path << " failed"; + exit(-1); + } +#else + VLOG(0) << "FleetWrapper::LoadModel does nothing when no pslib"; +#endif +} + +void FleetWrapper::LoadModelOneTable(const uint64_t table_id, + const std::string& path, const int mode) { +#ifdef PADDLE_WITH_PSLIB + auto ret = + pslib_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode)); + ret.wait(); + if (ret.get() != 0) { + LOG(ERROR) << "load model of table id: " << table_id + << ", from path: " << path << " failed"; + } +#else + VLOG(0) << "FleetWrapper::LoadModel does nothing when no pslib"; +#endif +} + +void FleetWrapper::SaveModel(const std::string& path, const int mode) { +#ifdef PADDLE_WITH_PSLIB + auto ret = pslib_ptr_->_worker_ptr->save(path, std::to_string(mode)); + ret.wait(); + int32_t feasign_cnt = ret.get(); + if (feasign_cnt == -1) { + LOG(ERROR) << "save model failed"; + exit(-1); + } +#else + VLOG(0) << "FleetWrapper::SaveModel does nothing when no pslib"; +#endif +} + +void FleetWrapper::ShrinkSparseTable(int table_id) { +#ifdef PADDLE_WITH_PSLIB + auto ret = pslib_ptr_->_worker_ptr->shrink(table_id); + ret.wait(); +#else + VLOG(0) << "FleetWrapper::ShrinkSparseTable does nothing when no pslib"; +#endif +} + +void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope, + std::vector var_list, + float decay) { +#ifdef PADDLE_WITH_PSLIB + std::vector regions; + for (std::string& name : var_list) { + if (name.find("batch_sum") != std::string::npos) { + Variable* var = scope->FindVar(name); + CHECK(var != nullptr) << "var[" << name << "] not found"; + VLOG(3) << "prepare shrink dense batch_sum"; + LoDTensor* tensor = var->GetMutable(); + float* g = tensor->data(); + Eigen::Map mat(g, 1, tensor->numel()); + mat *= decay; + paddle::ps::Region reg(g, tensor->numel()); + regions.emplace_back(std::move(reg)); + } else { + Variable* var = scope->FindVar(name); + CHECK(var != nullptr) << "var[" << name << "] not found"; + LoDTensor* tensor = var->GetMutable(); + float* g = tensor->data(); + paddle::ps::Region reg(g, tensor->numel()); + regions.emplace_back(std::move(reg)); + } + } + auto push_status = pslib_ptr_->_worker_ptr->push_dense_param( + regions.data(), regions.size(), table_id); + push_status.wait(); + auto status = push_status.get(); + if (status != 0) { + LOG(FATAL) << "push shrink dense param failed, status[" << status << "]"; + exit(-1); + } +#else + VLOG(0) << "FleetWrapper::ShrinkSparseTable does nothing when no pslib"; +#endif +} + +void FleetWrapper::ClientFlush() { +#ifdef PADDLE_WITH_PSLIB + auto ret = pslib_ptr_->_worker_ptr->flush(); + ret.wait(); +#else + VLOG(0) << "FleetWrapper::ServerFlush does nothing when no pslib"; +#endif +} + +int FleetWrapper::RegisterClientToClientMsgHandler(int msg_type, + MsgHandlerFunc handler) { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "calling FleetWrapper::RegisterClientToClientMsgHandler"; + VLOG(3) << "pslib_ptr_=" << pslib_ptr_; + VLOG(3) << "_worker_ptr=" << pslib_ptr_->_worker_ptr; + return pslib_ptr_->_worker_ptr->registe_client2client_msg_handler(msg_type, + handler); +#else + VLOG(0) << "FleetWrapper::RegisterClientToClientMsgHandler" + << " does nothing when no pslib"; +#endif + return 0; +} + +std::future FleetWrapper::SendClientToClientMsg( + int msg_type, int to_client_id, const std::string& msg) { +#ifdef PADDLE_WITH_PSLIB + return pslib_ptr_->_worker_ptr->send_client2client_msg(msg_type, to_client_id, + msg); +#else + VLOG(0) << "FleetWrapper::SendClientToClientMsg" + << " does nothing when no pslib"; +#endif + return std::future(); +} + +template +void FleetWrapper::Serialize(const std::vector& t, std::string* str) { +#ifdef PADDLE_WITH_PSLIB + paddle::ps::BinaryArchive ar; + for (size_t i = 0; i < t.size(); ++i) { + ar << *(t[i]); + } + *str = std::string(ar.buffer(), ar.length()); +#else + VLOG(0) << "FleetWrapper::Serialize does nothing when no pslib"; +#endif +} + +template +void FleetWrapper::Deserialize(std::vector* t, const std::string& str) { +#ifdef PADDLE_WITH_PSLIB + if (str.length() == 0) { + return; + } + paddle::ps::BinaryArchive ar; + ar.set_read_buffer(const_cast(str.c_str()), str.length(), nullptr); + if (ar.cursor() == ar.finish()) { + return; + } + while (ar.cursor() < ar.finish()) { + t->push_back(ar.get()); + } + CHECK(ar.cursor() == ar.finish()); + VLOG(3) << "Deserialize size " << t->size(); +#else + VLOG(0) << "FleetWrapper::Deserialize does nothing when no pslib"; +#endif +} + +std::default_random_engine& FleetWrapper::LocalRandomEngine() { + struct engine_wrapper_t { + std::default_random_engine engine; +#ifdef PADDLE_WITH_PSLIB + engine_wrapper_t() { + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + double cur_time = tp.tv_sec + tp.tv_nsec * 1e-9; + static std::atomic x(0); + std::seed_seq sseq = {x++, x++, x++, (uint64_t)(cur_time * 1000)}; + engine.seed(sseq); + } +#endif + }; + thread_local engine_wrapper_t r; + return r.engine; +} + +template void FleetWrapper::Serialize>( + const std::vector*>&, std::string*); +template void FleetWrapper::Deserialize>( + std::vector>*, const std::string&); + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h new file mode 100644 index 00000000..8b375727 --- /dev/null +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -0,0 +1,192 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#ifdef PADDLE_WITH_PSLIB +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace framework { + +// A wrapper class for pslib.h, this class follows Singleton pattern +// i.e. only initialized once in the current process +// Example: +// std::shared_ptr fleet_ptr = +// FleetWrapper::GetInstance(); +// string dist_desc; +// fleet_ptr->InitServer(dist_desc, 0); +// interface design principles: +// Pull +// Sync: PullSparseVarsSync +// Async: PullSparseVarsAsync(not implemented currently) +// Push +// Sync: PushSparseVarsSync +// Async: PushSparseVarsAsync(not implemented currently) +// Async: PushSparseVarsWithLabelAsync(with special usage) +// Push dense variables to server in Async mode +// Param: scope, table_id, var_names +// Param: push_sparse_status + +class FleetWrapper { + public: + virtual ~FleetWrapper() {} + FleetWrapper() { scale_sparse_gradient_with_batch_size_ = true; } + // Pull sparse variables from server in Sync mode + // Param: scope, table_id, var_names, fea_keys + // Param: fea_values + void PullSparseVarsSync(const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector* fea_keys, + std::vector>* fea_values, + int fea_dim); + + void PullDenseVarsSync(const Scope& scope, const uint64_t table_id, + const std::vector& var_names); + + void PullDenseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector<::std::future>* pull_dense_status); + + void PushDenseParamSync(const Scope& scope, const uint64_t table_id, + const std::vector& var_names); + + // Push dense variables to server in async mode + // Param: scope, table_id, var_names, + // Param: push_sparse_status + void PushDenseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector<::std::future>* push_sparse_status); + + void PushDenseVarsSync(Scope* scope, const uint64_t table_id, + const std::vector& var_names); + + // Push sparse variables with labels to server in Async mode + // This is specially designed for click/show stats in server + // Param: scope, table_id, var_grad_names, + // fea_keys, fea_labels, sparse_grad_names + // Param: push_values, push_sparse_status + void PushSparseVarsWithLabelAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& fea_keys, + const std::vector& fea_labels, + const std::vector& sparse_key_names, + const std::vector& sparse_grad_names, const int emb_dim, + std::vector>* push_values, + std::vector<::std::future>* push_sparse_status, + const int batch_size, const bool use_cvm); + + // Push sparse variables to server in Async mode + // Param: scope, table_id, fea_keys, sparse_grad_names + // Param: push_values, push_sparse_status + /* + void PushSparseVarsAsync( + const Scope& scope, + const uint64_t table_id, + const std::vector& fea_keys, + const std::vector& sparse_grad_names, + std::vector>* push_values, + std::vector<::std::future>* push_sparse_status); + */ + + void InitServer(const std::string& dist_desc, int index); + void InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, int node_num, + int index); + void StopServer(); + uint64_t RunServer(); + void GatherServers(const std::vector& host_sign_list, int node_num); + // gather client ip + void GatherClients(const std::vector& host_sign_list); + // get client info + std::vector GetClientsInfo(); + // create client to client connection + void CreateClient2ClientConnection(); + + // flush all push requests + void ClientFlush(); + // load from paddle model + void LoadFromPaddleModel(Scope& scope, const uint64_t table_id, // NOLINT + std::vector var_list, + std::string model_path, std::string model_proto_file, + bool load_combine); + // mode = 0, load all feature + // mode = 1, laod delta feature, which means load diff + void LoadModel(const std::string& path, const int mode); + // mode = 0, load all feature + // mode = 1, laod delta feature, which means load diff + void LoadModelOneTable(const uint64_t table_id, const std::string& path, + const int mode); + // mode = 0, save all feature + // mode = 1, save delta feature, which means save diff + void SaveModel(const std::string& path, const int mode); + + void ShrinkSparseTable(int table_id); + void ShrinkDenseTable(int table_id, Scope* scope, + std::vector var_list, float decay); + + // register client to client communication + typedef std::function MsgHandlerFunc; + int RegisterClientToClientMsgHandler(int msg_type, MsgHandlerFunc handler); + // send client to client message + std::future SendClientToClientMsg(int msg_type, int to_client_id, + const std::string& msg); + + template + void Serialize(const std::vector& t, std::string* str); + template + void Deserialize(std::vector* t, const std::string& str); + static std::shared_ptr GetInstance() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::framework::FleetWrapper()); + } + return s_instance_; + } + + // this performs better than rand_r, especially large data + std::default_random_engine& LocalRandomEngine(); + +#ifdef PADDLE_WITH_PSLIB + static std::shared_ptr pslib_ptr_; +#endif + + private: + static std::shared_ptr s_instance_; +#ifdef PADDLE_WITH_PSLIB + std::map> _regions; +#endif + + protected: + static bool is_initialized_; + bool scale_sparse_gradient_with_batch_size_; + DISABLE_COPY_AND_ASSIGN(FleetWrapper); +}; + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc new file mode 100644 index 00000000..38c75b1d --- /dev/null +++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/fleet/nccl_wrapper.h" +#include +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +std::shared_ptr NCCLWrapper::s_instance_ = NULL; +bool NCCLWrapper::is_initialized_ = false; + +void NCCLWrapper::InitNCCL() { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( + &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_, + nccl_info_.my_global_rank_)); +#endif + return; +} + +void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + nccl_info_.nccl_id_ = nccl_info.nccl_id_; +#endif + return; +} + +NCCLInfo NCCLWrapper::GetNCCLId() { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_))); +#endif + return nccl_info_; +} + +void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank, + const int ranks) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + nccl_info_.local_rank_ = local_rank; + nccl_info_.my_global_rank_ = global_rank; + nccl_info_.global_ranks_ = ranks; + PADDLE_ENFORCE(cudaSetDevice(local_rank)); + PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_))); +#endif + return; +} + +void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope, + const std::vector& var_names) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + for (auto& name : var_names) { + auto var = scope.FindVar(name); + LoDTensor* tensor = var->GetMutable(); + int32_t total_size = tensor->numel(); + PADDLE_ENFORCE(platform::dynload::ncclBcast( + reinterpret_cast(tensor->data()), total_size, ncclFloat, + root_rank, nccl_info_.comm_, nccl_info_.stream_)); + cudaStreamSynchronize(nccl_info_.stream_); + } +#endif + return; +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h new file mode 100644 index 00000000..84354308 --- /dev/null +++ b/paddle/fluid/framework/fleet/nccl_wrapper.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable_helper.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/dynload/nccl.h" +#endif +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace framework { + +class NCCLInfo { + public: + NCCLInfo() {} + virtual ~NCCLInfo() {} + + public: + int local_rank_; + int global_ranks_; + int my_global_rank_; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + ncclUniqueId nccl_id_; + ncclComm_t comm_; + cudaStream_t stream_; +#endif +}; + +class NCCLWrapper { + public: + virtual ~NCCLWrapper() {} + NCCLWrapper() {} + + void InitNCCL(); + void SetNCCLId(const NCCLInfo& nccl_info); + NCCLInfo GetNCCLId(); + void SetRankInfo(const int local_rank, const int global_rank, + const int ranks); + void SyncVar(const int root_rank, const Scope& scope, + const std::vector& var_names); + + static std::shared_ptr GetInstance() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::framework::NCCLWrapper()); + } + return s_instance_; + } + + public: + NCCLInfo nccl_info_; + + private: + static std::shared_ptr s_instance_; + + protected: + static bool is_initialized_; + DISABLE_COPY_AND_ASSIGN(NCCLWrapper); +}; + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto new file mode 100644 index 00000000..efdabffb --- /dev/null +++ b/paddle/fluid/framework/framework.proto @@ -0,0 +1,188 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto2"; +option optimize_for = LITE_RUNTIME; +package paddle.framework.proto; + +// Any incompatible changes to ProgramDesc and its dependencies should +// raise the version defined version.h. +// +// Serailization and Deserialization codes should be modified in a way +// that supports old versions following the version and compatibility policy. +message Version { optional int64 version = 1 [ default = 0 ]; } + +enum AttrType { + INT = 0; + FLOAT = 1; + STRING = 2; + INTS = 3; + FLOATS = 4; + STRINGS = 5; + BOOLEAN = 6; + BOOLEANS = 7; + BLOCK = 8; + LONG = 9; + BLOCKS = 10; + LONGS = 11; +} + +// OpDesc describes an instance of a C++ framework::OperatorBase +// derived class type. +message OpDesc { + + message Attr { + required string name = 1; + required AttrType type = 2; + optional int32 i = 3; + optional float f = 4; + optional string s = 5; + repeated int32 ints = 6; + repeated float floats = 7; + repeated string strings = 8; + optional bool b = 10; + repeated bool bools = 11; + optional int32 block_idx = 12; + optional int64 l = 13; + repeated int32 blocks_idx = 14; + repeated int64 longs = 15; + }; + + message Var { + required string parameter = 1; + repeated string arguments = 2; + }; + + required string type = 3; + repeated Var inputs = 1; + repeated Var outputs = 2; + repeated Attr attrs = 4; + optional bool is_target = 5 [ default = false ]; +}; + +// OpProto describes a C++ framework::OperatorBase derived class. +message OpProto { + + // VarProto describes the C++ type framework::Variable. + message Var { + required string name = 1; + required string comment = 2; + + optional bool duplicable = 3 [ default = false ]; + optional bool intermediate = 4 [ default = false ]; + optional bool dispensable = 5 [ default = false ]; + } + + // AttrProto describes the C++ type Attribute. + message Attr { + required string name = 1; + required AttrType type = 2; + required string comment = 3; + // If that attribute is generated, it means the Paddle third + // language binding has responsibility to fill that + // attribute. End-User should not set that attribute. + optional bool generated = 4 [ default = false ]; + } + + required string type = 1; + repeated Var inputs = 2; + repeated Var outputs = 3; + repeated Attr attrs = 4; + required string comment = 5; +} + +message VarType { + enum Type { + // Pod Types + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; + // Tensor is used in C++. + SIZE_T = 19; + UINT8 = 20; + INT8 = 21; + + // Other types that may need additional descriptions + LOD_TENSOR = 7; + SELECTED_ROWS = 8; + FEED_MINIBATCH = 9; + FETCH_LIST = 10; + STEP_SCOPES = 11; + LOD_RANK_TABLE = 12; + LOD_TENSOR_ARRAY = 13; + PLACE_LIST = 14; + READER = 15; + // Any runtime decided variable type is raw + // raw variables should manage their own allocations + // in operators like nccl_op + RAW = 17; + TUPLE = 18; + } + + required Type type = 1; + + message TensorDesc { + // Should only be PODType. Is enforced in C++ + required Type data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] + } + optional TensorDesc selected_rows = 2; + + message LoDTensorDesc { + required TensorDesc tensor = 1; + optional int32 lod_level = 2 [ default = 0 ]; + } + optional LoDTensorDesc lod_tensor = 3; + + message LoDTensorArrayDesc { + required TensorDesc tensor = 1; + optional int32 lod_level = 2 [ default = 0 ]; + } + optional LoDTensorArrayDesc tensor_array = 4; + + message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; } + optional ReaderDesc reader = 5; + + message Tuple { repeated Type element_type = 1; } + optional Tuple tuple = 7; +} + +message VarDesc { + required string name = 1; + required VarType type = 2; + optional bool persistable = 3 [ default = false ]; +} + +message BlockDesc { + required int32 idx = 1; + required int32 parent_idx = 2; + repeated VarDesc vars = 3; + repeated OpDesc ops = 4; + optional int32 forward_block_idx = 5 [ default = -1 ]; +} + +// Please refer to +// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md +// for more details. +// TODO(panyx0718): A model can have multiple programs. Need a +// way to distinguish them. Maybe ID or name? +message ProgramDesc { + repeated BlockDesc blocks = 1; + + optional Version version = 2; +} diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc new file mode 100644 index 00000000..789b2ef8 --- /dev/null +++ b/paddle/fluid/framework/garbage_collector.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include // NOLINT +#include +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "paddle/fluid/framework/garbage_collector.h" + +namespace paddle { +namespace framework { + +DEFINE_double( + eager_delete_tensor_gb, -1.0, + "Memory size threshold (GB) when the garbage collector clear tensors." + "Disabled when this value is less than 0"); + +DEFINE_bool(fast_eager_deletion_mode, true, + "Fast eager deletion mode. If enabled, memory would release " + "immediately without waiting GPU kernel ends."); + +DEFINE_double(memory_fraction_of_eager_deletion, 1.0, + "Fraction of eager deletion. If less than 1.0, all variables in " + "the program would be sorted according to its memory size, and " + "only the FLAGS_memory_fraction_of_eager_deletion of the largest " + "variables would be deleted."); + +GarbageCollector::GarbageCollector(const platform::Place &place, + size_t max_memory_size) + : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { + garbages_.reset(new GarbageQueue()); + dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); +} + +CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void CPUGarbageCollector::ClearCallback(const std::function &callback) { + callback(); +} + +#ifdef PADDLE_WITH_CUDA +UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( + const platform::CUDAPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void UnsafeFastGPUGarbageCollector::ClearCallback( + const std::function &callback) { + callback(); +} + +DefaultStreamGarbageCollector::DefaultStreamGarbageCollector( + const platform::CUDAPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void DefaultStreamGarbageCollector::Wait() const { + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); +} + +void DefaultStreamGarbageCollector::ClearCallback( + const std::function &callback) { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); +} + +StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) { + platform::CUDADeviceGuard guard(place.device); + PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + callback_manager_.reset(new platform::StreamCallbackManager(stream_)); +} + +StreamGarbageCollector::~StreamGarbageCollector() { + auto place = boost::get(this->dev_ctx_->GetPlace()); + platform::CUDADeviceGuard guard(place.device); + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); +} + +cudaStream_t StreamGarbageCollector::stream() const { return stream_; } + +void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); } + +void StreamGarbageCollector::ClearCallback( + const std::function &callback) { + callback_manager_->AddCallback(callback); +} +#endif + +int64_t GetEagerDeletionThreshold() { + return FLAGS_eager_delete_tensor_gb < 0 + ? -1 + : static_cast(FLAGS_eager_delete_tensor_gb * + (static_cast(1) << 30)); +} + +bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } + +void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode) { + FLAGS_eager_delete_tensor_gb = threshold; + FLAGS_memory_fraction_of_eager_deletion = fraction; + FLAGS_fast_eager_deletion_mode = fast_mode; +} + +double GetEagerDeletionMemoryFraction() { + return FLAGS_memory_fraction_of_eager_deletion; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h new file mode 100644 index 00000000..6ce797bd --- /dev/null +++ b/paddle/fluid/framework/garbage_collector.h @@ -0,0 +1,148 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include "gflags/gflags.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +class GarbageCollector { + public: + using GarbageQueue = std::deque>; + + GarbageCollector(const platform::Place &place, size_t max_memory_size); + + virtual ~GarbageCollector() = default; + + virtual void Wait() const {} + + template + void Add(Container &&objs); + + template + void Add(Container &&objs, Callback &&callback); + + protected: + virtual void ClearCallback(const std::function &callback) = 0; + + platform::DeviceContext *dev_ctx_; + std::unique_ptr garbages_; + mutable std::mutex mutex_; + const size_t max_memory_size_; + size_t cur_memory_size_{0}; +}; + +class CPUGarbageCollector : public GarbageCollector { + public: + CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size); + + protected: + void ClearCallback(const std::function &callback) override; +}; + +#ifdef PADDLE_WITH_CUDA +class UnsafeFastGPUGarbageCollector : public GarbageCollector { + public: + UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size); + + protected: + void ClearCallback(const std::function &callback) override; +}; + +class DefaultStreamGarbageCollector : public GarbageCollector { + public: + DefaultStreamGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size); + + void Wait() const override; + + protected: + void ClearCallback(const std::function &callback) override; +}; + +class StreamGarbageCollector : public GarbageCollector { + public: + StreamGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size); + + ~StreamGarbageCollector(); + + void Wait() const override; + + cudaStream_t stream() const; + + protected: + void ClearCallback(const std::function &callback) override; + + private: + cudaStream_t stream_; + std::unique_ptr callback_manager_; +}; +#endif + +template +void GarbageCollector::Add(Container &&objs) { + Add(std::forward(objs), []() {}); +} + +template +void GarbageCollector::Add(Container &&objs, Callback &&callback) { + // Special case when FLAGS_eager_delete_tensor_gb=0.0 + // It speeds up GC about 2~3%. + if (max_memory_size_ <= 1) { + callback(); + auto *container = new Container(std::move(objs)); + ClearCallback([container] { delete container; }); + return; + } + + GarbageQueue *garbage_queue = nullptr; + { + std::lock_guard guard(mutex_); + for (auto &obj : objs) { + if (!obj) continue; + cur_memory_size_ += obj->size(); + garbages_->push_back(std::move(obj)); + } + if (cur_memory_size_ >= max_memory_size_) { + cur_memory_size_ = 0; + garbage_queue = garbages_.release(); + garbages_.reset(new GarbageQueue()); + } + } + + if (garbage_queue) { + callback(); + ClearCallback([garbage_queue]() { delete garbage_queue; }); + } +} + +int64_t GetEagerDeletionThreshold(); +bool IsFastEagerDeletionModeEnabled(); + +void SetEagerDeletionMode(double threshold, double fraction, bool fast_mode); + +double GetEagerDeletionMemoryFraction(); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h new file mode 100644 index 00000000..25a64b69 --- /dev/null +++ b/paddle/fluid/framework/grad_op_desc_maker.h @@ -0,0 +1,194 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { + +/* + This functor class is responsible for creating the gradient ops for the given + operator fwd_op. After it is called (through operator()), the pairs of + (gradient variable, corresponding input variable of fwd_op) will be added to + grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its + gradient varialbe will be ignored or kEmptyVarName depending on the template + argument DropEmptyIG in the derived classes. + */ +class GradOpDescMakerBase { + public: + explicit GradOpDescMakerBase( + const OpDesc& fwd_op, const std::unordered_set& no_grad_set, + std::unordered_map* grad_to_var, + const std::vector& grad_block = std::vector()) + : fwd_op_(fwd_op), + no_grad_set_(no_grad_set), + grad_to_var_(grad_to_var), + grad_block_(grad_block) {} + + virtual ~GradOpDescMakerBase() = default; + virtual std::vector> operator()() const = 0; + + protected: + std::vector InputGrad(const std::string& name, + bool drop_empty_grad = true) const { + std::vector ret_val; + auto var_names = this->Input(name); + ret_val.reserve(var_names.size()); + std::transform(var_names.begin(), var_names.end(), + std::back_inserter(ret_val), + [this](const std::string& fwd_var_name) -> std::string { + auto g_name = GradVarName(fwd_var_name); + if (no_grad_set_.empty() || !no_grad_set_.count(g_name)) { + (*this->grad_to_var_)[g_name] = fwd_var_name; + return g_name; + } else { + return kEmptyVarName; + } + }); + if (!drop_empty_grad) { + return ret_val; + } + PADDLE_ENFORCE_LE(var_names.size(), 1UL, + "BUG from operator developer:" + " for input argument with a list of variables, " + " drop_empty_grad is not allowed because it makes" + " the correspondence bewteen a variable and its gradient" + " ambiguous." + " Op type %s", + fwd_op_.Type()); + + std::vector dropped_ret_val; + dropped_ret_val.reserve(ret_val.size()); + std::copy_if(ret_val.begin(), ret_val.end(), + std::back_inserter(dropped_ret_val), + [](const std::string& str) { return str != kEmptyVarName; }); + return dropped_ret_val; + } + + std::vector OutputGrad(const std::string& name) const { + std::vector ret_val; + auto onames = this->Output(name); + ret_val.reserve(onames.size()); + std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val), + [this](const std::string& fwd_var_name) -> std::string { + auto g_name = GradVarName(fwd_var_name); + (*this->grad_to_var_)[g_name] = fwd_var_name; + return g_name; + }); + return ret_val; + } + + std::vector InputNames() const { + return this->fwd_op_.InputNames(); + } + + std::vector OutputNames() const { + return this->fwd_op_.OutputNames(); + } + + std::vector Input(const std::string& name) const { + return fwd_op_.Input(name); + } + + std::vector Output(const std::string& name) const { + return fwd_op_.Output(name); + } + + const std::unordered_map& Attrs() const { + return fwd_op_.GetAttrMap(); + } + + const Attribute& GetAttr(const std::string& name) const { + auto& map = fwd_op_.GetAttrMap(); + auto it = map.find(name); + PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name); + return it->second; + } + + template + inline const T& Attr(const std::string& name) const { + return boost::get(GetAttr(name)); + } + + std::string ForwardOpType() const { return this->fwd_op_.Type(); } + + protected: + const OpDesc& ForwardOp() const { return fwd_op_; } + + private: + const OpDesc& fwd_op_; + const std::unordered_set& no_grad_set_; + std::unordered_map* grad_to_var_; + + protected: + std::vector grad_block_; +}; + +class SingleGradOpDescMaker : public GradOpDescMakerBase { + public: + using GradOpDescMakerBase::GradOpDescMakerBase; + + std::vector> operator()() const final { + std::vector> retv; + retv.emplace_back(this->Apply()); + return retv; + } + + protected: + virtual std::unique_ptr Apply() const = 0; +}; + +template +class DefaultGradOpDescMaker final : public SingleGradOpDescMaker { + public: + using SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const final { + auto* grad = new OpDesc(); + grad->SetType(this->ForwardOpType() + "_grad"); + + for (auto& input_param : this->InputNames()) { + grad->SetInput(input_param, this->Input(input_param)); + grad->SetOutput(GradVarName(input_param), + this->InputGrad(input_param, DropEmptyIG)); + } + + for (auto& output_param : this->OutputNames()) { + grad->SetInput(output_param, this->Output(output_param)); + grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param)); + } + + grad->SetAttrMap(this->Attrs()); + + return std::unique_ptr(grad); + } +}; + +class EmptyGradOpMaker final : public GradOpDescMakerBase { + public: + using GradOpDescMakerBase::GradOpDescMakerBase; + std::vector> operator()() const final { return {}; } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc new file mode 100644 index 00000000..a006a0fa --- /dev/null +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -0,0 +1,178 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/device_worker.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/lodtensor_printer.h" + +namespace paddle { +namespace framework { + +void HogwildWorker::Initialize(const TrainerDesc& desc) { + fetch_config_ = desc.fetch_config(); + param_ = desc.hogwild_param(); + skip_ops_.resize(param_.skip_ops_size()); + for (int i = 0; i < param_.skip_ops_size(); ++i) { + skip_ops_[i] = param_.skip_ops(i); + } + use_cvm_ = desc.use_cvm(); +} + +void HogwildWorker::CreateThreadOperators(const ProgramDesc& program) { + auto& block = program.Block(0); + op_names_.clear(); + for (auto& op_desc : block.AllOps()) { + std::unique_ptr local_op = OpRegistry::CreateOp(*op_desc); + op_names_.push_back(op_desc->Type()); + OperatorBase* local_op_ptr = local_op.release(); + ops_.push_back(local_op_ptr); + continue; + } +} + +void HogwildWorker::CreateThreadScope(const ProgramDesc& program) { + auto& block = program.Block(0); + + PADDLE_ENFORCE_NOT_NULL( + root_scope_, "root_scope should be set before creating thread scope"); + + thread_scope_ = &root_scope_->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Persistable()) { + auto* ptr = root_scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } else { + auto* ptr = thread_scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } + } +} + +void HogwildWorker::BindingDataFeedMemory() { + const std::vector& input_feed = + device_reader_->GetUseSlotAlias(); + for (auto name : input_feed) { + device_reader_->AddFeedVar(thread_scope_->FindVar(name), name); + } +} + +void HogwildWorker::CreateDeviceResource(const ProgramDesc& main_prog) { + CreateThreadScope(main_prog); + CreateThreadOperators(main_prog); +} + +void HogwildWorker::TrainFilesWithProfiler() { + platform::SetNumThreads(1); + device_reader_->Start(); + std::vector op_total_time; + std::vector op_name; + for (auto& op : ops_) { + op_name.push_back(op->Type()); + } + op_total_time.resize(ops_.size()); + for (size_t i = 0; i < op_total_time.size(); ++i) { + op_total_time[i] = 0.0; + } + platform::Timer timeline; + double total_time = 0.0; + double read_time = 0.0; + int cur_batch; + int batch_cnt = 0; + timeline.Start(); + uint64_t total_inst = 0; + while ((cur_batch = device_reader_->Next()) > 0) { + VLOG(3) << "read a batch in thread " << thread_id_; + timeline.Pause(); + read_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + for (size_t i = 0; i < ops_.size(); ++i) { + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (ops_[i]->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + timeline.Start(); + VLOG(3) << "Going to run op " << op_name[i]; + if (!need_skip) { + ops_[i]->Run(*thread_scope_, place_); + } + VLOG(3) << "Op " << op_name[i] << " Finished"; + timeline.Pause(); + op_total_time[i] += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + total_inst += cur_batch; + ++batch_cnt; + PrintFetchVars(); + if (thread_id_ == 0) { + if (batch_cnt > 0 && batch_cnt % 100 == 0) { + for (size_t i = 0; i < ops_.size(); ++i) { + fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i, + op_name[i].c_str(), op_total_time[i] / batch_cnt); + } + fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt); + fprintf(stderr, "IO percent: %f\n", read_time / total_time * 100); + fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time); + } + } + thread_scope_->DropKids(); + timeline.Start(); + } +} + +void HogwildWorker::TrainFiles() { + platform::SetNumThreads(1); + + // how to accumulate fetched values here + device_reader_->Start(); + int cur_batch; + while ((cur_batch = device_reader_->Next()) > 0) { + for (auto& op : ops_) { + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*thread_scope_, place_); + } + } + + PrintFetchVars(); + thread_scope_->DropKids(); + } +} + +void HogwildWorker::PrintFetchVars() { + // call count + batch_num_++; + int batch_per_print = fetch_config_.print_period(); + if (thread_id_ == 0) { + if (batch_num_ % batch_per_print == 0) { + int fetch_var_num = fetch_config_.fetch_var_names_size(); + for (int i = 0; i < fetch_var_num; ++i) { + platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i), + fetch_config_.fetch_var_str_format(i)); + } + } + } +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/inlined_vector.h b/paddle/fluid/framework/inlined_vector.h new file mode 100644 index 00000000..2a7f26b9 --- /dev/null +++ b/paddle/fluid/framework/inlined_vector.h @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +template +class InlinedVector { + static_assert(N > 0, "N must be larger than 0"); + + public: + inline InlinedVector() { len_ = 0; } + + inline size_t size() const { return len_; } + + inline T& operator[](size_t i) { return i < N ? head_[i] : tail_[i - N]; } + + inline const T& operator[](size_t i) const { + return i < N ? head_[i] : tail_[i - N]; + } + + inline void emplace_back(const T& item) { + if (LIKELY(len_ < N)) { + head_[len_++] = item; + } else { + tail_.emplace_back(item); + ++len_; + } + } + + inline void pop_back() { + if (UNLIKELY(len_ > N)) { + tail_.pop_back(); + } + --len_; + } + + inline T& back() { + if (LIKELY(len_ <= N)) { + return head_[len_ - 1]; + } else { + return tail_.back(); + } + } + + private: + T head_[N]; + size_t len_; + std::vector tail_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/inlined_vector_test.cc b/paddle/fluid/framework/inlined_vector_test.cc new file mode 100644 index 00000000..003c0d7b --- /dev/null +++ b/paddle/fluid/framework/inlined_vector_test.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/inlined_vector.h" +#include +#include +#include +#include +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { + +template +static std::vector ToStdVector(const framework::InlinedVector &vec) { + std::vector std_vec; + std_vec.reserve(vec.size()); + for (size_t i = 0; i < vec.size(); ++i) { + std_vec.emplace_back(vec[i]); + } + return std_vec; +} + +template +void InlinedVectorCheck(size_t n) { + std::srand(std::time(nullptr)); + + std::vector std_vec; + framework::InlinedVector vec; + + for (size_t i = 0; i < n; ++i) { + int value = rand(); // NOLINT + + std_vec.emplace_back(value); + vec.emplace_back(value); + + CHECK_EQ(std_vec.size(), vec.size()); + CHECK_EQ(std_vec.back(), vec.back()); + + CHECK_EQ(vec.back(), value); + } + + bool is_equal = (std_vec == ToStdVector(vec)); + + CHECK_EQ(is_equal, true); + + for (size_t i = 0; i < n; ++i) { + CHECK_EQ(std_vec.size(), vec.size()); + CHECK_EQ(std_vec.back(), vec.back()); + std_vec.pop_back(); + vec.pop_back(); + CHECK_EQ(std_vec.size(), vec.size()); + } + + CHECK_EQ(std_vec.size(), static_cast(0)); + CHECK_EQ(vec.size(), static_cast(0)); +} + +TEST(inlined_vector, inlined_vector) { + for (size_t i = 0; i < 20; ++i) { + InlinedVectorCheck<1>(i); + InlinedVectorCheck<10>(i); + InlinedVectorCheck<15>(i); + InlinedVectorCheck<20>(i); + InlinedVectorCheck<21>(i); + InlinedVectorCheck<25>(i); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h new file mode 100644 index 00000000..fdc0c202 --- /dev/null +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -0,0 +1,62 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/type_defs.h" + +namespace paddle { +namespace framework { + +/* + Inplace Inference for create In->Out pairs for inplaced operator. + If we specify a pair of corresponding names. For example, X->Out. + then Out will inplaced use X's memory. The base class will do + legality validation for both variables. +*/ + +class InplaceOpInference { + public: + virtual ~InplaceOpInference() {} + virtual std::unordered_map operator()( + const OpDesc& op_desc, bool use_cuda) const = 0; +}; + +/* + Inplace In and Out for operator only have an Input and an Output. + For example, activation op. + */ +class SingleOpInplaceInToOut : public InplaceOpInference { + public: + std::unordered_map operator()( + const OpDesc& op_desc, bool use_cuda) const override { + PADDLE_ENFORCE_EQ(op_desc.InputNames().size(), 1, + "Op inputs must be unique"); + PADDLE_ENFORCE_EQ(op_desc.OutputNames().size(), 1, + "Op outputs must be unique"); + auto x_name = op_desc.InputNames().at(0); + auto out_name = op_desc.OutputNames().at(0); + return std::unordered_map{{x_name, out_name}}; + } +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc new file mode 100644 index 00000000..727e579d --- /dev/null +++ b/paddle/fluid/framework/inplace_op_inference_test.cc @@ -0,0 +1,328 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" +#include "paddle/fluid/framework/ir/pass_builder.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_type_inference.h" + +USE_PASS(inplace_pass); + +namespace paddle { +namespace framework { + +std::unique_ptr CreateInplacePass() { + auto pass = ir::PassRegistry::Instance().Get("inplace_pass"); + pass->Set(ir::kUseCuda, new bool(true)); + return pass; +} + +class NOP : public OperatorBase { + public: + NOP(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SingleOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SingleGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("single_op_grad"); + op->SetInput("Out", OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return std::unique_ptr(op); + } +}; + +class SingleOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->HasInput("X"); + ctx->HasOutput("Out"); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } +}; + +class SingleGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->HasInput(framework::GradVarName("Out")); + ctx->HasOutput(framework::GradVarName("X")); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); + } +}; + +class MultiOutOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddInput("Y", "").AsDuplicable(); + AddInput("Z", "").AsDuplicable(); + AddOutput("Out", ""); + AddOutput("YOut", ""); + AddOutput("ZOut", ""); + AddOutput("NotReuseOut", ""); + AddComment(""); + } +}; + +class MultiOutShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->ShareDim("X", "Out"); + ctx->ShareDim("Y", "YOut"); + ctx->ShareDim("Z", "ZOut"); + } +}; + +class MultiGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("multi_out_grad"); + op->SetInput("X", Input("X")); + op->SetOutput(framework::GradVarName("Y"), OutputGrad("YOut")); + op->SetOutput(framework::GradVarName("X"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Z"), OutputGrad("ZOut")); + return std::unique_ptr(op); + } +}; + +class MultiOutGradShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("Y"), + ctx->GetInputDim(framework::GradVarName("YOut"))); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); + ctx->SetOutputDim(framework::GradVarName("Z"), + ctx->GetInputDim(framework::GradVarName("ZOut"))); + } +}; + +class MultiOutInplaceInToOut : public framework::InplaceOpInference { + public: + std::unordered_map operator()( + const OpDesc& op_desc, bool use_cuda) const override { + return std::unordered_map{ + {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"}, + }; + } +}; + +class MultiOutGradInplaceInToOut : public framework::InplaceOpInference { + public: + std::unordered_map operator()( + const OpDesc& op_desc, bool use_cuda) const override { + return std::unordered_map{ + {framework::GradVarName("YOut"), framework::GradVarName("Y")}, + {framework::GradVarName("Out"), framework::GradVarName("X")}, + {framework::GradVarName("ZOut"), framework::GradVarName("Z")}, + }; + } +}; + +} // namespace framework +} // namespace paddle + +namespace f = paddle::framework; +REGISTER_OPERATOR(single_op, f::NOP, f::SingleOpMaker, f::SingleGradOpMaker, + f::SingleOpInplaceInToOut, f::SingleOpShapeInference); +REGISTER_OPERATOR(single_op_grad, f::NOP, f::SingleOpInplaceInToOut, + f::SingleGradOpShapeInference); +REGISTER_OPERATOR(multi_out_op, f::NOP, f::MultiOutOpMaker, f::MultiGradOpMaker, + f::MultiOutInplaceInToOut, f::MultiOutShapeInference); +REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut, + f::MultiOutGradShapeInference); + +namespace paddle { +namespace framework { + +void FakeSuccData(ProgramDesc* prog) { // NOLINT + prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); + prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_out"); + prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 32, 128, 128}); +} + +void FakeNoInplaceData(ProgramDesc* prog) { // NOLINT + prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); + prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_out"); + prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 31, 128, 128}); +} + +ir::Node* GetNodeFromGraph(ir::Graph* g, std::string name) { + ir::Node* op_node = nullptr; + for (auto& item : g->Nodes()) { + if (item->Name() == name) { + op_node = item; + break; + } + } + return op_node; +} + +std::unique_ptr test_SingleOpInplaceInToOut( + std::unique_ptr g) { + auto pass = CreateInplacePass(); + ir::Node* op_node = GetNodeFromGraph(g.get(), "single_op"); + EXPECT_NE(op_node, nullptr); + pass->Apply(g.get()); + return g; +} + +TEST(InferInplace, SingleOpInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("single_op"); + op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); + op->SetOutput("Out", {"test2_out"}); + + FakeSuccData(&prog); + std::unique_ptr g(new ir::Graph(prog)); + g->Set(ir::kMemOptSkipVars, new std::unordered_set()); + g = test_SingleOpInplaceInToOut(std::move(g)); + auto op_node = GetNodeFromGraph(g.get(), "single_op"); + + EXPECT_EQ(op_node->outputs[0]->Name(), "test2_a"); +} + +TEST(InferInplace, SingleOpInplaceInToOutNoInplace) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("single_op"); + op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); + op->SetOutput("Out", {"test2_out"}); + + FakeNoInplaceData(&prog); + std::unique_ptr g(new ir::Graph(prog)); + g->Set(ir::kMemOptSkipVars, new std::unordered_set()); + g = test_SingleOpInplaceInToOut(std::move(g)); + auto op_node = GetNodeFromGraph(g.get(), "single_op"); + + EXPECT_EQ(op_node->outputs[0]->Name(), "test2_out"); +} + +TEST(InferInplace, MultiOutInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("multi_out_op"); + op->SetInput("X", {"a0", "a1"}); + op->SetInput("Y", {"b0"}); + op->SetInput("Z", {"c0", "c1"}); + op->SetOutput("Out", {"o0"}); + op->SetOutput("YOut", {"y0"}); + op->SetOutput("ZOut", {"z0"}); + + prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("o0"); + prog.MutableBlock(0)->Var("y0"); + prog.MutableBlock(0)->Var("z0"); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); + + std::unique_ptr g(new ir::Graph(prog)); + g->Set(ir::kMemOptSkipVars, new std::unordered_set()); + auto pass = CreateInplacePass(); + pass->Apply(g.get()); + auto op_node = GetNodeFromGraph(g.get(), "multi_out_op"); + ASSERT_TRUE(op_node != nullptr); + EXPECT_EQ(op_node->outputs[0]->Name(), "a0"); + EXPECT_EQ(op_node->outputs[1]->Name(), "b0"); + EXPECT_EQ(op_node->outputs[2]->Name(), "c0"); +} + +TEST(InferInplace, MultiGradInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("multi_out_grad"); + op->SetInput(GradVarName("Out"), {"o0"}); + op->SetInput(GradVarName("YOut"), {"y0"}); + op->SetInput(GradVarName("ZOut"), {"z0"}); + op->SetOutput(GradVarName("X"), {"a0", "a1"}); + op->SetOutput(GradVarName("Y"), {"b0"}); + op->SetOutput(GradVarName("Z"), {"c0", "c1"}); + + prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("o0"); + prog.MutableBlock(0)->Var("y0"); + prog.MutableBlock(0)->Var("z0"); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 15, 1024, 1024}); + + std::unique_ptr g(new ir::Graph(prog)); + g->Set(ir::kMemOptSkipVars, new std::unordered_set()); + auto pass = CreateInplacePass(); + pass->Apply(g.get()); + auto op_node = GetNodeFromGraph(g.get(), "multi_out_grad"); + ASSERT_TRUE(op_node != nullptr); + EXPECT_EQ(op_node->outputs[0]->Name(), "o0"); + EXPECT_EQ(op_node->outputs[2]->Name(), "y0"); + EXPECT_EQ(op_node->outputs[3]->Name(), "c0"); + + std::unordered_map expects = { + {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, + }; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/io/CMakeLists.txt b/paddle/fluid/framework/io/CMakeLists.txt new file mode 100644 index 00000000..2baef77b --- /dev/null +++ b/paddle/fluid/framework/io/CMakeLists.txt @@ -0,0 +1,2 @@ +cc_library(fs SRCS fs.cc DEPS string_helper glog boost) +cc_library(shell SRCS shell.cc DEPS string_helper glog) diff --git a/paddle/fluid/framework/io/fs.cc b/paddle/fluid/framework/io/fs.cc new file mode 100644 index 00000000..d5bc5df2 --- /dev/null +++ b/paddle/fluid/framework/io/fs.cc @@ -0,0 +1,456 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/io/fs.h" +#include + +namespace paddle { +namespace framework { + +static void fs_add_read_converter_internal(std::string& path, // NOLINT + bool& is_pipe, // NOLINT + const std::string& converter) { + if (converter == "") { + return; + } + + if (!is_pipe) { + path = string::format_string("( %s ) < \"%s\"", converter.c_str(), + path.c_str()); + is_pipe = true; + } else { + path = string::format_string("%s | %s", path.c_str(), converter.c_str()); + } +} + +static void fs_add_write_converter_internal(std::string& path, // NOLINT + bool& is_pipe, // NOLINT + const std::string& converter) { + if (converter == "") { + return; + } + + if (!is_pipe) { + path = string::format_string("( %s ) > \"%s\"", converter.c_str(), + path.c_str()); + is_pipe = true; + } else { + path = string::format_string("%s | %s", converter.c_str(), path.c_str()); + } +} + +static std::shared_ptr fs_open_internal(const std::string& path, + bool is_pipe, + const std::string& mode, + size_t buffer_size, + int* err_no = 0) { + std::shared_ptr fp = nullptr; + + if (!is_pipe) { + fp = shell_fopen(path, mode); + } else { + fp = shell_popen(path, mode, err_no); + } + + if (buffer_size > 0) { + char* buffer = new char[buffer_size]; + CHECK_EQ(0, setvbuf(&*fp, buffer, _IOFBF, buffer_size)); + fp = {&*fp, [fp, buffer](FILE*) mutable { // NOLINT + CHECK(fp.unique()); // NOLINT + fp = nullptr; + delete[] buffer; + }}; + } + + return fp; +} + +static bool fs_begin_with_internal(const std::string& path, + const std::string& str) { + return strncmp(path.c_str(), str.c_str(), str.length()) == 0; +} + +static bool fs_end_with_internal(const std::string& path, + const std::string& str) { + return path.length() >= str.length() && + strncmp(&path[path.length() - str.length()], str.c_str(), + str.length()) == 0; +} + +static size_t& localfs_buffer_size_internal() { + static size_t x = 0; + return x; +} + +size_t localfs_buffer_size() { return localfs_buffer_size_internal(); } + +void localfs_set_buffer_size(size_t x) { localfs_buffer_size_internal() = x; } + +std::shared_ptr localfs_open_read(std::string path, + const std::string& converter) { + bool is_pipe = false; + + if (fs_end_with_internal(path, ".gz")) { + fs_add_read_converter_internal(path, is_pipe, "zcat"); + } + + fs_add_read_converter_internal(path, is_pipe, converter); + return fs_open_internal(path, is_pipe, "r", localfs_buffer_size()); +} + +std::shared_ptr localfs_open_write(std::string path, + const std::string& converter) { + shell_execute( + string::format_string("mkdir -p $(dirname \"%s\")", path.c_str())); + + bool is_pipe = false; + + if (fs_end_with_internal(path, ".gz")) { + fs_add_write_converter_internal(path, is_pipe, "gzip"); + } + + fs_add_write_converter_internal(path, is_pipe, converter); + return fs_open_internal(path, is_pipe, "w", localfs_buffer_size()); +} + +int64_t localfs_file_size(const std::string& path) { + struct stat buf; + if (0 != stat(path.c_str(), &buf)) { + LOG(FATAL) << "file stat not zero"; + return -1; + } + return (int64_t)buf.st_size; +} + +void localfs_remove(const std::string& path) { + if (path == "") { + return; + } + + shell_execute(string::format_string("rm -rf %s", path.c_str())); +} + +std::vector localfs_list(const std::string& path) { + if (path == "") { + return {}; + } + + std::shared_ptr pipe; + int err_no = 0; + pipe = shell_popen( + string::format_string("find %s -type f -maxdepth 1", path.c_str()), "r", + &err_no); + string::LineFileReader reader; + std::vector list; + + while (reader.getline(&*pipe)) { + list.push_back(reader.get()); + } + + return list; +} + +std::string localfs_tail(const std::string& path) { + if (path == "") { + return ""; + } + + return shell_get_command_output( + string::format_string("tail -1 %s ", path.c_str())); +} + +bool localfs_exists(const std::string& path) { + std::string test_f = shell_get_command_output( + string::format_string("[ -f %s ] ; echo $?", path.c_str())); + + if (string::trim_spaces(test_f) == "0") { + return true; + } + + std::string test_d = shell_get_command_output( + string::format_string("[ -d %s ] ; echo $?", path.c_str())); + + if (string::trim_spaces(test_d) == "0") { + return true; + } + + return false; +} + +void localfs_mkdir(const std::string& path) { + if (path == "") { + return; + } + + shell_execute(string::format_string("mkdir -p %s", path.c_str())); +} + +static size_t& hdfs_buffer_size_internal() { + static size_t x = 0; + return x; +} + +size_t hdfs_buffer_size() { return hdfs_buffer_size_internal(); } + +void hdfs_set_buffer_size(size_t x) { hdfs_buffer_size_internal() = x; } + +static std::string& hdfs_command_internal() { + static std::string x = "hadoop fs"; + return x; +} + +const std::string& hdfs_command() { return hdfs_command_internal(); } + +void hdfs_set_command(const std::string& x) { hdfs_command_internal() = x; } + +std::shared_ptr hdfs_open_read(std::string path, int* err_no, + const std::string& converter) { + if (fs_end_with_internal(path, ".gz")) { + path = string::format_string("%s -text \"%s\"", hdfs_command().c_str(), + path.c_str()); + } else { + path = string::format_string("%s -cat \"%s\"", hdfs_command().c_str(), + path.c_str()); + } + + bool is_pipe = true; + fs_add_read_converter_internal(path, is_pipe, converter); + return fs_open_internal(path, is_pipe, "r", hdfs_buffer_size(), err_no); +} + +std::shared_ptr hdfs_open_write(std::string path, int* err_no, + const std::string& converter) { + path = string::format_string("%s -put - \"%s\"", hdfs_command().c_str(), + path.c_str()); + bool is_pipe = true; + + if (fs_end_with_internal(path, ".gz\"")) { + fs_add_write_converter_internal(path, is_pipe, "gzip"); + } + + fs_add_write_converter_internal(path, is_pipe, converter); + return fs_open_internal(path, is_pipe, "w", hdfs_buffer_size(), err_no); +} + +void hdfs_remove(const std::string& path) { + if (path == "") { + return; + } + + shell_execute(string::format_string("%s -rmr %s &>/dev/null; true", + hdfs_command().c_str(), path.c_str())); +} + +std::vector hdfs_list(const std::string& path) { + if (path == "") { + return {}; + } + + std::string prefix = "hdfs:"; + + if (fs_begin_with_internal(path, "afs:")) { + prefix = "afs:"; + } + int err_no = 0; + std::vector list; + do { + err_no = 0; + std::shared_ptr pipe; + pipe = shell_popen( + string::format_string("%s -ls %s | ( grep ^- ; [ $? != 2 ] )", + hdfs_command().c_str(), path.c_str()), + "r", &err_no); + string::LineFileReader reader; + list.clear(); + + while (reader.getline(&*pipe)) { + std::vector line = string::split_string(reader.get()); + if (line.size() != 8) { + continue; + } + list.push_back(prefix + line[7]); + } + } while (err_no == -1); + return list; +} + +std::string hdfs_tail(const std::string& path) { + if (path == "") { + return ""; + } + + return shell_get_command_output(string::format_string( + "%s -text %s | tail -1 ", hdfs_command().c_str(), path.c_str())); +} + +bool hdfs_exists(const std::string& path) { + std::string test = shell_get_command_output(string::format_string( + "%s -test -e %s ; echo $?", hdfs_command().c_str(), path.c_str())); + + if (string::trim_spaces(test) == "0") { + return true; + } + + return false; +} + +void hdfs_mkdir(const std::string& path) { + if (path == "") { + return; + } + + shell_execute(string::format_string("%s -mkdir %s; true", + hdfs_command().c_str(), path.c_str())); +} + +int fs_select_internal(const std::string& path) { + if (fs_begin_with_internal(path, "hdfs:")) { + return 1; + } else if (fs_begin_with_internal(path, "afs:")) { + return 1; + } + + return 0; +} + +std::shared_ptr fs_open_read(const std::string& path, int* err_no, + const std::string& converter) { + switch (fs_select_internal(path)) { + case 0: + return localfs_open_read(path, converter); + + case 1: + return hdfs_open_read(path, err_no, converter); + + default: + LOG(FATAL) << "Not supported"; + } + + return {}; +} + +std::shared_ptr fs_open_write(const std::string& path, int* err_no, + const std::string& converter) { + switch (fs_select_internal(path)) { + case 0: + return localfs_open_write(path, converter); + + case 1: + return hdfs_open_write(path, err_no, converter); + + default: + LOG(FATAL) << "Not supported"; + } + + return {}; +} + +std::shared_ptr fs_open(const std::string& path, const std::string& mode, + int* err_no, const std::string& converter) { + if (mode == "r" || mode == "rb") { + return fs_open_read(path, err_no, converter); + } + + if (mode == "w" || mode == "wb") { + return fs_open_write(path, err_no, converter); + } + + LOG(FATAL) << "Unknown mode: " << mode; + return {}; +} + +int64_t fs_file_size(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_file_size(path); + + default: + LOG(FATAL) << "Not supported"; + } + + return 0; +} + +void fs_remove(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_remove(path); + + case 1: + return hdfs_remove(path); + + default: + LOG(FATAL) << "Not supported"; + } +} + +std::vector fs_list(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_list(path); + + case 1: + return hdfs_list(path); + + default: + LOG(FATAL) << "Not supported"; + } + + return {}; +} + +std::string fs_tail(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_tail(path); + + case 1: + return hdfs_tail(path); + + default: + LOG(FATAL) << "Not supported"; + } + + return ""; +} + +bool fs_exists(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_exists(path); + + case 1: + return hdfs_exists(path); + + default: + LOG(FATAL) << "Not supported"; + } + + return false; +} + +void fs_mkdir(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_mkdir(path); + + case 1: + return hdfs_mkdir(path); + + default: + LOG(FATAL) << "Not supported"; + } +} +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/io/fs.h b/paddle/fluid/framework/io/fs.h new file mode 100644 index 00000000..3f017470 --- /dev/null +++ b/paddle/fluid/framework/io/fs.h @@ -0,0 +1,101 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/io/shell.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace framework { + +int fs_select_internal(const std::string& path); + +// localfs +extern size_t localfs_buffer_size(); + +extern void localfs_set_buffer_size(size_t x); + +extern std::shared_ptr localfs_open_read(std::string path, + const std::string& converter); + +extern std::shared_ptr localfs_open_write(std::string path, + const std::string& converter); + +extern int64_t localfs_file_size(const std::string& path); + +extern void localfs_remove(const std::string& path); + +extern std::vector localfs_list(const std::string& path); + +extern std::string localfs_tail(const std::string& path); + +extern bool localfs_exists(const std::string& path); + +extern void localfs_mkdir(const std::string& path); + +// hdfs +extern size_t hdfs_buffer_size(); + +extern void hdfs_set_buffer_size(size_t x); + +extern const std::string& hdfs_command(); + +extern void hdfs_set_command(const std::string& x); + +extern std::shared_ptr hdfs_open_read(std::string path, int* err_no, + const std::string& converter); + +extern std::shared_ptr hdfs_open_write(std::string path, int* err_no, + const std::string& converter); + +extern void hdfs_remove(const std::string& path); + +extern std::vector hdfs_list(const std::string& path); + +extern std::string hdfs_tail(const std::string& path); + +extern bool hdfs_exists(const std::string& path); + +extern void hdfs_mkdir(const std::string& path); + +// aut-detect fs +extern std::shared_ptr fs_open_read(const std::string& path, int* err_no, + const std::string& converter); + +extern std::shared_ptr fs_open_write(const std::string& path, int* err_no, + const std::string& converter); + +extern std::shared_ptr fs_open(const std::string& path, + const std::string& mode, int* err_no, + const std::string& converter = ""); + +extern int64_t fs_file_size(const std::string& path); + +extern void fs_remove(const std::string& path); + +extern std::vector fs_list(const std::string& path); + +extern std::string fs_tail(const std::string& path); + +extern bool fs_exists(const std::string& path); + +extern void fs_mkdir(const std::string& path); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc new file mode 100644 index 00000000..ab671cb5 --- /dev/null +++ b/paddle/fluid/framework/io/shell.cc @@ -0,0 +1,323 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/io/shell.h" + +namespace paddle { +namespace framework { + +std::shared_ptr shell_fopen(const std::string& path, + const std::string& mode) { +#if defined _WIN32 || defined __APPLE__ + return nullptr; +#else + if (shell_verbose()) { + LOG(INFO) << "Opening file[" << path << "] with mode[" << mode << "]"; + } + FILE* fp; + if (!(fp = fopen(path.c_str(), mode.c_str()))) { + LOG(FATAL) << "fopen fail, path[" << path << "], mode[" << mode << "]"; + } + return {fp, [path](FILE* fp) { + if (shell_verbose()) { + LOG(INFO) << "Closing file[" << path << "]"; + } + if (0 != fclose(fp)) { + LOG(FATAL) << "fclose fail, path[" << path << "]"; + } + }}; +#endif +} + +// Close all open file descriptors +// The implementation is async signal safe +// Mostly copy from CPython code +static int close_open_fds_internal() { +#if defined _WIN32 || defined __APPLE__ + return 0; +#else + struct linux_dirent { + long d_ino = 0; // NOLINT + off_t d_off; + unsigned short d_reclen = 0; // NOLINT + char d_name[256]; + }; + + int dir_fd = -1; + if ((dir_fd = open("/proc/self/fd", O_RDONLY)) < 0) { + LOG(FATAL) << "proc/self/fd open fail"; + return -1; + } + char buffer[sizeof(linux_dirent)]; + + for (;;) { + int bytes = 0; + if ((bytes = syscall(SYS_getdents, dir_fd, + reinterpret_cast(buffer), + sizeof(buffer))) < 0) { + LOG(FATAL) << "syscall fail"; + return -1; + } + + if (bytes == 0) { + break; + } + + linux_dirent* entry = NULL; + + for (int offset = 0; offset < bytes; offset += entry->d_reclen) { + entry = reinterpret_cast(buffer + offset); + int fd = 0; + const char* s = entry->d_name; + + while (*s >= '0' && *s <= '9') { + fd = fd * 10 + (*s - '0'); + s++; + } + + if (s != entry->d_name && fd != dir_fd && fd >= 3) { + close(fd); + } + } + } + + close(dir_fd); + return 0; +#endif +} + +static int shell_popen_fork_internal(const char* real_cmd, bool do_read, + int parent_end, int child_end) { +#if defined _WIN32 || defined __APPLE__ + return 0; +#else + int child_pid = -1; + // Too frequent calls to fork() makes openmpi very slow. Use vfork() instead. + // But vfork() is very dangerous. Be careful. + if ((child_pid = vfork()) < 0) { + return -1; + } + + // The following code is async signal safe (No memory allocation, no access to + // global data, etc.) + if (child_pid != 0) { + return child_pid; + } + + int child_std_end = do_read ? 1 : 0; + close(parent_end); + + if (child_end != child_std_end) { + if (dup2(child_end, child_std_end) != child_std_end) { + return -1; + } + close(child_end); + } + + close_open_fds_internal(); + if (execl("/bin/bash", "bash", "-c", real_cmd, NULL) < 0) { + return -1; + } + exit(127); +#endif +} + +std::shared_ptr shell_popen(const std::string& cmd, + const std::string& mode, int* err_no) { +#if defined _WIN32 || defined __APPLE__ + return nullptr; +#else + bool do_read = mode == "r"; + bool do_write = mode == "w"; + if (!(do_read || do_write)) { + *err_no = -1; + return NULL; + } + + if (shell_verbose()) { + LOG(INFO) << "Opening pipe[" << cmd << "] with mode[" << mode << "]"; + } + + std::string real_cmd = "set -o pipefail; " + cmd; + + int pipe_fds[2]; + if (pipe(pipe_fds) != 0) { + *err_no = -1; + return NULL; + } + int parent_end = 0; + int child_end = 0; + + if (do_read) { + parent_end = pipe_fds[0]; + child_end = pipe_fds[1]; + } else if (do_write) { + parent_end = pipe_fds[1]; + child_end = pipe_fds[0]; + } + + int child_pid = shell_popen_fork_internal(real_cmd.c_str(), do_read, + parent_end, child_end); + close(child_end); + fcntl(parent_end, F_SETFD, FD_CLOEXEC); + FILE* fp; + if ((fp = fdopen(parent_end, mode.c_str())) == NULL) { + *err_no = -1; + return NULL; + } + return {fp, [child_pid, cmd, err_no](FILE* fp) { + if (shell_verbose()) { + LOG(INFO) << "Closing pipe[" << cmd << "]"; + } + + if (fclose(fp) != 0) { + *err_no = -1; + } + int wstatus = -1; + waitpid(child_pid, &wstatus, 0); + if (wstatus == 0 || wstatus == (128 + SIGPIPE) * 256 || + (wstatus == -1 && errno == ECHILD)) { + } else { + *err_no = -1; + LOG(WARNING) << "status[" << wstatus << "], cmd[" << cmd << "]" + << ", err_no[" << *err_no << "]"; + } + if (wstatus == -1 && errno == ECHILD) { + LOG(WARNING) << "errno is ECHILD"; + } + }}; +#endif +} + +static int shell_p2open_fork_internal(const char* real_cmd, int pipein_fds[2], + int pipeout_fds[2]) { +#if defined _WIN32 || defined __APPLE__ + return 0; +#else + int child_pid = -1; + if ((child_pid = fork()) < 0) { + return -1; + } + + if (child_pid != 0) { + return child_pid; + } + + close(pipein_fds[0]); + close(pipeout_fds[1]); + + if (pipein_fds[1] != 1) { + if (dup2(pipein_fds[1], 1) != 1) { + return -1; + } + close(pipein_fds[1]); + } + + if (pipeout_fds[0] != 0) { + if (dup2(pipeout_fds[0], 0) != 0) { + return -1; + } + close(pipeout_fds[0]); + } + + close_open_fds_internal(); + if (execl("/bin/sh", "sh", "-c", real_cmd, NULL) < 0) { + return -1; + } + exit(127); +#endif +} + +std::pair, std::shared_ptr> shell_p2open( + const std::string& cmd) { +#if defined _WIN32 || defined __APPLE__ + return {}; +#else + if (shell_verbose()) { + LOG(INFO) << "Opening bidirectional pipe[" << cmd << "]"; + } + + std::string real_cmd = "set -o pipefail; " + cmd; + + int pipein_fds[2]; + int pipeout_fds[2]; + if (pipe(pipein_fds) != 0) { + return {NULL, NULL}; + } + if (pipe(pipeout_fds) != 0) { + return {NULL, NULL}; + } + + int child_pid = + shell_p2open_fork_internal(real_cmd.c_str(), pipein_fds, pipeout_fds); + + close(pipein_fds[1]); + close(pipeout_fds[0]); + fcntl(pipein_fds[0], F_SETFD, FD_CLOEXEC); + fcntl(pipeout_fds[1], F_SETFD, FD_CLOEXEC); + + std::shared_ptr child_life = { + NULL, [child_pid, cmd](void*) { + if (shell_verbose()) { + LOG(INFO) << "Closing bidirectional pipe[" << cmd << "]"; + } + + int wstatus, ret; + + do { + PCHECK((ret = waitpid(child_pid, &wstatus, 0)) >= 0 || + (ret == -1 && errno == EINTR)); + } while (ret == -1 && errno == EINTR); + + PCHECK(wstatus == 0 || wstatus == (128 + SIGPIPE) * 256 || + (wstatus == -1 && errno == ECHILD)) + << "status[" << wstatus << "], cmd[" << cmd << "]"; + + if (wstatus == -1 && errno == ECHILD) { + LOG(WARNING) << "errno is ECHILD"; + } + }}; + + FILE* in_fp; + PCHECK((in_fp = fdopen(pipein_fds[0], "r")) != NULL); + FILE* out_fp; + PCHECK((out_fp = fdopen(pipeout_fds[1], "w")) != NULL); + return {{in_fp, [child_life](FILE* fp) { PCHECK(fclose(fp) == 0); }}, + {out_fp, [child_life](FILE* fp) { PCHECK(fclose(fp) == 0); }}}; +#endif +} + +std::string shell_get_command_output(const std::string& cmd) { +#if defined _WIN32 || defined __APPLE__ + return ""; +#else + int err_no = 0; + do { + err_no = 0; + std::shared_ptr pipe = shell_popen(cmd, "r", &err_no); + string::LineFileReader reader; + + if (reader.getdelim(&*pipe, 0)) { + pipe = nullptr; + if (err_no == 0) { + return reader.get(); + } + } + } while (err_no == -1); + return ""; +#endif +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/io/shell.h b/paddle/fluid/framework/io/shell.h new file mode 100644 index 00000000..46fcc92b --- /dev/null +++ b/paddle/fluid/framework/io/shell.h @@ -0,0 +1,66 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#ifdef _WIN32 +#include +#else +#include +#endif +#include +#ifndef _WIN32 +#include +#endif +#include +#include +#include +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace framework { + +inline bool& shell_verbose_internal() { + static bool x = false; + return x; +} + +inline bool shell_verbose() { return shell_verbose_internal(); } + +inline void shell_set_verbose(bool x) { shell_verbose_internal() = x; } + +extern std::shared_ptr shell_fopen(const std::string& path, + const std::string& mode); + +extern std::shared_ptr shell_popen(const std::string& cmd, + const std::string& mode, int* err_no); + +extern std::pair, std::shared_ptr> shell_p2open( + const std::string& cmd); + +inline void shell_execute(const std::string& cmd) { + int err_no = 0; + do { + err_no = 0; + shell_popen(cmd, "w", &err_no); + } while (err_no == -1); +} + +extern std::string shell_get_command_output(const std::string& cmd); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt new file mode 100644 index 00000000..0e12e356 --- /dev/null +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -0,0 +1,136 @@ +set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) +file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT!\n\n") +file(APPEND ${pass_file} "\#pragma once\n") +file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n") + +add_subdirectory(fuse_optimizer_ops_pass) +add_subdirectory(memory_optimize_pass) +add_subdirectory(multi_devices_graph_pass) + +# Usage: pass_library(target inference) will append to paddle_inference_pass.h +unset(INFER_IR_PASSES CACHE) # clear the global variable +function(pass_library TARGET DEST) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + set(targetPrefix "") + + # Get optional argument + set(extraMacroArgs ${ARGN}) + list(LENGTH extraMacroArgs numExtraMacroArgs) + if(numExtraMacroArgs GREATER 0) + list(GET extraMacroArgs 0 targetPrefix) + endif() + + cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(targetPrefix) + cc_library(${TARGET} SRCS ${targetPrefix}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS}) + else() + cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS}) + endif() + + # add more DEST here, such as train, dist and collect USE_PASS into a file automatically. + if (${DEST} STREQUAL "base" OR ${DEST} STREQUAL "inference") + message(STATUS "add pass ${TARGET} ${DEST}") + file(APPEND ${pass_file} "USE_PASS(${TARGET});\n") + set(INFER_IR_PASSES ${INFER_IR_PASSES} ${TARGET} CACHE INTERNAL "") + endif() +endfunction() + +cc_library(node SRCS node.cc DEPS proto_desc) +cc_library(graph SRCS graph.cc DEPS node pretty_log) +cc_library(graph_helper SRCS graph_helper.cc DEPS graph) +cc_library(pass SRCS pass.cc DEPS graph node graph_helper) +cc_library(graph_traits SRCS graph_traits.cc DEPS graph) +cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) +cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) + +cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper) + +pass_library(graph_to_program_pass base) +pass_library(graph_viz_pass base) +pass_library(lock_free_optimize_pass base) +pass_library(fc_fuse_pass inference) +pass_library(attention_lstm_fuse_pass inference) +pass_library(infer_clean_graph_pass inference) +pass_library(fc_lstm_fuse_pass inference) +pass_library(embedding_fc_lstm_fuse_pass inference) +pass_library(fc_gru_fuse_pass inference) +pass_library(seq_concat_fc_fuse_pass inference) +pass_library(multi_batch_merge_pass base) +pass_library(conv_bn_fuse_pass inference) +pass_library(seqconv_eltadd_relu_fuse_pass inference) +pass_library(seqpool_concat_fuse_pass inference) +pass_library(repeated_fc_relu_fuse_pass inference) +pass_library(squared_mat_sub_fuse_pass inference) +pass_library(is_test_pass base) +pass_library(conv_elementwise_add_act_fuse_pass inference) +pass_library(conv_elementwise_add2_act_fuse_pass inference) +pass_library(conv_elementwise_add_fuse_pass inference) +pass_library(conv_affine_channel_fuse_pass inference) +pass_library(transpose_flatten_concat_fuse_pass inference) +pass_library(identity_scale_op_clean_pass base) +pass_library(sync_batch_norm_pass base) +pass_library(runtime_context_cache_pass base) +pass_library(quant_conv2d_dequant_fuse_pass inference) +pass_library(fillconstant_elementwisemul_fuse inference) +pass_library(shuffle_channel_detect_pass inference) +pass_library(delete_quant_dequant_op_pass inference) + +if(ANAKIN_SUBGRAPH) +pass_library(simplify_anakin_priorbox_detection_out_pass inference) +endif() + +if(WITH_MKLDNN) + pass_library(mkldnn_placement_pass base mkldnn) + pass_library(depthwise_conv_mkldnn_pass base mkldnn) + pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn) + pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn) + pass_library(conv_brelu_mkldnn_fuse_pass inference mkldnn) + pass_library(conv_concat_relu_mkldnn_fuse_pass inference mkldnn) + pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn) + pass_library(fc_mkldnn_pass inference mkldnn) + pass_library(cpu_quantize_placement_pass base mkldnn) + pass_library(cpu_quantize_pass inference mkldnn) + pass_library(cpu_quantize_squash_pass inference mkldnn) +endif() + +if(WITH_NGRAPH) + cc_library(ngraph_subgraph_pass SRCS ngraph_subgraph_pass.cc DEPS ngraph_bridge + analysis_helper subgraph_detector graph_pattern_detector pass fuse_pass_base ${op_library_DEPS}) + set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) + file(APPEND ${pass_file} "USE_PASS(ngraph_subgraph_pass);\n") + set(INFER_IR_PASSES ${INFER_IR_PASSES} ngraph_subgraph_pass CACHE INTERNAL "") +endif() + +cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector ) +cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector ) + +set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") + +cc_library(pass_builder SRCS pass_builder.cc DEPS pass) + +cc_test(node_test SRCS node_test.cc DEPS node) +cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) +cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) +cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) +cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) +cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) +cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) +cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) +cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) +if(NOT WIN32) + cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass) +endif() +if (WITH_MKLDNN) + cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) + cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor) + cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) + cc_test(test_conv_brelu_mkldnn_fuse_pass SRCS mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc DEPS conv_brelu_mkldnn_fuse_pass) + cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass) + cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) + cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass) + cc_test(test_cpu_quantize_placement_pass SRCS mkldnn/cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass) + cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor) + cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor) +endif () diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc new file mode 100644 index 00000000..c4ffb2a9 --- /dev/null +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -0,0 +1,284 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h" +#include +#include +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +struct Param { + std::string X = "concat_0.tmp_0"; + std::string C0 = "cell_init"; + std::string H0 = "hidden_init"; + std::string AttentionWeight = "attention_fc.w_0"; + std::string AttentionBias = "attention_fc.b_0"; + std::string AttentionScalar = "attention_output.w_0"; + std::string AttentionScalarBias = "attention_output.b_0"; + std::string LSTMWeight = "attention_w.new"; + std::string LSTMBias = "attention_b.new"; + std::string Hidden = "array_to_lod_tensor_0.tmp_0"; + std::string Cell = "at.cell.new"; + std::string AttentionedX = "at.x.new"; + std::string AttentionFCOut = "at.fc.new"; + std::string LSTMX = "at.lstmx.new"; + std::string LSTMOUT = "at.lstmout.new"; +}; + +void PrepareParameters(Graph* graph, const Param& param); + +void FindWhileOp(Graph* graph) { + GraphPatternDetector gpd; + std::unordered_set fused_external_ops( + {35, 36, 37, 38, 43, 44, 49, 45, 46, 47, 41, 42, 53, 54, 48, + 57, 55, 56, 52, 74, 80, 77, 78, 79, 50, 77, 39, 40, 51}); + + gpd.mutable_pattern()->NewNode( + [&](Node* n) { return fused_external_ops.count(n->id()); }, "while"); + + if (!graph->Has(kGraphvizMarkedNodeAttr)) { + graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t); + } + auto& marked_nodes = + graph->Get(kGraphvizMarkedNodeAttr); + + auto handle = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + auto* while_pat_node = gpd.pattern().RetrieveNode("while"); + auto* while_node = subgraph.at(while_pat_node); + marked_nodes.insert(while_node); + }; + gpd(graph, handle); + + Param param; + // Add AttentionLSTM node + OpDesc op_desc; + op_desc.SetType("attention_lstm"); + +#define OP_SET_IN(x) op_desc.SetInput(#x, {param.x}); +#define OP_SET_OUT(x) op_desc.SetOutput(#x, {param.x}); + OP_SET_IN(X); + OP_SET_IN(C0); + OP_SET_IN(H0); + OP_SET_IN(AttentionWeight); + OP_SET_IN(AttentionBias); + OP_SET_IN(AttentionScalar); + OP_SET_IN(AttentionScalarBias); + OP_SET_IN(LSTMWeight); + OP_SET_IN(LSTMBias); + + OP_SET_OUT(Hidden); + OP_SET_OUT(Cell); + OP_SET_OUT(AttentionedX); + OP_SET_OUT(AttentionFCOut); + OP_SET_OUT(LSTMX); + OP_SET_OUT(LSTMOUT); +#undef OP_SET_IN +#undef OP_SET_OUT + + auto* X = graph->RetrieveNode(34); + auto* LSTMOUT = graph->RetrieveNode(81); + auto* cell_init = graph->RetrieveNode(6); + auto* hidden_init = graph->RetrieveNode(8); + + auto* lstm_op = graph->CreateOpNode(&op_desc); + PrepareParameters(graph, param); + + IR_NODE_LINK_TO(X, lstm_op); + IR_NODE_LINK_TO(cell_init, lstm_op); + IR_NODE_LINK_TO(hidden_init, lstm_op); + IR_NODE_LINK_TO(lstm_op, LSTMOUT); + + GraphSafeRemoveNodes(graph, marked_nodes); +} + +#define CHECK_P1(x) PADDLE_ENFORCE_NOT_NULL(x); +#define CHECK_P2(x0, x1) \ + CHECK_P1(x0); \ + CHECK_P1(x1); +#define CHECK_P3(x0, x1, x2) \ + CHECK_P2(x0, x1); \ + CHECK_P1(x2); +#define CHECK_P4(x0, x1, x2, x3) \ + CHECK_P3(x0, x1, x2); \ + CHECK_P1(x3); +#define CHECK_P5(x0, x1, x2, x3, x4) \ + CHECK_P4(x0, x1, x2, x3); \ + CHECK_P1(x4); + +void PrepareLSTMWeight(const LoDTensor& W_forget_w0, + const LoDTensor& W_forget_w1, + const LoDTensor& W_input_w0, const LoDTensor& W_input_w1, + const LoDTensor& W_output_w0, + const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0, + const LoDTensor& W_cell_w1, LoDTensor* out); + +void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, + const LoDTensor& B_output, const LoDTensor& B_cell, + LoDTensor* out); + +void PrepareParameters(Graph* graph, const Param& param) { + // Check parameters + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto& scope = graph->Get(kParamScopeAttr); + + // Create new parameters. + scope.Var(param.LSTMWeight)->GetMutable(); + scope.Var(param.LSTMBias)->GetMutable(); + scope.Var(param.Hidden)->GetMutable(); + scope.Var(param.Cell)->GetMutable(); + scope.Var(param.AttentionedX)->GetMutable(); + scope.Var(param.AttentionFCOut)->GetMutable(); + scope.Var(param.LSTMX)->GetMutable(); + scope.Var(param.LSTMOUT)->GetMutable(); + +#define GATE_W(name__) \ + auto* W_##name__##_w0 = scope.FindVar(#name__ ".w_0"); \ + auto* W_##name__##_w1 = scope.FindVar(#name__ ".w_1"); \ + auto* W_##name__##_b0 = scope.FindVar(#name__ ".b_0"); \ + CHECK_P3(W_##name__##_w0, W_##name__##_w1, W_##name__##_b0); \ + VLOG(4) << #name__ "_w0" \ + << " shape: " << W_##name__##_w0->Get().dims(); \ + VLOG(4) << #name__ "_w1" \ + << " shape: " << W_##name__##_w1->Get().dims(); \ + VLOG(4) << #name__ "_b0" \ + << " shape: " << W_##name__##_b0->Get().dims(); \ + auto& W_##name__##_w0_t = W_##name__##_w0->Get(); \ + auto& W_##name__##_w1_t = W_##name__##_w1->Get(); \ + auto& W_##name__##_b0_t = W_##name__##_b0->Get(); + + GATE_W(forget); + GATE_W(input); + GATE_W(output); + GATE_W(c); +#undef GATE_W + + auto* attention_fc_w = scope.FindVar("attention_fc.w_0"); + auto* attention_fc_b = scope.FindVar("attention_fc.b_0"); + auto* attention_output_w = scope.FindVar("attention_output.w_0"); + auto* attention_output_b = scope.FindVar("attention_output.b_0"); + CHECK_P4(attention_fc_w, attention_fc_b, attention_output_w, + attention_output_b); + + auto* lstm_weight = scope.Var(param.LSTMWeight); + auto* lstm_weight_t = lstm_weight->GetMutable(); + auto* lstm_bias = scope.Var(param.LSTMBias); + auto* lstm_bias_t = lstm_bias->GetMutable(); + + // reshape attention_bias + auto* attention_bias_t = + scope.FindVar(param.AttentionBias)->GetMutable(); + PADDLE_ENFORCE_EQ(attention_bias_t->dims().size(), 1); + attention_bias_t->Resize(make_ddim({1, attention_bias_t->dims()[0]})); + + auto* attention_scalar_bias_t = + scope.FindVar(param.AttentionScalarBias)->GetMutable(); + attention_scalar_bias_t->Resize( + make_ddim({1, attention_scalar_bias_t->dims()[0]})); + + PrepareLSTMWeight(W_forget_w0_t, W_forget_w1_t, W_input_w0_t, W_input_w1_t, + W_output_w0_t, W_output_w1_t, W_c_w0_t, W_c_w1_t, + lstm_weight_t); + PrepareLSTMBias(W_forget_b0_t, W_input_b0_t, W_output_b0_t, W_c_b0_t, + lstm_bias_t); +} + +// Prepare parameters +void PrepareLSTMWeight(const LoDTensor& W_forget_w0, + const LoDTensor& W_forget_w1, + const LoDTensor& W_input_w0, const LoDTensor& W_input_w1, + const LoDTensor& W_output_w0, + const LoDTensor& W_output_w1, const LoDTensor& W_cell_w0, + const LoDTensor& W_cell_w1, LoDTensor* out) { + int D = W_forget_w0.dims()[0]; + int M = W_forget_w1.dims()[0]; + out->Resize(make_ddim({D + M, 4 * D})); + VLOG(3) << "LSTMWeight resized to " << out->dims(); + + float* out_data = out->mutable_data(platform::CPUPlace()); + std::array tensors{ + W_forget_w0.data(), W_input_w0.data(), + W_output_w0.data(), W_cell_w0.data()}; + std::array tensors1{ + W_forget_w1.data(), W_input_w1.data(), + W_output_w1.data(), W_cell_w1.data()}; + + for (int row = 0; row < D; row++) { + for (int col = 0; col < 4; col++) { + float* dst = out_data + 4 * D * row + D * col; + const float* src = tensors[col] + D * row; + memcpy(dst, src, D * sizeof(float)); + } + } + + for (int row = 0; row < M; row++) { + for (int col = 0; col < 4; col++) { + float* dst = out_data + 4 * D * (D + row) + D * col; + const float* src = tensors1[col] + D * row; + memcpy(dst, src, D * sizeof(float)); + } + } +} + +void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, + const LoDTensor& B_output, const LoDTensor& B_cell, + LoDTensor* out) { + std::array tensors{ + B_forget.data(), B_input.data(), B_output.data(), + B_cell.data()}; + + PADDLE_ENFORCE_EQ(B_forget.dims().size(), 1); + int D = B_forget.dims()[0]; + out->Resize(make_ddim({1, 4 * D})); + auto* out_data = out->mutable_data(platform::CPUPlace()); + for (size_t i = 0; i < tensors.size(); i++) { + memcpy(out_data + D * i, tensors[i], D * sizeof(float)); + } +} + +// Parameters + +void AttentionLSTMFusePass::ApplyImpl(ir::Graph* graph) const { + PDPattern external_pattern, subblock_pattern; + + // Use the following variables to tell whether this model is RNN1. + // This fuse can only works on the RNN1 model. + std::unordered_set specified_vars({"data_lod_attention", + "cell_init", "hidden_init", + "data", "week", "minute"}); + size_t count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsVar() && specified_vars.count(node->Name())) { + ++count; + } + } + if (count < specified_vars.size()) { + return; + } + + // Continue to fuse. + FindWhileOp(graph); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(attention_lstm_fuse_pass, + paddle::framework::ir::AttentionLSTMFusePass); diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h new file mode 100644 index 00000000..47ed9f03 --- /dev/null +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h @@ -0,0 +1,30 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class AttentionLSTMFusePass : public FusePassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc new file mode 100644 index 00000000..3eb4ef9f --- /dev/null +++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc @@ -0,0 +1,488 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +DEFINE_double(fuse_parameter_memory_size, -1.0, // MBytes + "fuse_parameter_memory_size is up limited memory size(MB)" + "of one group parameters' gradient which is the input " + "of communication calling(e.g NCCLAllReduce). " + "The default value is 0, it means that " + "not set group according to memory_size."); +DEFINE_int32( + fuse_parameter_groups_size, 1, + "fuse_parameter_groups_size is the up limited size of one group " + "parameters' gradient. " + "The default value is a experimental result. If the " + "fuse_parameter_groups_size is 1, it means that the groups size is " + "the number of parameters' gradient. If the fuse_parameter_groups_size is " + "-1, it means that there are only one group. The default value is 3, it is " + "an experimental value."); + +namespace paddle { +namespace framework { +namespace ir { +// unit of the FLAGS_fuse_parameter_memory_size. +static constexpr double kMB = 1048576.0; + +// SetFuseParameterGroupsSize and SetFuseParameterMemorySize are used in unit +// test, because it is invalid that seting 'FLAGS_fuse_parameter_memory_size' +// and 'FLAGS_fuse_parameter_groups_size' in unit test. +void SetFuseParameterGroupsSize(int group_size) { + FLAGS_fuse_parameter_groups_size = group_size; +} + +int GetFuseParameterGroupsSize() { return FLAGS_fuse_parameter_groups_size; } + +void SetFuseParameterMemorySize(double memory_size) { + FLAGS_fuse_parameter_memory_size = memory_size; +} + +double GetFuseParameterMemorySize() { return FLAGS_fuse_parameter_memory_size; } + +class CoalesceGradTensorPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const { + ir::Graph &result = *graph; + + details::ParamsAndGrads params_grads; + RecordParamsAndGrads(result, ¶ms_grads); + + VLOG(10) << "The number of params and grads is:" << params_grads.size(); + if (params_grads.size() == 0) { + return; + } + + auto vars_info = GetVarInfo(result); + ResetAttribute(details::kParamsAndDenseGrads, + &result); + ResetAttribute(details::kParamsAndSparseGrads, + &result); + ResetAttribute( + details::kGroupParamsAndDenseGrads, &result); + auto &p_g_dense_grad = + result.Get(details::kParamsAndDenseGrads); + auto &p_g_sparse_grad = + result.Get(details::kParamsAndSparseGrads); + + for (auto ¶m_grad : params_grads) { + if (IsLoDTensorType(GetTypeOfVar(vars_info, param_grad.second))) { + p_g_dense_grad.emplace_back(param_grad); + } else { + p_g_sparse_grad.emplace_back(param_grad); + } + } + + VLOG(10) << "Dense grads: " << p_g_dense_grad.size() + << ", Sparse grads: " << p_g_sparse_grad.size(); + if (p_g_dense_grad.size() == 0) { + return; + } + + auto num_of_p_g_dense_grad = p_g_dense_grad.size(); + auto &group_params_grads = result.Get( + details::kGroupParamsAndDenseGrads); + // Note: the order of p_g_dense_grad may be changed by + // SetGroupParamsAndGrads. + SetGroupParamsAndGrads(vars_info, p_g_dense_grad, &group_params_grads); + + p_g_dense_grad.clear(); + p_g_dense_grad.reserve(num_of_p_g_dense_grad); + for (auto &group_p_g : group_params_grads) { + p_g_dense_grad.insert(p_g_dense_grad.end(), group_p_g.begin(), + group_p_g.end()); + } + PADDLE_ENFORCE_EQ( + p_g_dense_grad.size(), num_of_p_g_dense_grad, + "The number of p_g_dense_grad is not consistent with before."); + + if (IsUnifiedDtype(p_g_dense_grad, vars_info)) { + SetGradientPersistable(p_g_dense_grad, vars_info); + CoalesceTensors(vars_info, p_g_dense_grad, &result); + } else { + for (auto &sub_param_grad : group_params_grads) { + SetGradientPersistable(p_g_dense_grad, vars_info); + PADDLE_ENFORCE(IsUnifiedDtype(sub_param_grad, vars_info), + "The data type of the same group is not consistent."); + CoalesceTensors(vars_info, sub_param_grad, &result); + } + } + } + + void SetGradientPersistable( + const std::vector> &sub_param_grad, + const std::unordered_map> &vars_info) + const { + for (auto &p_g : sub_param_grad) { + auto iter = vars_info.find(p_g.second); + PADDLE_ENFORCE(iter != vars_info.end(), "%s is not found.", p_g.second); + PADDLE_ENFORCE(!iter->second.empty()); + // Set persistable + for (auto it : iter->second) { + PADDLE_ENFORCE_NOT_NULL(it->Var()); + it->Var()->SetPersistable(true); + } + PADDLE_ENFORCE(IsLoDTensorType(GetTypeOfVar(vars_info, p_g.second))); + } + } + + bool IsUnifiedDtype( + const details::ParamsAndGrads ¶ms_grads, + const std::unordered_map> &vars_info) + const { + if (params_grads.empty()) return true; + auto dtype = GetDtypeOfVar(vars_info, params_grads.front().second); + for (auto p_g : params_grads) { + auto next_dtype = GetDtypeOfVar(vars_info, p_g.second); + if (next_dtype != dtype) { + return false; + } + } + return true; + } + + void CoalesceTensors( + const std::unordered_map> &vars_info, + const details::ParamsAndGrads ¶ms_grads, Graph *result) const { + // Create a FusedVarsSet to avoid duplicating names for fused_var in other + // pass. + if (!result->Has(details::kFusedVars)) { + result->Set(details::kFusedVars, new details::FusedVars); + } + // the kFusedGrads is used be fuse_optimizer_op_pass. + if (!result->Has(details::kFusedGrads)) { + result->Set(details::kFusedGrads, new details::FusedGrads); + } + if (!result->Has(details::kProgramDescs)) { + result->Set(details::kProgramDescs, new details::ProgramDescs); + } + // the fused_var_name should be unique, so it appends + // params_grads.begin()->second. + auto fused_grad_var_name = std::string(details::kFusedVarNamePrefix) + + "@GRAD@" + params_grads.begin()->second; + auto &fused_var_set = result->Get(details::kFusedVars); + PADDLE_ENFORCE_EQ(fused_var_set.count(fused_grad_var_name), 0, + "%s is duplicate in FusedVars.", fused_grad_var_name); + fused_var_set.insert(fused_grad_var_name); + result->Get(details::kFusedGrads) + .emplace_back(fused_grad_var_name); + + InitFusedVarsAndAllocSpaceForVars(vars_info, fused_grad_var_name, + params_grads, result); + } + + template + void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const { + if (graph->Has(attr_name)) { + VLOG(10) << attr_name << " is reset."; + graph->Erase(attr_name); + } + graph->Set(attr_name, new AttrType); + } + + void SetGroupParamsAndGrads( + const std::unordered_map> &vars_info, + const details::ParamsAndGrads ¶ms_grads, + details::GroupParamsAndGrads *group_params_grads) const { + SetGroupAccordingToLayers(vars_info, params_grads, group_params_grads); + SetGroupAccordingToMemorySize(vars_info, group_params_grads); + if (!IsUnifiedDtype(params_grads, vars_info)) { + ReGroupByDtype(vars_info, group_params_grads); + } + } + + void SetGroupAccordingToLayers( + const std::unordered_map> &vars_info, + const details::ParamsAndGrads ¶ms_grads, + details::GroupParamsAndGrads *group_params_grads) const { + std::map var_idx; + + for (size_t i = 0; i < params_grads.size(); ++i) { + auto pos = params_grads[i].first.find_first_of("."); + + std::string var_key; + if (pos == std::string::npos) { + var_key = params_grads[i].first; + } else { + var_key = params_grads[i].first.substr(0, pos); + } + + size_t idx = 0; + auto var_idx_iter = var_idx.find(var_key); + if (var_idx_iter != var_idx.end()) { + idx = var_idx_iter->second; + } else { + group_params_grads->emplace_back(); + idx = group_params_grads->size() - 1; + var_idx[var_key] = idx; + } + auto &local_group_params_grads = group_params_grads->at(idx); + local_group_params_grads.emplace_back( + std::make_pair(params_grads[i].first, params_grads[i].second)); + } + + if (VLOG_IS_ON(10)) { + VLOG(10) << "SetGroupAccordingToLayers: "; + PrintGroupInfo(vars_info, group_params_grads); + } + } + + void PrintGroupInfo( + const std::unordered_map> &vars_info, + details::GroupParamsAndGrads *group_params_grads) const { + for (size_t i = 0; i < group_params_grads->size(); ++i) { + VLOG(10) << "group " << i; + std::stringstream out; + size_t gps_size = 0; + for (auto &p_g : group_params_grads->at(i)) { + auto var_desc = GetVarDescFromVarsInfo(vars_info, p_g.first); + auto shape = var_desc->GetShape(); + size_t size = framework::SizeOfType(var_desc->GetDataType()); + std::for_each(shape.begin(), shape.end(), + [&size](const int64_t &n) { size *= n; }); + gps_size += size; + out << string::Sprintf("(%s(%d), %s)", p_g.first, size, p_g.second); + } + + auto dtype = + GetDtypeOfVar(vars_info, group_params_grads->at(i).front().first); + VLOG(10) << out.str() + << ", group size:" << group_params_grads->at(i).size() + << ", group memory size:" << static_cast(gps_size) / kMB + << "(MB), dtype:" << dtype; + } + } + + void SetGroupAccordingToMemorySize( + const std::unordered_map> &vars_info, + details::GroupParamsAndGrads *group_params_grads) const { + const double group_memory_size = GetFuseParameterMemorySize(); + if (group_memory_size <= 0.0) { + return; + } + details::GroupParamsAndGrads local_group_params_grads; + + size_t j = 0; + while (j < group_params_grads->size()) { + local_group_params_grads.emplace_back(); + auto &group_p_g = local_group_params_grads.back(); + + size_t local_group_memory_size = 0; + while (j < group_params_grads->size()) { + for (auto &p_g_iter : group_params_grads->at(j)) { + auto var_desc = GetVarDescFromVarsInfo(vars_info, p_g_iter.second); + size_t size = framework::SizeOfType(var_desc->GetDataType()); + auto shape = var_desc->GetShape(); + std::for_each(shape.begin(), shape.end(), + [&size](const int64_t &n) { size *= n; }); + local_group_memory_size += size; + } + + group_p_g.insert(group_p_g.end(), group_params_grads->at(j).begin(), + group_params_grads->at(j).end()); + + ++j; + if (j >= group_params_grads->size()) { + break; + } + + if (GetFuseParameterGroupsSize() > 1 && + group_p_g.size() > + static_cast(GetFuseParameterGroupsSize())) { + break; + } + + if (static_cast(local_group_memory_size) / kMB >= + group_memory_size) { + break; + } + } + } + + std::swap(*group_params_grads, local_group_params_grads); + + if (VLOG_IS_ON(10)) { + VLOG(10) << string::Sprintf( + "SetGroupAccordingToMemorySize(memory_size: %f MB):", + GetFuseParameterMemorySize()); + PrintGroupInfo(vars_info, group_params_grads); + } + } + + void ReGroupByDtype( + const std::unordered_map> &vars_info, + details::GroupParamsAndGrads *group_params_grads) const { + details::GroupParamsAndGrads new_group_params_grads; + + for (auto &group_p_g : *group_params_grads) { + std::map type_idx; + details::GroupParamsAndGrads local_group_params_grads; + + for (auto &p_g : group_p_g) { + auto dtype = GetDtypeOfVar(vars_info, p_g.second); + + size_t idx = 0; + auto var_idx_iter = type_idx.find(dtype); + if (var_idx_iter != type_idx.end()) { + idx = var_idx_iter->second; + } else { + local_group_params_grads.emplace_back(); + idx = local_group_params_grads.size() - 1; + type_idx[dtype] = idx; + } + + auto &local = local_group_params_grads.at(idx); + local.emplace_back(p_g); + } + + VLOG(10) << "local_group_params_grads size:" + << local_group_params_grads.size(); + new_group_params_grads.insert(new_group_params_grads.end(), + local_group_params_grads.begin(), + local_group_params_grads.end()); + } + + std::swap(*group_params_grads, new_group_params_grads); + + if (VLOG_IS_ON(10)) { + VLOG(10) << string::Sprintf("ReGroupByDtype(memory_size: %f MB, %u):", + GetFuseParameterMemorySize(), + GetFuseParameterGroupsSize()); + PrintGroupInfo(vars_info, group_params_grads); + } + } + + proto::VarType::Type GetDtypeOfVar( + const std::unordered_map> &vars_info, + const std::string &name) const { + auto var_desc = GetVarDescFromVarsInfo(vars_info, name); + return var_desc->GetDataType(); + } + + proto::VarType::Type GetTypeOfVar( + const std::unordered_map> &vars_info, + const std::string &name) const { + auto var_desc = GetVarDescFromVarsInfo(vars_info, name); + return var_desc->GetType(); + } + + private: + bool IsLoDTensorType(const proto::VarType::Type &type) const { + // Current only support LOD_TENSOR. + return type == proto::VarType::LOD_TENSOR; + } + + std::unordered_map> GetVarInfo( + const Graph &result) const { + std::unordered_map> vars; + for (Node *node : result.Nodes()) { + if (node->IsVar() && node->Var()) { + // Note: The graph may have the same name node. For example, parameter + // is the input of operator and it also is the output of optimizer; + vars[node->Var()->Name()].emplace_back(node); + } + } + return vars; + } + + const VarDesc *GetVarDescFromVarsInfo( + const std::unordered_map> &vars_info, + const std::string &var_name) const { + auto grad_iter = vars_info.find(var_name); + PADDLE_ENFORCE(grad_iter != vars_info.end(), "%s is not found.", var_name); + PADDLE_ENFORCE(!grad_iter->second.empty()); + PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var()); + return grad_iter->second.front()->Var(); + } + + void RecordParamsAndGrads(const ir::Graph &graph, + details::ParamsAndGrads *params_grads) const { + std::vector topo_nodes = ir::TopologySortOperations(graph); + for (auto &node : topo_nodes) { + try { + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + auto backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast(0)); + for (size_t i = 0; i < backward_vars.size(); i += 2) { + VLOG(10) << "Trainable parameter: " << backward_vars[i] + << ", gradient: " << backward_vars[i + 1]; + + params_grads->emplace_back(std::make_pair( + backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/)); + } + } catch (boost::bad_get e) { + } + } + } + + void InitFusedVarsAndAllocSpaceForVars( + const std::unordered_map> &vars_info, + const std::string &fused_var_name, + const details::ParamsAndGrads ¶ms_grads, ir::Graph *result) const { + // Alloc continuous space for vars. + std::vector grads_name; + std::vector params_name; + grads_name.reserve(params_grads.size()); + params_name.reserve(params_grads.size()); + for (auto &p_g : params_grads) { + params_name.emplace_back(p_g.first); + grads_name.emplace_back(p_g.second); + } + + result->Get(details::kProgramDescs).emplace_back(); + ProgramDesc &program_desc = + result->Get(details::kProgramDescs).back(); + auto *global_block = program_desc.MutableBlock(0); + AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name, + global_block); + } + + void AppendAllocSpaceForVarsOp(const std::vector ¶ms_name, + const std::vector &grads_name, + const std::string &fused_var_name, + BlockDesc *global_block) const { + auto op_desc = global_block->AppendOp(); + op_desc->SetType("coalesce_tensor"); + op_desc->SetInput("Input", params_name); + op_desc->SetOutput("Output", grads_name); + op_desc->SetOutput("FusedOutput", {fused_var_name}); + } +}; +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(coalesce_grad_tensor_pass, + paddle::framework::ir::CoalesceGradTensorPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h new file mode 100644 index 00000000..38dc4c99 --- /dev/null +++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h @@ -0,0 +1,29 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetFuseParameterGroupsSize(int group_size); +int GetFuseParameterGroupsSize(); + +void SetFuseParameterMemorySize(double memory_size); +double GetFuseParameterMemorySize(); + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc new file mode 100644 index 00000000..fecc159a --- /dev/null +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc @@ -0,0 +1,218 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_CONV_BN_NODES(pattern_name) \ + /* OPERATORS */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \ + /* CONV inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \ + /* CONV outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \ + /* Affine Channel inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name); \ + /* Affine channel outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */ + +void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, + const ir::Node& ac_scale, + const LoDTensor& ac_bias_tensor, + LoDTensor* eltwise_y_in_tensor) { + using EigenVectorArrayMap = + Eigen::Map>; + using ConstEigenVectorArrayMap = + Eigen::Map>; + using EigenMatrixArrayMap = Eigen::Map< + Eigen::Array>; + + // Re-compute bias of conv2d from AffineChannel + PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), ac_bias_tensor.dims()); + + auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable(); + + ConstEigenVectorArrayMap scale_array(scale_tensor->data(), + scale_tensor->numel(), 1); + ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data(), + ac_bias_tensor.numel(), 1); + + EigenVectorArrayMap eltwise_y_in_array( + eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 1); + + eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array; + + // Re-compute weight of conv2d from AffineChannel + auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); + auto weights_shape = weights->dims(); + auto weights_shape_2d = flatten_to_2d(weights_shape, 1); + + EigenMatrixArrayMap weights_array_2d( + weights->mutable_data(platform::CPUPlace()), weights_shape_2d[0], + weights_shape_2d[1]); + + weights_array_2d.colwise() *= scale_array; +} + +void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), + name_scope_); + conv_ac_pattern(conv_input, false /*with_eltwise_add*/); + + int found_conv_ac_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvAffineChannel fuse"; + + GET_CONV_BN_NODES(conv_ac_pattern); + + // check if fuse can be done and if MKL-DNN should be used + FuseOptions fuse_option = FindFuseOption(*conv, *affine_channel); + if (fuse_option == DO_NOT_FUSE) { + VLOG(3) << "do not perform conv+affinechannel fuse"; + return; + } + + // Create eltwise_y (conv bias) variable + VarDesc eltwise_y_in_desc( + patterns::PDNodeName(name_scope_, "eltwise_y_in")); + eltwise_y_in_desc.SetPersistable(true); + auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); + auto* eltwise_y_in_tensor = + scope->Var(eltwise_y_in_node->Name())->GetMutable(); + + // Get affine_channel bias + auto* ac_bias_tensor = + scope->FindVar(ac_bias->Name())->GetMutable(); + + // Initialize eltwise_y + eltwise_y_in_tensor->Resize(ac_bias_tensor->dims()); + std::fill_n(eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 0.0f); + + // update weights and biases + recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, + eltwise_y_in_tensor); + + // create an elementwise add node. + OpDesc desc; + desc.SetInput("X", std::vector({conv_out->Name()})); + desc.SetInput("Y", std::vector({eltwise_y_in_node->Name()})); + desc.SetOutput("Out", std::vector({ac_out->Name()})); + desc.SetType("elementwise_add"); + desc.SetAttr("axis", 1); + auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. + + GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel}); + + IR_NODE_LINK_TO(conv_out, eltwise_op); + IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); + IR_NODE_LINK_TO(eltwise_op, ac_out); + found_conv_ac_count++; + }; + + gpd(graph, handler); + + AddStatis(found_conv_ac_count); +} + +void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(), + name_scope_); + conv_ac_pattern(conv_input, true /*with_eltwise_add*/); + + int found_conv_ac_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBN fuse"; + + GET_CONV_BN_NODES(conv_ac_pattern); + // OPERATORS + GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern); + // BIAS inputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern); + // BIAS outputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern); + + // Get eltwise_y (conv bias) variable + auto* eltwise_y_in_tensor = + scope->FindVar(eltwise_y_in->Name())->GetMutable(); + + // Get batch norm bias + auto* ac_bias_tensor = + scope->FindVar(ac_bias->Name())->GetMutable(); + + recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor, + eltwise_y_in_tensor); + + // Update the elementwise_add node + eltwise->Op()->SetAttr("axis", 1); + eltwise->Op()->SetOutput("Out", std::vector({ac_out->Name()})); + + GraphSafeRemoveNodes(graph, + {ac_scale, ac_bias, affine_channel, eltwise_out}); + + IR_NODE_LINK_TO(eltwise, ac_out); + + found_conv_ac_count++; + }; + + gpd(graph, handler); + AddStatis(found_conv_ac_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_affine_channel_fuse_pass, + paddle::framework::ir::ConvAffineChannelFusePass); +REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass, + paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass); diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h new file mode 100644 index 00000000..d607020a --- /dev/null +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the Conv and ConvAffineChannel. + */ +class ConvAffineChannelFusePass : public FusePassBase { + public: + virtual ~ConvAffineChannelFusePass() {} + + protected: + void ApplyImpl(ir::Graph*) const override; + const std::string name_scope_{"conv_affine_channel_fuse"}; +}; + +class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { + public: + virtual ~ConvEltwiseAddAffineChannelFusePass() {} + + protected: + void ApplyImpl(ir::Graph*) const override; + const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc new file mode 100644 index 00000000..4fe3fb4f --- /dev/null +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc @@ -0,0 +1,296 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_bn_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_CONV_BN_NODES(pattern_name) \ + /* OPERATORS */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(batch_norm, batch_norm, pattern_name); \ + /* CONV inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name); \ + /* CONV outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name); \ + /* BN inputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_mean, bn_mean, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_variance, bn_variance, pattern_name); \ + /* BN outputs */ \ + GET_IR_NODE_FROM_SUBGRAPH(bn_out, bn_out, pattern_name); /* Out */ \ + GET_IR_NODE_FROM_SUBGRAPH(bn_mean_out, bn_mean_out, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_variance_out, bn_variance_out, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, pattern_name); \ + GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance, pattern_name) + +void recompute_bias_and_weights(const Scope* scope, + ir::Node* conv_weight, // + const ir::Node& bn_scale, // + const LoDTensor& bn_bias_tensor, // + const ir::Node& bn_mean, // + const ir::Node& bn_variance, // + LoDTensor* eltwise_y_in_tensor, // + float epsilon) { + using EigenVectorArrayMap = + Eigen::Map>; + using ConstEigenVectorArrayMap = + Eigen::Map>; + using EigenMatrixArrayMap = Eigen::Map< + Eigen::Array>; + + // Re-compute bias of conv2d from BN + PADDLE_ENFORCE_EQ(eltwise_y_in_tensor->dims(), bn_bias_tensor.dims()); + + auto* scale_tensor = scope->FindVar(bn_scale.Name())->GetMutable(); + auto* variance_tensor = + scope->FindVar(bn_variance.Name())->GetMutable(); + auto* mean_tensor = scope->FindVar(bn_mean.Name())->GetMutable(); + + ConstEigenVectorArrayMap scale_array(scale_tensor->data(), + scale_tensor->numel(), 1); + EigenVectorArrayMap variance_array( + variance_tensor->mutable_data(platform::CPUPlace()), + variance_tensor->numel(), 1); + ConstEigenVectorArrayMap mean_array(mean_tensor->data(), + mean_tensor->numel(), 1); + ConstEigenVectorArrayMap bn_bias_array(bn_bias_tensor.data(), + bn_bias_tensor.numel(), 1); + + // variance will not be used anymore, so make it std_array and then tmp_array + variance_array += epsilon; + variance_array = variance_array.sqrt(); + variance_array = scale_array / variance_array; + + EigenVectorArrayMap eltwise_y_in_array( + eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 1); + + eltwise_y_in_array = + ((eltwise_y_in_array - mean_array) * variance_array) + bn_bias_array; + + // Re-compute weight of conv2d from BN + auto* weights = scope->FindVar(conv_weight->Name())->GetMutable(); + auto weights_shape = weights->dims(); + auto weights_shape_2d = flatten_to_2d(weights_shape, 1); + + EigenMatrixArrayMap weights_array_2d( + weights->mutable_data(platform::CPUPlace()), weights_shape_2d[0], + weights_shape_2d[1]); + + weights_array_2d.colwise() *= variance_array; +} + +void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvBN conv_bn_pattern(gpd.mutable_pattern(), name_scope_); + conv_bn_pattern(conv_input, false /*with_eltwise_add*/); + + int found_conv_bn_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBN fuse"; + + // conv, batch_norm, + // conv_weight, conv_out, + // bn_scale, bn_bias, bn_mean, bn_variance, + // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean, + // bn_saved_variance + GET_CONV_BN_NODES(conv_bn_pattern); + + // check if fuse can be done and if MKL-DNN should be used + FuseOptions fuse_option = FindFuseOption(*conv, *batch_norm); + if (fuse_option == DO_NOT_FUSE) { + VLOG(3) << "do not perform conv+bn fuse"; + return; + } + + // Get batch norm bias + auto* bn_bias_tensor = + scope->FindVar(bn_bias->Name())->GetMutable(); + + // Create eltwise_y (conv bias) variable + VarDesc eltwise_y_in_desc( + patterns::PDNodeName(name_scope_, "eltwise_y_in")); + eltwise_y_in_desc.SetShape(framework::vectorize(bn_bias_tensor->dims())); + eltwise_y_in_desc.SetDataType(bn_bias_tensor->type()); + eltwise_y_in_desc.SetLoDLevel(bn_bias->Var()->GetLoDLevel()); + eltwise_y_in_desc.SetPersistable(true); + auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc); + auto* eltwise_y_in_tensor = + scope->Var(eltwise_y_in_node->Name())->GetMutable(); + + // Initialize eltwise_y + eltwise_y_in_tensor->Resize(bn_bias_tensor->dims()); + std::fill_n(eltwise_y_in_tensor->mutable_data(platform::CPUPlace()), + eltwise_y_in_tensor->numel(), 0.0f); + + // update weights and biases + float epsilon = boost::get(batch_norm->Op()->GetAttr("epsilon")); + recompute_bias_and_weights(scope, conv_weight, *bn_scale, *bn_bias_tensor, + *bn_mean, *bn_variance, eltwise_y_in_tensor, + epsilon); + + // with MKL-DNN fuse conv+bn into conv with bias + // without MKL-DNN fuse conv+bn into conv+elementwise_add + if (fuse_option == FUSE_MKLDNN) { + auto input_names = conv->Op()->InputNames(); + bool has_bias = std::find(input_names.begin(), input_names.end(), + "Bias") != input_names.end(); + if (has_bias && conv->Op()->Input("Bias").size() > 0) { + // reuse existing conv bias node + auto conv_bias_names = conv->Op()->Input("Bias"); + PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1UL); + auto* conv_bias_var = scope->FindVar(conv_bias_names[0]); + auto* conv_bias_tensor = conv_bias_var->GetMutable(); + PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), + eltwise_y_in_tensor->dims()); + + auto eigen_conv_bias = EigenVector::From(*conv_bias_tensor); + eigen_conv_bias += EigenVector::From(*eltwise_y_in_tensor); + } else { + // add new conv_bias node + conv->Op()->SetInput( + "Bias", std::vector({eltwise_y_in_node->Name()})); + IR_NODE_LINK_TO(eltwise_y_in_node, conv); + } + conv->Op()->SetOutput("Output", + std::vector({bn_out->Name()})); + + GraphSafeRemoveNodes( + graph, + {conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, + bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance}); + + IR_NODE_LINK_TO(conv, bn_out); + found_conv_bn_count++; + } else { // fuse_option == FUSE_NATIVE + // create an elementwise add node. + OpDesc desc; + desc.SetInput("X", std::vector({conv_out->Name()})); + desc.SetInput("Y", std::vector({eltwise_y_in_node->Name()})); + desc.SetOutput("Out", std::vector({bn_out->Name()})); + desc.SetType("elementwise_add"); + desc.SetAttr("axis", 1); + auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. + + GraphSafeRemoveNodes(graph, {bn_scale, bn_bias, bn_mean, bn_variance, + batch_norm, bn_mean_out, bn_variance_out, + bn_saved_mean, bn_saved_variance}); + + IR_NODE_LINK_TO(conv_out, eltwise_op); + IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); + IR_NODE_LINK_TO(eltwise_op, bn_out); + found_conv_bn_count++; + } + }; + + gpd(graph, handler); + + AddStatis(found_conv_bn_count); +} + +void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvBN conv_bn_pattern(gpd.mutable_pattern(), name_scope_); + conv_bn_pattern(conv_input, true /*with_eltwise_add*/); + + int found_conv_bn_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBN fuse"; + + // conv, batch_norm, + // conv_weight, conv_out, + // bn_scale, bn_bias, bn_mean, bn_variance, + // bn_out, bn_mean_out, bn_variance_out, bn_saved_mean,bn_saved_variance + GET_CONV_BN_NODES(conv_bn_pattern); + // OPERATORS + GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bn_pattern); + // BIAS inputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_bn_pattern); + // BIAS outputs + GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bn_pattern); + + // Get eltwise_y (conv bias) variable + auto* eltwise_y_in_tensor = + scope->FindVar(eltwise_y_in->Name())->GetMutable(); + + // Get batch norm bias + auto* bn_bias_tensor = + scope->FindVar(bn_bias->Name())->GetMutable(); + + // update weights and biases + float epsilon = boost::get(batch_norm->Op()->GetAttr("epsilon")); + recompute_bias_and_weights(scope, conv_weight, *bn_scale, *bn_bias_tensor, + *bn_mean, *bn_variance, eltwise_y_in_tensor, + epsilon); + + // Update the elementwise_add node + eltwise->Op()->SetAttr("axis", 1); + eltwise->Op()->SetOutput("Out", std::vector({bn_out->Name()})); + + GraphSafeRemoveNodes( + graph, + {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out, + bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out}); + + IR_NODE_LINK_TO(eltwise, bn_out); + + found_conv_bn_count++; + }; + + gpd(graph, handler); + + AddStatis(found_conv_bn_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_bn_fuse_pass, paddle::framework::ir::ConvBNFusePass); +REGISTER_PASS(conv_eltwiseadd_bn_fuse_pass, + paddle::framework::ir::ConvEltwiseAddBNFusePass); diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h new file mode 100644 index 00000000..837a48ed --- /dev/null +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the Conv and BatchNorm to a ConvBNMKLDNNOp. + */ +class ConvBNFusePass : public FusePassBase { + public: + virtual ~ConvBNFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + const std::string name_scope_{"conv_bn_fuse"}; +}; + +class ConvEltwiseAddBNFusePass : public FusePassBase { + public: + virtual ~ConvEltwiseAddBNFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + const std::string name_scope_{"conv_eltwiseadd_bn_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc new file mode 100644 index 00000000..99bc5fe8 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(elementwise_add_op_1); \ + GET_IR_NODE(elementwise_add_in_y_1); \ + GET_IR_NODE(elementwise_add_out_1); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& bias1, const std::string& activation, + const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {bias1}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_cudnn", false); + + return *desc.Proto(); +} + +void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( + "conv2d", "Input"); + + patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string bias1_name = elementwise_add_in_y_1->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name, + act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + auto new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // ResidualData + IR_NODE_LINK_TO(new_conv_op, act_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), + {conv_op, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out}); + }; + gpd(graph.get(), handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAdd2ActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc new file mode 100644 index 00000000..b4d6f683 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" +#include + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(elementwise_add_op_1); \ + GET_IR_NODE(elementwise_add_in_y_1); \ + GET_IR_NODE(elementwise_add_out_1); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& bias1, const std::string& activation, + const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetType("conv2d_fusion"); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {bias1}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_cudnn", false); + desc.Flush(); + return *desc.Proto(); +} + +void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "conv_elementwise_add2_act_fuse"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( + "conv2d", "Input"); + + patterns::ConvElementwiseadd2Act pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string bias1_name = elementwise_add_in_y_1->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name, + act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, act_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes( + graph, {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out, elementwise_add_out_1, act_op}); + }; + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAdd2ActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h new file mode 100644 index 00000000..ea9e465d --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAdd2ActFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAdd2ActFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc new file mode 100644 index 00000000..ba0a2fb9 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& activation, const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetType("conv2d_fusion"); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_cudnn", false); + desc.Flush(); + return *desc.Proto(); +} + +void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("conv2d", "Input") + ->AsInput(); + + patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = + PrepareOpDesc(base_op_desc, bias_name, act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, act_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op, + elementwise_add_out, act_op}); + }; + + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAddActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h new file mode 100644 index 00000000..8b34c355 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAddActFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAddActFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc new file mode 100644 index 00000000..8c491d4f --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); + +void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "conv_elementwise_add_fuse"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("conv2d", "Input") + ->AsInput(); + + patterns::ConvElementwiseadd pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string output_name = elementwise_add_out->Name(); + + std::string act_type = "identity"; + framework::OpDesc new_op_desc(base_op_desc, nullptr); + new_op_desc.SetType("conv2d_fusion"); + new_op_desc.SetInput("Bias", {bias_name}); + new_op_desc.SetInput("ResidualData", {}); + new_op_desc.SetAttr("activation", act_type); + new_op_desc.SetOutput("Output", {output_name}); + new_op_desc.SetAttr("is_test", true); + new_op_desc.SetAttr("use_cudnn", false); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, elementwise_add_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op}); + }; + + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_fuse_pass, + paddle::framework::ir::ConvElementwiseAddFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h new file mode 100644 index 00000000..66a562cd --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAddFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAddFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc new file mode 100644 index 00000000..3d4df87a --- /dev/null +++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(any_op_out); \ + GET_IR_NODE(quant_dequant_op_inscale); \ + GET_IR_NODE(quant_dequant_op); \ + GET_IR_NODE(quant_dequant_op_outscale); \ + GET_IR_NODE(quant_dequant_op_out); \ + GET_IR_NODE(any_op2); + +void DeleteQuantDequantOpPass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "delete_quantdequant_op_pattern"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + + patterns::DeleteQuantDequantOpPattern pattern(gpd.mutable_pattern(), + pattern_name); + pattern(); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + IR_NODE_LINK_TO(any_op_out, any_op2); + std::string any_op_out_name = any_op_out->Var()->Name(); + std::string quant_dequant_op_out_name = quant_dequant_op_out->Var()->Name(); + + auto* any_op2_desc = any_op2->Op(); + // auto input_args_names = any_op2_desc->InputArgumentNames(); + auto var_map = any_op2_desc->Inputs(); + + for (auto& name_m : var_map) { + if (std::find(name_m.second.begin(), name_m.second.end(), + quant_dequant_op_out_name) != name_m.second.end()) { + std::vector new_inputs; + for (auto& i_n : name_m.second) { + if (i_n != quant_dequant_op_out_name) { + new_inputs.push_back(i_n); + } + } + new_inputs.push_back(any_op_out_name); + any_op2_desc->SetInput(name_m.first, new_inputs); + any_op2_desc->Flush(); + } + } + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, + {quant_dequant_op, quant_dequant_op_out, + quant_dequant_op_inscale, quant_dequant_op_outscale}); + }; + + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(delete_quant_dequant_op_pass, + paddle::framework::ir::DeleteQuantDequantOpPass); diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h new file mode 100644 index 00000000..938ada64 --- /dev/null +++ b/paddle/fluid/framework/ir/delete_quant_dequant_op_pass.h @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class DeleteQuantDequantOpPass : public FusePassBase { + public: + virtual ~DeleteQuantDequantOpPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc new file mode 100644 index 00000000..6462e7bf --- /dev/null +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc @@ -0,0 +1,243 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace framework { +namespace ir { + +static int BuildFusion(Graph* graph, const std::string& name_scope, + Scope* scope, bool with_fc_bias) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Build pattern + PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x")) + ->assert_is_op_input("lookup_table") + ->assert_var_not_persistable(); + patterns::Embedding embedding_pattern(pattern, name_scope); + // TODO(jczaja): Intermediate can only be for val that are not used anywhere + // but lookup table output may go into other LSTM (for reverse + // direction) + auto* embedding_out = embedding_pattern(x); + patterns::FC fc_pattern(pattern, name_scope); + + // fc_out is a tmp var, will be removed after fuse, so marked as intermediate. + auto* fc_out = fc_pattern(embedding_out, with_fc_bias)->AsIntermediate(); + patterns::LSTM lstm_pattern(pattern, name_scope); + lstm_pattern(fc_out); + + // Create New OpDesc + auto embedding_lstm_creator = [&](Node* embedding, Node* W, Node* lstm, + Node* input, Node* weight_x, Node* weight_h, + Node* bias, Node* hidden, Node* cell, + Node* xx, Node* fc_bias) { + OpDesc op_desc; + op_desc.SetType("fused_embedding_fc_lstm"); +#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()}); + SET_IN(Ids, input); + SET_IN(WeightH, weight_h); + // Neet to have this passed as We need Wc data for peephole connections + SET_IN(Bias, bias); +#undef SET_IN + + // Multiply embeddings with Weights + PADDLE_ENFORCE(scope); + const std::string& embeddings = patterns::UniqueKey("Embeddings"); + auto* embeddings_var = scope->Var(embeddings); + PADDLE_ENFORCE(embeddings_var); + auto* embeddings_tensor = + embeddings_var->GetMutable(); + // Get WeightX size: [single_embedding, fc_size] + // and embedding size: [dict_size, single_embedding] + // and create new size of embeddings eg. [dict_size , hidden_size] + auto* embedding_var = scope->FindVar(W->Name()); + PADDLE_ENFORCE(embedding_var); + const auto& embedding_tensor = embedding_var->Get(); + + const auto& weightx_tensor = + scope->FindVar(weight_x->Name())->Get(); + embeddings_tensor->Resize( + {embedding_tensor.dims()[0], weightx_tensor.dims()[1]}); + + // Multiplie embeddings via WeightsX and add bias + auto embedding_data = embedding_tensor.data(); + auto weightx_data = weightx_tensor.data(); + auto embeddings_data = + embeddings_tensor->mutable_data(platform::CPUPlace()); + + // Adding biases to GEMM result to be + auto* lstm_bias_var = scope->FindVar(bias->Name()); + PADDLE_ENFORCE(lstm_bias_var); + const auto& lstm_bias_tensor = lstm_bias_var->Get(); + + auto alpha = 1.0f; + auto beta = 1.0f; + int m = embedding_tensor.dims()[0]; + int n = weightx_tensor.dims()[1]; + int k = embedding_tensor.dims()[1]; + + // Copy only gate biases values (only actual bias data, not peephole + // weights) + std::vector combined_biases; + combined_biases.reserve(n); + std::copy_n(lstm_bias_tensor.data(), n, + std::back_inserter(combined_biases)); + + if (with_fc_bias) { + // Add FC-bias with LSTM-bias (into GEMM result to be) + auto* fc_bias_var = scope->FindVar(fc_bias->Name()); + const auto& fc_bias_tensor = fc_bias_var->Get(); + for (int i = 0; i < fc_bias_tensor.numel(); i++) { + combined_biases[i] += fc_bias_tensor.data()[i]; + } + } + + // broadcast biases + std::vector ones(m, 1.0f); + paddle::operators::math::CBlas::GEMM( + CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1, + &combined_biases[0], n, 0.0f, embeddings_data, n); + + // Wx*embeddings + biases + paddle::operators::math::CBlas::GEMM( + CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, + embedding_data, k, weightx_data, n, beta, embeddings_data, n); + op_desc.SetInput("Embeddings", {embeddings}); + + // Create temp variables. + const std::string BatchedInput = patterns::UniqueKey("BatchedInput"); + const std::string BatchedCellPreAct = + patterns::UniqueKey("BatchedCellPreAct"); + const std::string BatchedGate = patterns::UniqueKey("BatchedGate"); + + scope->Var(BatchedInput)->GetMutable(); + scope->Var(BatchedCellPreAct)->GetMutable(); + scope->Var(BatchedGate)->GetMutable(); + + op_desc.SetInput("H0", {}); + op_desc.SetInput("C0", {}); + op_desc.SetOutput("Hidden", {hidden->Name()}); + op_desc.SetOutput("Cell", {cell->Name()}); + op_desc.SetOutput("XX", {xx->Name()}); + op_desc.SetOutput("BatchedGate", {BatchedGate}); + op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct}); + op_desc.SetOutput("BatchedInput", {BatchedInput}); + op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse")); + op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes")); + // TODO(TJ): get from attr + op_desc.SetAttr("use_seq", true); + + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto& scope = graph->Get(kParamScopeAttr); +#define OP_SET_OUT(x) \ + const std::string x = patterns::UniqueKey(#x); \ + op_desc.SetOutput(#x, {x}); \ + scope.Var(x)->GetMutable() + OP_SET_OUT(BatchedCell); + OP_SET_OUT(BatchedHidden); + OP_SET_OUT(ReorderedH0); + OP_SET_OUT(ReorderedC0); +#undef OP_SET_OUT + + auto* op = graph->CreateOpNode(&op_desc); + IR_NODE_LINK_TO(input, op); + IR_NODE_LINK_TO(weight_x, op); + IR_NODE_LINK_TO(weight_h, op); + IR_NODE_LINK_TO(bias, op); + IR_NODE_LINK_TO(op, hidden); + return op; + }; + + int fusion_count{0}; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(lookup_table, lookup_table, embedding_pattern); + GET_IR_NODE_FROM_SUBGRAPH(W, W, embedding_pattern); + GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); + + // TODO(jczaja): Add support for is_sparse / is_distributed + auto is_sparse = boost::get(lookup_table->Op()->GetAttr("is_sparse")); + auto is_distributed = + boost::get(lookup_table->Op()->GetAttr("is_distributed")); + + if (is_sparse == true || is_distributed == true) { + return; + } + + if (with_fc_bias) { + GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); + embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight, + Bias, Hidden, Cell, fc_out, fc_bias); + // Remove unneeded nodes. + // TODO(jczaja): Proper removing of lookup table + std::unordered_set marked_nodes( + // {lookup_table, mul, lstm, elementwise_add, fc_bias, W}); + {mul, lstm, elementwise_add, fc_bias}); + GraphSafeRemoveNodes(graph, marked_nodes); + } else { + GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern); + embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight, + Bias, Hidden, Cell, fc_out, nullptr); + // Remove unneeded nodes. + // TODO(jczaja): Proper removing of lookup table + // std::unordered_set marked_nodes({lookup_table, W, mul, + // lstm}); + std::unordered_set marked_nodes({mul, lstm}); + GraphSafeRemoveNodes(graph, marked_nodes); + } + + ++fusion_count; + }; + + gpd(graph, handler); + + return fusion_count; +} + +void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); + + AddStatis(fusion_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(embedding_fc_lstm_fuse_pass, + paddle::framework::ir::EmbeddingFCLSTMFusePass); diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h new file mode 100644 index 00000000..65cb4439 --- /dev/null +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h @@ -0,0 +1,42 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +// Fusing of Embedding , FC and LSTM op + +// Just FC without bias +class EmbeddingFCLSTMFusePass : public FusePassBase { + public: + virtual ~EmbeddingFCLSTMFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + const std::string name_scope_{"embedding_fc_lstm_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc new file mode 100644 index 00000000..102fd388 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fc_fuse_pass.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void FCFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("fc_fuse", graph); + + std::unordered_set nodes2delete; + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("fc_fuse/x") + ->AsInput() + ->assert_is_op_input("mul", "X"); + patterns::FC fc_pattern(gpd.mutable_pattern(), "fc_fuse"); + fc_pattern(x, true /*with bias*/); + + int found_fc_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle FC fuse"; + GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern); + + auto base_op_desc = mul->Op(); + // Create an FC Node. + // OpDesc desc(base_op_desc, nullptr); + OpDesc desc; + std::string fc_x_in = subgraph.at(x)->Name(); + std::string fc_Y_in = w->Name(); + std::string fc_bias_in = fc_bias->Name(); + std::string fc_out_out = fc_out->Name(); + + desc.SetInput("Input", std::vector({fc_x_in})); + desc.SetInput("W", std::vector({fc_Y_in})); + desc.SetInput("Bias", std::vector({fc_bias_in})); + desc.SetOutput("Out", std::vector({fc_out_out})); + desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims")); + + // For anakin subgraph int8 + // When in anakin subgraph int8 mode, the pattern like "fake_quant + mul + + // fake_dequant" + // can be detected by the quant_dequant_fuse_pass. This pass will add + // "input_scale", + // "weight_scale" which are extracted from fake_quant op and fake_dequant op + // to mul op, + // and then delete the fake_quant op and fake_dequant op in the graph. If + // the mul op + // has the scale info, we should add those to the fused fc. + if (base_op_desc->HasAttr("enable_int8")) { + desc.SetAttr("enable_int8", base_op_desc->GetAttr("enable_int8")); + desc.SetAttr("input_scale", base_op_desc->GetAttr("input_scale")); + desc.SetAttr("weight_scale", base_op_desc->GetAttr("weight_scale")); + if (base_op_desc->HasAttr("out_scale")) + desc.SetAttr("out_scale", base_op_desc->GetAttr("out_scale")); + auto elementwise_desc = elementwise_add->Op(); + if (elementwise_desc->HasAttr("out_scale")) + desc.SetAttr("out_scale", elementwise_desc->GetAttr("out_scale")); + } + + desc.SetType("fc"); + + auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. + GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out}); + + PADDLE_ENFORCE(subgraph.count(x)); + IR_NODE_LINK_TO(subgraph.at(x), fc_node); + IR_NODE_LINK_TO(w, fc_node); + IR_NODE_LINK_TO(fc_bias, fc_node); + IR_NODE_LINK_TO(fc_node, fc_out); + + found_fc_count++; + }; + + gpd(graph, handler); + + AddStatis(found_fc_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass); diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h new file mode 100644 index 00000000..0a0fcd2d --- /dev/null +++ b/paddle/fluid/framework/ir/fc_fuse_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the MUL and ELEMENTWISE_ADD to a FCOp. + */ +class FCFusePass : public FusePassBase { + public: + virtual ~FCFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc new file mode 100644 index 00000000..affe5069 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fc_fuse_pass.h" + +#include +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "mul") { + op->SetInput("X", {inputs[0]}); + op->SetInput("Y", {inputs[1]}); + op->SetAttr("x_num_col_dims", {1}); + } else if (type == "elementwise_add") { + op->SetInput("X", inputs); + } + op->SetOutput("Out", outputs); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); +} + +// a->OP0->b +// a->OP1->c +// (b, c)->mul->d +// (d, e)->elementwise_add->f +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : std::vector({"a", "b", "c", "d", "e", "f"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "c") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "OP0", std::vector({"a"}), + std::vector({"b"})); + SetOp(&prog, "OP1", std::vector({"a"}), + std::vector({"c"})); + SetOp(&prog, "mul", std::vector({"b", "c"}), + std::vector({"d"})); + SetOp(&prog, "elementwise_add", std::vector({"d", "e"}), + std::vector({"f"})); + + return prog; +} + +TEST(FCFusePass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("fc_fuse_pass"); + + int pre_nodes = graph->Nodes().size(); + + graph.reset(pass->Apply(graph.release())); + + int after_nodes = graph->Nodes().size(); + + // Remove 3 Nodes: MUL,ELEMENTWISE_ADD, mul_out + // Add 1 Node: FC + EXPECT_EQ(pre_nodes - 2, after_nodes); + + // Assert fc op in newly generated graph + int fc_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "fc") { + ++fc_count; + } + } + EXPECT_EQ(fc_count, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(fc_fuse_pass); diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc new file mode 100644 index 00000000..10cbe319 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc @@ -0,0 +1,180 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h" +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +static int BuildFusion(Graph* graph, const std::string& name_scope, + Scope* scope, bool with_fc_bias) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Create pattern. + patterns::FC fc_pattern(pattern, name_scope); + patterns::GRU gru_pattern(pattern, name_scope); + + PDNode* x = + pattern->NewNode(patterns::UniqueKey("x"))->assert_var_not_persistable(); + + auto* fc_out = fc_pattern(x, with_fc_bias); + fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse. + gru_pattern(fc_out); + + // Create New OpDesc + auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h, + Node* bias, Node* hidden, Node* fc_bias) { + OpDesc op_desc; + op_desc.SetType("fusion_gru"); + +#define NEW_NAME(x) name_scope + "/at." #x ".new" +#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()}); + SET_IN(X, x); + SET_IN(WeightX, weight_x); + SET_IN(WeightH, weight_h); + if (with_fc_bias) { + op_desc.SetInput("Bias", {NEW_NAME(bias) + bias->Name()}); + } else { + SET_IN(Bias, bias); + } +#undef SET_IN + op_desc.SetInput("H0", {}); + op_desc.SetOutput("Hidden", {hidden->Name()}); + op_desc.SetAttr("is_reverse", gru->Op()->GetAttr("is_reverse")); + // TODO(TJ): This should be a option for infer + op_desc.SetAttr("use_seq", true); + +#define SET_IMTERMEDIATE_OUT(key) op_desc.SetOutput(#key, {NEW_NAME(key)}) + SET_IMTERMEDIATE_OUT(ReorderedH0); + SET_IMTERMEDIATE_OUT(XX); + SET_IMTERMEDIATE_OUT(BatchedInput); + SET_IMTERMEDIATE_OUT(BatchedOut); +#undef SET_IMTERMEDIATE_OUT + + auto* op = graph->CreateOpNode(&op_desc); + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto& scope = graph->Get(kParamScopeAttr); + if (with_fc_bias) { + // Fusion GRU bias = fcbias + grubias + auto* fusion_bias_var = scope.Var(NEW_NAME(bias) + bias->Name()); + auto* out_bias_tensor = + fusion_bias_var->GetMutable(); + PADDLE_ENFORCE(fusion_bias_var); + auto* gru_bias_var = scope.FindVar(bias->Name()); + auto* fc_bias_var = scope.FindVar(fc_bias->Name()); + PADDLE_ENFORCE(gru_bias_var); + PADDLE_ENFORCE(fc_bias_var); + const auto& gru_bias_tenosr = gru_bias_var->Get(); + const auto& fc_bias_tensor = fc_bias_var->Get(); + // new bias = fc bias + gru bias + out_bias_tensor->Resize(gru_bias_tenosr.dims()); + auto* data = out_bias_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < out_bias_tensor->numel(); i++) { + data[i] = + fc_bias_tensor.data()[i] + gru_bias_tenosr.data()[i]; + } + } +#undef GET_NODE + +#define NEW_IMTERMEDIATE_OUT(key) \ + scope.Var(NEW_NAME(key))->GetMutable() + NEW_IMTERMEDIATE_OUT(ReorderedH0); + NEW_IMTERMEDIATE_OUT(XX); + NEW_IMTERMEDIATE_OUT(BatchedInput); + NEW_IMTERMEDIATE_OUT(BatchedOut); +#undef NEW_NAME +#undef NEW_IMTERMEDIATE_OUT + + IR_NODE_LINK_TO(x, op); + IR_NODE_LINK_TO(weight_x, op); + IR_NODE_LINK_TO(weight_h, op); + IR_NODE_LINK_TO(bias, op); // actually should link to new bias if have + IR_NODE_LINK_TO(op, hidden); + // h0? + return op; + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + auto* x_n = subgraph.at(x); + GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern); + GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, gru_pattern); + // nodes need be removed + GET_IR_NODE_FROM_SUBGRAPH(BatchGate, BatchGate, gru_pattern); + GET_IR_NODE_FROM_SUBGRAPH(BatchResetHiddenPrev, BatchGate, gru_pattern); + GET_IR_NODE_FROM_SUBGRAPH(BatchHidden, BatchGate, gru_pattern); + + if (with_fc_bias) { + GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); + + gru_creater(gru, x_n, w, Weight, Bias, Hidden, fc_bias); + // Remove unneeded nodes. + std::unordered_set marked_nodes( + {mul, gru, elementwise_add, fc_bias, fc_out, mul_out, BatchGate, + BatchResetHiddenPrev, BatchHidden}); + GraphSafeRemoveNodes(graph, marked_nodes); + } else { + gru_creater(gru, x_n, w, Weight, Bias, Hidden, nullptr); + // Remove unneeded nodes. + std::unordered_set marked_nodes( + {mul, gru, BatchGate, BatchResetHiddenPrev, BatchHidden}); + GraphSafeRemoveNodes(graph, marked_nodes); + } +#undef GET_NODE + + ++fusion_count; + }; + + gpd(graph, handler); + + return fusion_count; +} + +void MulGRUFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/); + + AddStatis(fusion_count); +} + +void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); + + AddStatis(fusion_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(mul_gru_fuse_pass, paddle::framework::ir::MulGRUFusePass); +REGISTER_PASS(fc_gru_fuse_pass, paddle::framework::ir::FCGRUFusePass); diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h new file mode 100644 index 00000000..e11cdac7 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op. + +class FCGRUFusePass : public FusePassBase { + public: + virtual ~FCGRUFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + const std::string name_scope_{"fc_gru_fuse"}; +}; + +// Just FC without bias +class MulGRUFusePass : public FusePassBase { + public: + virtual ~MulGRUFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + const std::string name_scope_{"fc_nobias_gru_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc new file mode 100644 index 00000000..6858a98b --- /dev/null +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, + bool with_fc_bias) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Build pattern + PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x")) + ->assert_is_op_input("mul") + ->assert_var_not_persistable(); + patterns::FC fc_pattern(pattern, name_scope); + + // fc_out is a tmp var, will be removed after fuse, so marked as intermediate. + auto* fc_out = fc_pattern(x, with_fc_bias)->AsIntermediate(); + patterns::LSTM lstm_pattern(pattern, name_scope); + lstm_pattern(fc_out); + + // Create New OpDesc + auto lstm_creator = [&](Node* lstm, Node* input, Node* weight_x, + Node* weight_h, Node* bias, Node* hidden, Node* cell, + Node* xx, Node* fc_bias) { + OpDesc op_desc; + op_desc.SetType("fusion_lstm"); +#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()}); + SET_IN(X, input); + SET_IN(WeightX, weight_x); + SET_IN(WeightH, weight_h); + SET_IN(Bias, bias); +#undef SET_IN + if (with_fc_bias) { + // Add FC-bias with LSTM-bias and create a new weight + PADDLE_ENFORCE(scope); + const std::string& new_bias_var = patterns::UniqueKey("NewBias"); + auto* bias_var = scope->Var(new_bias_var); + PADDLE_ENFORCE(bias_var); + auto* bias_tensor = bias_var->GetMutable(); + auto* lstm_bias_var = scope->FindVar(bias->Name()); + PADDLE_ENFORCE(lstm_bias_var); + const auto& lstm_bias_tensor = lstm_bias_var->Get(); + bias_tensor->Resize(lstm_bias_tensor.dims()); + + auto* fc_bias_var = scope->FindVar(fc_bias->Name()); + const auto& fc_bias_tensor = fc_bias_var->Get(); + + auto* data = bias_tensor->mutable_data(platform::CPUPlace()); + + for (int i = 0; i < bias_tensor->numel(); i++) { + data[i] = + fc_bias_tensor.data()[i] + lstm_bias_tensor.data()[i]; + } + op_desc.SetInput("Bias", {new_bias_var}); + } + + // Create temp variables. + const std::string BatchedInput = patterns::UniqueKey("BatchedInput"); + const std::string BatchedCellPreAct = + patterns::UniqueKey("BatchedCellPreAct"); + const std::string BatchedGate = patterns::UniqueKey("BatchedGate"); + const std::string CheckedCell = patterns::UniqueKey("CheckedCell"); + + scope->Var(BatchedInput)->GetMutable(); + scope->Var(BatchedCellPreAct)->GetMutable(); + scope->Var(BatchedGate)->GetMutable(); + scope->Var(CheckedCell)->GetMutable(); + + op_desc.SetInput("H0", {}); + op_desc.SetInput("C0", {}); + op_desc.SetOutput("Hidden", {hidden->Name()}); + op_desc.SetOutput("Cell", {cell->Name()}); + op_desc.SetOutput("XX", {xx->Name()}); + op_desc.SetOutput("BatchedGate", {BatchedGate}); + op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct}); + op_desc.SetOutput("BatchedInput", {BatchedInput}); + op_desc.SetOutput("CheckedCell", {CheckedCell}); + op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse")); + op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes")); + // TODO(TJ): get from attr + op_desc.SetAttr("use_seq", true); + + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto& scope = graph->Get(kParamScopeAttr); +#define OP_SET_OUT(x) \ + const std::string x = patterns::UniqueKey(#x); \ + op_desc.SetOutput(#x, {x}); \ + scope.Var(x)->GetMutable() + OP_SET_OUT(BatchedCell); + OP_SET_OUT(BatchedHidden); + OP_SET_OUT(ReorderedH0); + OP_SET_OUT(ReorderedC0); +#undef OP_SET_OUT + + auto* op = graph->CreateOpNode(&op_desc); + IR_NODE_LINK_TO(input, op); + IR_NODE_LINK_TO(weight_x, op); + IR_NODE_LINK_TO(weight_h, op); + IR_NODE_LINK_TO(bias, op); + IR_NODE_LINK_TO(op, hidden); + return op; + }; + + int fusion_count{0}; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern); + GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); + if (with_fc_bias) { + GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); + lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out, + fc_bias); + // Remove unneeded nodes. + std::unordered_set marked_nodes( + {mul, lstm, elementwise_add, fc_bias}); + GraphSafeRemoveNodes(graph, marked_nodes); + } else { + GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern); + lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out, + nullptr); + // Remove unneeded nodes. + std::unordered_set marked_nodes({mul, lstm}); + GraphSafeRemoveNodes(graph, marked_nodes); + } + + ++fusion_count; + }; + + gpd(graph, handler); + + return fusion_count; +} + +void MulLstmFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/); + + AddStatis(fusion_count); +} + +void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); + + AddStatis(fusion_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(mul_lstm_fuse_pass, paddle::framework::ir::MulLstmFusePass); +REGISTER_PASS(fc_lstm_fuse_pass, paddle::framework::ir::FCLstmFusePass); diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h new file mode 100644 index 00000000..5dea7c91 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +// The MulLstmFusePass and MulLstmFusePass will fuse to the same FusionLstm op. + +// Just FC without bias +class FCLstmFusePass : public FusePassBase { + public: + virtual ~FCLstmFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + const std::string name_scope_{"fc_lstm_fuse"}; +}; + +class MulLstmFusePass : public FusePassBase { + public: + virtual ~MulLstmFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + const std::string name_scope_{"fc_nobias_lstm_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc new file mode 100644 index 00000000..915a2f62 --- /dev/null +++ b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(fill_constant); \ + GET_IR_NODE(fill_constant_out); \ + GET_IR_NODE(elementwise_mul); \ + GET_IR_NODE(elementwise_mul_out); + +void FillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "fillconstant_elementwisemul_fuse"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("elementwise_mul", "X") + ->AsInput(); + + patterns::FillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(), + pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + PADDLE_ENFORCE(subgraph.count(x)); + auto* elementwise_in = subgraph.at(x); + float constant_value = + boost::get(fill_constant->Op()->GetAttr("value")); + + framework::OpDesc new_op_desc; + new_op_desc.SetType("scale"); + new_op_desc.SetInput("X", {elementwise_in->Name()}); + new_op_desc.SetAttr("scale", constant_value); + new_op_desc.SetAttr("bias", static_cast(0.0)); + new_op_desc.SetAttr("bias_after_scale", true); + new_op_desc.SetOutput("Out", {elementwise_mul_out->Name()}); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto* scale_op = graph->CreateOpNode(&new_op_desc); + + IR_NODE_LINK_TO(elementwise_in, scale_op); // Input + IR_NODE_LINK_TO(scale_op, elementwise_mul_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, + {fill_constant, fill_constant_out, elementwise_mul}); + }; + + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fillconstant_elementwisemul_fuse, + paddle::framework::ir::FillconstantElementwisemulFuse); diff --git a/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h new file mode 100644 index 00000000..ab66fb4a --- /dev/null +++ b/paddle/fluid/framework/ir/fillconstant_elementwisemul_fuse.h @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class FillconstantElementwisemulFuse : public FusePassBase { + public: + virtual ~FillconstantElementwisemulFuse() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc new file mode 100644 index 00000000..7f9eccf2 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc @@ -0,0 +1,370 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void FuseElewiseAddActPass::ApplyImpl(ir::Graph *graph) const { + std::unordered_set act_types = {"relu", "scale", "tanh"}; + graph = FuseActElewiseAdd(graph, act_types); + graph = FuseElewiseAddAct(graph, act_types); + // backward + { + std::unordered_set in_place_act_types = {"relu_grad"}; + graph = FuseElewiseAddActInplaceGrad(graph, in_place_act_types); + } + + // Remove the removable intermediate_out. + RemoveIntermediateOut(graph); +} + +// ele_add(x, act(y)) +ir::Graph *FuseElewiseAddActPass::FuseElewiseAddAct( + ir::Graph *graph, const std::unordered_set &act_types) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("elewise_add_act", graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode("elewise_add_act/x") + ->AsInput() + ->assert_is_op_input("elementwise_add", "X"); + patterns::ElewiseAddAct elewise_add_act_pattern(gpd.mutable_pattern(), + "elementwise_add"); + + elewise_add_act_pattern(x, act_types); + + int found_elewise_add_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle FuseElewiseAddAct fuse"; + GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, + elewise_add_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act, act, elewise_add_act_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add, ele_add, elewise_add_act_pattern); + + std::string ele_x_n = subgraph.at(x)->Name(); + std::string ele_y_n = ele_y->Name(); + std::string ele_out_n = ele_out->Name(); + std::string act_out_n = act_out->Name(); + + Node *elewise_add_act_node = CreateFuseElewiseAddActNode( + g, act, ele_add, ele_x_n, ele_y_n, ele_out_n, act_out_n); + + VLOG(4) << "\n\t " << ele_x_n << " and " << ele_y_n << " -> " + << ele_add->Name() << " -> " << ele_out_n << "\n" + << "\t " << ele_out_n << " -> " << act->Name() << " -> " + << act_out_n; + + ReLinkNodes(g, ele_out, ele_add, act, elewise_add_act_node); + found_elewise_add_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_elewise_add_act_count); + return graph; +} + +// act(ele_add(x,y)) +ir::Graph *FuseElewiseAddActPass::FuseActElewiseAdd( + ir::Graph *graph, const std::unordered_set &act_types) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("act_elewise_add", graph); + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode("act_elewise_add/x") + ->AsInput() + ->assert_is_ops_input(act_types, "X"); + patterns::ActElewiseAdd act_elewise_add_pattern(gpd.mutable_pattern(), + "act_elewise_add"); + + act_elewise_add_pattern(x, act_types); + + int found_elewise_add_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle FuseElewiseAddAct fuse"; + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, act_elewise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_x, ele_x, act_elewise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_out, elewise_add_out, + act_elewise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act, act, act_elewise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add, ele_add, act_elewise_add_pattern); + + std::string act_i_n = subgraph.at(x)->Name(); + std::string act_o_n = act_out->Name(); + std::string elewise_add_x_n = ele_x->Name(); + std::string elewise_add_out_n = ele_out->Name(); + + Node *elewise_add_act_node = CreateFuseElewiseAddActNode( + g, ele_add, act, elewise_add_x_n, act_i_n, act_o_n, elewise_add_out_n); + + VLOG(4) << "\n\t " << act_i_n << " -> " << act->Name() << " -> " << act_o_n + << "\n\t " << act_o_n << " and " << elewise_add_x_n << " -> " + << ele_add->Name() << " -> " << elewise_add_out_n; + + ReLinkNodes(g, act_out, act, ele_add, elewise_add_act_node); + found_elewise_add_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_elewise_add_act_count); + return graph; +} + +// the backward of act(ele_add(x,y)) +// act_grad: in["Out", "Out@GRAD"], out["X@GRAD"] +// ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"] +ir::Graph *FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( + ir::Graph *graph, const std::unordered_set &act_types) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("elewise_add_act_grad", graph); + + GraphPatternDetector gpd; + auto *d_act_out = gpd.mutable_pattern() + ->NewNode("elewise_add_act_grad_inplace/x") + ->AsInput() + ->assert_is_ops_input(act_types, GradVarName("Out")); + patterns::ElewiseAddActInplaceGrad elewise_add_act_grad_pattern( + gpd.mutable_pattern(), "elewise_add_act_grad_inplace"); + elewise_add_act_grad_pattern(d_act_out, act_types); + + int found_elewise_add_act_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle FuseElewiseAddActGrad1 fuse"; + GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(d_itermediate_out, d_itermediate_out, + elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_y, ele_y, elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(ele_add_grad, ele_add_grad, + elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(d_ele_x, d_ele_x, elewise_add_act_grad_pattern); + GET_IR_NODE_FROM_SUBGRAPH(d_ele_y, d_ele_y, elewise_add_act_grad_pattern); + + std::string d_act_out_n = subgraph.at(d_act_out)->Name(); + std::string act_out_n = act_out->Name(); + std::string d_itermediate_out_n = d_itermediate_out->Name(); + std::string ele_y_n = ele_y->Name(); + std::string d_ele_x_n = d_ele_x->Name(); + std::string d_ele_y_n = d_ele_y->Name(); + + OpDesc desc; + desc.SetType("fused_elemwise_activation_grad"); + desc.SetInput("IntermediateOut", {}); + desc.SetInput("X", {}); + desc.SetInput("Y", std::vector({ele_y_n})); + desc.SetInput("Out", std::vector({act_out_n})); + desc.SetInput(GradVarName("Out"), std::vector({d_act_out_n})); + desc.SetOutput(GradVarName("X"), std::vector({d_ele_x_n})); + desc.SetOutput(GradVarName("Y"), std::vector({d_ele_y_n})); + desc.SetOutput(GradVarName("IntermediateOut"), + std::vector({d_itermediate_out_n})); + + desc.SetAttr("save_intermediate_out", false); + desc.SetAttr("functor_list", + std::vector( + {act_grad->Op()->Type(), ele_add_grad->Op()->Type()})); + + for (auto &n : {act_grad->Op(), ele_add_grad->Op()}) { + for (auto &m_ele : n->GetAttrMap()) { + desc.SetAttr(m_ele.first, m_ele.second); + } + } + + auto fused_node = g->CreateOpNode(&desc); + + VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> " + << act_grad->Name() << " -> " << d_itermediate_out_n << "\n\t " + << d_itermediate_out_n << " and " << act_out_n << " -> " + << ele_add_grad->Name() << " -> " << d_itermediate_out_n; + + ReLinkNodes(g, d_itermediate_out, act_grad, ele_add_grad, fused_node); + found_elewise_add_act_count++; + }; + + gpd(graph, handler); + + AddStatis(found_elewise_add_act_count); + return graph; +} + +Node *FuseElewiseAddActPass::CreateFuseElewiseAddActNode( + Graph *g, const Node *op_1, const Node *op_2, const std::string &ele_x_n, + const std::string &ele_y_n, const std::string &ele_out_n, + const std::string &act_out_n) const { + OpDesc desc; + desc.SetInput("X", std::vector({ele_x_n})); + desc.SetInput("Y", std::vector({ele_y_n})); + desc.SetOutput("Out", std::vector({act_out_n})); + desc.SetOutput("IntermediateOut", std::vector({ele_out_n})); + desc.SetType("fused_elemwise_activation"); + desc.SetAttr("save_intermediate_out", true); + desc.SetAttr("functor_list", std::vector( + {op_1->Op()->Type(), op_2->Op()->Type()})); + + // Set attrs + for (auto &n : {op_1->Op(), op_2->Op()}) { + for (auto &m_ele : n->GetAttrMap()) { + desc.SetAttr(m_ele.first, m_ele.second); + } + } + + auto elewise_add_act_node = g->CreateOpNode(&desc); + return elewise_add_act_node; +} + +void FuseElewiseAddActPass::RemoveIntermediateOut(Graph *graph) const { + std::unordered_set need_removed_nodes; + for (auto &cur_node : graph->Nodes()) { + if (cur_node->IsVar()) continue; + if (cur_node->Name() == "fused_elemwise_activation") { + bool save_intermediate_out = + boost::get(cur_node->Op()->GetAttr("save_intermediate_out")); + auto intermediate_out_args = cur_node->Op()->Output("IntermediateOut"); + PADDLE_ENFORCE( + save_intermediate_out && !intermediate_out_args.empty(), + "The %s should save the intermediate_out in the fusing stage.", + cur_node->Name()); + + // If the intermediate_out's output is empty, it should be removed. + auto cur_node_outputs = cur_node->outputs; + for (auto &out : cur_node_outputs) { + if (out->Name() == intermediate_out_args[0]) { + if (out->outputs.size() == 0) { + cur_node->outputs = this->RemoveNode(out, cur_node->outputs); + need_removed_nodes.insert(std::move(out)); + cur_node->Op()->SetAttr("save_intermediate_out", false); + } + } + } + } else if (cur_node->Name() == "fused_elemwise_activation_grad") { + auto intermediate_out_grad_args = + cur_node->Op()->Output(GradVarName("IntermediateOut")); + PADDLE_ENFORCE( + !intermediate_out_grad_args.empty(), + "The %s should save the intermediate_out in the fusing stage.", + cur_node->Name()); + auto cur_node_outputs = cur_node->outputs; + // If the intermediate_out_g's output is empty, it should be removed. + for (auto &out : cur_node_outputs) { + if (out->Name() == intermediate_out_grad_args[0] && + out->outputs.empty()) { + cur_node->Op()->SetOutput(GradVarName("IntermediateOut"), {}); + cur_node->outputs = this->RemoveNode(out, cur_node->outputs); + need_removed_nodes.insert(std::move(out)); + } + } + } + } + GraphSafeRemoveNodes(graph, need_removed_nodes); +} + +void FuseElewiseAddActPass::ReLinkNodes(Graph *graph, + const Node *intermediate_out, + Node *op_1, Node *op_2, + Node *fused_op) const { // delete act + for (auto &in : op_1->inputs) { + fused_op->inputs.emplace_back(in); + in->outputs = this->ReplaceNode(op_1, fused_op, in->outputs); + } + + std::unordered_set nodes2delete; + for (auto &out : op_1->outputs) { + if (out->IsCtrlVar()) { + auto result_iter = std::find_if( + op_2->inputs.begin(), op_2->inputs.end(), + [&out](const Node *node) -> bool { return node == out; }); + + if (result_iter == op_2->inputs.end()) { + IR_OP_VAR_LINK(fused_op, out); + } else { + nodes2delete.emplace(out); + } + } else { + PADDLE_ENFORCE(out == intermediate_out); + IR_OP_VAR_LINK(fused_op, out); + } + } + + for (auto &in : op_2->inputs) { + if (in == intermediate_out || nodes2delete.count(in)) { + continue; + } + fused_op->inputs.emplace_back(in); + in->outputs = this->ReplaceNode(op_2, fused_op, in->outputs); + } + + for (auto &out : op_2->outputs) { + IR_OP_VAR_LINK(fused_op, out); + } + + nodes2delete.insert(std::move(op_1)); + nodes2delete.insert(std::move(op_2)); + + GraphSafeRemoveNodes(graph, nodes2delete); +} + +std::vector FuseElewiseAddActPass::ReplaceNode( + Node *cur_node, Node *new_node, const std::vector &nodes) const { + std::vector new_list(nodes.size()); + bool has_replaced = false; + std::transform(nodes.begin(), nodes.end(), new_list.begin(), + [&](Node *node) -> Node * { + if (node == cur_node) { + has_replaced = true; + return new_node; + } + return node; + }); + PADDLE_ENFORCE(has_replaced, "Not find %s in the node list.", + cur_node->Name()); + return new_list; +} + +std::vector FuseElewiseAddActPass::RemoveNode( + Node *trg_node, const std::vector &nodes) const { + std::vector new_list(nodes.size()); + auto end_iter = + std::copy_if(nodes.begin(), nodes.end(), new_list.begin(), + [&](Node *node) -> bool { return node != trg_node; }); + new_list.resize( + static_cast(std::distance(new_list.begin(), end_iter))); + return new_list; +} +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_elewise_add_act_pass, + paddle::framework::ir::FuseElewiseAddActPass); diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h new file mode 100644 index 00000000..dc73f1fd --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h @@ -0,0 +1,74 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the ElewiseAdd and activation + */ +class FuseElewiseAddActPass : public FusePassBase { + public: + virtual ~FuseElewiseAddActPass() {} + + protected: + void ApplyImpl(ir::Graph *graph) const override; + + ir::Graph *FuseElewiseAddAct( + ir::Graph *graph, const std::unordered_set &act_types) const; + + ir::Graph *FuseActElewiseAdd( + ir::Graph *graph, const std::unordered_set &act_types) const; + + ir::Graph *FuseElewiseAddActInplaceGrad( + ir::Graph *graph, const std::unordered_set &act_types) const; + + /** + * Remove the removable intermediate_out. + * - If the intermediate_out is only used by the backward op, but the + * backward op doesn't use intermediate_out. + * - If the intermediate_out_grad is not used by any op. + */ + void RemoveIntermediateOut(Graph *graph) const; + + std::vector ReplaceNode(Node *cur_node, Node *new_node, + const std::vector &nodes) const; + + std::vector RemoveNode(Node *trg_node, + const std::vector &nodes) const; + + void ReLinkNodes(Graph *graph, const Node *intermediate_out, Node *op_1, + Node *op_2, Node *fused_op) const; + Node *CreateFuseElewiseAddActNode(Graph *g, const Node *op_1, + const Node *op_2, + const std::string &ele_x_n, + const std::string &ele_y_n, + const std::string &ele_out_n, + const std::string &act_out_n) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt new file mode 100644 index 00000000..22876e96 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/CMakeLists.txt @@ -0,0 +1,4 @@ +cc_library(fuse_optimizer_op_pass SRCS fuse_optimizer_op_pass.cc DEPS graph graph_helper) +cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc DEPS fuse_optimizer_op_pass) +cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc DEPS fuse_optimizer_op_pass) +cc_library(fuse_momentum_op_pass SRCS fuse_momentum_op_pass.cc DEPS fuse_optimizer_op_pass) diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc new file mode 100644 index 00000000..504ff04c --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc @@ -0,0 +1,209 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +class FuseAdamOpPass : public FuseOptimizerOpPass { + private: + const std::string GetOpType() const { return "adam"; } + + const std::vector GetAuxiliaryVarNames() const { + return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; + } + + void FuseOptimizerOps( + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const { + FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph); + FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"), + adam_ops, graph); + FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"), + adam_ops, graph); + } + + void FuseAdamOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(adam_ops.size(), static_cast(0)); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get( + adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + float beta1 = boost::get(adam_ops[0]->Op()->GetAttr("beta1")); + float beta2 = boost::get(adam_ops[0]->Op()->GetAttr("beta2")); + float epsilon = boost::get(adam_ops[0]->Op()->GetAttr("epsilon")); + bool lazy_mode = boost::get(adam_ops[0]->Op()->GetAttr("lazy_mode")); + int64_t min_row_size_to_use_multithread = boost::get( + adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread")); + for (auto &adam_op : adam_ops) { + PADDLE_ENFORCE_EQ(beta1, + boost::get(adam_op->Op()->GetAttr("beta1"))); + PADDLE_ENFORCE_EQ(beta2, + boost::get(adam_op->Op()->GetAttr("beta2"))); + PADDLE_ENFORCE_EQ(epsilon, + boost::get(adam_op->Op()->GetAttr("epsilon"))); + PADDLE_ENFORCE_EQ(lazy_mode, + boost::get(adam_op->Op()->GetAttr("lazy_mode"))); + PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread, + boost::get(adam_op->Op()->GetAttr( + "min_row_size_to_use_multithread"))); + PADDLE_ENFORCE_EQ(op_role, + boost::get(adam_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); + } + + // NOTE: fused_var is only exist in scope, so the graph doesn't have + // fused_var node. + + VLOG(7) << "Insert adam to graph "; + OpDesc adam_desc(adam_ops[0]->Op()->Block()); + adam_desc.SetType("adam"); + adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); + adam_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); + adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")}); + adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")}); + // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. + adam_desc.SetInput(kLearningRate, adam_ops[0]->Op()->Input(kLearningRate)); + adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow")); + adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow")); + + adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); + adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")}); + adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")}); + adam_desc.SetAttr("beta1", beta1); + adam_desc.SetAttr("beta2", beta2); + adam_desc.SetAttr("epsilon", epsilon); + adam_desc.SetAttr("lazy_mode", lazy_mode); + adam_desc.SetAttr("min_row_size_to_use_multithread", + min_row_size_to_use_multithread); + adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + + auto adam_node = graph->CreateOpNode(&adam_desc); + + InserInputAndOutputForOptOps(adam_ops, adam_node); + } + + void FuseScaleOps(const std::vector &beta_name, + const std::string &fused_var_name, + const std::vector &adam_ops, + ir::Graph *graph) const { + PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size()); + const std::string scale_op_name = "scale"; + + // Get the scale_ops of dealing the adam's beta var. + std::vector scale_ops; + scale_ops.reserve(beta_name.size()); + for (size_t i = 0; i < adam_ops.size(); ++i) { + auto &beta_1_pow_name = beta_name[i]; + auto beta_pow_iter = std::find_if( + adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(), + [&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool { + return var_node->Var() && + var_node->Var()->Name() == beta_1_pow_name; + }); + PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end()); + + auto beta_pow_node = *beta_pow_iter; + auto scale_op_iter = std::find_if( + beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(), + [&scale_op_name](ir::Node *op_node) -> bool { + return op_node->Op() && op_node->Op()->Type() == scale_op_name; + }); + PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end()); + + scale_ops.emplace_back(*scale_op_iter); + } + PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size()); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get( + scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + float scale = boost::get(scale_ops[0]->Op()->GetAttr("scale")); + float bias = boost::get(scale_ops[0]->Op()->GetAttr("bias")); + bool bias_after_scale = + boost::get(scale_ops[0]->Op()->GetAttr("bias_after_scale")); + for (auto &scale_op : scale_ops) { + PADDLE_ENFORCE_EQ(scale, + boost::get(scale_op->Op()->GetAttr("scale"))); + PADDLE_ENFORCE_EQ(bias, + boost::get(scale_op->Op()->GetAttr("bias"))); + PADDLE_ENFORCE_EQ( + bias_after_scale, + boost::get(scale_op->Op()->GetAttr("bias_after_scale"))); + PADDLE_ENFORCE_EQ(op_role, + boost::get(scale_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); + } + + // NOTE: fused_var is only exist in scope, so the graph doesn't have + // fused_var node. + + VLOG(7) << "Insert fused scale to graph."; + OpDesc scale_desc(scale_ops[0]->Op()->Block()); + scale_desc.SetType("scale"); + scale_desc.SetInput("X", {fused_var_name}); + scale_desc.SetOutput("Out", {fused_var_name}); + scale_desc.SetAttr("scale", scale); + scale_desc.SetAttr("bias", bias); + scale_desc.SetAttr("bias_after_scale", bias_after_scale); + scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + auto scale_node = graph->CreateOpNode(&scale_desc); + + for (auto scale_op : scale_ops) { + // set inputs + scale_node->inputs.insert(scale_node->inputs.begin(), + scale_op->inputs.begin(), + scale_op->inputs.end()); + for (auto &input : scale_op->inputs) { + std::replace(input->outputs.begin(), input->outputs.end(), scale_op, + scale_node); + } + // set outputs + scale_node->outputs.insert(scale_node->outputs.begin(), + scale_op->outputs.begin(), + scale_op->outputs.end()); + for (auto &output : scale_op->outputs) { + std::replace(output->inputs.begin(), output->inputs.end(), scale_op, + scale_node); + } + } + + // Delete scale_ops + for (auto &scale_op : scale_ops) { + graph->RemoveNode(scale_op); + } + } +}; +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_adam_op_pass, paddle::framework::ir::FuseAdamOpPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc new file mode 100644 index 00000000..3ac92d17 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +class FuseMomentumOpPass : public FuseOptimizerOpPass { + private: + virtual const std::string GetOpType() const { return "momentum"; } + + virtual const std::vector GetAuxiliaryVarNames() const { + return {"Velocity"}; + } + + // Fuse Momentum Ops + virtual void FuseOptimizerOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &momentum_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(momentum_ops.size(), static_cast(0)); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get(momentum_ops[0]->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())); + float mu = boost::get(momentum_ops[0]->Op()->GetAttr("mu")); + bool use_nesterov = + boost::get(momentum_ops[0]->Op()->GetAttr("use_nesterov")); + + for (auto &momentum_op : momentum_ops) { + PADDLE_ENFORCE_EQ(mu, + boost::get(momentum_op->Op()->GetAttr("mu"))); + PADDLE_ENFORCE_EQ( + use_nesterov, + boost::get(momentum_op->Op()->GetAttr("use_nesterov"))); + PADDLE_ENFORCE_EQ(op_role, + boost::get(momentum_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); + } + + // NOTE: fused_var is only exist in scope, so the graph doesn't have + // fused_var node. + + VLOG(7) << "Insert momentum to graph "; + OpDesc momentum_desc(momentum_ops[0]->Op()->Block()); + momentum_desc.SetType("momentum"); + momentum_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); + momentum_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); + momentum_desc.SetInput("Velocity", {fused_vars_name.at("Velocity")}); + // TODO(zcd): The LearningRate should be equal. + momentum_desc.SetInput(kLearningRate, + momentum_ops[0]->Op()->Input(kLearningRate)); + + momentum_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); + momentum_desc.SetOutput("VelocityOut", {fused_vars_name.at("Velocity")}); + momentum_desc.SetAttr("mu", mu); + momentum_desc.SetAttr("use_nesterov", use_nesterov); + momentum_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + + auto momentum_node = graph->CreateOpNode(&momentum_desc); + + InserInputAndOutputForOptOps(momentum_ops, momentum_node); + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_momentum_op_pass, paddle::framework::ir::FuseMomentumOpPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc new file mode 100644 index 00000000..ee601145 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc @@ -0,0 +1,364 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h" +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace ir { + +void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { + ir::Graph &result = *graph; + + const std::string fuse_op_type = GetOpType(); + std::vector aux_var_names = GetAuxiliaryVarNames(); + aux_var_names.emplace_back(kParam); + aux_var_names.emplace_back(kGrad); + + // Step 1: Get the specified op and auxiliary variables. + std::vector topo_nodes = ir::TopologySortOperations(result); + auto vars_info = GetVarInfo(result); + std::vector opt_nodes; + size_t opt_ops_num = 0; + // Note: Only take care about the dense gradients. + for (auto &node : topo_nodes) { + if (node->Op()->Type() == fuse_op_type) { + auto grad_name = node->Op()->Input(kGrad); + PADDLE_ENFORCE_EQ(grad_name.size(), static_cast(1)); + if (IsLoDTensorType(GetTypeOfVar(vars_info, grad_name[0]))) { + opt_nodes.emplace_back(node); + } + ++opt_ops_num; + } + } + + VLOG(6) << "Find " << fuse_op_type << " operators : " << opt_ops_num + << ", and " << opt_nodes.size() << " for dense gradients "; + if (opt_nodes.size() == 0 || result.Has(details::kFusedOptType)) { + if (result.Has(details::kFusedOptType)) { + auto &opt_type = + result.Get(details::kFusedOptType); + VLOG(6) << "Currently only support fusing one type optimizer op. " + "Has fused " + << opt_type; + } + return; + } + result.Set(details::kFusedOptType, new details::FusedOptType); + result.Get(details::kFusedOptType) = fuse_op_type; + if (!result.Has(details::kProgramDescs)) { + result.Set(details::kProgramDescs, new details::ProgramDescs); + } + + // Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be + // initialized in scopes before execution. + if (!result.Has(details::kFusedVars)) { + result.Set(details::kFusedVars, new details::FusedVars); + } + std::unordered_map> aux_var_set; + GetSpecifiedOpsAndVars(aux_var_names, opt_nodes, &aux_var_set); + std::unordered_map fused_vars_name; + fused_vars_name.reserve(aux_var_names.size()); + auto &fused_var_set = result.Get(details::kFusedVars); + const std::string prefix(details::kFusedVarNamePrefix); + for (auto &var_name : aux_var_names) { + // NOTE: the fused_var_name should be unique. + auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" + + aux_var_set[var_name][0]; + VLOG(6) << var_name << ": " << fused_var_name; + PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); + fused_var_set.insert(fused_var_name); + fused_vars_name.emplace(var_name, fused_var_name); + } + + // Step 3: Get the fused Gradient's name + bool grad_fused = false; + if (result.Has(details::kParamsAndDenseGrads)) { + // NOTE: kParamsAndDenseGrads is generated by + // alloc_continue_space_for_grad_pass + auto ¶ms_and_dense_grads = + result.Get(details::kParamsAndDenseGrads); + PADDLE_ENFORCE_LE( + params_and_dense_grads.size(), aux_var_set.at(kGrad).size(), + "The number of dense gradients should be little than optimizer ops."); + std::unordered_set opt_grad_set(aux_var_set.at(kGrad).size()); + for (auto &p_g : params_and_dense_grads) { + opt_grad_set.insert(p_g.second); + } + std::vector new_grad_idx; + for (size_t idx = 0; idx < aux_var_set.at(kGrad).size(); ++idx) { + auto &grad = aux_var_set.at(kGrad).at(idx); + if (!opt_grad_set.count(grad)) { + new_grad_idx.emplace_back(idx); + } + } + + // NOTE(zcd): the gradient of kParamsAndDenseGrads may be different + // with the kGrad. The gradients of kParamsAndDenseGrads is + // collected during backward stage, but in optimization state, the + // some gradient's name maybe changed. + if (new_grad_idx.size() == 0) { + if (!result.Has(details::kFusedGrads)) { + PADDLE_THROW( + "The coalesce_grad_tensor_pass should " + "be called before this pass."); + } + auto &fused_grad = result.Get(details::kFusedGrads); + PADDLE_ENFORCE_NE(fused_grad.size(), 0, + "The fused gradient should not be empty."); + PADDLE_ENFORCE_EQ(fused_grad.size(), 1, + "Because the dtype of those gradients " + "is not unified, so the number of fused gradients is " + "more than one, but it is not supported currently."); + auto &fused_vars = result.Get(details::kFusedVars); + auto iter = + std::find(fused_vars.begin(), fused_vars.end(), fused_grad.front()); + PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); + fused_vars_name[kGrad] = fused_grad.front(); + + // Sort the parameters and auxiliary variables according + // to parameters' name to make variables' name correspond correctly. + SortParametersAndAuxVars(params_and_dense_grads, &aux_var_set, + &opt_nodes); + grad_fused = true; + } else { + VLOG(10) << "The number of new gradients is " << new_grad_idx.size(); + if (new_grad_idx.size() == 1) return; + // NOTE(zcd): If the gradients of backward stage and optimization stage + // have diff, Only take care of the the gradient of optimization stage. + GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_set); + } + } + + // Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g. + // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops + // separately. + if (!grad_fused) { + InitFusedGradsAndAllocSpaceForGrads(aux_var_set.at(kParam), + aux_var_set.at(kGrad), + fused_vars_name.at(kGrad), &result); + } + aux_var_names.pop_back(); + InitFusedVarsAndAllocSpaceForVars(aux_var_names, aux_var_set, fused_vars_name, + &result); + + // Step 5: Fuse optimizer Ops and Scale Ops + FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result); + + // Step 6: Remove optimizer Ops + for (auto &opt_op : opt_nodes) { + graph->RemoveNode(opt_op); + } +} + +void FuseOptimizerOpPass::GradientsFilter( + const std::vector &new_grad_idx, std::vector *opt_nodes, + std::unordered_map> *aux_var_set) + const { + for (auto &aux_vars : *aux_var_set) { + std::vector sorted_vars; + sorted_vars.reserve(aux_vars.second.size()); + for (size_t i : new_grad_idx) { + sorted_vars.emplace_back(aux_vars.second.at(i)); + } + std::swap(aux_vars.second, sorted_vars); + if (VLOG_IS_ON(6)) { + std::stringstream out; + for (auto &var_name : aux_vars.second) { + out << var_name << " "; + } + VLOG(6) << aux_vars.first << ": " << out.str(); + } + } + std::vector sorted_ops; + for (size_t i : new_grad_idx) { + sorted_ops.emplace_back(opt_nodes->at(i)); + } + std::swap(*opt_nodes, sorted_ops); +} + +void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( + const std::vector ¶ms, + const std::vector &grads, const std::string &fused_grad_name, + ir::Graph *result) const { + auto vars_info = GetVarInfo(*result); + // Set Gradients as Persistable to prevent this var becoming reusable. + for (auto &grad_var_name : grads) { + auto iter = vars_info.find(grad_var_name); + PADDLE_ENFORCE(iter != vars_info.end()); + PADDLE_ENFORCE(!iter->second.empty()); + PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var()); + PADDLE_ENFORCE(IsLoDTensorType(iter->second.front()->Var()->GetType()), + "Currently the gradient type only should be LoDTensor when " + "fusing optimizer ops."); + for (auto var : iter->second) { + var->Var()->SetPersistable(true); + } + } + + // Define Ops + result->Get(details::kProgramDescs).emplace_back(); + ProgramDesc &program_desc = + result->Get(details::kProgramDescs).back(); + auto *global_block = program_desc.MutableBlock(0); + AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block, + false, false); +} + +std::unordered_map> +FuseOptimizerOpPass::GetVarInfo(const Graph &result) const { + std::unordered_map> vars; + for (Node *node : result.Nodes()) { + if (node->IsVar() && node->Var()) { + // Note: The graph may have the same name node. For example, parameter + // is the input of operator and it also is the output of optimizer; + vars[node->Var()->Name()].emplace_back(node); + } + } + return vars; +} + +bool FuseOptimizerOpPass::IsLoDTensorType( + const proto::VarType::Type &type) const { + // Current only support LOD_TENSOR. + return type == proto::VarType::LOD_TENSOR; +} + +proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar( + const std::unordered_map> &var_nodes, + const std::string &name) const { + auto grad_iter = var_nodes.find(name); + PADDLE_ENFORCE(grad_iter != var_nodes.end()); + PADDLE_ENFORCE(grad_iter->second.size() > 0); + PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var()); + return grad_iter->second.front()->Var()->GetType(); +} + +void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( + const std::vector &aux_var_names, + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name, + ir::Graph *result) const { + // Define Ops + result->Get(details::kProgramDescs).emplace_back(); + ProgramDesc &program_desc = + result->Get(details::kProgramDescs).back(); + auto *global_block = program_desc.MutableBlock(0); + for (auto &var_name : aux_var_names) { + AppendAllocContinuousSpace( + aux_var_set.at(var_name), aux_var_set.at(var_name), + fused_vars_name.at(var_name), global_block, true); + } +} + +void FuseOptimizerOpPass::SortParametersAndAuxVars( + const std::vector> ¶ms_grads, + std::unordered_map> *aux_vars_set, + std::vector *ops) const { + PADDLE_ENFORCE_NE(aux_vars_set->count(kParam), static_cast(0)); + auto ¶m_vec = aux_vars_set->at(kParam); + + std::vector param_sort_idx; + param_sort_idx.reserve(param_vec.size()); + + for (auto &p_g : params_grads) { + auto iter = std::find(param_vec.begin(), param_vec.end(), p_g.first); + PADDLE_ENFORCE(iter != param_vec.end()); + auto idx = std::distance(param_vec.begin(), iter); + param_sort_idx.emplace_back(idx); + } + + for (auto &aux_vars : *aux_vars_set) { + std::vector sorted_vars; + sorted_vars.reserve(aux_vars.second.size()); + for (size_t i = 0; i < aux_vars.second.size(); ++i) { + sorted_vars.emplace_back(aux_vars.second.at(param_sort_idx[i])); + } + std::swap(aux_vars.second, sorted_vars); + + if (VLOG_IS_ON(6)) { + std::stringstream out; + for (auto &var_name : aux_vars.second) { + out << var_name << " "; + } + VLOG(6) << aux_vars.first << ": " << out.str(); + } + } + + std::vector sorted_ops; + sorted_ops.reserve(ops->size()); + for (size_t i = 0; i < ops->size(); ++i) { + sorted_ops.emplace_back(ops->at(param_sort_idx[i])); + } + std::swap(*ops, sorted_ops); +} + +void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( + const std::vector &aux_vars_name, + const std::vector &opt_nodes, + std::unordered_map> *aux_args_name) + const { + for (auto &node : opt_nodes) { + std::stringstream out; + for (auto &var_n : aux_vars_name) { + auto arg_names = node->Op()->Input(var_n); + PADDLE_ENFORCE_EQ(arg_names.size(), static_cast(1)); + (*aux_args_name)[var_n].emplace_back(arg_names[0]); + out << var_n << ", " << arg_names[0] << "; "; + } + } +} + +void FuseOptimizerOpPass::AppendAllocContinuousSpace( + const std::vector &in_args, + const std::vector &out_args, const std::string &fused_out_arg, + BlockDesc *global_block, bool copy_data, bool check_name) const { + auto op_desc = global_block->AppendOp(); + op_desc->SetType("coalesce_tensor"); + op_desc->SetInput("Input", in_args); + op_desc->SetOutput("Output", out_args); + op_desc->SetOutput("FusedOutput", {fused_out_arg}); + op_desc->SetAttr("copy_data", copy_data); + op_desc->SetAttr("check_name", check_name); +} + +void FuseOptimizerOpPass::InserInputAndOutputForOptOps( + const std::vector &opt_nodes, ir::Node *opt_node) const { + std::unordered_set inputs; + std::unordered_set outputs; + for (auto opt_op : opt_nodes) { + // set inputs + inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end()); + for (auto &input : opt_op->inputs) { + replace(input->outputs.begin(), input->outputs.end(), opt_op, opt_node); + } + // set outputs + outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end()); + for (auto &output : opt_op->outputs) { + replace(output->inputs.begin(), output->inputs.end(), opt_op, opt_node); + } + } + opt_node->inputs.insert(opt_node->inputs.begin(), inputs.begin(), + inputs.end()); + opt_node->outputs.insert(opt_node->outputs.begin(), outputs.begin(), + outputs.end()); +} +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h new file mode 100644 index 00000000..0432d8c4 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h @@ -0,0 +1,98 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { + +constexpr char kGrad[] = "Grad"; +constexpr char kParam[] = "Param"; +constexpr char kLearningRate[] = "LearningRate"; + +class FuseOptimizerOpPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override; + + protected: + virtual void SortParametersAndAuxVars( + const std::vector> ¶ms_grads, + std::unordered_map> *aux_var_set, + std::vector *ops) const; + + void InserInputAndOutputForOptOps(const std::vector &opt_ops, + ir::Node *opt_node) const; + + private: + virtual const std::string GetOpType() const = 0; + + virtual const std::vector GetAuxiliaryVarNames() const = 0; + + virtual void FuseOptimizerOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const = 0; + + void GetSpecifiedOpsAndVars( + const std::vector &aux_vars_name, + const std::vector &opt_nodes, + std::unordered_map> *aux_args_name) + const; + + void AppendAllocContinuousSpace(const std::vector &in_args, + const std::vector &out_args, + const std::string &fused_out_arg, + BlockDesc *global_block, bool copy_data, + bool check_name = true) const; + + void InitFusedGradsAndAllocSpaceForGrads( + const std::vector ¶ms, + const std::vector &grads, const std::string &fused_grad_name, + ir::Graph *result) const; + + void InitFusedVarsAndAllocSpaceForVars( + const std::vector &aux_var_names, + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name, + ir::Graph *result) const; + + std::unordered_map> GetVarInfo( + const Graph &result) const; + + proto::VarType::Type GetTypeOfVar( + const std::unordered_map> &var_nodes, + const std::string &name) const; + + void GradientsFilter(const std::vector &new_grad_idx, + std::vector *opt_nodes, + std::unordered_map> + *aux_var_set) const; + + bool IsLoDTensorType(const proto::VarType::Type &type) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc new file mode 100644 index 00000000..077e393c --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/op_registry.h" +namespace paddle { +namespace framework { +namespace ir { + +class FuseSgdOpPass : public FuseOptimizerOpPass { + private: + virtual const std::string GetOpType() const { return "sgd"; } + + virtual const std::vector GetAuxiliaryVarNames() const { + return {}; + } + + // Fuse Sgd Ops + virtual void FuseOptimizerOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &sgd_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast(0)); + + // NOTE: fused_var is only exist in scope, so the graph doesn't have + // fused_var node. + + int op_role = boost::get( + sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + VLOG(7) << "Insert sgd to graph "; + // Add fused scale + OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); + Sgd_desc.SetType("sgd"); + Sgd_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); + Sgd_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); + Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); + + // TODO(zcd): The LearningRate should be equal. + Sgd_desc.SetInput(kLearningRate, sgd_ops[0]->Op()->Input(kLearningRate)); + + // NOTE: multi_devices_pass requires that every op should have a role. + Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + + auto sgd_node = graph->CreateOpNode(&Sgd_desc); + + InserInputAndOutputForOptOps(sgd_ops, sgd_node); + } +}; +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::ir::FuseSgdOpPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/ir/fuse_pass_base.cc b/paddle/fluid/framework/ir/fuse_pass_base.cc new file mode 100644 index 00000000..5e252360 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_pass_base.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include + +namespace paddle { +namespace framework { +namespace ir { + +void FusePassBase::Init(const std::string& repr, Graph* graph) const { + repr_ = repr; + graph_ = graph; +} + +Scope* FusePassBase::param_scope() const { + PADDLE_ENFORCE(graph_->Has(kParamScopeAttr)); + auto& scope = graph_->Get(kParamScopeAttr); + return &scope; +} + +void FusePassBase::AddStatis(int count_of_fused) const { + PADDLE_ENFORCE(graph_); + PADDLE_ENFORCE(!repr_.empty()); + if (!graph_->Has(kFuseStatisAttr)) { + graph_->Set(kFuseStatisAttr, new std::unordered_map); + } + auto& info = + graph_->Get>(kFuseStatisAttr); + info[repr_] = count_of_fused; +} + +FuseOptions FusePassBase::FindFuseOption(const Node& node1, + const Node& node2) const { +#ifdef PADDLE_WITH_MKLDNN + bool node1_mkldnn = node1.Op()->HasAttr("use_mkldnn") && + boost::get(node1.Op()->GetAttr("use_mkldnn")); + bool node2_mkldnn = node2.Op()->HasAttr("use_mkldnn") && + boost::get(node2.Op()->GetAttr("use_mkldnn")); + if (node1_mkldnn && node2_mkldnn) + return FUSE_MKLDNN; + else if (!node1_mkldnn && !node2_mkldnn) + return FUSE_NATIVE; + else + return DO_NOT_FUSE; +#else + return FUSE_NATIVE; +#endif +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h new file mode 100644 index 00000000..3a1022bb --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_pass_base.h @@ -0,0 +1,57 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace ir { + +static const char kParamScopeAttr[] = "__param_scope__"; +static const char kFuseStatisAttr[] = "__fuse_statis__"; +// When we use trt or other third_party lib, the parameters are managed by +// the lib, but not the fluid. So we need to record them to avoid duplicate +// allocation. +static const char kRepetitiveParamAttr[] = "__repetitive_param__"; + +enum FuseOptions { + DO_NOT_FUSE, // fusing will not be done + FUSE_NATIVE, // fusing will be done without MKL-DNN + FUSE_MKLDNN // fusing will be done with MKL-DNN +}; + +class FusePassBase : public Pass { + public: + void Init(const std::string& repr, Graph* graph) const; + Scope* param_scope() const; + void AddStatis(int count_of_fused) const; + + virtual ~FusePassBase() {} + + protected: + virtual FuseOptions FindFuseOption(const Node& node1, + const Node& node2) const; + + mutable Graph* graph_; + mutable std::string repr_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc new file mode 100644 index 00000000..c4e6b6e6 --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc @@ -0,0 +1,157 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void FuseReluDepthwiseConvPass::ApplyImpl(ir::Graph *graph) const { + graph = FuseReluDepthwiseConv(graph, true); + graph = FuseReluDepthwiseConv(graph, false); +} + +ir::Graph *FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( + ir::Graph *graph, bool only_forward) const { + PADDLE_ENFORCE(graph); + if (only_forward) + FusePassBase::Init("relu_depthwise_conv_only_forward", graph); + else + FusePassBase::Init("relu_depthwise_conv", graph); + /* + x ---act--> y ---layer-> z + +----------+ + ↓ ↓ + x' <--act'--- y' <-layer'--- z' + + fuse to: + + x ---act-layer-> z + | + ↓ + x' <--act-layer'--- z' + + */ + + GraphPatternDetector gpd; + auto *pattern = gpd.mutable_pattern(); + std::string act_type = "relu"; + std::string layer_type = "depthwise_conv2d"; + auto *x = pattern->NewNode("x")->AsInput(); + auto *y = pattern->NewNode("y")->AsIntermediate(); + auto *z = pattern->NewNode("z")->AsOutput(); + PDNode *xg = nullptr; + PDNode *yg = nullptr; + PDNode *zg = nullptr; + if (!only_forward) { + xg = pattern->NewNode("xg")->AsOutput(); + yg = pattern->NewNode("yg")->AsIntermediate(); + zg = pattern->NewNode("zg")->AsInput(); + } + + PDNode *act_g = nullptr; + PDNode *layer_g = nullptr; + auto *act = pattern->NewNode("act")->assert_is_op(act_type); + auto *layer = pattern->NewNode("layer")->assert_is_op(layer_type); + if (!only_forward) { + act_g = pattern->NewNode("act_g")->assert_is_op(act_type + "_grad"); + layer_g = pattern->NewNode("layer_g")->assert_is_op(layer_type + "_grad"); + } + + act->LinksFrom({x}).LinksTo({y}); + layer->LinksFrom({y}).LinksTo({z}); + if (!only_forward) { + layer_g->LinksFrom({y, zg}).LinksTo({yg}); + act_g->LinksFrom({y, yg}).LinksTo({xg}); + } + + int count = 0; + std::unordered_set need_removed_nodes; + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + VLOG(4) << "handle FuseReluDepthwiseConv fuse"; + // 1. turn on fuse option + auto *layer_op = subgraph.at(layer)->Op(); + layer_op->SetAttr("use_cudnn", false); + layer_op->SetAttr("fuse_relu_before_depthwise_conv", true); + + OpDesc *layer_g_op = nullptr; + if (!only_forward) { + layer_g_op = subgraph.at(layer_g)->Op(); + layer_g_op->SetAttr("use_cudnn", false); + layer_g_op->SetAttr("fuse_relu_before_depthwise_conv", true); + } + // 2. connect x to layer and layer_g, layer_g to xg + auto *y_var = subgraph.at(y)->Var(); + auto *x_var = subgraph.at(x)->Var(); + VarDesc *yg_var = nullptr; + VarDesc *xg_var = nullptr; + if (!only_forward) { + yg_var = subgraph.at(yg)->Var(); + xg_var = subgraph.at(xg)->Var(); + } + + PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1UL); + PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name()); + layer_op->SetInput("Input", {x_var->Name()}); + subgraph.at(layer)->inputs.push_back(subgraph.at(x)); + subgraph.at(x)->outputs.push_back(subgraph.at(layer)); + VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name(); + + if (!only_forward) { + PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1UL); + PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name()); + layer_g_op->SetInput("Input", {x_var->Name()}); + subgraph.at(layer_g)->inputs.push_back(subgraph.at(x)); + subgraph.at(x)->outputs.push_back(subgraph.at(layer_g)); + + PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1UL); + PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0], + yg_var->Name()); + layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()}); + subgraph.at(layer_g)->outputs.push_back(subgraph.at(xg)); + subgraph.at(xg)->inputs.push_back(subgraph.at(layer_g)); + VLOG(4) << "replace " << yg_var->Name() << " -> " << xg_var->Name(); + } + + // 3. delete y, yg, act, act_g + + if (only_forward) { + need_removed_nodes.insert({subgraph.at(y), subgraph.at(act)}); + } else { + need_removed_nodes.insert({subgraph.at(y), subgraph.at(yg), + subgraph.at(act), subgraph.at(act_g)}); + } + count++; + }; + gpd(graph, handler); + GraphSafeRemoveNodes(graph, need_removed_nodes); + AddStatis(count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_relu_depthwise_conv_pass, + paddle::framework::ir::FuseReluDepthwiseConvPass); diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h new file mode 100644 index 00000000..d37c153d --- /dev/null +++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h @@ -0,0 +1,41 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the relu and depthwise conv + */ +class FuseReluDepthwiseConvPass : public FusePassBase { + public: + virtual ~FuseReluDepthwiseConvPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + ir::Graph* FuseReluDepthwiseConv(ir::Graph* graph, bool only_forward) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc new file mode 100644 index 00000000..8ba0e8b8 --- /dev/null +++ b/paddle/fluid/framework/ir/graph.cc @@ -0,0 +1,209 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +Graph::Graph(const ProgramDesc &program) : program_(program) { + auto var_nodes = InitFromProgram(program_); + ResolveHazard(var_nodes); +} + +std::map> Graph::InitFromProgram( + const ProgramDesc &program) { + VLOG(3) << "block in program:" << program_.Size(); + std::unordered_map all_vars; + // var nodes for each var name, will have multiple versions in SSA + std::map> var_nodes; + for (auto *var : program.Block(0).AllVars()) { + all_vars.emplace(var->Name(), var); + } + + for (auto *op : program.Block(0).AllOps()) { + ir::Node *node = CreateOpNode(op); + // For input args, reuse the same var name if it was created before. + // Otherwise, create a new one. + for (auto &each_var_name : op->InputArgumentNames()) { + ir::Node *var = nullptr; + if (var_nodes.find(each_var_name) != var_nodes.end()) { + var = var_nodes.at(each_var_name).back(); + } else if (all_vars.count(each_var_name) != 0) { + var = CreateVarNode(all_vars.at(each_var_name)); + var_nodes[each_var_name].push_back(var); + } else { + // Operation input var can be optional (dispensable). Which means + // the operation doesn't really need the var at runtime. In this + // case, the no-existed var is ready at the beginning. + var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable); + var_nodes[each_var_name].push_back(var); + } + node->inputs.push_back(var); + var->outputs.push_back(node); + } + // For output args, always create a new var. + std::unordered_set out_arg_set; + for (auto &each_var_name : op->OutputArgumentNames()) { + if (each_var_name != kEmptyVarName) { + PADDLE_ENFORCE(out_arg_set.count(each_var_name) == 0, + "Program is wrong. %s occurs in output of %s several " + "times.", + each_var_name, op->Type()); + out_arg_set.insert(each_var_name); + } + + ir::Node *var = nullptr; + if (all_vars.count(each_var_name) != 0) { + var = CreateVarNode(all_vars.at(each_var_name)); + } else { + // Operation output vars can be @EMPTY@. For example, while_grad + // can have multi @EMPTY@ outputs with no VarDesc. + // TODO(panyx0718): Add a test. + var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable); + } + var_nodes[each_var_name].push_back(var); + node->outputs.push_back(var); + var->inputs.push_back(node); + } + } + Set>( + details::kStaleProgramOpDescs, + new std::vector(program.Block(0).AllOps())); + return var_nodes; +} + +void Graph::ResolveHazard( + const std::map> &var_nodes) { + /** + * We should handle write after read(WAR) and write after write(WAW) here. + * Because some of the operators of the program can be executed parallelly. + * So, to make the program running in the right order, we should add the + * dependence of WAR and WAW. + * + * + * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) + */ + + for (auto &var : var_nodes) { + auto &versions = var.second; + if (versions.size() <= 1) continue; + + auto it_new = versions.rbegin(); + auto it_old = versions.rbegin(); + ++it_old; + for (; it_old != versions.rend(); it_new = it_old, ++it_old) { + VLOG(3) << "deal with var: " << (*it_new)->Name(); + ir::Node *write_op = + (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0]; + const auto &read_ops = (*it_old)->outputs; + + PADDLE_ENFORCE( + write_op, + string::Sprintf("The write_op of var %s should not be empty.", + (*it_new)->Name())); + + // Add write after write dependence + ir::Node *upstream_op = + (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0]; + // TODO(zcd): Add a test. + if (upstream_op && upstream_op != write_op) { + ir::Node *dep_var = CreateControlDepVar(); + write_op->inputs.push_back(dep_var); + upstream_op->outputs.push_back(dep_var); + VLOG(10) << "add dep_var:" << dep_var->Name(); + dep_var->outputs.push_back(write_op); + dep_var->inputs.push_back(upstream_op); + } + + for (auto *read_op : read_ops) { + // Manually add a dependency var from read_op to write_op; + if (read_op == write_op) { + // Read Write is the same op. + continue; + } + // 2 ops might have been connected via other vars. + bool has_dep = false; + for (ir::Node *r_out : read_op->outputs) { + for (ir::Node *w_in : write_op->inputs) { + if (r_out == w_in) { + has_dep = true; + break; + } + } + } + if (has_dep) continue; + + ir::Node *dep_var = CreateControlDepVar(); + VLOG(10) << "add dep_var:" << dep_var->Name(); + read_op->outputs.push_back(dep_var); + dep_var->inputs.push_back(read_op); + write_op->inputs.push_back(dep_var); + dep_var->outputs.push_back(write_op); + } + } + } +} + +std::shared_ptr Graph::Clone() { + auto cloned_graph = std::make_shared(this->program_); + cloned_graph->ReleaseNodes(); + cloned_graph->num_node_created_ = 0; + std::unordered_map origin_to_cloned; + for (auto *n : this->node_set_) { + ir::Node *cloned_node = nullptr; + if (n->IsCtrlVar()) { + cloned_node = cloned_graph->CreateControlDepVar(); + } else if (!n->var_desc_ && !n->op_desc_) { // empty node + cloned_node = cloned_graph->CreateEmptyNode(n->Name(), n->NodeType()); + } else if (n->IsVar()) { + cloned_node = cloned_graph->CreateVarNode(n->Var()); + } else if (n->IsOp()) { + cloned_node = cloned_graph->CreateOpNode(n->Op()); + } + if (cloned_node) { + origin_to_cloned[n] = cloned_node; + } else { + PADDLE_THROW("The cloned node's type is not supported!"); + } + } + for (auto *n : this->node_set_) { + for (auto it = n->inputs.begin(); it != n->inputs.end(); it++) { + origin_to_cloned[n]->inputs.push_back(origin_to_cloned[*it]); + } + for (auto it = n->outputs.begin(); it != n->outputs.end(); it++) { + origin_to_cloned[n]->outputs.push_back(origin_to_cloned[*it]); + } + } + return cloned_graph; +} + +bool IsControlDepVar(const ir::Node &var) { + return var.Name().find(ir::Node::kControlDepVarName) != std::string::npos; +} +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h new file mode 100644 index 00000000..fff015d4 --- /dev/null +++ b/paddle/fluid/framework/ir/graph.h @@ -0,0 +1,241 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace framework { + +namespace details { + +// This attr is not recommended, because the graph should not dependence +// the program once it is built. +constexpr char kStaleProgramOpDescs[] = "stale_program_op_descs"; +} // namespace details + +namespace ir { + +/* + * The graph is a Directed Acyclic Single Static Assignment Graph. + * + * In more detail, the following properties must hold: + * + * The graph shouldn't contain cycle. Each node is a black-box to the graph + * so the node itself could be a loop operator. + * + * Each Variable-type node has only one input (thus single static assignment). + * + * The output/input of operator is variable and the output/input of variable + * is operator. + * + * The following data harzards in Program are addressed in the Graph: + * + * Write-After-Read + * a = op1(x) + * x = op2(b) + * A control-dependency connection is created bettwen op1 and op2 such that + * op1->op2, so as to ensure correct order. + * + * Write-After-Write + * x = op1(a) + * x = op2(b) + * A control-dependency connection is created between op1 and op2 such that + * op1->op2, so as to ensure correct order. + * + * Other properties currently hold, but is not enforced yet: + * + * Variable-type node (not control dep) with the same variable name share + * the same underlying VarDesc. + */ +class Graph { + public: + explicit Graph(const ProgramDesc &program); + + virtual ~Graph() { + for (auto &attr : attrs_) { + attr_dels_[attr.first](); + } + attrs_.clear(); + attr_dels_.clear(); + } + + bool Has(const std::string &attr_name) const { + return attrs_.count(attr_name) > 0; + } + + template + AttrType &Get(const std::string &attr_name) const { + PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.", + attr_name); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast &) { + PADDLE_THROW( + "Invalid attribute type of %s error, expected: %s, actual: %s", + attr_name, typeid(AttrType *).name(), + attrs_.at(attr_name).type().name()); + } + } + + template + void Set(const std::string &attr_name, AttrType *attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph", + attr_name); + attrs_[attr_name] = attr; + attr_dels_[attr_name] = [attr, attr_name]() { + VLOG(3) << "deleting " << attr_name; + delete attr; + }; + } + + template + void SetNotOwned(const std::string &attr_name, AttrType *attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph", + attr_name); + attrs_[attr_name] = attr; + attr_dels_[attr_name] = []() {}; + } + + void Erase(const std::string &attr_name) { + PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph", + attr_name); + attr_dels_[attr_name](); + attrs_.erase(attr_name); + attr_dels_.erase(attr_name); + } + + const std::unordered_set &Nodes() const { return node_set_; } + + // Create a normal variable with non-null VarDesc. + ir::Node *CreateVarNode(VarDesc *var_desc) { + PADDLE_ENFORCE(var_desc); + auto *x = AddNode(new ir::Node(var_desc)); + x->SetId(num_node_created_++); + return x; + } + + // Create a normal runnable operator with OpDesc. + ir::Node *CreateOpNode(OpDesc *op_desc) { + PADDLE_ENFORCE(op_desc); + auto *x = AddNode(new ir::Node(op_desc)); + x->SetId(num_node_created_++); + return x; + } + + // Create a control dependency var that connects 2 operations. The + // var doesn't hold any data. Other than that, it's no different from + // other var, considering dependency analysis. + ir::Node *CreateControlDepVar() { + // TODO(panyx0718): control var name should be really unique. + const std::string name = string::Sprintf( + "%s@%llu", static_cast(ir::Node::kControlDepVarName), + num_node_created_); + auto *x = AddNode(new ir::Node(name, ir::Node::Type::kVariable)); + x->SetId(num_node_created_++); + return x; + } + + // A more free style way of creating a graph node. Mostly use for test + // or "copy" from another node. Avoid using it if possible. + ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type) { + auto *x = AddNode(new ir::Node(name, type)); + x->SetId(num_node_created_++); + return x; + } + + // Clear all node information of the graph and return the ownership of the + // nodes. + std::vector> ReleaseNodes() { + std::vector> ret; + for (auto &n : nodes_) { + ret.emplace_back(n.second.release()); + } + nodes_.clear(); + node_set_.clear(); + return ret; + } + + std::unique_ptr RemoveNode(ir::Node *node) { + PADDLE_ENFORCE(node_set_.find(node) != node_set_.end()); + std::unique_ptr ret; + ret.reset(nodes_.at(node).release()); + nodes_.erase(node); + node_set_.erase(node); + return ret; + } + + // NOTE low performance, but simple and secure. + Node *RetrieveNode(int id) { + for (auto &node : nodes_) { + if (node.second->id() == id) { + return node.second.get(); + } + } + return nullptr; + } + + // Returns reference to the original program. + // WARN: After a series of passes, the current graph can be quite + // different from OriginProgram. Caller shouldn't assume much from + // the returned OriginProgram. + const ProgramDesc &OriginProgram() const { + LOG(WARNING) << "WARN: After a series of passes, the current graph can be " + "quite different from OriginProgram. So, please avoid " + "using the `OriginProgram()` method!"; + return program_; + } + + // This method takes ownership of `node`. + ir::Node *AddNode(ir::Node *node) { + PADDLE_ENFORCE(node_set_.find(node) == node_set_.end()); + nodes_[node].reset(node); + node_set_.insert(node); + return node; + } + + void ResolveHazard( + const std::map> &var_nodes); + + // Create a new and duplicated graph. + // WARN: The method only clones the graph structure, not its attributes. + std::shared_ptr Clone(); + + private: + std::map> InitFromProgram( + const ProgramDesc &program); + + // NOTE: program_ shouldn't be exposed to user. + const ProgramDesc program_; + std::map attrs_; + std::map> attr_dels_; + std::map> nodes_; + std::unordered_set node_set_; + size_t num_node_created_{0}; // help to generate a unique node id. +}; + +bool IsControlDepVar(const ir::Node &var); +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc new file mode 100644 index 00000000..b397216f --- /dev/null +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -0,0 +1,395 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph_helper.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph_traits.h" + +DEFINE_string(print_sub_graph_dir, "", + "FLAGS_print_sub_graph_dir is used " + "to print the nodes of sub_graphs."); + +namespace paddle { +namespace framework { +namespace ir { +namespace { +void SortHelper(const std::map, + ir::NodeComp> &adj_list, + ir::Node *node, std::unordered_set *visited, + std::vector *ret) { + visited->insert(node); + + for (auto adj : adj_list.at(node)) { + if (visited->find(adj) == visited->end()) { + SortHelper(adj_list, adj, visited, ret); + } + } + + VLOG(5) << "topology sort insert: " << node->Name() << " " + << reinterpret_cast(node) << " input " << node->inputs.size(); + ret->push_back(node); +} + +bool HasCircleHelper( + ir::Node *node, + const std::map, ir::NodeComp> + &adj_list, + std::unordered_set *visited, + std::unordered_set *in_trace, + std::vector> *circles) { + if (visited->find(node) == visited->end()) { + visited->insert(node); + in_trace->insert(node); + + for (ir::Node *in : adj_list.at(node)) { + if (visited->find(in) == visited->end() && + HasCircleHelper(in, adj_list, visited, in_trace, circles)) { + return true; + } else if (in_trace->find(in) != in_trace->end()) { + if (circles != nullptr) { + std::vector circle; + circle.emplace_back(in); + ir::Node *p = in; + for (auto &adj : adj_list.at(p)) { + if (in_trace->count(adj)) { + circle.emplace_back(adj); + p = adj; + } + } + circles->emplace_back(circle); + } + return true; + } + } + } + in_trace->erase(node); + return false; +} + +bool HasCircleInternal( + const std::map, ir::NodeComp> + &adj_list, + std::vector> *circles) { + std::unordered_set visited; + std::unordered_set in_trace; + for (auto &adj : adj_list) { + if (HasCircleHelper(adj.first, adj_list, &visited, &in_trace, circles)) { + return true; + } + } + return false; +} +} // namespace + +bool HasCircle(const Graph &graph) { + return HasCircleInternal(BuildOperationAdjList(graph), nullptr); +} + +bool VarDescIsConsistency(const Graph &graph) { + std::unordered_map> + var_name2node_set; + for (ir::Node *node : graph.Nodes()) { + if (node->IsVar() && node->Var()) { + var_name2node_set[node->Var()->Name()].emplace(node); + } + } + for (auto &iter : var_name2node_set) { + auto &first_node = *iter.second.begin(); + bool is_persistable = std::any_of(iter.second.begin(), iter.second.end(), + [&first_node](const ir::Node *node) { + return node->Var()->Persistable(); + }); + if (is_persistable) { + bool is_consistency = + std::all_of(iter.second.begin(), iter.second.end(), + [&first_node](const ir::Node *node) { + return *node->Var() == *first_node->Var(); + }); + if (!is_consistency) return false; + } + } + return true; +} +bool FindCircleSubGraph(const Graph &graph, + std::vector> *circles) { + return HasCircleInternal(BuildOperationAdjList(graph), circles); +} + +std::vector TopologySortOperations(const Graph &graph) { + std::map, ir::NodeComp> + adj_list = BuildOperationAdjList(graph); + PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr)); + std::unordered_set visited; + std::vector ret; + for (auto adj : adj_list) { + if (visited.find(adj.first) == visited.end()) { + SortHelper(adj_list, adj.first, &visited, &ret); + } + } + + return ret; +} + +// Build operator inlink edge table. +std::map, ir::NodeComp> +BuildOperationAdjList(const Graph &graph) { + std::map, ir::NodeComp> + adj_list; + + for (auto &n : graph.Nodes()) { + if (!n->IsOp()) continue; + if (adj_list.find(n) == adj_list.end()) { + adj_list[n] = std::set(); + } + for (auto &var : n->inputs) { + for (auto &adj_n : var->inputs) { + PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); + VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) + << " -> " << n->Name() << reinterpret_cast(n) + << " via " << var->Name() << reinterpret_cast(var); + adj_list[n].insert(adj_n); + } + } + } + return adj_list; +} + +// Build operator outlink edge table. +std::map> BuildOperationOutAdjList( + const Graph &graph) { + std::map> adj_list; + + for (auto &n : graph.Nodes()) { + if (!n->IsOp()) continue; + if (adj_list.find(n) == adj_list.end()) { + adj_list[n] = std::unordered_set(); + } + for (auto &var : n->outputs) { + for (auto &adj_n : var->outputs) { + PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); + VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) + << " -> " << n->Name() << reinterpret_cast(n) + << " via " << var->Name() << reinterpret_cast(var); + adj_list[n].insert(adj_n); + } + } + } + return adj_list; +} + +std::vector OpDFSSort(const Graph &graph) { + auto edge_table = BuildOperationOutAdjList(graph); + std::stack stack; + for (auto &ele : edge_table) { + if (ele.first->inputs.empty()) { + // find the input ops (those without input vars) + stack.push(ele.first); + } else { + // find the ops with only persistable vars as inputs. + bool all_persistable = true; + for (auto *input : ele.first->inputs) { + if (!(input->IsVar() && input->Var() && input->Var()->Persistable())) { + all_persistable = false; + } + } + if (all_persistable) { + stack.push(ele.first); + } + } + } + + std::vector res; + // start from the feed op and DFS + std::unordered_set unique_set; + while (!stack.empty()) { + // will start from the last feed by default. + auto cur = stack.top(); + stack.pop(); + unique_set.insert(cur); + res.push_back(cur); + + for (auto *op : edge_table[cur]) { + if (!unique_set.count(op)) { + stack.push(op); + } + } + } + return res; +} + +std::vector TopologyDfsSortOperations(const Graph &graph) { + std::vector nodes; + std::unordered_map in_degree; + + auto set_out_ops_ready = [&](Node *var) { + for (auto *op : var->outputs) { + --in_degree[op]; + } + }; + // build in_degree + for (auto *node : graph.Nodes()) { + if (node->IsOp()) { + in_degree[node] += node->inputs.size(); + } else if (node->IsVar() && node->inputs.empty()) { + // put all the inputs of the whole graph ready. + set_out_ops_ready(node); + } + } + + std::deque op_queue; + // first visit + for (auto &node : OpDFSSort(graph)) { + if (node->IsOp()) { + op_queue.push_back(node); + } + } + + // traverse the graph + int num_ops = op_queue.size(); + while (num_ops) { + for (auto it = op_queue.begin(); it != op_queue.end(); it++) { + auto *&cur_op = *it; + if (!cur_op || in_degree[cur_op] > 0) continue; + // visit this node + // put all the output var of this op valid. + for (auto *out_var : cur_op->outputs) { + if (!out_var) continue; + set_out_ops_ready(out_var); + } + VLOG(8) << "visit " << cur_op->Name(); + nodes.push_back(cur_op); + + cur_op = nullptr; + num_ops--; + } + } + + return nodes; +} + +size_t GraphNum(const Graph &graph) { + std::unordered_set nodes(graph.Nodes()); + std::unordered_set visited_nodes; + visited_nodes.reserve(nodes.size()); + std::deque q_nodes; + std::vector> graph_nodes; + std::unordered_set g_nodes; + // q_set used to record records in the queue. + std::unordered_set q_set; + size_t graph_count = 0; + + auto traverse_nodes = [&visited_nodes, &q_nodes, + &q_set](const std::vector &nodes) { + for (auto n : nodes) { + if (visited_nodes.count(n) == 0 && q_set.count(n) == 0) { + q_nodes.push_back(n); + q_set.insert(n); + } + } + }; + + while (visited_nodes.size() != nodes.size()) { + if (!q_nodes.empty()) { + auto cur_node = q_nodes.front(); + q_nodes.pop_front(); + q_set.erase(cur_node); + visited_nodes.insert(cur_node); + g_nodes.insert(cur_node); + traverse_nodes(cur_node->inputs); + traverse_nodes(cur_node->outputs); + } else { + ++graph_count; + if (g_nodes.size()) { + graph_nodes.emplace_back(g_nodes); + } + g_nodes.clear(); + for (auto &n : nodes) { + if (visited_nodes.count(n) == 0) { + q_nodes.push_back(n); + q_set.insert(n); + break; + } + } + } + } + + if (g_nodes.size()) { + graph_nodes.emplace_back(g_nodes); + } + + if (FLAGS_print_sub_graph_dir.size()) { + if (graph_nodes.size() > 1) { + std::stringstream out; + for (auto &g_n : graph_nodes) { + out << "graph_nodes: " << g_n.size() << "\n"; + } + out << "\n\n"; + for (auto &g_n : graph_nodes) { + out << "graph_nodes: " << g_n.size(); + for (auto &node : g_n) { + out << "\nNode: " << node->Name() << " in ["; + for (auto &n : node->inputs) { + out << n->Name() << ", "; + } + out << "], out["; + for (auto &n : node->outputs) { + out << n->Name() << ", "; + } + out << "]"; + } + out << "\n\n\n"; + } + std::unique_ptr fout( + new std::ofstream(FLAGS_print_sub_graph_dir)); + PADDLE_ENFORCE(fout->good()); + *fout << out.str(); + } + } + + return graph_count; +} + +void CleanIndividualNodes(Graph *graph) { + std::unordered_set nodes2rm; + for (auto *node : graph->Nodes()) { + if (node->inputs.empty() && node->outputs.empty()) { + nodes2rm.insert(node); + } + } + + for (auto *node : nodes2rm) { + graph->RemoveNode(node); + } +} + +std::vector TopologyVarientSort(const Graph &graph, + SortKind sort_kind) { + switch (sort_kind) { + case SortKind::TS: + return framework::ir::TopologySortOperations(graph); + default: + return framework::ir::TopologyDfsSortOperations(graph); + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h new file mode 100644 index 00000000..074ad320 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" + +namespace paddle { +namespace framework { +namespace ir { + +// Compare nodes via node id. +struct NodeComp { + bool operator()(ir::Node *const &node1, ir::Node *const &node2) const { + return node1->id() < node2->id(); + } +}; + +// Test if the graph contains circle. +bool HasCircle(const Graph &graph); + +// Check if the var desc of node is consistency. +// The graph may have the same name node, for example, parameter +// is the input of operator and it also is the output of optimizer. +// For the persistable variable, the var_desc of the nodes with +// the same node name should be equal. +bool VarDescIsConsistency(const Graph &graph); + +// Find All Circles for debugging, +// store all subgraph in circles. +bool FindCircleSubGraph(const Graph &graph, + std::vector> *circles); + +size_t GraphNum(const Graph &graph); + +// Topology Sort the operations in the graph from inputs to outputs. +// `graph` cannot contain circle. +std::vector TopologySortOperations(const Graph &graph); + +// Topological sort, but try to DFS. +std::vector TopologyDfsSortOperations(const Graph &graph); + +// Different kinds to sort the operators in a graph to a sequence. +enum class SortKind { + // Topological Search + TS = 0, + // Topological and Depth First Search + TDFS +}; + +// Several kinds of topological sort. +std::vector TopologyVarientSort(const Graph &graph, SortKind sort_kind); + +// Clean the nodes that doesn't connect to others. +void CleanIndividualNodes(Graph *graph); + +// Build an adjacency list of operations for the `graph`. +std::map, ir::NodeComp> +BuildOperationAdjList(const Graph &graph); + +template +std::vector FilterByNodeWrapper(const Graph &graph) { + std::vector ret; + for (ir::Node *n : graph.Nodes()) { + if (n->IsWrappedBy()) ret.push_back(&n->Wrapper()); + } + return ret; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc new file mode 100644 index 00000000..d8973d5a --- /dev/null +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -0,0 +1,227 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph.h" +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +void BuildCircleGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + + o1->outputs.push_back(v1); + o1->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o1); +} + +void BuildCircleGraph2(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + + o2->outputs.push_back(v2); + o1->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o1); +} + +void BuildNoCircleGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation); + ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation); + ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable); + ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + // o2->v3->o5 + o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + v4->outputs.push_back(o5); +} + +TEST(GraphHelperTest, Basic) { + ProgramDesc prog; + + Graph g(prog); + BuildCircleGraph(&g); + ASSERT_TRUE(HasCircle(g)); + + Graph g2(prog); + BuildCircleGraph2(&g2); + ASSERT_TRUE(HasCircle(g2)); + + auto adj_list = BuildOperationAdjList(g2); + for (auto& adj : adj_list) { + auto& adj_set = adj.second; + if (adj.first->Name() == "op1") { + ASSERT_EQ((*adj_set.begin())->Name(), "op2"); + } else if (adj.first->Name() == "op2") { + ASSERT_EQ((*adj_set.begin())->Name(), "op1"); + } else { + ASSERT_TRUE(false); + } + } + + Graph g3(prog); + BuildNoCircleGraph(&g3); + ASSERT_FALSE(HasCircle(g3)); + auto sorted = TopologySortOperations(g3); + std::map node_map; + for (size_t i = 0; i < sorted.size(); ++i) { + node_map[sorted[i]->Name()] = i; + } + ASSERT_EQ(node_map.at("op1"), 0UL); + ASSERT_EQ(node_map.at("op2"), 1UL); + ASSERT_TRUE(node_map.at("op3") < node_map.at("op5")); +} + +void BuildZeroGraph(Graph* g) {} + +void BuildOneGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation); + ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation); + ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable); + ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + // o2->v3->o5 + o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + v4->outputs.push_back(o5); +} + +void BuildTwoGraphs(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation); + ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation); + ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable); + ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + // o2->v3->o5 + // o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + // v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + // o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + // v4->outputs.push_back(o5); +} + +TEST(GraphHelperTest, Circles) { + ProgramDesc prog; + + Graph g(prog); + BuildCircleGraph(&g); + + std::vector> circles; + ASSERT_TRUE(FindCircleSubGraph(g, &circles)); + ASSERT_EQ(circles.size(), 1UL); +} + +TEST(GraphHelperTest, GraphNum) { + ProgramDesc prog; + + Graph g(prog); + BuildZeroGraph(&g); + ASSERT_EQ(GraphNum(g), 0UL); + + Graph g2(prog); + BuildOneGraph(&g2); + ASSERT_EQ(GraphNum(g2), 1UL); + + Graph g3(prog); + BuildTwoGraphs(&g3); + ASSERT_EQ(GraphNum(g3), 2UL); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc new file mode 100644 index 00000000..969166a3 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -0,0 +1,1948 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogEndl; +using string::PrettyLog; +using string::Style; + +size_t PDPattern::id_ = 0UL; + +PDNode *PDPattern::NewNode(const std::string &name) { + if (!name.empty()) { + PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL, + "PDNode's name should be unique, get duplicate [%s]", + name); + } + + nodes_.emplace_back(new PDNode(this, name)); + auto *cur = nodes_.back().get(); + node_map_[name] = cur; + return cur; +} + +PDNode *PDPattern::NewNode(PDNode::teller_t &&teller, const std::string &name) { + if (!name.empty()) { + PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL, + "PDNode's name should be unique, get duplicate [%s]", + name); + } + + nodes_.emplace_back(new PDNode(std::move(teller), this, name)); + auto *cur = nodes_.back().get(); + node_map_[name] = cur; + return cur; +} + +PDNode *PDPattern::RetrieveNode(const std::string &id) const { + auto it = node_map_.find(id); + if (it == node_map_.end()) { + return nullptr; + } + + return it->second; +} + +void PDPattern::AddEdge(PDNode *a, PDNode *b) { + PADDLE_ENFORCE(a); + PADDLE_ENFORCE(b); + PADDLE_ENFORCE(a != b, "can't connect to the same nodes."); + edges_.emplace_back(a, b); +} + +void GraphPatternDetector::operator()(Graph *graph, + GraphPatternDetector::handle_t handler) { + if (!MarkPDNodesInGraph(*graph)) { + return; + } + + auto subgraphs = DetectPatterns(); + UniquePatterns(&subgraphs); + RemoveOverlappedMatch(&subgraphs); + ValidateByNodeRole(&subgraphs); + + if (subgraphs.empty()) return; + PrettyLogEndl(Style::detail(), "--- detected %d subgraphs", + subgraphs.size()); + int id = 0; + for (auto &g : subgraphs) { + VLOG(3) << "optimizing #" << id++ << " subgraph"; + handler(g, graph); + } +} + +bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { + VLOG(3) << "mark pdnodes in graph"; + if (graph.Nodes().empty()) return false; + + for (auto &node : GraphTraits::DFS(graph)) { + for (const auto &pdnode : pattern_.nodes()) { + if (pdnode->Tell(&node)) { + VLOG(4) << "Node " << node.Name() << " marked as " << pdnode->name(); + pdnodes2nodes_[pdnode.get()].insert(&node); + } + } + } + // Check to early stop if some PDNode can't find matched Node. + for (auto &pdnode : pattern_.nodes()) { + if (!pdnodes2nodes_.count(pdnode.get())) { + VLOG(4) << pdnode->name() << " can't find matched Node, early stop"; + // return false; + } + } + VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; + + return !pdnodes2nodes_.empty(); +} + +// The intermediate Nodes can only link to the nodes inside the pattern, or this +// subgraph will be droped. +void GraphPatternDetector::ValidateByNodeRole( + std::vector *subgraphs) { + std::vector result; + + subgraphs->erase( + std::remove_if( + subgraphs->begin(), subgraphs->end(), + [](const GraphPatternDetector::subgraph_t &subgraph) -> bool { + // Collect the inputs and outputs. + std::unordered_set ios; + for (auto &item : subgraph) { + if (!item.first->IsIntermediate()) { + ios.insert(item.second); + } + } + for (auto &item : subgraph) { + if (item.first->IsIntermediate()) { + for (auto *x : item.second->inputs) { + if (!ios.count(x)) { + return true; + } + } + for (auto *x : item.second->outputs) { + if (!ios.count(x)) { + return true; + } + } + } + } + return false; + }), + subgraphs->end()); +} + +struct HitGroup { + std::unordered_map roles; + + bool Match(Node *node, PDNode *pat) { + if (nodes_.count(node)) { + if (roles.count(pat) && roles[pat] == node) return true; + return false; + } else { + if (roles.count(pat) && roles[pat] != node) return false; + return true; + } + } + + void Register(Node *node, PDNode *pat) { + roles[pat] = node; + nodes_.insert(node); + } + + private: + std::unordered_set nodes_; +}; + +// Tell whether Node a links to b. +bool IsNodesLink(Node *a, Node *b) { + for (auto *node : a->outputs) { + if (b == node) { + return true; + } + } + return false; +} + +std::vector +GraphPatternDetector::DetectPatterns() { + // Init empty subgraphs. + std::vector result; + std::vector init_groups; + std::array, 2> bi_records; + auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() + : pattern_.edges().front().first; + if (!pdnodes2nodes_.count(first_pnode)) return result; + for (auto *node : pdnodes2nodes_[first_pnode]) { + HitGroup group; + group.roles[first_pnode] = node; + init_groups.emplace_back(group); + } + + int step = 0; + bi_records[0] = std::move(init_groups); + + // Extend a PDNode to subgraphs by deducing the connection relations defined + // in edges of PDNodes. + for (const auto &edge : pattern_.edges()) { + VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); + // TODO(Superjomn) Fix bug here, the groups might be duplicate here. + // Each role has two PDNodes, which indicates two roles. + // Detect two Nodes that can match these two roles and they are connected. + auto &pre_groups = bi_records[step % 2]; + auto &cur_groups = bi_records[1 - (step++ % 2)]; + cur_groups.clear(); + if (pre_groups.empty()) break; + // source -> target + for (Node *source : pdnodes2nodes_[edge.first]) { + for (Node *target : pdnodes2nodes_[edge.second]) { + VLOG(8) << "check " << source->id() << " -- " << target->id(); + // TODO(Superjomn) add some prune strategies. + for (const auto &group : pre_groups) { + if (IsNodesLink(source, target)) { + HitGroup new_group = group; + bool flag = new_group.Match(source, edge.first) && + new_group.Match(target, edge.second); + if (flag) { + new_group.Register(source, edge.first); + new_group.Register(target, edge.second); + cur_groups.push_back(new_group); + // TODO(Superjomn) need to unique + } + } + } + } + } + VLOG(3) << "step " << step << " get records: " << cur_groups.size(); + for (auto &group : cur_groups) { + for (auto &item : group.roles) { + VLOG(4) << "node " << item.second->id() << " as " << item.first->name(); + } + VLOG(4) << "========================================================="; + } + } + + for (auto &group : bi_records[step % 2]) { + GraphPatternDetector::subgraph_t subgraph; + for (auto &role : group.roles) { + subgraph.emplace(role.first, role.second); + } + result.emplace_back(subgraph); + } + return result; +} + +struct GraphItemLessThan { + bool operator()(const std::pair &a, + const std::pair &b) { + if (a.first != b.first) { + return a.first < b.first; + } else { + return a.second < b.second; + } + } +}; + +// TODO(Superjomn) enhance the function as it marks unique unique as duplicates +// see https://github.com/PaddlePaddle/Paddle/issues/13550 +void GraphPatternDetector::UniquePatterns( + std::vector *subgraphs) { + if (subgraphs->empty()) return; + std::vector result; + + std::unordered_set set; + std::hash hasher; + for (auto &g : *subgraphs) { + // Sort the items in the sub-graph, and transform to a string key. + std::vector> sorted_keys(g.begin(), g.end()); + std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan()); + std::stringstream ss; + for (auto &item : sorted_keys) { + ss << item.first << ":" << item.second; + } + auto key = hasher(ss.str()); + if (!set.count(key)) { + result.emplace_back(g); + set.insert(key); + } + } + *subgraphs = result; +} + +void GraphPatternDetector::RemoveOverlappedMatch( + std::vector *subgraphs) { + std::vector result; + std::unordered_set node_set; + + for (const auto &subgraph : *subgraphs) { + bool valid = true; + for (auto &item : subgraph) { + if (item.first->IsIntermediate() && node_set.count(item.second)) { + valid = false; + break; + } + } + if (valid) { + for (auto &item : subgraph) { + node_set.insert(item.second); + } + result.push_back(subgraph); + } + } + *subgraphs = result; +} + +std::string PDPattern::DotString() const { + using inference::analysis::Dot; + Dot dot; + int id = 0; + // Create Nodes + std::unordered_map node2dot; + for (const auto &node : nodes()) { + std::string node_id = "Node" + std::to_string(id++); + dot.AddNode(node_id, {}, node->name()); + node2dot[node.get()] = node_id; + } + // Create Edges + for (const auto &edge : edges()) { + if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) { + LOG(ERROR) << "no node " << edge.first << " " << edge.second; + continue; + } + auto &src = node2dot.at(edge.first); + auto &trg = node2dot.at(edge.second); + dot.AddEdge(src, trg, {}); + } + return dot.Build(); +} + +PDNode &PDNode::LinksTo(const std::vector &others) { + // extend outlinks. + for (PDNode *x : others) { + pattern_->AddEdge(this, x); + } + return *this; +} + +PDNode &PDNode::LinksFrom(const std::vector &others) { + // extend outlinks. + for (PDNode *x : others) { + pattern_->AddEdge(x, this); + } + return *this; +} + +PDNode *PDNode::assert_is_op() { + asserts_.emplace_back([](Node *x) { return x && x->IsOp(); }); + return this; +} + +PDNode *PDNode::assert_is_op(const std::string &op_type) { + asserts_.emplace_back([op_type](Node *x) { + return x && x->IsOp() && x->Op()->Type() == op_type; + }); + return this; +} + +PDNode *PDNode::assert_is_var() { + asserts_.emplace_back([](Node *x) { return x && x->IsVar(); }); + return this; +} + +PDNode *PDNode::assert_is_not_ctrl_var() { + asserts_.emplace_back([](Node *x) { return x && !x->IsCtrlVar(); }); + return this; +} + +PDNode *PDNode::assert_var_not_persistable() { + assert_is_var(); + asserts_.emplace_back([](Node *x) { return !x->Var()->Persistable(); }); + return this; +} + +PDNode *PDNode::assert_is_persistable_var() { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { return x->Var()->Persistable(); }); + return this; +} + +PDNode *PDNode::assert_is_op_nth_input(const std::string &op_type, + const std::string &argument, int nth) { + assert_is_var(); + assert_is_op_input(op_type); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { + if (op->IsOp() && op->Op()->Type() == op_type && + IsNthInput(x, op, argument, nth)) + return true; + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_op_nth_output(const std::string &op_type, + const std::string &argument, int nth) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { + if (op->IsOp() && op->Op()->Type() == op_type && + IsNthOutput(x, op, argument, nth)) + return true; + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_only_input_of_op(const std::string &op_type) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { + if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type && + op->inputs.size() == 1) { + return true; + } + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_only_output_of_op(const std::string &op_type) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { + if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type && + op->outputs.size() == 1) { + return true; + } + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_op_output(const std::string &op_type) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { + if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) { + return true; + } + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_op_output(const std::string &op_type, + const std::string &argument) { + assert_is_var(); + assert_is_op_nth_output(op_type, argument, 0); + return this; +} +PDNode *PDNode::assert_is_op_input(const std::string &op_type) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { + if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) { + return true; + } + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_op_input(const std::string &op_type, + const std::string &argument) { + assert_is_var(); + assert_is_op_nth_input(op_type, argument, 0); + return this; +} + +PDNode *PDNode::assert_op_has_n_inputs(const std::string &op_type, size_t n) { + assert_is_op(op_type); + asserts_.emplace_back([=](Node *x) { return x->inputs.size() == n; }); + return this; +} + +PDNode *PDNode::assert_op_has_n_outputs(const std::string &op_type, size_t n) { + assert_is_op(op_type); + asserts_.emplace_back([=](Node *x) { return x->outputs.size() == n; }); + return this; +} + +PDNode *PDNode::assert_has_n_inputs(size_t n) { + asserts_.emplace_back([=](Node *x) { return x->inputs.size() == n; }); + return this; +} + +PDNode *PDNode::assert_has_n_outputs(size_t n) { + asserts_.emplace_back([=](Node *x) { return x->outputs.size() == n; }); + return this; +} + +PDNode *PDNode::assert_more(PDNode::teller_t &&teller) { + asserts_.emplace_back(std::move(teller)); + return this; +} + +PDNode *PDNode::assert_is_ops(const std::unordered_set &op_types) { + asserts_.emplace_back([op_types](Node *x) { + return x && x->IsOp() && op_types.count(x->Op()->Type()); + }); + return this; +} + +PDNode *PDNode::assert_is_ops_nth_input( + const std::unordered_set &op_types, + const std::string &argument, int nth) { + assert_is_var(); + assert_is_ops_input(op_types); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { + if (op->IsOp() && op_types.count(op->Op()->Type()) && + IsNthInput(x, op, argument, nth)) + return true; + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_ops_nth_output( + const std::unordered_set &op_types, + const std::string &argument, int nth) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { + if (op->IsOp() && op_types.count(op->Op()->Type()) && + IsNthOutput(x, op, argument, nth)) + return true; + } + return false; + }); + return this; +} +PDNode *PDNode::assert_is_ops_output( + const std::unordered_set &op_types) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { + if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type())) { + return true; + } + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_ops_output( + const std::unordered_set &op_types, + const std::string &argument) { + assert_is_var(); + assert_is_ops_nth_output(op_types, argument, 0); + return this; +} + +PDNode *PDNode::assert_is_ops_input( + const std::unordered_set &op_types) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { + if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type())) { + return true; + } + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_ops_input( + const std::unordered_set &op_types, + const std::string &argument) { + assert_is_var(); + assert_is_ops_nth_input(op_types, argument, 0); + return this; +} + +bool VarLinksToOp(Node *node, const std::string &op_type) { + for (auto *out : node->outputs) { + if (out->IsOp() && out->Op()->Type() == op_type) { + return true; + } + } + return false; +} + +bool IsNthInput(Node *var, Node *op, const std::string &argument, size_t nth) { + PADDLE_ENFORCE(var->IsVar()); + PADDLE_ENFORCE(op->IsOp()); + if (!HasInput(op, argument) || op->Op()->Input(argument).size() <= nth) + return false; + return var->Name() == op->Op()->Input(argument)[nth]; +} + +bool HasInput(Node *op, const std::string &argument) { + PADDLE_ENFORCE(op->IsOp()); + auto const &names = op->Op()->InputNames(); + if (std::find(names.begin(), names.end(), argument) == names.end()) + return false; + return true; +} + +bool IsNthOutput(Node *var, Node *op, const std::string &argument, size_t nth) { + PADDLE_ENFORCE(var->IsVar()); + PADDLE_ENFORCE(op->IsOp()); + if (op->Op()->Output(argument).size() <= nth) return false; + return var->Name() == op->Op()->Output(argument)[nth]; +} + +void GraphSafeRemoveNodes(Graph *graph, + const std::unordered_set &nodes) { + for (auto *node : nodes) { + graph->RemoveNode(const_cast(node)); + } + + for (auto *node : graph->Nodes()) { + for (auto it = node->inputs.begin(); it != node->inputs.end();) { + if (nodes.count(*it)) { + it = const_cast(node)->inputs.erase(it); + } else { + it++; + } + } + for (auto it = node->outputs.begin(); it != node->outputs.end();) { + if (nodes.count(*it)) { + it = const_cast(node)->outputs.erase(it); + } else { + it++; + } + } + } +} + +bool VarLinksFromOp(Node *node, const std::string &op_type) { + for (auto *out : node->inputs) { + if (out->IsOp() && out->Op()->Type() == op_type) { + return true; + } + } + return false; +} + +PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input, + bool with_eltwise_add) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + + PDNode *eltwise_op = nullptr; + if (with_eltwise_add) { + eltwise_op = + pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); + } + auto *batch_norm_op = + pattern->NewNode(batch_norm_repr())->assert_is_op("batch_norm"); + // Create variables + // Conv Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d"); + + PDNode *eltwise_y_in_var = nullptr; + PDNode *eltwise_out_var = nullptr; + if (with_eltwise_add) { + // Conv output as Bias input + conv_out_var->assert_is_op_input("elementwise_add", "X"); + // Bias + eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + eltwise_out_var = pattern->NewNode(eltwise_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("elementwise_add"); + } else { + // Conv output as BN input + conv_out_var->assert_is_op_input("batch_norm", "X"); + } + + // BN Scale + auto *bn_scale_var = pattern->NewNode(bn_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("batch_norm", "Scale"); + // BN Bias + auto *bn_bias_var = pattern->NewNode(bn_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("batch_norm", "Bias"); + // BN Mean + auto *bn_mean_var = pattern->NewNode(bn_mean_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("batch_norm", "Mean"); + // BN Variance + auto *bn_variance_var = pattern->NewNode(bn_variance_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("batch_norm", "Variance"); + + // BN output + auto *bn_out_var = pattern->NewNode(bn_out_repr()) + ->AsOutput() + ->assert_is_op_output("batch_norm"); + + auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr()) + ->AsOutput() + ->assert_is_op_output("batch_norm", "MeanOut"); + + auto *bn_variance_out_var = + pattern->NewNode(bn_variance_out_repr()) + ->AsOutput() + ->assert_is_op_output("batch_norm", "VarianceOut"); + + auto *bn_saved_mean_var = + pattern->NewNode(bn_saved_mean_repr()) + ->AsOutput() + ->assert_is_op_output("batch_norm", "SavedMean"); + + auto *bn_saved_variance_var = + pattern->NewNode(bn_saved_variance_repr()) + ->AsOutput() + ->assert_is_op_output("batch_norm", "SavedVariance"); + + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + + if (with_eltwise_add) { + eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var}) + .LinksTo({eltwise_out_var}); + batch_norm_op + ->LinksFrom({eltwise_out_var, bn_scale_var, bn_bias_var, bn_mean_var, + bn_variance_var}) + .LinksTo({bn_out_var, bn_mean_out_var, bn_variance_out_var, + bn_saved_mean_var, bn_saved_variance_var}); + } else { + batch_norm_op + ->LinksFrom({conv_out_var, bn_scale_var, bn_bias_var, bn_mean_var, + bn_variance_var}) + .LinksTo({bn_out_var, bn_mean_out_var, bn_variance_out_var, + bn_saved_mean_var, bn_saved_variance_var}); + } + return bn_out_var; +} + +PDNode *patterns::ConvReLU::operator()( + paddle::framework::ir::PDNode *conv_input) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu"); + // Create variables + // Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + // intermediate variable, will be removed in the IR after fuse. + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d") + ->assert_is_op_input("relu"); + // output + auto *relu_out_var = pattern->NewNode(relu_out_repr()) + ->AsOutput() + ->assert_is_op_output("relu"); + + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var}); + return relu_out_var; +} + +PDNode *patterns::ConvBReLU::operator()( + paddle::framework::ir::PDNode *conv_input) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + auto *brelu_op = pattern->NewNode(brelu_repr())->assert_is_op("relu6"); + // Create variables + // Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + // intermediate variable, will be removed in the IR after fuse. + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d") + ->assert_is_op_input("relu6"); + // output + auto *brelu_out_var = pattern->NewNode(brelu_out_repr()) + ->AsOutput() + ->assert_is_op_output("relu6"); + + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + brelu_op->LinksFrom({conv_out_var}).LinksTo({brelu_out_var}); + return brelu_out_var; +} + +PDNode *patterns::SeqConvEltAddRelu::operator()( + paddle::framework::ir::PDNode *seqconv_input) { + // Create Operators + seqconv_input->assert_is_op_input("sequence_conv", "X"); + auto *seqconv_op = pattern->NewNode(seqconv_repr()) + ->assert_is_op("sequence_conv") + ->assert_op_attr("paddingTrainable", false) + ->assert_op_attr("contextStride", 1); + + auto *eltadd_op = + pattern->NewNode(eltadd_repr())->assert_is_op("elementwise_add"); + auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu"); + // Create variables + // Filter + auto *seqconv_weight_var = + pattern->NewNode(seqconv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("sequence_conv", "Filter"); + // Bias + auto *eltadd_bias_var = pattern->NewNode(eltadd_bias_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add"); + // intermediate variable, will be removed in the IR after fuse. + auto *seqconv_out_var = pattern->NewNode(seqconv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("sequence_conv") + ->assert_is_op_input("elementwise_add"); + auto *eltadd_out_var = pattern->NewNode(eltadd_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("elementwise_add") + ->assert_is_only_input_of_op("relu"); + // output + auto *relu_out_var = pattern->NewNode(relu_out_repr()) + ->AsOutput() + ->assert_is_op_output("relu"); + + seqconv_op->LinksFrom({seqconv_input, seqconv_weight_var}) + .LinksTo({seqconv_out_var}); + eltadd_op->LinksFrom({seqconv_out_var, eltadd_bias_var}) + .LinksTo({eltadd_out_var}); + relu_op->LinksFrom({eltadd_out_var}).LinksTo({relu_out_var}); + return relu_out_var; +} + +PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x, + bool with_bias) { + // Create shared nodes. + x->assert_is_op_input("mul", "X"); + auto *mul = pattern->NewNode(mul_repr())->assert_is_op("mul"); + + auto *mul_w_var = pattern->NewNode(w_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("mul", "Y"); + + auto *mul_out_var = + pattern->NewNode(mul_out_repr())->assert_is_op_output("mul"); + + if (!with_bias) { // not with bias + // Add links. + mul->LinksFrom({x, mul_w_var}).LinksTo({mul_out_var}); + return mul_out_var; + + } else { // with bias + mul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + // Create operators. + auto *elementwise_add = pattern->NewNode(elementwise_add_repr()) + ->assert_is_op("elementwise_add"); + // Create variables. + auto *bias = pattern->NewNode(bias_repr()) + ->assert_is_op_input("elementwise_add") + ->AsInput(); + + auto *fc_out = pattern->NewNode(Out_repr()) + ->AsOutput() + ->assert_is_op_output("elementwise_add"); + + mul->LinksFrom({mul_w_var, x}).LinksTo({mul_out_var}); + elementwise_add->LinksFrom({mul_out_var, bias}).LinksTo({fc_out}); + return fc_out; + } +} + +PDNode *patterns::FCMKLDNN::operator()(paddle::framework::ir::PDNode *x, + bool with_bias) { + // Create shared nodes. + x->assert_is_op_input("fc", "Input"); + + auto *fc_op = pattern->NewNode(fc_repr())->assert_is_op("fc"); + // Create variables + // Filter + auto *fc_weight_var = pattern->NewNode(weights_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("fc", "W"); + // Bias + auto *fc_bias_var = pattern->NewNode(bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("fc", "Bias"); + // Output + auto *fc_out_var = pattern->NewNode(output_repr()) + ->AsOutput() + ->assert_is_op_output("fc", "Out") + ->assert_is_only_output_of_op("fc"); + + fc_op->LinksFrom({x, fc_weight_var, fc_bias_var}).LinksTo({fc_out_var}); + return fc_out_var; +} + +PDNode *patterns::Embedding::operator()(PDNode *x) { + x->assert_is_op_input("lookup_table", "Ids"); + auto *lookup_table_op = + pattern->NewNode(lookup_table_repr())->assert_is_op("lookup_table"); +#define NEW_NODE(arg__, io__) \ + auto *arg__ = pattern->NewNode(arg__##_repr()) \ + ->assert_is_op_##io__("lookup_table", #arg__); + + NEW_NODE(W, input); + + NEW_NODE(Out, output); +#undef NEW_NODE + + lookup_table_op->LinksFrom({x, W}); + lookup_table_op->LinksTo({Out}); + return Out; +} + +PDNode *patterns::LSTM::operator()(PDNode *x) { + x->assert_is_op_input("lstm", "Input"); + auto *lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm"); +#define NEW_NODE(arg__, io__) \ + auto *arg__ = \ + pattern->NewNode(arg__##_repr())->assert_is_op_##io__("lstm", #arg__); + + // Currently, the H0 and C0 are optional + // TODO(Superjomn) upgrade the fuse framework to support optional. + // NEW_NODE(H0, input); + // NEW_NODE(C0, input); + NEW_NODE(Weight, input); + NEW_NODE(Bias, input); + + NEW_NODE(Hidden, output); + NEW_NODE(Cell, output); + NEW_NODE(BatchGate, output); + NEW_NODE(BatchCellPreAct, output); +#undef NEW_NODE + + lstm_op->LinksFrom({x, Weight, Bias}); + lstm_op->LinksTo({Hidden, Cell, BatchGate, BatchCellPreAct}); + return Hidden; +} + +PDNode *patterns::GRU::operator()(PDNode *x) { + x->assert_is_op_input("gru", "Input"); + auto *gru_op = pattern->NewNode(gru_repr())->assert_is_op("gru"); +#define NEW_NODE(arg__, io__) \ + auto *arg__ = \ + pattern->NewNode(arg__##_repr())->assert_is_op_##io__("gru", #arg__); + + NEW_NODE(Weight, input); + // TODO(Superjomn): upgrade the fuse framework to support optional. + // H0 and bias are optional + NEW_NODE(Bias, input); // also optional + // NEW_NODE(H0, input); + + NEW_NODE(Hidden, output); + // below are intermediate + NEW_NODE(BatchGate, output); + NEW_NODE(BatchResetHiddenPrev, output); + NEW_NODE(BatchHidden, output); +#undef NEW_NODE + + BatchGate->AsIntermediate(); + BatchResetHiddenPrev->AsIntermediate(); + BatchHidden->AsIntermediate(); + + gru_op->LinksFrom({x, Weight, Bias}); + gru_op->LinksTo({Hidden, BatchGate, BatchResetHiddenPrev, BatchHidden}); + return Hidden; +} + +PDNode *patterns::ActElewiseAdd::operator()( + paddle::framework::ir::PDNode *in_var, + std::unordered_set act_types) { + in_var->assert_is_ops_input(act_types, "X"); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + auto *act_out_var = pattern->NewNode(act_out_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_ops_output(act_types); + act_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + auto *ele_x_var = pattern->NewNode(ele_x_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_op_input("elementwise_add") + ->AsInput(); + auto *elementwise_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *elewise_add_out = pattern->NewNode(elewise_add_out_repr()) + ->AsOutput() + ->assert_is_op_output("elementwise_add", "Out"); + + act->LinksFrom({in_var}).LinksTo({act_out_var}); + elementwise_add->LinksFrom({act_out_var, ele_x_var}) + .LinksTo({elewise_add_out}); + + return elewise_add_out; +} + +PDNode *patterns::ElewiseAddAct::operator()( + paddle::framework::ir::PDNode *ele_x_var, + std::unordered_set act_types) { + auto *ele_y_var = pattern->NewNode(ele_y_repr()) + ->assert_is_op_input("elementwise_add", "Y"); + + auto *ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + + auto *ele_out_var = pattern->NewNode(elewise_add_out_repr()) + ->assert_is_op_output("elementwise_add", "Out"); + + ele_out_var->AsIntermediate()->assert_is_ops_input(act_types); + + auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types); + + auto *act_out_var = + pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out"); + + ele_add->LinksFrom({ele_x_var, ele_y_var}).LinksTo({ele_out_var}); + act->LinksFrom({ele_out_var}).LinksTo({act_out_var}); + + return act_out_var; +} + +PDNode *patterns::ElewiseAddActInplaceGrad::operator()( + paddle::framework::ir::PDNode *d_act_out_var, + std::unordered_set act_types) { + // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"] + // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"] + auto *act_grad = pattern->NewNode(act_grad_repr())->assert_is_ops(act_types); + + auto *act_out_var = + pattern->NewNode(act_out_repr())->assert_is_ops_input(act_types, "Out"); + + auto *d_intermediate_var = + pattern->NewNode(d_itermediate_out_repr()) + ->assert_is_ops_output(act_types, GradVarName("X")); + + act_grad->LinksFrom({d_act_out_var, act_out_var}) + .LinksTo({d_intermediate_var}); + + auto *ele_y_var = pattern->NewNode(ele_y_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_op_input("elementwise_add_grad", "Y"); + + auto *ele_add_grad = pattern->NewNode(ele_add_grad_repr()) + ->assert_is_op("elementwise_add_grad"); + + auto *d_ele_x_var = + pattern->NewNode(d_ele_x_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_op_output("elementwise_add_grad", GradVarName("X")); + + auto *d_ele_y_var = + pattern->NewNode(d_ele_y_repr()) + ->assert_is_not_ctrl_var() + ->assert_is_op_output("elementwise_add_grad", GradVarName("Y")); + + ele_add_grad->LinksFrom({d_intermediate_var, ele_y_var}) + .LinksTo({d_ele_x_var, d_ele_y_var}); + + return ele_add_grad; +} + +// conv_type: conv2d, conv3d, conv2d_transpose +PDNode *patterns::ConvBias::operator()( + paddle::framework::ir::PDNode *conv_input, std::string conv_type) { + // Create Operators + conv_input->assert_is_op_input(conv_type, "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(conv_type); + auto *eltiwse_op = + pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); + // Create variables + // Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input(conv_type, "Filter"); + // intermediate variable, will be removed in the IR after fuse. + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op(conv_type) + ->assert_is_op_input("elementwise_add"); + // Bias stored in elementwise_add + auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("elementwise_add", "Y"); + // output + auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr()) + ->AsOutput() + ->assert_is_op_output("elementwise_add"); + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var}) + .LinksTo({eltwise_out_var}); + return eltwise_out_var; +} + +PDNode *patterns::Conv::operator()() { + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + + auto input_var = pattern->NewNode(conv_input_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + + auto filter_var = pattern->NewNode(conv_filter_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "Filter"); + + auto output_var = pattern->NewNode(conv_output_repr()) + ->AsOutput() + ->assert_is_op_output("conv2d", "Output"); + + conv_op->LinksFrom({input_var, filter_var}).LinksTo({output_var}); + return output_var; +} + +PDNode *patterns::ConvResidual::operator()(bool with_residual_data) { + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + + if (!with_residual_data) { + conv_op->assert_more([&](Node *x) { + auto node_names = x->Op()->InputNames(); + if (!HasInput(x, "ResidualData") || + x->Op()->Input("ResidualData").size() == 0) + return true; + return false; + }); + } + + auto input_var = pattern->NewNode(conv_input_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + + auto filter_var = pattern->NewNode(conv_filter_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "Filter"); + + auto output_var = pattern->NewNode(conv_output_repr()) + ->AsOutput() + ->assert_is_op_output("conv2d", "Output"); + + std::vector links_from{input_var, filter_var}; + + if (with_residual_data) { + auto res_conn_var = pattern->NewNode(conv_residual_data_repr()) + ->AsInput() + ->assert_is_op_input("conv2d", "ResidualData"); + links_from.push_back(res_conn_var); + } + + conv_op->LinksFrom(links_from).LinksTo({output_var}); + return output_var; +} + +PDNode *patterns::Pool::operator()() { + auto pool_op = pattern->NewNode(pool_op_repr())->assert_is_op("pool2d"); + + auto input_var = pattern->NewNode(pool_input_repr()) + ->AsInput() + ->assert_is_op_input("pool2d", "X"); + + auto output_var = pattern->NewNode(pool_output_repr()) + ->AsOutput() + ->assert_is_op_output("pool2d", "Out"); + + pool_op->LinksFrom({input_var}).LinksTo({output_var}); + return output_var; +} + +PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + + x_var->AsInput()->assert_is_op_input("elementwise_add", "X"); + y_var->AsInput()->assert_is_op_input("elementwise_add", "Y"); + auto out_var = pattern->NewNode(elementwise_add_out_repr()) + ->AsOutput() + ->assert_is_op_output("elementwise_add", "Out"); + + elementwise_add_op->LinksFrom({x_var, y_var}); + elementwise_add_op->LinksTo({out_var}); + + return out_var; +} + +PDNode *patterns::Concat::operator()() { + auto concat_op = pattern->NewNode(concat_op_repr())->assert_is_op("concat"); + + auto output_var = pattern->NewNode(concat_out_repr()) + ->AsOutput() + ->assert_is_op_output("concat", "Out"); + + concat_op->LinksTo({output_var}); + return output_var; +} + +PDNode *patterns::ConcatReLU::operator()() { + auto concat_op = pattern->NewNode(concat_op_repr())->assert_is_op("concat"); + auto relu_op = pattern->NewNode(relu_op_repr())->assert_is_op("relu"); + + auto concat_out = + pattern->NewNode(concat_out_repr())->assert_is_op_output("concat", "Out"); + + auto relu_out = pattern->NewNode(relu_out_repr()) + ->AsOutput() + ->assert_is_op_output("relu", "Out"); + + concat_op->LinksTo({concat_out}); + relu_op->LinksFrom({concat_out}).LinksTo({relu_out}); + + return relu_out; +} + +PDNode *patterns::ConvConcatReLU::operator()() { + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto concat_op = pattern->NewNode(concat_op_repr())->assert_is_op("concat"); + auto relu_op = pattern->NewNode(relu_op_repr())->assert_is_op("relu"); + + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d", "Output"); + + auto concat_out = pattern->NewNode(concat_out_repr()) + ->assert_is_op_output("concat", "Out") + ->assert_is_op_input("relu", "X"); + + auto relu_out = pattern->NewNode(relu_out_repr()) + ->AsOutput() + ->assert_is_op_output("relu", "Out"); + + conv_op->LinksTo({conv_out}); + concat_op->LinksFrom({conv_out}).LinksTo({concat_out}); + relu_op->LinksFrom({concat_out}).LinksTo({relu_out}); + + return relu_out; +} + +PDNode *patterns::PriorBox::operator()() { + auto prior_box_op = + pattern->NewNode(prior_box_op_repr())->assert_is_op("prior_box"); + + auto input_var = pattern->NewNode(prior_box_input_repr()) + ->AsInput() + ->assert_is_op_input("prior_box", "Input"); + + auto image_var = pattern->NewNode(prior_box_image_repr()) + ->AsInput() + ->assert_is_op_input("prior_box", "Image"); + + auto boxes_var = pattern->NewNode(prior_box_boxes_repr()) + ->AsOutput() + ->assert_is_op_output("prior_box", "Boxes"); + + auto variances_var = pattern->NewNode(prior_box_variances_repr()) + ->AsOutput() + ->assert_is_op_output("prior_box", "Variances"); + + prior_box_op->LinksFrom({input_var, image_var}) + .LinksTo({boxes_var, variances_var}); + return boxes_var; +} + +std::unordered_set conv_act_set({"identity", "relu"}); + +PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { + conv_in->AsInput(); + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->AsIntermediate(); + + auto act_op = pattern->NewNode(act_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + auto op_type = node->Name(); + return conv_act_set.count(op_type); + }); + + auto act_out = pattern->NewNode(act_out_repr()) + ->assert_is_var() + // is activation op's output. + ->assert_more([&](Node *node) { + for (auto *in_op : node->inputs) { + if (conv_act_set.count(in_op->Name())) { + return true; + } + } + return false; + }) + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}); + conv_out->LinksFrom({conv_op}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + act_op->LinksFrom({elementwise_add_out}).LinksTo({act_out}); + + return act_out; +} + +PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + + auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) + ->assert_is_op_input("elementwise_add", "X") + ->AsInput(); + auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) + ->assert_is_op_output("elementwise_add") + ->AsIntermediate(); + + auto act_op = pattern->NewNode(act_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + auto op_type = node->Name(); + return conv_act_set.count(op_type); + }); + auto act_out = pattern->NewNode(act_out_repr()) + ->assert_is_var() + // is activation op's output. + ->assert_more([&](Node *node) { + for (auto *in_op : node->inputs) { + if (conv_act_set.count(in_op->Name())) { + return true; + } + } + return false; + }) + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + elementwise_add_op_1->LinksFrom({elementwise_add_out, elementwise_add_in_y_1}) + .LinksTo({elementwise_add_out_1}); + act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); + return act_out; +} + +PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) { + conv_in->AsInput(); + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}); + conv_out->LinksFrom({conv_op}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + + return elementwise_add_out; +} + +PDNode *patterns::ConvAffineChannel::operator()( + paddle::framework::ir::PDNode *conv_input, bool with_eltwise_add) { + // Create Operators + conv_input->assert_is_op_input("conv2d", "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + + PDNode *eltwise_op = nullptr; + if (with_eltwise_add) { + eltwise_op = + pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); + } + + auto *affine_channel_op = + pattern->NewNode(affine_channel_repr())->assert_is_op("affine_channel"); + // Create variables + // Conv Filter + auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("conv2d", "Filter"); + + auto *conv_out_var = pattern->NewNode(conv_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("conv2d"); + + PDNode *eltwise_y_in_var = nullptr; + PDNode *eltwise_out_var = nullptr; + if (with_eltwise_add) { + // Conv output as Bias input + conv_out_var->assert_is_op_input("elementwise_add", "X"); + // Bias + eltwise_y_in_var = pattern->NewNode(eltwise_y_in_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + eltwise_out_var = pattern->NewNode(eltwise_out_repr()) + ->AsIntermediate() + ->assert_is_only_output_of_op("elementwise_add"); + } else { + // Conv output as AffineChannel input + conv_out_var->assert_is_op_input("affine_channel", "X"); + } + + // AC Scale + auto *ac_scale_var = pattern->NewNode(ac_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_has_n_outputs(1) + ->assert_is_op_input("affine_channel", "Scale"); + // AC Bias + auto *ac_bias_var = pattern->NewNode(ac_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_has_n_outputs(1) + ->assert_is_op_input("affine_channel", "Bias"); + + // AC output + auto *ac_out_var = pattern->NewNode(ac_out_repr()) + ->AsOutput() + ->assert_is_op_output("affine_channel"); + + conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); + + if (with_eltwise_add) { + eltwise_op->LinksFrom({conv_out_var, eltwise_y_in_var}) + .LinksTo({eltwise_out_var}); + affine_channel_op->LinksFrom({eltwise_out_var, ac_scale_var, ac_bias_var}) + .LinksTo({ac_out_var}); + } else { + affine_channel_op->LinksFrom({conv_out_var, ac_scale_var, ac_bias_var}) + .LinksTo({ac_out_var}); + } + return ac_out_var; +} + +PDNode *patterns::DequantQuantAny::operator()() { + auto *dequant_in = pattern->NewNode(dequant_in_repr()) + ->AsInput() + ->assert_is_op_input("dequantize", "Input"); + + auto *dequant_op = + pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize"); + + auto *dequant_out = pattern->NewNode(dequant_out_repr()) + ->AsOutput() + ->assert_is_op_output("dequantize", "Output"); + + auto *quant_op = pattern->NewNode(quant_op_repr()) + ->assert_is_op("quantize") + ->AsIntermediate(); + + auto *quant_out = pattern->NewNode(quant_out_repr()) + ->AsOutput() + ->assert_is_op_output("quantize"); + + auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + + dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out}); + quant_op->LinksFrom({dequant_out}).LinksTo({quant_out}); + next_op->LinksFrom({quant_out}); + + return quant_out; +} + +PDNode *patterns::DequantAny::operator()() { + auto *dequant_op = + pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize"); + + auto *dequant_out = pattern->NewNode(dequant_out_repr()) + ->AsOutput() + ->assert_is_op_output("dequantize", "Output"); + + auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + + dequant_op->LinksTo({dequant_out}); + next_op->LinksFrom({dequant_out}); + + return dequant_out; +} + +// a -> transpose_op(1) -> transpose_out_a -> flatten_op(1) -> flatten_out_a +// b -> transpose_op(2) -> transpose_out_b -> flatten_op(2) -> flatten_out_b +// ... +// z -> transpose_op(n) -> transpose_out_z -> flatten_op(n) -> flatten_out_z +// flatten_out_a -> concat_op flatten_out_b -> concat_op ... flatten_out_z -> +// concat_op +PDNode *patterns::TransposeFlattenConcat::operator()( + std::vector conv_in, int times) { + // The times represents the repeat times of the + // {trans, trans_out, flatten, flatten_out} + const int kNumFields = 4; + const int kTransOutOffset = 1; + const int kFlattenOffset = 2; + const int kFlattenOutOffset = 3; + + std::vector nodes; + + for (int i = 0; i < times; i++) { + nodes.push_back( + pattern->NewNode(GetNodeName("transpose" + std::to_string(i))) + ->assert_is_op("transpose2")); + nodes.push_back( + pattern->NewNode(GetNodeName("transpose_out" + std::to_string(i))) + ->assert_is_op_output("transpose2") + ->assert_is_op_input("flatten2", "X") + ->AsIntermediate()); + nodes.push_back(pattern->NewNode(GetNodeName("flatten" + std::to_string(i))) + ->assert_is_op("flatten2")); + + nodes.push_back( + pattern->NewNode(GetNodeName("flatten_out" + std::to_string(i))) + ->assert_is_op_output("flatten2") + ->assert_is_op_nth_input("concat", "X", i) + ->AsIntermediate()); + } + + auto concat_op = pattern->NewNode(GetNodeName("concat")) + ->assert_is_op("concat") + ->assert_op_has_n_inputs("concat", times); + auto concat_out = pattern->NewNode(GetNodeName("concat_out")) + ->assert_is_op_output("concat") + ->AsOutput(); + + std::vector flatten_outs; + for (int i = 0; i < times; i++) { + conv_in[i]->AsInput(); + // trans + nodes[i * kNumFields]->LinksFrom({conv_in[i]}); + // trans_out + nodes[i * kNumFields + kTransOutOffset]->LinksFrom({nodes[i * kNumFields]}); + // flatten + nodes[i * kNumFields + kFlattenOffset]->LinksFrom( + {nodes[i * kNumFields + kTransOutOffset]}); + // flatten_out + nodes[i * kNumFields + kFlattenOutOffset]->LinksFrom( + {nodes[i * kNumFields + kFlattenOffset]}); + flatten_outs.push_back(nodes[i * kNumFields + kFlattenOutOffset]); + } + + concat_op->LinksFrom(flatten_outs).LinksTo({concat_out}); + return concat_out; +} + +PDNode *patterns::AnakinDetectionPattern::operator()( + std::vector conv_in, int times, std::string priorbox_type, + bool is_reshape) { + // The times represents the repeat times of the + // {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape} + const int kNumFields = 7; + const int kPriorBoxLocOffset = 1; + const int kReshape1Offset = 2; + const int kReshape1OutOffset = 3; + const int kPriorBoxVarOffset = 4; + const int kReshape2Offset = 5; + const int kReshape2OutOffset = 6; + + const int kBoxCoderThirdInputOffset = times; + const int kMultiClassSecondInputNmsOffset = times + 1; + + std::vector nodes; + std::string op_after_priorbox = is_reshape ? "reshape2" : "flatten2"; + + for (int i = 0; i < times; i++) { + nodes.push_back( + pattern->NewNode(GetNodeName("prior_box" + std::to_string(i))) + ->assert_is_op(priorbox_type)); + nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i))) + ->assert_is_op_output(priorbox_type, "Boxes") + ->assert_is_op_input(op_after_priorbox, "X") + ->AsIntermediate()); + nodes.push_back( + pattern->NewNode(GetNodeName("reshape1" + std::to_string(i))) + ->assert_is_op(op_after_priorbox)); + + nodes.push_back( + pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i))) + ->assert_is_op_output(op_after_priorbox) + ->assert_is_op_nth_input("concat", "X", i) + ->AsIntermediate()); + + nodes.push_back( + pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i))) + ->assert_is_op_output(priorbox_type, "Variances") + ->assert_is_op_input(op_after_priorbox, "X") + ->AsIntermediate()); + nodes.push_back( + pattern->NewNode(GetNodeName("reshape2" + std::to_string(i))) + ->assert_is_op(op_after_priorbox)); + + nodes.push_back( + pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i))) + ->assert_is_op_output(op_after_priorbox) + ->assert_is_op_nth_input("concat", "X", i) + ->AsIntermediate()); + } + + auto concat_op1 = pattern->NewNode(GetNodeName("concat1")) + ->assert_is_op("concat") + ->assert_op_has_n_inputs("concat", times); + auto concat_out1 = pattern->NewNode(GetNodeName("concat1_out")) + ->assert_is_op_output("concat") + ->AsIntermediate(); + + auto concat_op2 = pattern->NewNode(GetNodeName("concat2")) + ->assert_is_op("concat") + ->assert_op_has_n_inputs("concat", times); + auto concat_out2 = pattern->NewNode(GetNodeName("concat2_out")) + ->assert_is_op_output("concat") + ->AsIntermediate(); + + auto box_coder_op = pattern->NewNode(GetNodeName("box_coder")) + ->assert_is_op("box_coder") + ->assert_op_has_n_inputs("box_coder", 3); + + auto box_coder_out = pattern->NewNode(GetNodeName("box_coder_out")) + ->assert_is_op_output("box_coder") + ->AsIntermediate(); + + auto transpose_before_nms = + pattern->NewNode(GetNodeName("transpose_before_nms")) + ->assert_is_op("transpose2"); + + auto transpose_before_nms_out = + pattern->NewNode(GetNodeName("transpose_before_nms_out")) + ->assert_is_op_output("transpose2") + ->assert_is_op_input("multiclass_nms", "Scores") + ->AsIntermediate(); + + auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms")) + ->assert_is_op("multiclass_nms") + ->assert_op_has_n_inputs("multiclass_nms", 2); + + auto multiclass_nms_out = pattern->NewNode(GetNodeName("multiclass_nms_out")) + ->assert_is_op_output("multiclass_nms") + ->AsOutput(); + + std::vector reshape1_outs; + std::vector reshape2_outs; + + for (int i = 0; i < times; i++) { + conv_in[i]->AsInput(); + // prior_box + nodes[i * kNumFields]->LinksFrom({conv_in[i]}); + // prior_box box out + nodes[i * kNumFields + kPriorBoxLocOffset]->LinksFrom( + {nodes[i * kNumFields]}); + // reshape + nodes[i * kNumFields + kReshape1Offset]->LinksFrom( + {nodes[i * kNumFields + kPriorBoxLocOffset]}); + // reshape_out + nodes[i * kNumFields + kReshape1OutOffset]->LinksFrom( + {nodes[i * kNumFields + kReshape1Offset]}); + + nodes[i * kNumFields + kPriorBoxVarOffset]->LinksFrom( + {nodes[i * kNumFields]}); + // reshape + nodes[i * kNumFields + kReshape2Offset]->LinksFrom( + {nodes[i * kNumFields + kPriorBoxVarOffset]}); + // reshape_out + nodes[i * kNumFields + kReshape2OutOffset]->LinksFrom( + {nodes[i * kNumFields + kReshape2Offset]}); + + reshape1_outs.push_back(nodes[i * kNumFields + kReshape1OutOffset]); + reshape2_outs.push_back(nodes[i * kNumFields + kReshape2OutOffset]); + } + + concat_op1->LinksFrom(reshape1_outs); + concat_op2->LinksFrom(reshape2_outs); + concat_out1->LinksFrom({concat_op1}); + concat_out2->LinksFrom({concat_op2}); + + conv_in[kBoxCoderThirdInputOffset]->AsInput(); + conv_in[kMultiClassSecondInputNmsOffset]->AsInput(); + + box_coder_op->LinksFrom( + {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]}); + box_coder_out->LinksFrom({box_coder_op}); + + transpose_before_nms->LinksFrom({conv_in[kMultiClassSecondInputNmsOffset]}); + transpose_before_nms_out->LinksFrom({transpose_before_nms}); + + multiclass_nms_op->LinksFrom({box_coder_out, transpose_before_nms_out}) + .LinksTo({multiclass_nms_out}); + + return multiclass_nms_out; +} + +PDNode *patterns::FillConstantElementWiseMulFuse::operator()( + PDNode *elementwise_op_input) { + auto fill_constant = + pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant"); + + auto fill_constant_out = pattern->NewNode(fill_constant_out_repr()) + ->assert_is_op_output("fill_constant") + ->assert_is_op_input("elementwise_mul", "Y") + ->AsIntermediate(); + + auto elementwise_mul_op = + pattern->NewNode(elementwise_mul_repr())->assert_is_op("elementwise_mul"); + + auto elementwise_mul_out = pattern->NewNode(elementwise_mul_out_repr()) + ->assert_is_op_output("elementwise_mul") + ->AsOutput(); + + fill_constant_out->LinksFrom({fill_constant}); + elementwise_mul_op->LinksFrom({elementwise_op_input, fill_constant_out}); + elementwise_mul_out->LinksFrom({elementwise_mul_op}); + return elementwise_mul_out; +} + +void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, + const std::string &op_type, + const std::string &weight_name, + int times, + const std::string &quant_type, + const std::string &dequant_type) { + int kNumFields = 5; + const int kQuantizedWeightOffset = 0; + const int kQuantizedOpOffset = 1; + const int kQuantizedOpOutOffset = 2; + const int kDequantOpOffset = 3; + const int kDequantOpOutOffset = 4; + const int kDequantOpWeightScaleOffset = 5; + + // the quant op always be one. + auto quant_op_in_scale = pattern->NewNode(GetNodeName("quant_op_in_scale")) + ->assert_is_op_input(quant_type, "InScale") + ->AsInput(); + auto quant_op = + pattern->NewNode(GetNodeName("quant_op"))->assert_is_op(quant_type); + + PDNode *quant_op_out_scale = nullptr; + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + kNumFields += 1; + quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale")) + ->assert_is_op_output(quant_type, "OutScale") + ->assert_is_op_nth_input(dequant_type, "Scales", 1) + ->AsIntermediate(); + } else { + quant_op_out_scale = pattern->NewNode(GetNodeName("quant_op_out_scale")) + ->assert_is_op_output(quant_type, "OutScale") + ->assert_is_op_input(dequant_type, "Scale") + ->AsIntermediate(); + } + + auto quant_op_out = pattern->NewNode(GetNodeName("quant_op_out")) + ->assert_is_op_output(quant_type, "Out") + ->assert_is_op_input(op_type) + ->AsIntermediate(); + + // there are 'times' quantized and dequant op + std::vector nodes; + for (int i = 0; i < times; i++) { + nodes.push_back( + pattern->NewNode(GetNodeName("quantized_op_weight") + std::to_string(i)) + ->assert_is_op_input(op_type, weight_name) + ->AsInput()); + nodes.push_back( + pattern->NewNode(GetNodeName("quantized_op") + std::to_string(i)) + ->assert_is_op(op_type)); + + nodes.push_back( + pattern->NewNode(GetNodeName("quantized_op_out") + std::to_string(i)) + ->assert_is_op_output(op_type) + ->assert_is_op_input(dequant_type, "X") + ->AsIntermediate()); + + nodes.push_back( + pattern->NewNode(GetNodeName("dequant_op") + std::to_string(i)) + ->assert_is_op(dequant_type)); + + nodes.push_back( + pattern->NewNode(GetNodeName("dequant_op_out") + std::to_string(i)) + ->assert_is_op_output(dequant_type, "Out") + ->AsOutput()); + + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + nodes.push_back(pattern + ->NewNode(GetNodeName("dequant_channel_scale") + + std::to_string(i)) + ->assert_is_op_nth_input(dequant_type, "Scales", 0) + ->AsInput()); + } + } + + quant_op->LinksFrom({quant_op_input, quant_op_in_scale}); + quant_op_out->LinksFrom({quant_op}); + for (int i = 0; i < times; i++) { + nodes[i * kNumFields + kQuantizedOpOffset]->LinksFrom( + {quant_op_out, nodes[i * kNumFields + kQuantizedWeightOffset]}); + nodes[i * kNumFields + kQuantizedOpOutOffset]->LinksFrom( + {nodes[i * kNumFields + kQuantizedOpOffset]}); + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + nodes[i * kNumFields + kDequantOpOffset]->LinksFrom( + {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale, + nodes[i * kNumFields + kDequantOpWeightScaleOffset]}); + } else { + nodes[i * kNumFields + kDequantOpOffset]->LinksFrom( + {nodes[i * kNumFields + kQuantizedOpOutOffset], quant_op_out_scale}); + } + nodes[i * kNumFields + kDequantOpOutOffset]->LinksFrom( + {nodes[i * kNumFields + kDequantOpOffset]}); + } +} + +void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) { + auto reshape1_op = + pattern->NewNode(reshape1_op_repr())->assert_is_op("reshape2"); + + auto reshape1_out = pattern->NewNode(reshape1_out_repr()) + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("transpose2") + ->AsIntermediate(); + + auto transpose_op = + pattern->NewNode(transpose_op_repr())->assert_is_op("transpose2"); + + auto transpose_out = pattern->NewNode(transpose_out_repr()) + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("reshape2") + ->AsIntermediate(); + + auto reshape2_op = + pattern->NewNode(reshape2_op_repr())->assert_is_op("reshape2"); + auto reshape2_out = pattern->NewNode(reshape2_out_repr()) + ->assert_is_op_output("reshape2", "Out") + ->AsOutput(); + + reshape1_op->LinksFrom({reshape1_in}); + reshape1_out->LinksFrom({reshape1_op}); + transpose_op->LinksFrom({reshape1_out}); + transpose_out->LinksFrom({transpose_op}); + reshape2_op->LinksFrom({transpose_out}); + reshape2_out->LinksFrom({reshape2_op}); +} + +void patterns::DeleteQuantDequantOpPattern::operator()() { + auto any_op_out = + pattern->NewNode(any_op_out_repr()) + ->assert_is_op_input( + "fake_quantize_dequantize_moving_average_abs_max", "X") + ->AsInput(); + + auto quant_dequant_op_inscale = + pattern->NewNode(quant_dequant_op_inscale_repr()) + ->assert_is_op_input( + "fake_quantize_dequantize_moving_average_abs_max", "InScale") + ->AsInput(); + auto quant_dequant_op = + pattern->NewNode(quant_dequant_op_repr()) + ->assert_is_op("fake_quantize_dequantize_moving_average_abs_max"); + + auto quant_dequant_out = + pattern->NewNode(quant_dequant_op_out_repr()) + ->assert_is_op_output( + "fake_quantize_dequantize_moving_average_abs_max", "Out") + ->AsIntermediate(); + + auto quant_dequant_op_outscale = + pattern->NewNode(quant_dequant_op_outscale_repr()) + ->assert_is_op_output( + "fake_quantize_dequantize_moving_average_abs_max", "OutScale") + ->AsOutput(); + auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput(); + + quant_dequant_op->LinksFrom({any_op_out, quant_dequant_op_inscale}); + quant_dequant_op_outscale->LinksFrom({quant_dequant_op}); + quant_dequant_out->LinksFrom({quant_dequant_op}); + any_op2->LinksFrom({quant_dequant_out}); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h new file mode 100644 index 00000000..c53e4e5e --- /dev/null +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -0,0 +1,1046 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_TESTING +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/inference/analysis/dot.h" + +namespace paddle { +namespace framework { +namespace ir { +class PDPattern; + +// Some basic terminologies: +// - PDPattern: a pattern defined as a data flow graph. +// - PDNode: the node in the pattern, each PDNode represents an `ir::Node` +// that meets some conditions defined in `PDNode.teller`. +// - A pattern is defined with PDNodes with edges. + +// Pattern detector node. This node helps to build a pattern. +struct PDNode { + // tell whether an ir::Node* is a candidation for a PDNode. + using teller_t = std::function; + enum class Type { kOp, kVar }; + enum class Role { + kUnknown, // No role, + kInput, // an input and will be retained, + kOutput, // an output and will be retained, + kIntermediate // will be removed after handler. + }; + + // this link to others + PDNode& LinksTo(const std::vector& others); + PDNode& LinksFrom(const std::vector& others); + + bool Tell(Node* node) const { + if (teller_) return teller_(node); + + for (auto& asrt : asserts_) { + if (!asrt(node)) return false; + } + return true; + } + + bool IsOp() const { return type_ == Type::kOp; } + bool IsVar() const { return type_ == Type::kVar; } + + const std::string& name() const { return name_; } + + PDNode& operator=(const PDNode&) = delete; + PDNode(const PDNode&) = delete; + + // Mark this node is an Input of a subgraph and will be retained. + PDNode* AsInput() { + role_ = Role::kInput; + return this; + } + // Mark this node is an Output of a subgraph and will be retained. + PDNode* AsOutput() { + role_ = Role::kOutput; + return this; + } + // Mark this node will be removed, so all the links should be inside a matched + // sub-graph. + PDNode* AsIntermediate() { + role_ = Role::kIntermediate; + return this; + } + + bool IsIntermediate() const { return role_ == Role::kIntermediate; } + bool IsInput() const { return role_ == Role::kInput; } + bool IsOutput() const { return role_ == Role::kOutput; } + + // Assertions, helper functions to simplify the pattern definition. + PDNode* assert_is_op(); + PDNode* assert_is_op(const std::string& op_type); + PDNode* assert_is_var(); + PDNode* assert_is_not_ctrl_var(); + PDNode* assert_var_not_persistable(); + PDNode* assert_is_persistable_var(); + PDNode* assert_is_op_output(const std::string& op_type); + PDNode* assert_is_op_output(const std::string& op_type, + const std::string& argument); + PDNode* assert_is_op_input(const std::string& op_type); + PDNode* assert_is_op_input(const std::string& op_type, + const std::string& argument); + PDNode* assert_is_op_nth_input(const std::string& op_type, + const std::string& argument, int nth); + PDNode* assert_is_op_nth_output(const std::string& op_type, + const std::string& argument, int nth); + PDNode* assert_is_only_input_of_op(const std::string& op_type); + PDNode* assert_is_only_output_of_op(const std::string& op_type); + PDNode* assert_op_has_n_inputs(const std::string& op_type, size_t n); + PDNode* assert_op_has_n_outputs(const std::string& op_type, size_t n); + PDNode* assert_more(teller_t&& teller); + + PDNode* assert_is_ops_output(const std::unordered_set& op_types); + PDNode* assert_is_ops(const std::unordered_set& op_types); + PDNode* assert_is_ops_output(const std::unordered_set& op_types, + const std::string& argument); + PDNode* assert_is_ops_nth_input( + const std::unordered_set& op_types, + const std::string& argument, int nth); + PDNode* assert_is_ops_input(const std::unordered_set& op_types); + PDNode* assert_is_ops_input(const std::unordered_set& op_types, + const std::string& argument); + PDNode* assert_is_ops_nth_output( + const std::unordered_set& op_types, + const std::string& argument, int nth); + + PDNode* assert_has_n_inputs(size_t n); + PDNode* assert_has_n_outputs(size_t n); + + template + PDNode* assert_op_attr(const std::string& attr_name, const T& attr) { + asserts_.emplace_back([=](Node* x) { + return x && x->IsOp() && x->Op()->HasAttr(attr_name) && + boost::get(x->Op()->GetAttr(attr_name)) == attr; + }); + return this; + } + + private: + PDNode(PDPattern* pattern, const std::string& name = "", + Type type = Type::kVar) + : pattern_(pattern), name_(name), type_(type) {} + PDNode(teller_t&& teller, PDPattern* pattern, const std::string& name = "", + Type type = Type::kVar) + : teller_(std::move(teller)), + pattern_(pattern), + name_(name), + type_(type) { + PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set."); + } + + PDNode(PDNode&& other) = default; + + friend class PDPattern; + + // Will removed latter. + teller_t teller_; + std::vector asserts_; + PDPattern* pattern_; + std::string name_; + Type type_; + Role role_{Role::kUnknown}; +}; + +/* + * A pattern in a graph, which defined with PDNode and edges. Most graph + * patterns can be divided into PDNodes and link relations between them. + * + * For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD + * operators from the computation graph, the MUL's output should have only one + * consumer which is the ELEMENTWISE_ADD. + * This pattern can be defined as with the following pseudo codes + * + * // Create two operator PDNodes. + * MUL = PDPattern.NewNode().assert_is_op("mul"); + * ELE = PDPattern.NewNode().assert_is_op("elementwise_add"); + * // Create the variable PDNodes. + * MUL_out = PDPattern.NewNode().assert_is_op_output("mul") \ + * .assert_is_op_input("elementwise_add") \ + * .AsIntermediate(); + * // Add relations. + * MUL->LinksTo({MUL_out}); + * MUL_out->LinksTo({ELE}); + * + * One can add more specific asserts for PDNodes or edges, both the Operator + * and Variable Nodes can be ruled in PDNode.assert_more(...). + * + * PDPattern can record the general patterns, such as the pattern represents + * - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place. + * - Ops whose inputs and outputs share the same variables + */ +class PDPattern { + public: + using edge_t = std::pair; + + void AddEdge(PDNode* a, PDNode* b); + + PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID()); + PDNode* NewNode(const std::string& name = NewID()); + PDNode* NewNode(const std::string& prefix, const std::string& name) { + return NewNode(prefix + "/" + name); + } + PDNode* RetrieveNode(const std::string& id) const; + + const std::vector>& nodes() const { return nodes_; } + const std::vector& edges() const { return edges_; } + + std::string DotString() const; + + private: +#ifdef PADDLE_WITH_TESTING + FRIEND_TEST(PDPattern, AddEdge); + FRIEND_TEST(PDPattern, NewNode); +#endif + + static std::string NewID() { return "pdnode-" + std::to_string(id_++); } + + std::vector> nodes_; + std::vector edges_; + std::unordered_map node_map_; + static size_t id_; +}; + +/* + * GraphPatternDetector helps to detect the specific patterns in the graph. + * Input a pattern, output a list of the matched subgraphs/nodes. + * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.). + * + * The algorithm has three phases: + * 1. Mark the nodes that match the defined PDNodes in a PDPattern, + * 2. Extend a PDNode to subgraphs by deducing the connection relation defined + * in PAPattern(the edges), + * 3. Get the filtered subgraphs and treat them with a pre-defined handler. + * + * Usage: + * // Create a detector + * GraphPatternDetector detector; + * // Define the detector's pattern, by adding PDNode and define the edges. + * auto* node0 = detector.mutable_pattern().AddNode(...) + * auto* node1 = detector.mutable_pattern().AddNode(...) + * node0->teller = some lambda. + * node1->teller = some lambda. + * detector.mutable_pattern().AddEdge(node0, node1); + * // Create an handler, to define the behavior of treating the filtered + * // subgraphs that comply with the patterns. + * GraphPatternDetector::handle_t handler = some labmda + * // Execute the detector. + * detector(&graph, handler); + */ +class GraphPatternDetector { + public: + using subgraph_t = std::unordered_map; + + // Operate on the detected pattern. + using handle_t = + std::function; + + void operator()(Graph* graph, handle_t handler); + + const PDPattern& pattern() const { return pattern_; } + PDPattern* mutable_pattern() { return &pattern_; } + + private: + // Mark the nodes that fits the pattern. + bool MarkPDNodesInGraph(const ir::Graph& graph); + + // Detect all the pattern and output the hit records. + std::vector DetectPatterns(); + + // Remove duplicate patterns. + void UniquePatterns(std::vector* subgraphs); + + // Remove overlapped match subgraphs, when overlapped, keep the previous one. + // The intermediate PDNodes will be removed, so can't shared by multiple + // patterns. + void RemoveOverlappedMatch(std::vector* subgraphs); + + // Validate whether the intermediate nodes are linked by external nodes. + void ValidateByNodeRole(std::vector* subgraphs); + +#ifdef PADDLE_WITH_TESTING + FRIEND_TEST(GraphPatternDetecter, MarkPDNodesInGraph); + FRIEND_TEST(GraphPatternDetecter, DetectPatterns); +#endif + + private: + using hit_rcd_t = + std::pair; + PDPattern pattern_; + std::unordered_map> pdnodes2nodes_; +}; + +// some helper methods. + +// Tell if a var links to an Op +bool VarLinksToOp(Node* node, const std::string& op_type); + +// Tell if an op links to a var +bool VarLinksFromOp(Node* node, const std::string& op_type); + +// Check whether a var node is a op node's nth input. +bool IsNthInput(Node* var, Node* op, const std::string& argument, size_t nth); + +// Check whether the op node has input of given name. +bool HasInput(Node* op, const std::string& argument); + +// Tell whether a var node is a op node's nth output. +bool IsNthOutput(Node* var, Node* op, const std::string& argument, size_t nth); + +// Graph safely remove some nodes, will automatically clean up the edges. +void GraphSafeRemoveNodes(Graph* graph, + const std::unordered_set& nodes); + +// Some pre-defined patterns those can be reused in multiple passes. +// The related Fluid Layer or Op should be one pattern here for better re-usage +// across different fusion. +namespace patterns { + +struct KeyCounter { + static KeyCounter& Instance() { + static KeyCounter x; + return x; + } + + int IncCounter(const std::string& key) { return dic_[key]++; } + + private: + std::unordered_map dic_; +}; + +// Generate a unique PDNode's name with name_scope and id. +// The format is {name_scope}/{repr}/{id}/{name} +static std::string PDNodeName(const std::string& name_scope, + const std::string& repr, size_t id, + const std::string& name) { + return string::Sprintf("%s/%s/%d/%s", name_scope, repr, id, name); +} +// Generate a unique PDNode's name. +// The format is {name_scope}/{repr}/{id} +static std::string PDNodeName(const std::string& name_scope, + const std::string& repr) { + return string::Sprintf("%s/%s/%d", name_scope, repr, + KeyCounter::Instance().IncCounter(repr)); +} +// Generate a unique key. It can be used for a universally unique temporary +// name. +// The format is {repr}/{id} +static std::string UniqueKey(const std::string& repr) { + return string::Sprintf("%s/%d", repr, + KeyCounter::Instance().IncCounter(repr)); +} + +// Declare a PDNode in a pattern, will create two methods: +// std::string xxx_repr(); return this PDNode's string id. +// PDNode* xxx_n(); return the corresponding PDNode. +#define PATTERN_DECL_NODE(name__) \ + std::string name__##_repr() const { \ + return PDNodeName(name_scope_, repr_, id_, #name__); \ + } \ + PDNode* name__##_n() const { return pattern->RetrieveNode(name__##_repr()); } + +// Get an ir::Node* from the matched subgraph. +// var: variable. +// arg: the argument declared by PATTERN_DECL_NODE in a pattern definition. +// pat: the pattern object. +#define GET_IR_NODE_FROM_SUBGRAPH(var, arg, pat) \ + PADDLE_ENFORCE(subgraph.count(pat.arg##_n()), \ + "Node not found for PDNode %s", pat.arg##_repr()); \ + Node* var = subgraph.at(pat.arg##_n()); \ + PADDLE_ENFORCE(var, "node %s not exists in the sub-graph", #arg) + +// The base class of all the patterns. +struct PatternBase { + PatternBase(PDPattern* pattern, const std::string& name_scope, + const std::string& repr) + : pattern(pattern), + name_scope_(name_scope), + repr_(repr), + id_(KeyCounter::Instance().IncCounter(repr)) {} + + PDPattern* pattern; + + protected: + std::string name_scope_; + std::string repr_; + size_t id_; +}; + +// Conv with batch norm +// op: conv + (elementwise_add +) batch_norm +// named nodes: +// conv_weight, conv_out, conv, +// bn_x, bn_scale, bn_bias, bn_mean, bn_variance, +// bn_batch_norm, bn_y, bn_mean_out, bn_variance_out, +// bn_saved_mean, bn_saved_variance +struct ConvBN : public PatternBase { + ConvBN(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_bn") {} + + PDNode* operator()(PDNode* conv_input, bool with_eltwise_add); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(batch_norm); + PATTERN_DECL_NODE(eltwise); // ELEMENTWISE_ADD + // CONV inputs + PATTERN_DECL_NODE(conv_weight); // Filter + // CONV outputs + PATTERN_DECL_NODE(conv_out); // tmp + // ELTWISE inputs + PATTERN_DECL_NODE(eltwise_y_in); + // ELTWISE outputs + PATTERN_DECL_NODE(eltwise_out); // tmp + // BN inputs + PATTERN_DECL_NODE(bn_scale); + PATTERN_DECL_NODE(bn_bias); + PATTERN_DECL_NODE(bn_mean); + PATTERN_DECL_NODE(bn_variance); + // BN outputs + PATTERN_DECL_NODE(bn_out); // Out + PATTERN_DECL_NODE(bn_mean_out); + PATTERN_DECL_NODE(bn_variance_out); + PATTERN_DECL_NODE(bn_saved_mean); + PATTERN_DECL_NODE(bn_saved_variance); +}; + +// CONV with ReLU +// op: conv + relu +// named nodes: +// conv_input, conv_weight, +// conv_out, conv, +// relu_out, relu +struct ConvReLU : public PatternBase { + ConvReLU(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_relu") {} + + PDNode* operator()(PDNode* conv_input); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(relu); + // declare variable node's name + PATTERN_DECL_NODE(conv_weight); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(relu_out); +}; + +// CONV with ReLU6 +// op: conv + relu6 +// named nodes: +// conv_input, conv_weight, +// conv_out, conv, +// relu6_out, relu6 +struct ConvBReLU : public PatternBase { + ConvBReLU(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_bounded_relu") {} + + PDNode* operator()(PDNode* conv_input); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(brelu); + // declare variable node's name + PATTERN_DECL_NODE(conv_weight); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(brelu_out); +}; + +// SEQCONV with Elementwise_Add ReLU +// op: seqconv + elementwise_add + relu +// named nodes: +// seqconv_input, seqconv_weight, +// seqconv_out, seqconv, +// elementwise_add_bias, elementwise_add_out, elementwise_add +// relu_out, relu +struct SeqConvEltAddRelu : public PatternBase { + SeqConvEltAddRelu(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "seqconv_eltadd_relu") {} + + PDNode* operator()(PDNode* seqconv_input); + + // declare operator node's name + PATTERN_DECL_NODE(seqconv); + PATTERN_DECL_NODE(eltadd); + PATTERN_DECL_NODE(relu); + // declare variable node's name + PATTERN_DECL_NODE(seqconv_weight); + PATTERN_DECL_NODE(seqconv_out); + PATTERN_DECL_NODE(eltadd_bias); + PATTERN_DECL_NODE(eltadd_out); + PATTERN_DECL_NODE(relu_out); +}; + +// FC with bias +// op: mul + elementwise_add +// named nodes: +// mul, elementwise_add +// w, mul_out, bias, fc_out +struct FC : public PatternBase { + FC(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "fc") {} + + PDNode* operator()(PDNode* x, bool with_bias); + + // declare operator node's name + PATTERN_DECL_NODE(fc); + PATTERN_DECL_NODE(mul); + PATTERN_DECL_NODE(elementwise_add); + // declare variable node's name + PATTERN_DECL_NODE(w); + PATTERN_DECL_NODE(mul_out); // (x,w) -> mul_out + PATTERN_DECL_NODE(bias); + PATTERN_DECL_NODE(Out); +}; + +// MKL-DNN's FC with bias +// op: fc +// named node: +// fc +// w, bias, output +struct FCMKLDNN : public PatternBase { + FCMKLDNN(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "fc_mkldnn") {} + + PDNode* operator()(PDNode* x, bool with_bias); + + // declare operator node's name + PATTERN_DECL_NODE(fc); + // declare variable node's name + PATTERN_DECL_NODE(weights); + PATTERN_DECL_NODE(bias); + PATTERN_DECL_NODE(output); +}; + +// Embedding +struct Embedding : public PatternBase { + Embedding(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "embedding") {} + + PDNode* operator()(PDNode* x); + + // declare operator node's name + PATTERN_DECL_NODE(lookup_table); + // Inputs + // + PATTERN_DECL_NODE(Ids); + PATTERN_DECL_NODE(W); // embeddings + // Outputs + PATTERN_DECL_NODE(Out); +}; + +struct LSTM : public PatternBase { + LSTM(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "lstm") {} + + PDNode* operator()(PDNode* x); + + // Operators + PATTERN_DECL_NODE(lstm); + + // Inputs + PATTERN_DECL_NODE(Input); + PATTERN_DECL_NODE(H0); + PATTERN_DECL_NODE(C0); + PATTERN_DECL_NODE(Weight); + PATTERN_DECL_NODE(Bias); + + // Outputs + PATTERN_DECL_NODE(Hidden); + PATTERN_DECL_NODE(Cell); + PATTERN_DECL_NODE(BatchGate); + PATTERN_DECL_NODE(BatchCellPreAct); +}; + +struct GRU : public PatternBase { + GRU(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "gru") {} + + PDNode* operator()(PDNode* x); + + // Operators + PATTERN_DECL_NODE(gru); + + // Inputs + PATTERN_DECL_NODE(Bias); + PATTERN_DECL_NODE(Weight); + + // Outputs + PATTERN_DECL_NODE(BatchGate); + PATTERN_DECL_NODE(BatchResetHiddenPrev); + PATTERN_DECL_NODE(BatchHidden); + PATTERN_DECL_NODE(Hidden); +}; + +// The following patterns are used to fuse elewise_add and act +// formula: act(ele_add(x, y)) +// op: elementwise_add + act +// named nodes: elementwise_add, act +// ele_x, ele_y, elewise_add_out, act_out +struct ElewiseAddAct : public PatternBase { + ElewiseAddAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elewise_add_act") {} + + PDNode* operator()(PDNode* x, std::unordered_set acts); + + // declare operator node's name + PATTERN_DECL_NODE(ele_add); + PATTERN_DECL_NODE(act); + // declare variable node's name + PATTERN_DECL_NODE(elewise_add_out); + PATTERN_DECL_NODE(ele_y); + PATTERN_DECL_NODE(act_out); +}; + +// formula: ele_add(x, act(y)) +// op: elementwise_add + act +// named nodes: elementwise_add, act +// act_in, act_out, ele_x, elewise_add_out +struct ActElewiseAdd : public PatternBase { + ActElewiseAdd(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "act_elewise_add") {} + + PDNode* operator()(PDNode* x, std::unordered_set acts); + + // declare operator node's name + PATTERN_DECL_NODE(act); + PATTERN_DECL_NODE(ele_add); + // declare variable node's name + PATTERN_DECL_NODE(act_out); + PATTERN_DECL_NODE(ele_x); + PATTERN_DECL_NODE(elewise_add_out); +}; + +// the backward of act(ele_add(x, y)) +// the act is inplace. +// op: elementwise_add_grad + act_grad +// named nodes: elementwise_add_grad, act_grad +// act_out, act_out_g, ele_y, d_itermediate_out, d_ele_x, d_ele_y +struct ElewiseAddActInplaceGrad : public PatternBase { + ElewiseAddActInplaceGrad(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elewise_add_act_grad1") {} + + // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"] + // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"] + PDNode* operator()(PDNode* x, std::unordered_set acts); + + // declare operator node's name + PATTERN_DECL_NODE(act_grad); + PATTERN_DECL_NODE(ele_add_grad); + // declare variable node's name + PATTERN_DECL_NODE(act_out); + PATTERN_DECL_NODE(d_itermediate_out); + PATTERN_DECL_NODE(d_ele_x); + PATTERN_DECL_NODE(d_ele_y); + PATTERN_DECL_NODE(ele_y); +}; + +// Conv with Elementwise_add as bias +// op: conv + elementwise_add +// named nodes: +// conv_input, conv_weight, +// conv_out, conv, +// eltwise_bias, eltwise_out, +// elementwise_add +struct ConvBias : public PatternBase { + ConvBias(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_bias") {} + PDNode* operator()(PDNode* conv_input, std::string conv_type = "conv2d"); + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(eltwise); + // declare variable node's name + PATTERN_DECL_NODE(conv_weight); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(eltwise_bias); + PATTERN_DECL_NODE(eltwise_out); +}; + +// Convolution op +// Forward pass for convolution. +// conv_input, conv_bias and conv_filter are inputs. +// conv_output is a result of the operator. +// residual_data is data used by skip connection. +// If residual connection fusion is on, the formula is: +// conv_output = conv_op(conv_filter, conv_input, conv_bias) +// + conv_residual_data +// If the fusion is off, conv_residual_data is not added. +struct Conv : public PatternBase { + Conv(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "convolution") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_input); + PATTERN_DECL_NODE(conv_filter); + PATTERN_DECL_NODE(conv_residual_data); + PATTERN_DECL_NODE(conv_output); +}; + +// Convolution op with residual data +struct ConvResidual : public PatternBase { + ConvResidual(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_residual") {} + + PDNode* operator()(bool with_residual_data); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_input); + PATTERN_DECL_NODE(conv_filter); + PATTERN_DECL_NODE(conv_residual_data); + PATTERN_DECL_NODE(conv_output); +}; + +// Pool op +// Forward pass for pooling. +// pool_input is the input. +// pool_output is a result of the operator. +struct Pool : public PatternBase { + Pool(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "pooling") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(pool_op); + PATTERN_DECL_NODE(pool_input); + PATTERN_DECL_NODE(pool_output); +}; + +// ElementwiseAdd used in residual connections. +// y_var is used and convolution output. +// The operator is removed, when residual +// connection fusion is on. +struct ElementwiseAdd : public PatternBase { + ElementwiseAdd(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "elementwise_add") {} + + PDNode* operator()(PDNode* x_var, PDNode* y_var); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_x); + PATTERN_DECL_NODE(elementwise_add_y); + PATTERN_DECL_NODE(elementwise_add_out); +}; + +// Concat op +// Forward pass for concat. +// concat_out is a result of the operator. +struct Concat : public PatternBase { + Concat(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "concat") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(concat_op); + PATTERN_DECL_NODE(concat_out); +}; + +// Concat + ReLU +// named nodes: +// concat_op, concat_out, relu_op, relu_out +struct ConcatReLU : public PatternBase { + ConcatReLU(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "concat_relu") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(concat_op); + PATTERN_DECL_NODE(concat_out); + PATTERN_DECL_NODE(relu_op); + PATTERN_DECL_NODE(relu_out); +}; + +// Conv + Concat + ReLU +// named nodes: +// conv_op, conv_out +// concat_op, concat_out, relu_op, relu_out +struct ConvConcatReLU : public PatternBase { + ConvConcatReLU(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_concat_relu") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(concat_op); + PATTERN_DECL_NODE(concat_out); + PATTERN_DECL_NODE(relu_op); + PATTERN_DECL_NODE(relu_out); +}; + +// PriorBox operator +// operator: prior_box_op +// inputs: prior_box_input, prior_box_image +// outputs: prior_box_boxes, prior_box_variances +struct PriorBox : public PatternBase { + PriorBox(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "PriorBox") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(prior_box_op); + PATTERN_DECL_NODE(prior_box_input); + PATTERN_DECL_NODE(prior_box_image); + PATTERN_DECL_NODE(prior_box_boxes); + PATTERN_DECL_NODE(prior_box_variances); +}; + +// Conv + ElementwiseAdd + an activation +// This pattern can futher fuse the conv related ops after the conv+bn fusion. +struct ConvElementwiseaddAct : public PatternBase { + ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(conv_filter); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); // input + PATTERN_DECL_NODE(elementwise_add_out); + + PATTERN_DECL_NODE(act_op); + PATTERN_DECL_NODE(act_out); +}; + +// Conv + ElementwiseAdd + ElementwiseAdd + Activation +struct ConvElementwiseadd2Act : public PatternBase { + ConvElementwiseadd2Act(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, + "conv_elementwiseadd2_elementwiseadd_act") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_filter); + PATTERN_DECL_NODE(conv_out); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); // input + PATTERN_DECL_NODE(elementwise_add_out); + + PATTERN_DECL_NODE(elementwise_add_op_1); + PATTERN_DECL_NODE(elementwise_add_in_y_1); // input + PATTERN_DECL_NODE(elementwise_add_out_1); + + PATTERN_DECL_NODE(act_op); + PATTERN_DECL_NODE(act_out); +}; + +// Conv + ElementwiseAdd +// This pattern should be used after ConvElementwiseadd2Act or +// ConvElementwiseadd pass +struct ConvElementwiseadd : public PatternBase { + ConvElementwiseadd(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_elementwiseadd") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(conv_filter); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); + PATTERN_DECL_NODE(elementwise_add_out); +}; + +// Conv with affine_channel +// op: conv + (elementwise_add +) affine_channel +// named nodes: +// conv_weight, conv_out, conv, +// ac_x, ac_scale, ac_bias +// affine_channel, ac_out +struct ConvAffineChannel : public PatternBase { + ConvAffineChannel(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_affine_channel") {} + + PDNode* operator()(PDNode* conv_input, bool with_eltwise_add); + + // declare operator node's name + PATTERN_DECL_NODE(conv); + PATTERN_DECL_NODE(affine_channel); + PATTERN_DECL_NODE(eltwise); // ELEMENTWISE_ADD + // CONV inputs + PATTERN_DECL_NODE(conv_weight); // Filter + // CONV outputs + PATTERN_DECL_NODE(conv_out); // tmp + // ELTWISE inputs + PATTERN_DECL_NODE(eltwise_y_in); + // ELTWISE outputs + PATTERN_DECL_NODE(eltwise_out); // tmp + + // AC(Affine_Channel) inputs + PATTERN_DECL_NODE(ac_scale); + PATTERN_DECL_NODE(ac_bias); + // AC outputs + PATTERN_DECL_NODE(ac_out); // Out +}; + +// Dequantize + Quantize + anyOP +// This pattern is used for squashing the dequantize-quantize pairs. +struct DequantQuantAny : public PatternBase { + DequantQuantAny(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "dequant_quant_any") {} + PDNode* operator()(); + + PATTERN_DECL_NODE(dequant_in); + PATTERN_DECL_NODE(dequant_op); + PATTERN_DECL_NODE(dequant_out); + PATTERN_DECL_NODE(quant_op); + PATTERN_DECL_NODE(quant_out); + PATTERN_DECL_NODE(next_op); +}; + +// Dequantize + anyOP +// This quantize is used for getting number of ops the Dequantize's +// output is an input to. +struct DequantAny : public PatternBase { + DequantAny(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "dequant_any") {} + PDNode* operator()(); + + PATTERN_DECL_NODE(dequant_op); + PATTERN_DECL_NODE(dequant_out); + PATTERN_DECL_NODE(next_op); +}; + +struct TransposeFlattenConcat : public PatternBase { + TransposeFlattenConcat(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "transpose_flatten_concat") {} + + PDNode* operator()(std::vector conv_inputs, int times); + + std::string GetNodeName(const std::string& op_type) { + return PDNodeName(name_scope_, repr_, id_, op_type); + } + + PDNode* GetPDNode(const std::string& op_type) { + return pattern->RetrieveNode(GetNodeName(op_type)); + } +}; + +struct AnakinDetectionPattern : public PatternBase { + AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "anakin_detect_pattern") {} + + PDNode* operator()(std::vector conv_inputs, int times, + std::string priorbox_type, bool is_reshape); + + std::string GetNodeName(const std::string& op_type) { + return PDNodeName(name_scope_, repr_, id_, op_type); + } + + PDNode* GetPDNode(const std::string& op_type) { + return pattern->RetrieveNode(GetNodeName(op_type)); + } +}; + +struct FillConstantElementWiseMulFuse : public PatternBase { + FillConstantElementWiseMulFuse(PDPattern* pattern, + const std::string& name_scope) + : PatternBase(pattern, name_scope, + "anakin_fillconstant_elementwisemul_fuse") {} + + PDNode* operator()(PDNode* elementwise_op_input); + + // declare operator node's name + PATTERN_DECL_NODE(fill_constant); + PATTERN_DECL_NODE(fill_constant_out); + PATTERN_DECL_NODE(elementwise_mul); + PATTERN_DECL_NODE(elementwise_mul_out); +}; + +struct QuantDequantOpFuse : public PatternBase { + QuantDequantOpFuse(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "quant_dequant_fuse") {} + + void operator()(PDNode* quant_op_input, const std::string& op_name, + const std::string& weight_name, int times, + const std::string& quant_type, + const std::string& dequant_type); + + std::string GetNodeName(const std::string& op_type) { + return PDNodeName(name_scope_, repr_, id_, op_type); + } + + PDNode* GetPDNode(const std::string& op_type) { + return pattern->RetrieveNode(GetNodeName(op_type)); + } +}; + +struct ShuffleChannelPattern : public PatternBase { + ShuffleChannelPattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "shufflechannel_pattern") {} + + void operator()(PDNode* reshape1_in); + + PATTERN_DECL_NODE(reshape1_op); + PATTERN_DECL_NODE(reshape1_out); + + PATTERN_DECL_NODE(transpose_op); + PATTERN_DECL_NODE(transpose_out); + PATTERN_DECL_NODE(reshape2_op); + PATTERN_DECL_NODE(reshape2_out); +}; + +struct DeleteQuantDequantOpPattern : public PatternBase { + DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {} + + void operator()(); + + PATTERN_DECL_NODE(any_op_out); + PATTERN_DECL_NODE(quant_dequant_op_inscale); + PATTERN_DECL_NODE(quant_dequant_op); + PATTERN_DECL_NODE(quant_dequant_op_outscale); + PATTERN_DECL_NODE(quant_dequant_op_out); + PATTERN_DECL_NODE(any_op2); +}; + +} // namespace patterns + +// Link two ir::Nodes from each other. +#define IR_NODE_LINK_TO(a, b) \ + a->outputs.push_back(b); \ + b->inputs.push_back(a); + +// Set the out_var as the output of the op +#define IR_OP_VAR_LINK(op, out_var) \ + op->outputs.push_back(out_var); \ + out_var->inputs.clear(); \ + out_var->inputs.push_back(op); + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc new file mode 100644 index 00000000..6c466fb2 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_pattern_detector_tester.cc @@ -0,0 +1,206 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +#include + +namespace paddle { +namespace framework { +namespace ir { + +void BuildGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation); + ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation); + ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable); + ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + // o2->v3->o5 + o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + v4->outputs.push_back(o5); +} + +TEST(PDPattern, NewNode) { + PDPattern x; + auto* n = x.NewNode([](Node* x) { return true; }); + ASSERT_TRUE(n); + ASSERT_EQ(x.nodes_.size(), 1UL); +} + +TEST(PDPattern, AddEdge) { + PDPattern x; + auto* a = x.NewNode([](Node* x) { return true; }); + auto* b = x.NewNode([](Node* x) { return true; }); + ASSERT_TRUE(a); + ASSERT_TRUE(b); + x.AddEdge(a, b); + ASSERT_EQ(x.nodes_.size(), 2UL); + ASSERT_EQ(x.edges_.size(), 1UL); + ASSERT_EQ(x.edges_.front().first, a); + ASSERT_EQ(x.edges_.front().second, b); + + ASSERT_EQ(x.nodes().size(), 2UL); + ASSERT_EQ(x.edges().size(), 1UL); + ASSERT_EQ(x.edges().front().first, a); + ASSERT_EQ(x.edges().front().second, b); +} + +TEST(GraphPatternDetecter, MarkPDNodesInGraph) { + GraphPatternDetector x; + // mark o2, o3, v2 + + // The pattern is a graph: + // o2(a node named o2) -> v2(a node named v2) + // v2 -> o3(a node named o3) + auto* o2 = x.pattern_.NewNode([](Node* node) { + // The teller can be any condition, such as op type, or variable's shape. + return node && node->Name() == "op2" && node->IsOp(); + }); + auto* o3 = x.pattern_.NewNode([](Node* node) { + // The teller can be any condition, such as op type, or variable's shape. + return node && node->Name() == "op3" && node->IsOp(); + }); + auto* v2 = x.pattern_.NewNode([](Node* node) { + // The teller can be any condition, such as op type, or variable's shape. + return node && node->Name() == "var2" && node->IsVar(); + }); + + ASSERT_FALSE(o2->Tell(nullptr)); + ASSERT_FALSE(o3->Tell(nullptr)); + ASSERT_FALSE(v2->Tell(nullptr)); + + x.pattern_.AddEdge(o2, v2); + x.pattern_.AddEdge(v2, o3); + + ASSERT_EQ(x.pattern_.edges().size(), 2UL); + ASSERT_EQ(x.pattern_.edges()[0].first, o2); + ASSERT_EQ(x.pattern_.edges()[0].second, v2); + ASSERT_EQ(x.pattern_.edges()[1].first, v2); + ASSERT_EQ(x.pattern_.edges()[1].second, o3); + + ProgramDesc program; + Graph graph(program); + BuildGraph(&graph); + + x.MarkPDNodesInGraph(graph); + + ASSERT_EQ(x.pdnodes2nodes_.size(), 3UL); + + auto subgraphs = x.DetectPatterns(); + ASSERT_EQ(subgraphs.size(), 1UL); +} + +TEST(GraphPatternDetecter, MultiSubgraph) { + ProgramDesc program; + Graph graph(program); + BuildGraph(&graph); + + GraphPatternDetector x; + + // The pattern is a graph: + // op -> var + auto* any_op = x.mutable_pattern()->NewNode( + [](Node* node) { + return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3"); + }, + "OP0"); + auto* any_var = x.mutable_pattern() + ->NewNode([](Node* node) { return node->IsVar(); }, "VAR") + ->AsIntermediate(); + auto* any_op1 = x.mutable_pattern()->NewNode( + [](Node* node) { return node->IsOp(); }, "OP1"); + + x.mutable_pattern()->AddEdge(any_op, any_var); + x.mutable_pattern()->AddEdge(any_var, any_op1); + + int count = 0; + GraphPatternDetector::handle_t handle = [&]( + const GraphPatternDetector::subgraph_t& s, Graph* g) { + LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> " + << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name(); + count++; + }; + + x(&graph, handle); + + // 1. Detect op3 -> var4 -> op5 + // 2. Detect op2 -> var2 -> op3 + // 3. Detect op2 -> var2 -> op4 + // 4. Detect op2 -> var3 -> op5 + // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2 + ASSERT_GE(count, 1); + ASSERT_LE(count, 2); +} + +TEST(GraphPatternDetector, IntermediateCheck) { + ProgramDesc program; + Graph graph(program); + BuildGraph(&graph); + + // o2->v2->o3 + // o2->v2->o4 + // check o2+o3 fuse, should fail because v2 also link to o4. + GraphPatternDetector detector; + auto* op2 = detector.mutable_pattern()->NewNode( + [](Node* x) { return x && x->IsOp() && x->Name() == "op2"; }, "op2"); + auto* op3 = detector.mutable_pattern()->NewNode( + [](Node* x) { return x && x->IsOp() && x->Name() == "op3"; }, "op3"); + auto* v2 = + detector.mutable_pattern() + ->NewNode( + [](Node* x) { return x && x->IsVar() && x->Name() == "var2"; }, + "var2") + ->AsIntermediate(); + v2->LinksFrom({op2}).LinksTo({op3}); + + int count = 0; + detector(&graph, [&](const GraphPatternDetector::subgraph_t& g, + Graph* graph) { ++count; }); + EXPECT_EQ(count, 0); + + count = 0; + v2->AsInput(); + detector(&graph, [&](const GraphPatternDetector::subgraph_t& g, + Graph* graph) { ++count; }); + ASSERT_EQ(count, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc new file mode 100644 index 00000000..a95588a5 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -0,0 +1,210 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class NOP : public OperatorBase { + public: + NOP(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope &scope, + const platform::Place &place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", "").AsDuplicable(); + AddComment(""); + } +}; + +class SumOpVarTypeInference : public VarTypeInference { + public: + void operator()(InferVarTypeContext *ctx) const override { + auto &inputs = ctx->Input("X"); + auto default_var_type = proto::VarType::SELECTED_ROWS; + + bool any_input_is_lod_tensor = std::any_of( + inputs.begin(), inputs.end(), [&ctx](const std::string &name) { + return ctx->GetType(name) == proto::VarType::LOD_TENSOR; + }); + if (any_input_is_lod_tensor) { + default_var_type = proto::VarType::LOD_TENSOR; + } + + auto out_var_name = ctx->Output("Out").front(); + ctx->SetType(out_var_name, default_var_type); + } +}; + +class DummyOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", "").AsDuplicable(); + AddComment(""); + } +}; + +class DummyOpVarTypeInference : public VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override {} +}; +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker, + paddle::framework::SumOpVarTypeInference); +REGISTER_OPERATOR(dummy, paddle::framework::NOP, paddle::framework::SumOpMaker, + paddle::framework::SumOpVarTypeInference); +REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP, + paddle::framework::SumOpMaker); + +namespace paddle { +namespace framework { + +TEST(GraphTest, Basic) { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"test_a", "test_b", "test_c"}); + op->SetOutput("Out", {"test_out"}); + op->SetAttr("op_role", 1); + + prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_out"); + + op->InferVarType(prog.MutableBlock(0)); + + ASSERT_EQ(proto::VarType::SELECTED_ROWS, + prog.MutableBlock(0)->Var("test_out")->GetType()); + + prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::LOD_TENSOR); + op->InferVarType(prog.MutableBlock(0)); + ASSERT_EQ(proto::VarType::LOD_TENSOR, + prog.MutableBlock(0)->Var("test_out")->GetType()); + + std::unique_ptr g(new ir::Graph(prog)); + std::vector nodes(g->Nodes().begin(), g->Nodes().end()); + for (ir::Node *n : nodes) { + if (n->Name() == "sum") { + ASSERT_EQ(n->inputs.size(), 3UL); + ASSERT_EQ(n->outputs.size(), 1UL); + } else if (n->Name() == "test_a" || n->Name() == "test_b" || + n->Name() == "test_c") { + ASSERT_EQ(n->inputs.size(), 0UL); + ASSERT_EQ(n->outputs.size(), 1UL); + } else if (n->Name() == "test_out") { + ASSERT_EQ(n->inputs.size(), 1UL); + ASSERT_EQ(n->outputs.size(), 0UL); + } + } + ASSERT_EQ(nodes.size(), 5UL); +} + +TEST(GraphTest, WriteAfterRead) { + // void Test() { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + op->SetAttr("op_role", 1); + + op = prog.MutableBlock(0)->AppendOp(); + op->SetType("dummy"); + op->SetInput("X", {"c"}); + op->SetOutput("Out", {"a"}); + op->SetAttr("op_role", 1); + + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + + std::unique_ptr g(new ir::Graph(prog)); + ir::Node *control_dep1 = nullptr; + ir::Node *control_dep2 = nullptr; + for (ir::Node *n : g->Nodes()) { + if (n->Name() == "sum") { + ASSERT_EQ(n->outputs[0]->Name(), "b"); + ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1])); + control_dep1 = n->outputs[1]; + ASSERT_EQ(n->outputs.size(), 2); + } + if (n->Name() == "dummy") { + ASSERT_EQ(n->inputs[0]->Name(), "c"); + ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1])); + control_dep2 = n->inputs[1]; + ASSERT_EQ(n->inputs.size(), 2); + } + } + ASSERT_EQ(control_dep1, control_dep2); +} + +TEST(GraphTest, WriteAfterWrite) { + // void Test() { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + op->SetAttr("op_role", 1); + + op = prog.MutableBlock(0)->AppendOp(); + op->SetType("dummy"); + op->SetInput("X", {"c"}); + op->SetOutput("Out", {"b"}); + op->SetAttr("op_role", 1); + + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + + std::unique_ptr g(new ir::Graph(prog)); + ir::Node *control_dep1 = nullptr; + ir::Node *control_dep2 = nullptr; + for (ir::Node *n : g->Nodes()) { + if (n->Name() == "sum") { + ASSERT_EQ(n->outputs[0]->Name(), "b"); + ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1])); + ASSERT_EQ(n->outputs.size(), 2); + control_dep1 = n->outputs[1]; + } + if (n->Name() == "dummy") { + ASSERT_EQ(n->inputs[0]->Name(), "c"); + ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1])); + control_dep2 = n->inputs[1]; + ASSERT_EQ(n->inputs.size(), 2); + } + } + ASSERT_NE(control_dep1, nullptr); + ASSERT_NE(control_dep2, nullptr); + ASSERT_EQ(control_dep1, control_dep2); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc new file mode 100644 index 00000000..b0d056f2 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +void GraphToProgramPass::ApplyImpl(ir::Graph* graph) const { + // Remove the unneeded variables after memory optimization. + std::unordered_set vars2remove; + if (graph->Has(kGraphToProgramVarsToRemove)) { + vars2remove = graph->Get>( + kGraphToProgramVarsToRemove); + VLOG(2) << "graph to program remove " << vars2remove.size() << " nodes"; + } + + ProgramDesc& program = Get("program"); + + std::unique_ptr program_pb( + new proto::ProgramDesc(*program.Proto())); + + auto block = program_pb->mutable_blocks(kRootBlockIndex); + block->set_idx(kRootBlockIndex); + block->clear_vars(); + std::unordered_set visited_vars; + for (ir::Node* n : graph->Nodes()) { + if (n->IsVar()) { + if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 && + !vars2remove.count(n->Var()->Name())) { + visited_vars.insert(n->Var()->Name()); + block->add_vars()->MergeFrom(*n->Var()->Proto()); + } + } + } + block->clear_ops(); + + std::vector nodes; + if (Has(kGraphToProgramSortKind)) { + // Inference Memory Optimize relays on this branch. + int sort_kind = Get(kGraphToProgramSortKind); + nodes = TopologyVarientSort( + *graph, static_cast(sort_kind)); + } else { + nodes = TopologySortOperations(*graph); + } + + for (ir::Node* n : nodes) { + if (!n->Op()) continue; + + block->add_ops()->MergeFrom(*n->Op()->Proto()); + } + + program.CopyFrom(*program_pb); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(graph_to_program_pass, paddle::framework::ir::GraphToProgramPass); diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h new file mode 100644 index 00000000..52c8f4e0 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_to_program_pass.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +const char kGraphToProgramVarsToRemove[] = + "__graph_to_program_vars_to_remove__"; +const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__"; + +class GraphToProgramPass : public Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc new file mode 100644 index 00000000..5ee6b8a5 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +void BuildNoCircleGraph(Graph* g) { + OpDesc op1; + op1.SetType("op1"); + OpDesc op2; + op2.SetType("op2"); + OpDesc op3; + op3.SetType("op3"); + OpDesc op4; + op4.SetType("op4"); + OpDesc op5; + op5.SetType("op5"); + VarDesc var1("var1"); + VarDesc var2("var2"); + VarDesc var3("var3"); + VarDesc var4("var4"); + + ir::Node* o1 = g->CreateOpNode(&op1); + ir::Node* o2 = g->CreateOpNode(&op2); + ir::Node* o3 = g->CreateOpNode(&op3); + ir::Node* o4 = g->CreateOpNode(&op4); + ir::Node* o5 = g->CreateOpNode(&op5); + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + v2->inputs.push_back(o2); + // o4->v3->o5 + o4->outputs.push_back(v3); + o5->inputs.push_back(v3); + v3->inputs.push_back(o4); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + v4->outputs.push_back(o5); +} + +TEST(GraphToProgramPass, Basic) { + ProgramDesc prog; + std::unique_ptr g(new Graph(prog)); + BuildNoCircleGraph(g.get()); + + auto pass = paddle::framework::ir::PassRegistry::Instance().Get( + "graph_to_program_pass"); + + ProgramDesc compiled_prog; + pass->SetNotOwned("program", &compiled_prog); + pass->Apply(g.get()); + std::vector ops = compiled_prog.Block(0).AllOps(); + EXPECT_EQ(ops[0]->Type(), "op1"); + EXPECT_EQ(ops[1]->Type(), "op2"); + if (ops[2]->Type() == "op3") { + EXPECT_EQ(ops[3]->Type(), "op4"); + } else if (ops[2]->Type() == "op4") { + EXPECT_EQ(ops[3]->Type(), "op3"); + } + EXPECT_EQ(ops[4]->Type(), "op5"); + + std::unordered_set vars; + for (VarDesc* v : compiled_prog.Block(0).AllVars()) { + vars.insert(v->Name()); + } + EXPECT_TRUE(vars.find("var1") != vars.end()); + EXPECT_TRUE(vars.find("var2") != vars.end()); + EXPECT_TRUE(vars.find("var3") != vars.end()); +} +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(graph_to_program_pass); diff --git a/paddle/fluid/framework/ir/graph_traits.cc b/paddle/fluid/framework/ir/graph_traits.cc new file mode 100644 index 00000000..929d9edc --- /dev/null +++ b/paddle/fluid/framework/ir/graph_traits.cc @@ -0,0 +1,142 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/graph_traits.h" + +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +// +// NodesDFSIterator +// +NodesDFSIterator::NodesDFSIterator(const std::vector &source) { + for (auto *x : source) stack_.push(x); +} + +NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept + : stack_(std::move(other.stack_)), + visited_(std::move(other.visited_)) {} + +NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other) + : stack_(other.stack_), visited_(other.visited_) {} + +Node &NodesDFSIterator::operator*() { + PADDLE_ENFORCE(!stack_.empty()); + return *stack_.top(); +} + +NodesDFSIterator &NodesDFSIterator::operator++() { + PADDLE_ENFORCE(!stack_.empty(), "the iterator exceeds range"); + visited_.insert(stack_.top()); + auto *cur = stack_.top(); + stack_.pop(); + for (auto *x : cur->outputs) { + if (!visited_.count(x)) { + stack_.push(x); + } + } + return *this; +} +bool NodesDFSIterator::operator==(const NodesDFSIterator &other) { + if (stack_.empty()) return other.stack_.empty(); + if ((!stack_.empty()) && (!other.stack_.empty())) { + return stack_.top() == other.stack_.top(); + } + return false; +} + +NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) { + stack_ = other.stack_; + visited_ = other.visited_; + return *this; +} +Node *NodesDFSIterator::operator->() { return stack_.top(); } + +inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) { + return node.inputs.size() == n; +} + +NodesTSIterator::NodesTSIterator(const std::vector &source) { + PADDLE_ENFORCE(!source.empty(), + "Start points of topological sorting should not be empty!"); + // CHECK all the inputs' in-degree is 0 + for (auto *node : source) { + PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0)); + } + + std::unordered_set visited; + std::set to_visit{source.begin(), source.end()}; + + std::vector inlink_visited; + while (!to_visit.empty()) { + std::vector queue(to_visit.begin(), to_visit.end()); + for (auto *p : queue) { + inlink_visited.clear(); + + std::copy_if(p->inputs.begin(), p->inputs.end(), + std::back_inserter(inlink_visited), + [&](Node *x) -> bool { return visited.count(x) != 0; }); + + if (inlink_visited.size() == p->inputs.size()) { + sorted_.push_back(p); + for (auto *_ : p->outputs) { + if (!visited.count(_)) { + to_visit.insert(_); + } + } + + to_visit.erase(p); + visited.insert(p); + } + } + } +} + +NodesTSIterator::NodesTSIterator(const NodesTSIterator &other) + : sorted_(other.sorted_), cursor_(other.cursor_) {} + +Node &NodesTSIterator::operator*() { + PADDLE_ENFORCE_LT(cursor_, sorted_.size()); + return *sorted_[cursor_]; +} + +NodesTSIterator &NodesTSIterator::operator++() { + if (++cursor_ >= sorted_.size()) { + sorted_.clear(); + cursor_ = 0; + } + return *this; +} +NodesTSIterator &NodesTSIterator::operator=(const NodesTSIterator &other) { + cursor_ = other.cursor_; + sorted_ = other.sorted_; + return *this; +} + +bool NodesTSIterator::operator==(const NodesTSIterator &other) { + return sorted_ == other.sorted_ && cursor_ == other.cursor_; +} + +Node *NodesTSIterator::operator->() { + PADDLE_ENFORCE_LT(cursor_, sorted_.size()); + return sorted_[cursor_]; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_traits.h b/paddle/fluid/framework/ir/graph_traits.h new file mode 100644 index 00000000..f6772f9a --- /dev/null +++ b/paddle/fluid/framework/ir/graph_traits.h @@ -0,0 +1,128 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" + +namespace paddle { +namespace framework { +namespace ir { + +template +class iterator_range { + IteratorT begin_, end_; + + public: + template + explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {} + + iterator_range(const IteratorT &begin, const IteratorT &end) + : begin_(begin), end_(end) {} + + const IteratorT &begin() const { return begin_; } + const IteratorT &end() const { return end_; } +}; + +// DFS iterator on nodes. +struct NodesDFSIterator + : public std::iterator { + NodesDFSIterator() = default; + explicit NodesDFSIterator(const std::vector &source); + NodesDFSIterator(NodesDFSIterator &&other) noexcept; + NodesDFSIterator(const NodesDFSIterator &other); + + Node &operator*(); + NodesDFSIterator &operator++(); + // TODO(Superjomn) current implementation just compare the first + // element, need to compare the graph and all the elements in the queue and + // set. + NodesDFSIterator &operator=(const NodesDFSIterator &other); + bool operator==(const NodesDFSIterator &other); + bool operator!=(const NodesDFSIterator &other) { return !(*this == other); } + Node *operator->(); + + private: + std::stack stack_; + std::unordered_set visited_; +}; + +// Topological sorting iterator on nodes. +struct NodesTSIterator + : public std::iterator { + NodesTSIterator() = default; + NodesTSIterator(const std::vector &source); + NodesTSIterator(NodesTSIterator &&other) + : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) { + other.cursor_ = 0; + } + NodesTSIterator(const NodesTSIterator &other); + + Node &operator*(); + NodesTSIterator &operator++(); + // TODO(Superjomn) current implementation just compare the first + // element, need to compare the graph and all the elements in the queue and + // set. + NodesTSIterator &operator=(const NodesTSIterator &other); + bool operator==(const NodesTSIterator &other); + bool operator!=(const NodesTSIterator &other) { return !(*this == other); } + Node *operator->(); + + private: + std::vector sorted_; + size_t cursor_{0}; +}; + +/* + * GraphTraits contains some graph traversal algorithms. + * + * Usage: + * + */ +struct GraphTraits { + static iterator_range DFS(const Graph &g) { + auto start_points = ExtractStartPoints(g); + NodesDFSIterator x(start_points); + return iterator_range(NodesDFSIterator(start_points), + NodesDFSIterator()); + } + + static iterator_range TS(const Graph &g) { + auto start_points = ExtractStartPoints(g); + PADDLE_ENFORCE(!start_points.empty()); + NodesTSIterator x(start_points); + return iterator_range(NodesTSIterator(start_points), + NodesTSIterator()); + } + + private: + // The nodes those have no input will be treated as start points. + static std::vector ExtractStartPoints(const Graph &g) { + std::vector result; + for (auto *node : g.Nodes()) { + if (node->inputs.empty()) { + result.push_back(node); + } + } + return result; + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc new file mode 100644 index 00000000..f4df4cfe --- /dev/null +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -0,0 +1,135 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/inference/analysis/dot.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace framework { +namespace ir { +using inference::analysis::Dot; +namespace { +const char kGraphVizPath[] = "graph_viz_path"; + +std::string FormatName(const Node* node) { + if (!node->IsOp() || !node->Op() || + !node->Op()->HasAttr(OpProtoAndCheckerMaker::OpNamescopeAttrName())) { + return node->Name(); + } + const std::string full_scope = boost::get( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpNamescopeAttrName())); + return string::Sprintf("%s%s", full_scope.c_str(), node->Name().c_str()); +} +} // namespace + +void GraphVizPass::ApplyImpl(ir::Graph* graph) const { + const std::string graph_viz_path = Get(kGraphVizPath); + VLOG(3) << "draw IR graph viz to " << graph_viz_path; + std::unique_ptr fout(new std::ofstream(graph_viz_path)); + PADDLE_ENFORCE(fout->good()); + std::ostream& sout = *fout; + + std::unordered_map node2dot; + + Dot dot; + + const std::vector op_attrs({ + Dot::Attr("style", "rounded,filled,bold"), // + Dot::Attr("shape", "box"), // + Dot::Attr("color", "#303A3A"), // + Dot::Attr("fontcolor", "#ffffff"), // + Dot::Attr("width", "1.3"), // + Dot::Attr("height", "0.84"), // + Dot::Attr("fontname", "Arial"), // + }); + const std::vector arg_attrs({ + Dot::Attr("shape", "box"), // + Dot::Attr("style", "rounded,filled,bold"), // + Dot::Attr("fontname", "Arial"), // + Dot::Attr("fillcolor", "#999999"), // + Dot::Attr("color", "#dddddd"), // + }); + + const std::vector param_attrs({ + Dot::Attr("shape", "box"), // + Dot::Attr("style", "rounded,filled,bold"), // + Dot::Attr("fontname", "Arial"), // + Dot::Attr("color", "#148b97"), // + Dot::Attr("fontcolor", "#ffffff"), // + }); + + const std::vector marked_op_attrs( + {Dot::Attr("style", "rounded,filled,bold"), Dot::Attr("shape", "box"), + Dot::Attr("fillcolor", "yellow")}); + const std::vector marked_var_attrs( + {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"), + Dot::Attr("fillcolor", "yellow")}); + + auto marked_nodes = ConsumeMarkedNodes(graph); + // Create nodes + for (const Node* n : graph->Nodes()) { + std::string node_id = FormatName(n) + "(" + std::to_string(n->id()) + ")"; + if (n->IsOp()) { + decltype(op_attrs) attr = + marked_nodes.count(n) ? marked_op_attrs : op_attrs; + dot.AddNode(node_id, attr, node_id); + } else if (n->IsVar()) { + decltype(op_attrs)* attr; + if (marked_nodes.count(n)) { + attr = &marked_var_attrs; + } else if (const_cast(n)->Var() && + const_cast(n)->Var()->Persistable()) { + attr = ¶m_attrs; + } else { + attr = &arg_attrs; + } + + dot.AddNode(node_id, *attr, node_id); + } + node2dot[n] = node_id; + } + // Create edges + for (const Node* n : graph->Nodes()) { + const auto& src_id = node2dot.at(n); + for (auto* out : n->outputs) { + const auto& trg_id = node2dot.at(out); + dot.AddEdge(src_id, trg_id, {}); + } + } + + sout << dot.Build(); +} + +GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes( + Graph* graph) const { + marked_nodes_t res; + if (graph->Has(kGraphvizMarkedNodeAttr)) { + auto& attr = graph->Get(kGraphvizMarkedNodeAttr); + res = attr; + attr.clear(); + } + return res; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass) + .RequirePassAttr(paddle::framework::ir::kGraphVizPath); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h new file mode 100644 index 00000000..7091aa6a --- /dev/null +++ b/paddle/fluid/framework/ir/graph_viz_pass.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +const char kGraphvizMarkedNodeAttr[] = "__graphviz__marked_node__"; + +class GraphVizPass : public Pass { + public: + using marked_nodes_t = std::unordered_set; + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + // Tell whether there are any marked nodes in the graph. Consume the + // corresponding attribute. + marked_nodes_t ConsumeMarkedNodes(Graph* graph) const; +}; + +static GraphVizPass::marked_nodes_t& GetMarkedNodes(Graph* graph) { + if (!graph->Has(kGraphvizMarkedNodeAttr)) { + graph->Set(kGraphvizMarkedNodeAttr, new GraphVizPass::marked_nodes_t); + } + return graph->Get(kGraphvizMarkedNodeAttr); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc new file mode 100644 index 00000000..a39901e6 --- /dev/null +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/identity_scale_op_clean_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init("identity_scale_op_clean", graph); + + // pre_op -> scale_in -> scale_op -> scale_out + // -> + // pre_op -> scale_out + GraphPatternDetector detector; + auto pre_op = detector.mutable_pattern()->NewNode("pre_op")->assert_is_op(); + auto scale_in = detector.mutable_pattern() + ->NewNode("scale_in") + ->assert_is_op_input("scale") + ->AsIntermediate(); + auto scale_op = detector.mutable_pattern() + ->NewNode("scale_fuse") + ->assert_is_op("scale") + ->assert_op_attr("scale", 1.) + ->assert_op_attr("bias", 0.); + auto scale_out = + detector.mutable_pattern() + ->NewNode("scale_out") + ->assert_is_op_output("scale") + // scale's output var should has only one consumer, or it can't be + // removed. + ->assert_more([](Node* x) { return x->outputs.size() == 1UL; }); + + pre_op->LinksTo({scale_in}); + scale_op->LinksFrom({scale_in}).LinksTo({scale_out}); + + GraphPatternDetector::handle_t handler = [&]( + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + Node* scale_op_var = subgraph.at(scale_op); + Node* scale_in_var = subgraph.at(scale_in); + Node* scale_out_var = subgraph.at(scale_out); + Node* pre_op_var = subgraph.at(pre_op); + // Link pre_op directly to scale_out + const std::string scale_in_name = scale_in_var->Name(); + const std::string scale_out_name = scale_out_var->Name(); + // Remove links in graph + GraphSafeRemoveNodes(graph, {scale_in_var, scale_op_var}); + // Modify proto message + auto* pre_op_desc = pre_op_var->Op(); + for (auto& parameter : *pre_op_desc->Proto()->mutable_outputs()) { + auto* arguments = parameter.mutable_arguments(); + auto it = std::find(arguments->begin(), arguments->end(), scale_in_name); + PADDLE_ENFORCE(it != arguments->end()); + *it = scale_out_name; + } + + IR_NODE_LINK_TO(pre_op_var, scale_out_var); + }; + + detector(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(identity_scale_op_clean_pass, + paddle::framework::ir::IdentityScaleOpCleanPass); diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h new file mode 100644 index 00000000..d66b4112 --- /dev/null +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class IdentityScaleOpCleanPass : public FusePassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; + + private: + virtual ~IdentityScaleOpCleanPass() = default; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc new file mode 100644 index 00000000..d7692411 --- /dev/null +++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class InferCleanGraphPass : public FusePassBase { + public: + virtual ~InferCleanGraphPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init("original_graph", graph); + PADDLE_ENFORCE(graph); + + auto is_valid_node = [](Node* x) { + return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); + }; + + std::unordered_set invalid_nodes; + int valid_op = 0; + for (auto* node : graph->Nodes()) { + PADDLE_ENFORCE_NOT_NULL(node); + if (is_valid_node(node)) { + invalid_nodes.insert(node); + } else if (node->IsOp()) { + // Collect all the operators to help tracking number of operators. + ++valid_op; + } + } + + GraphSafeRemoveNodes(graph, invalid_nodes); + + AddStatis(valid_op); + } + + void CleanEdges(std::vector* nodes, + const std::unordered_set& to_remove) const { + auto it = std::remove_if(nodes->begin(), nodes->end(), + [&](Node* x) { return to_remove.count(x); }); + nodes->erase(it, nodes->end()); + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(infer_clean_graph_pass, + paddle::framework::ir::InferCleanGraphPass); diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc new file mode 100644 index 00000000..bf6fe999 --- /dev/null +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/is_test_pass.h" +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +void IsTestPass::ApplyImpl(ir::Graph* graph) const { + VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it " + "for activations and pooling."; + auto op_list = {"pool2d", "sigmoid", "logsigmoid", + "softshrink", "exp", "brelu", + "pow", "leaky_relu", "stanh", + "relu", "tanh", "tanh_shrink", + "sqrt", "abs", "ceil", + "elu", "floor", "cos", + "sin", "round", "reciprocal", + "hard_shrink", "hard_sigmoid", "relu6", + "soft_relu", "swish", "thresholded_relu", + "log", "square", "softplus", + "softsign"}; + for (const Node* n : graph->Nodes()) { + if (n->IsOp()) { + auto* op = n->Op(); + if (op->HasAttr("is_test") || op->HasProtoAttr("is_test")) { + op->SetAttr("is_test", true); + } else if (std::find(begin(op_list), end(op_list), op->Type()) != + end(op_list)) { + op->MutableAttrMap()->insert( + std::pair("is_test", true)); + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(is_test_pass, paddle::framework::ir::IsTestPass); diff --git a/paddle/fluid/framework/ir/is_test_pass.h b/paddle/fluid/framework/ir/is_test_pass.h new file mode 100644 index 00000000..80cedbf9 --- /dev/null +++ b/paddle/fluid/framework/ir/is_test_pass.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class IsTestPass : public Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc new file mode 100644 index 00000000..3fa543c6 --- /dev/null +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -0,0 +1,120 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/is_test_pass.h" + +#include +#ifdef _WIN32 +#undef FALSE +#undef TRUE +#endif +namespace paddle { +namespace framework { +namespace ir { + +enum class ISTEST_STATE { FALSE, TRUE, UNSET }; + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn = false, + ISTEST_STATE is_test = ISTEST_STATE::UNSET) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("name", name); + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); + op->SetAttr("use_mkldnn", use_mkldnn); + if (is_test == ISTEST_STATE::UNSET) + op->MutableAttrMap()->erase("is_test"); + else if (is_test == ISTEST_STATE::FALSE) + op->SetAttr("is_test", false); + else + op->SetAttr("is_test", true); +} + +// a->pool2d->b +// b->relu->c +// c,weights1)->conv2d->d +// +// d->pool2d->e +// e->hard_sigmoid->f +// (f,weights2)->conv2d->g +// +// g->pool2d->h +// h->tanh->i +// (i,weights3)->conv2d->j +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : + std::vector({"a", "b", "c", "d", "e", "f", "g", "h", "i", + "j", "weights1", "weights2", "weights3"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights1" || v == "weights2" || v == "weights3") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "pool2d", "pooling1", std::vector({"a"}), + std::vector({"b"}), true, ISTEST_STATE::TRUE); + SetOp(&prog, "relu", "activation1", std::vector({"b"}), + std::vector({"c"}), true, ISTEST_STATE::TRUE); + SetOp(&prog, "conv2d", "conv1", std::vector({"c", "weights1"}), + std::vector({"d"}), true, ISTEST_STATE::TRUE); + + SetOp(&prog, "pool2d", "pooling2", std::vector({"d"}), + std::vector({"e"}), false, ISTEST_STATE::FALSE); + SetOp(&prog, "hard_sigmoid", "activation2", std::vector({"e"}), + std::vector({"f"}), false, ISTEST_STATE::FALSE); + SetOp(&prog, "conv2d", "conv2", std::vector({"f", "weights2"}), + std::vector({"g"}), false, ISTEST_STATE::FALSE); + + SetOp(&prog, "pool2d", "pooling3", std::vector({"g"}), + std::vector({"h"}), false, ISTEST_STATE::UNSET); + SetOp(&prog, "tanh", "activation3", std::vector({"h"}), + std::vector({"i"}), true, ISTEST_STATE::UNSET); + SetOp(&prog, "conv2d", "conv3", std::vector({"i", "weights3"}), + std::vector({"j"}), false, ISTEST_STATE::UNSET); + + return prog; +} + +TEST(IsTestPass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("is_test_pass"); + + graph.reset(pass->Apply(graph.release())); + + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + auto op_name = boost::get(op->GetAttr("name")); + if (op_name == "conv3") { + ASSERT_FALSE(op->HasAttr("is_test")); + } else { + ASSERT_TRUE(op->HasAttr("is_test")); + EXPECT_TRUE(boost::get(op->GetAttr("is_test"))); + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(is_test_pass); diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc new file mode 100644 index 00000000..05d23961 --- /dev/null +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc @@ -0,0 +1,355 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/lock_free_optimize_pass.h" + +#include +#include +#include + +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +const char kSumGradOpName[] = "sum"; +// TODO(minqiyang): only support sgd at current time, please add +// other optimizers later. +const char kOptimizerType[] = "sgd"; + +void LockFreeOptimizePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + + // We could collect all weights' name from SGD, where + // W1 <- SGD(W0, Grad0) + std::unordered_set weight_var_set; + for (auto* node : graph->Nodes()) { + if (IsOpNamed(node, kOptimizerType)) { + auto& param_out_vars = node->Op()->Output("ParamOut"); + PADDLE_ENFORCE(param_out_vars.size() == 1u); + weight_var_set.insert(param_out_vars[0]); + } + } + + // find all grad's merge op via weight name, where + // Grad0 <- SUM(Grad1, Grad2, Grad3 ...) + std::unordered_set grad_sum_op_set; + for (ir::Node* node : graph->Nodes()) { + if (IsOpNamed(node, kSumGradOpName)) { + for (ir::Node* output : node->outputs) { + // strip the last grad suffix @GRAD + std::string var_name = output->Name(); + const std::string suffix(kGradVarSuffix); + if (var_name != suffix && var_name.size() > suffix.size() && + var_name.substr(var_name.size() - suffix.size()) == suffix) { + // if so then strip them off + var_name = var_name.substr(0, var_name.size() - suffix.size()); + if (weight_var_set.find(var_name) != weight_var_set.end()) { + grad_sum_op_set.insert(node); + break; + } + } + } + } + } + + // get the forward op and backward op pairs, where + // out <- forward(X, W) + // Grad1 <- backward(out, X') + // Grad0 <- SUM(Grad1, Grad2, Grad3 ...) + // W0 <- SGD(W1, Grad0) + for (ir::Node* node : grad_sum_op_set) { + for (ir::Node* merged_grad_var : node->outputs) { + // find the optimizers connected with sum op + if (IsVarNameEndsWith(merged_grad_var, kGradVarSuffix) && + merged_grad_var->outputs.size() == 1u) { + ir::Node* opt_node = merged_grad_var->outputs[0]; + VLOG(3) << "Found opt node " << opt_node->Name(); + + // find the backward op connected with sum op + for (ir::Node* unmerged_grad_var : node->inputs) { + if (IsVarNameContains(unmerged_grad_var, kGradVarSuffix) && + unmerged_grad_var->inputs.size() == 1u) { + ir::Node* backward_op = unmerged_grad_var->inputs[0]; + + VLOG(3) << "Found backward_op " << backward_op->Name(); + + // find the forward op related to the backward op + ir::Node* forward_op = + FindForwardOpViaBackwardOp(graph, backward_op); + + VLOG(3) << "Found forward_op " << forward_op->Name(); + + PADDLE_ENFORCE(forward_op); + + Node* new_optimizer_node = CreateNewSGDNode( + graph, forward_op, backward_op, node, opt_node); + + PADDLE_ENFORCE(new_optimizer_node); + } + } + } + } + } + + // Remove the sum_op and its' outputs and connected Optimizers + for (Node* sum_op : grad_sum_op_set) { + for (Node* sum_op_output : sum_op->outputs) { + for (Node* optimize_op : sum_op_output->outputs) { + if (optimize_op->NodeType() == Node::Type::kOperation && + optimize_op->Name() == kOptimizerType) { + VLOG(3) << "remove optimize_op: " << optimize_op->Name() << "_" + << optimize_op->id(); + graph->RemoveNode(optimize_op); + } + } + VLOG(3) << "remove sum_op_output: " << sum_op_output->Name() << "_" + << sum_op_output->id(); + graph->RemoveNode(sum_op_output); + } + VLOG(3) << "remove sum_op: " << sum_op->Name() << "_" << sum_op->id(); + graph->RemoveNode(sum_op); + } + + for (auto* node : graph->Nodes()) { + for (Node* output_node : node->outputs) { + if (output_node->Name() == "sgd") { + VLOG(3) << "Node link to SGD: " << node->Name() << "_" << node->id() + << " --> " << output_node->Name() << "_" << output_node->id(); + for (Node* input_node : node->inputs) { + VLOG(3) << "SGD Input link: " << input_node->Name() << "_" + << input_node->id() << " --> " << node->Name() << "_" + << node->id(); + } + } + } + } +} + +ir::Node* LockFreeOptimizePass::CreateNewSGDNode( + ir::Graph* graph, ir::Node* forward_node, ir::Node* backward_node, + ir::Node* grad_sum_node, ir::Node* optimize_node) const { + PADDLE_ENFORCE(graph); + PADDLE_ENFORCE(forward_node); + PADDLE_ENFORCE(backward_node); + PADDLE_ENFORCE(grad_sum_node); + PADDLE_ENFORCE(optimize_node); + + // find the grad var node between the grad sum node and backward_node + std::vector grad_vars = + FindConnectedNode(backward_node, grad_sum_node); + ir::Node* grad_node = nullptr; + for (ir::Node* node : grad_vars) { + if (!ir::IsControlDepVar(*node)) { + grad_node = node; + } + } + PADDLE_ENFORCE(grad_node); + + // create a new SGD node + OpDesc* old_desc = optimize_node->Op(); + // keep with the same block between new optimizer and the old one + OpDesc new_desc(*old_desc, old_desc->Block()); + new_desc.SetInput("Param", old_desc->Input("Param")); + new_desc.SetInput("LearningRate", old_desc->Input("LearningRate")); + new_desc.SetInput("Grad", std::vector({grad_node->Name()})); + new_desc.SetOutput("ParamOut", old_desc->Output("ParamOut")); + + std::vector op_role_vars = boost::get>( + new_desc.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName())); + // replace the second op role var, because the grad name was + // changed in new optimizer + op_role_vars.pop_back(); + op_role_vars.push_back(grad_node->Name()); + new_desc.SetAttr(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), + op_role_vars); + new_desc.SetType(kOptimizerType); + + // set backward op's op role var, this will be used to + // set device_id in multi_device_pass + backward_node->Op()->SetAttr( + framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), op_role_vars); + // backward_node->Op()->SetAttr( + // framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), {}); + + // keep with the same output nodes between new optimizer and the + // old one + Node* sgd_node = graph->CreateOpNode(&new_desc); + + // change all outputs of the optimize_node to the new one + ReplaceAllDownstreamNode(optimize_node, sgd_node); + + // find connected node between forward node and optimize node + // and replace the optimize node to new sgd node + std::vector forward_opt_connected_nodes = + FindConnectedNode(forward_node, optimize_node); + for (ir::Node* node : forward_opt_connected_nodes) { + ReplaceUpstreamNode(node, optimize_node, sgd_node); + } + + // find connected node between backward node and optimize node + // and replace the optimize node to new sgd node + std::vector backward_opt_connected_nodes = + FindConnectedNode(backward_node, optimize_node); + for (ir::Node* node : backward_opt_connected_nodes) { + ReplaceUpstreamNode(node, optimize_node, sgd_node); + } + + // SGD must have only one param and LR in + PADDLE_ENFORCE(old_desc->Input("LearningRate").size() == 1u); + PADDLE_ENFORCE(old_desc->Input("Param").size() == 1u); + + // LR and weight nodes should be copied + for (Node* upstream_node : optimize_node->inputs) { + if (upstream_node->Name() == old_desc->Input("LearningRate")[0] || + upstream_node->Name() == old_desc->Input("Param")[0]) { + ReplaceUpstreamNode(upstream_node, optimize_node, sgd_node); + } + } + + VLOG(3) << "Create new opt node" << sgd_node->Name() << "_" << sgd_node->id(); + + return sgd_node; +} + +std::vector LockFreeOptimizePass::FindConnectedNode( + ir::Node* upstream_node, ir::Node* downstream_node) const { + std::vector result; + for (ir::Node* out_node : upstream_node->outputs) { + for (ir::Node* in_node : downstream_node->inputs) { + if (in_node == out_node) { + result.push_back(in_node); + } + } + } + + return result; +} + +void LockFreeOptimizePass::ReplaceUpstreamNode( + ir::Node* upstream_node, ir::Node* old_optimizer_node, + ir::Node* new_optimizer_node) const { + PADDLE_ENFORCE(upstream_node); + PADDLE_ENFORCE(old_optimizer_node); + PADDLE_ENFORCE(new_optimizer_node); + + // Remove the old_optimizer_node from upstream_node's outputs vector + auto& output_node_vec = upstream_node->outputs; + for (auto output_node_iter = output_node_vec.begin(); + output_node_iter != output_node_vec.end();) { + if (*output_node_iter == old_optimizer_node) { + output_node_vec.erase(output_node_iter); + break; + } else { + ++output_node_iter; + } + } + + // Add the new_optimizer_node to upstream_node's outputs vector + output_node_vec.emplace_back(new_optimizer_node); + new_optimizer_node->inputs.emplace_back(upstream_node); +} + +void LockFreeOptimizePass::ReplaceAllDownstreamNode( + ir::Node* old_optimizer_node, ir::Node* new_optimizer_node) const { + PADDLE_ENFORCE(old_optimizer_node); + PADDLE_ENFORCE(new_optimizer_node); + + for (ir::Node* downstream_node : old_optimizer_node->outputs) { + // Remove the old_optimizer_node from downstream_node's inputs vector + auto& input_node_vec = downstream_node->inputs; + for (auto input_node_iter = input_node_vec.begin(); + input_node_iter != input_node_vec.end();) { + if (*input_node_iter == old_optimizer_node) { + input_node_vec.erase(input_node_iter); + break; + } else { + ++input_node_iter; + } + } + + // Add the new_optimizer_node to downstream_node's inputs vector + input_node_vec.emplace_back(new_optimizer_node); + new_optimizer_node->outputs.emplace_back(downstream_node); + } +} + +ir::Node* LockFreeOptimizePass::FindForwardOpViaBackwardOp( + ir::Graph* graph, ir::Node* backward_node) const { + PADDLE_ENFORCE(graph); + PADDLE_ENFORCE(backward_node); + + // strip the suffix _grad of backward_node's name + std::string forward_op_name = backward_node->Name(); + const std::string suffix("_grad"); + if (forward_op_name != suffix && forward_op_name.size() > suffix.size() && + forward_op_name.substr(forward_op_name.size() - suffix.size()) == + suffix) { + // if so then strip them off + forward_op_name = + forward_op_name.substr(0, forward_op_name.size() - suffix.size()); + } else { + LOG(WARNING) << "Illegal backward node's name " << backward_node->Name() + << " id " << backward_node->id(); + + return nullptr; + } + + for (ir::Node* node : graph->Nodes()) { + if (node->Name() == forward_op_name) { + if (node->outputs.size() == 0u) { + // if forward_node has no output, then it has NO grad op + continue; + } + + // check whether all inputs of the backward_op that ends_with @GRAD + // comes from the output of forward_op is the input of the backward_op + bool is_related_forward_node = true; + for (ir::Node* backward_input : backward_node->inputs) { + if (IsVarNameEndsWith(backward_input, kGradVarSuffix)) { + bool meets_correct_output = false; + for (ir::Node* forward_output : node->outputs) { + if (forward_output->Name() + kGradVarSuffix == + backward_input->Name()) { + meets_correct_output = true; + break; + } + } + + if (!meets_correct_output) { + is_related_forward_node = false; + break; + } + } + } + + if (is_related_forward_node) { + return node; + } + } + } + + return nullptr; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(lock_free_optimize_pass, + paddle::framework::ir::LockFreeOptimizePass); diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h new file mode 100644 index 00000000..9c923480 --- /dev/null +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h @@ -0,0 +1,127 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Node; + +/* +* Remove the sum op of all gradients of the backward op. +* And remove the dependecies of the optimizer related to the +* same backward op. +* +* Before this pass: +* +* forward_op1 forward_op2 +* | | +* grad_op1 grad_op2 +* \ / +* \ / +* sum_op +* | +* sgd_op +* +* After this pass: +* forward_op1 forward_op2 +* | | +* grad_op1 grad_op2 +* | | +* sgd_op1 sgd_op2 +* +* sgd_op1 and sgd_op2 will update the same weight which holds the same +* memory, so we could benefits from the acceleration +*/ +class LockFreeOptimizePass : public Pass { + public: + virtual ~LockFreeOptimizePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + private: + // Create a new sgd node via current optimizer node + ir::Node* CreateNewSGDNode(ir::Graph* graph, ir::Node* forward_node, + ir::Node* backward_node, ir::Node* grad_sum_node, + ir::Node* optimize_node) const; + + // Replace the input weight's optimizers + void ReplaceUpstreamNode(ir::Node* upstream_node, + ir::Node* old_optimizer_node, + ir::Node* new_optimizer_node) const; + + // Replace the output weight's optimizers + void ReplaceAllDownstreamNode(ir::Node* old_optimizer_node, + ir::Node* new_optimizer_node) const; + + // Find all weight variables in graph + bool FindAllWeightVars(ir::Graph* graph) const; + + // Find the forward_op node via the backward_op node + ir::Node* FindForwardOpViaBackwardOp(ir::Graph* graph, + ir::Node* backward_node) const; + + std::vector FindConnectedNode(ir::Node* upstream_node, + ir::Node* downstream_node) const; + + inline bool IsOpNamed(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kOperation && node->Name() == name; + } + + inline bool IsVarNamed(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kVariable && node->Name() == name; + } + + inline bool IsVarNameEndsWith(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kVariable && + boost::algorithm::ends_with(node->Name(), name); + } + + inline bool IsVarNameContains(ir::Node* node, const std::string& name) const { + PADDLE_ENFORCE(node); + + return node->NodeType() == Node::Type::kVariable && + node->Name().find(name) != std::string::npos; + } + + inline bool IsControlDepFrom(ir::Node* ctrl_dep_node, ir::Node* node) const { + PADDLE_ENFORCE(ctrl_dep_node); + PADDLE_ENFORCE(node); + + return IsControlDepVar(*ctrl_dep_node) && + ctrl_dep_node->inputs.size() >= 1u && + ctrl_dep_node->inputs[0] == node; + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt new file mode 100644 index 00000000..070ea9aa --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt @@ -0,0 +1,24 @@ +cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base) +cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle) +cc_library(recurrent_op_eager_deletion_pass SRCS recurrent_op_eager_deletion_pass.cc DEPS recurrent_op_helper graph_helper pass computation_op_handle) +cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle var_handle) +cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) + +if(WITH_GPU) + cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info) +else() + cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info) +endif() + +cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) +cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) + +cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry) + +cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle + eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper) +cc_library(record_skip_memory_opt_vars_pass SRCS record_skip_memory_opt_vars_pass.cc DEPS graph graph_helper) + +cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handle reference_count_pass_helper share_tensor_buffer_op_handle multi_devices_helper graph pass) + +cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass) diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc new file mode 100644 index 00000000..b5d17ef2 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc @@ -0,0 +1,160 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class BufferSharedInplaceOpPass : public MemoryReusePass { + protected: + std::string ReuseType() const override { return "inplace"; } + + void Run(Graph *graph) const override; +}; + +void BufferSharedInplaceOpPass::Run(Graph *graph) const { + const auto &last_live_ops = + Get>(kLastLiveOpsOfVars); + + bool use_cuda = Get(kUseCuda); + + // Step 1: Build a reverse map of last_live_ops + // i.e.: op -> vars + std::unordered_map> + candidate_ops; + for (auto &each_scope_ops : last_live_ops) { + for (auto &pair : each_scope_ops) { + // If variable has more than 1 last lived ops, this variable cannot + // be inplaced. + if (pair.second.size() != 1) { + continue; + } + + auto *op = *(pair.second.begin()); + const std::string &op_type = op->GetOp()->Type(); + const framework::OpDesc *op_desc = op->Node()->Op(); + PADDLE_ENFORCE_NOT_NULL(op_desc); + + auto &infer_inplace = OpInfoMap::Instance().Get(op_type).infer_inplace_; + if (!infer_inplace) { + continue; + } + + const std::string &var_name = pair.first; + auto in_nodes = this->FindNodesByName(var_name, op->Node()->inputs); + if (in_nodes.size() == 1) { + candidate_ops[op][var_name] = *in_nodes.begin(); + } + } + } + + // Step 2: Check which vars can be inplaced indeed + for (auto &op_vars_pair : candidate_ops) { + auto *op = op_vars_pair.first; + auto &vars = op_vars_pair.second; + + const std::string &op_type = op->GetOp()->Type(); + auto *op_desc = op->Node()->Op(); + + auto in_to_outs = + OpInfoMap::Instance().Get(op_type).infer_inplace_(*op_desc, use_cuda); + for (auto &pair : in_to_outs) { + auto &in_param = pair.first; + auto &in_args = op_desc->Input(in_param); + if (in_args.empty()) { + VLOG(4) << "Cannot inplace because Input(" << in_param + << ") is empty in " << op_type; + continue; + } + + auto &in_arg = in_args[0]; + auto iter = vars.find(in_arg); + if (iter == vars.end()) { + VLOG(4) << "Cannot inplace maybe because Input(" << in_param + << ")=" << in_arg << " is not lastly used in op " << op_type + << ", or it occurs multiple times in input or occurs in output"; + continue; + } + + ir::Node *in_node = iter->second; + + auto &out_param = pair.second; + auto &out_args = op_desc->Output(out_param); + + if (out_args.empty()) { + VLOG(4) << "Cannot inplace because Output(" << out_param + << ") is empty in " << op_type; + continue; + } + + auto &out_arg = out_args[0]; + auto out_nodes = this->FindNodesByName(out_arg, op->Node()->outputs); + if (out_nodes.size() != 1) { + VLOG(4) << "Cannot inplace because Output(" << out_param + << ")=" << out_arg << " occurs " << out_nodes.size() + << " time(s) in output of op " << op_type; + continue; + } + + auto *out_node = *out_nodes.begin(); + + auto &in_var_handle = in_node->Wrapper(); + auto &out_var_handle = out_node->Wrapper(); + + auto *in_var_handle_ptr = + dynamic_cast(&in_var_handle); + auto *out_var_handle_ptr = + dynamic_cast(&out_var_handle); + + if (in_var_handle_ptr == nullptr || out_var_handle_ptr == nullptr) { + continue; + } + + bool success = this->TryReuseVar(in_var_handle_ptr, out_var_handle_ptr); + if (success) { + VLOG(4) << "Inplace performed in op " << op_type << ": " + << in_var_handle_ptr->Name() << " -> " + << out_var_handle_ptr->Name() + << ". Debug String is: " << op->GetOp()->DebugString(); + } else { + VLOG(4) << "Inplace failed in op " << op_type << ": " + << in_var_handle_ptr->Name() << " -> " + << out_var_handle_ptr->Name(); + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(buffer_shared_inplace_pass, + paddle::framework::ir::BufferSharedInplaceOpPass) + .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList) + .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars) + .RequirePassAttr(paddle::framework::ir::kUseCuda); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc new file mode 100644 index 00000000..452255a6 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc @@ -0,0 +1,292 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" + +namespace paddle { +namespace framework { +namespace ir { + +// op -> variables which can be deleted after op runs +using OpToVarNameSetMap = std::unordered_map>; + +static std::map> VarsGroupByScopeIdx( + const OpToVarNameSetMap &map) { + std::map> result; + for (auto &pair : map) { + size_t scope_idx = pair.first->GetScopeIdx(); + auto &var_set = result[scope_idx]; + for (auto &var : pair.second) { + var_set.insert(var); + } + } + return result; +} + +// Check whether the variable is LoDTensor based on static VarDesc info +static bool IsLoDTensor(VarDesc *var) { + return var->Proto()->type().type() == proto::VarType::LOD_TENSOR; +} + +// Get memory size of LoDTensor +static int64_t GetMemorySize( + const std::unordered_map> + &vars, + const std::string &var_name) { + auto *var_desc = TryGetLatestVarDesc(vars.at(var_name)); + PADDLE_ENFORCE_NOT_NULL(var_desc); + PADDLE_ENFORCE(IsLoDTensor(var_desc)); + auto dims = var_desc->GetShape(); + return SizeOfType(var_desc->GetDataType()) * + std::accumulate(dims.begin(), dims.end(), static_cast(1), + std::multiplies()); +} + +// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g. +// SelectedRows, LoDTensorArray) +// Since partial GC is based on static analysis of memory size of each variable +// So we should skip SelectedRows and LoDTensorArray here +static void SplitIntoLoDTensorAndNonLoDTensorVars( + const OpToVarNameSetMap &m, const details::GraphVars &vars, + OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) { + lod_tensors->clear(); + other_vars->clear(); + + for (auto &op_vars_pair : m) { + for (auto var_name : op_vars_pair.second) { + auto *var_desc = TryGetLatestVarDesc( + vars[op_vars_pair.first->GetScopeIdx()].at(var_name)); + if (IsLoDTensor(var_desc)) { + (*lod_tensors)[op_vars_pair.first].insert(var_name); + } else { + (*other_vars)[op_vars_pair.first].insert(var_name); + } + } + } +} + +struct GCVarInfo { + GCVarInfo(const std::string &name, int64_t memory_size, + details::ComputationOpHandle *op, size_t scope_idx) + : name_(name), + memory_size_(memory_size), + op_(op), + scope_idx_(scope_idx) {} + + std::string name_; // variable name + int64_t memory_size_; // memory size + details::ComputationOpHandle + *op_; // op after which the variable could be deleted + size_t scope_idx_; // scope index where the variable locates + + int64_t AbsMemorySize() const { return std::abs(memory_size_); } +}; + +// Delete delete_lod_tensor_only is not used currently +static OpToVarNameSetMap ShrinkGCVars( + const OpToVarNameSetMap &m, const details::GraphVars &vars, + const std::vector &places, double fraction_of_memory_size, + bool delete_lod_tensor_only = false) { + // Do not perform gc when fraction_of_memory_size = 0 + if (fraction_of_memory_size <= 0.0) return {}; + + /** + * Step 1: Split all variables into LoDTensor and Non-LoDTensor. + * We can only calculate memory size of LoDTensors + */ + OpToVarNameSetMap lod_tensors, other_vars; + SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars); + + // Perform complete gc when fraction_of_memory_size >= 1 + if (fraction_of_memory_size >= 1.0) { + return delete_lod_tensor_only ? lod_tensors : m; + } + + /** + * Step 2: build GCVarInfos, and calculate total memory sizes of each device + */ + + // place -> variable info (name, memory size, place, scope_idx) + std::map> place_to_vars; + + // place -> total memory sizes + std::map place_to_size; + for (auto &op_vars_pair : lod_tensors) { + auto *op = op_vars_pair.first; + auto &var_names = op_vars_pair.second; + auto scope_idx = op->GetScopeIdx(); + auto &place = places[scope_idx]; + + for (auto &var_name : var_names) { + auto var_size = GetMemorySize(vars[scope_idx], var_name); + GCVarInfo var_info(var_name, var_size, op, scope_idx); + place_to_size[place] += var_info.AbsMemorySize(); + place_to_vars[place].emplace_back(std::move(var_info)); + } + } + + /** + * Step 3: sort GCVarInfos, and only delete the largest variables. + */ + OpToVarNameSetMap partial_vars; + for (auto &place_to_var_pair : place_to_vars) { + auto &place = place_to_var_pair.first; + auto &gc_vars = place_to_var_pair.second; + std::sort(gc_vars.begin(), gc_vars.end(), + [](const GCVarInfo &var1, const GCVarInfo &var2) { + return var1.AbsMemorySize() > var2.AbsMemorySize(); + }); + + int64_t accumulated_size = 0; + int64_t size_threshold = + static_cast(fraction_of_memory_size * place_to_size[place]); + for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold; + ++i) { + partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_); + accumulated_size += gc_vars[i].AbsMemorySize(); + } + } + + /** + * Step 4: Combine other vars (SelectedRows, LoDTensorArray) + */ + if (!delete_lod_tensor_only) { + for (auto &op_vars_pair : other_vars) { + partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(), + op_vars_pair.second.end()); + } + } + + return partial_vars; +} + +class EagerDeletionPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override; +}; + +void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const { + auto &var_infos = Get(kMemOptVarInfoMapList); + + const auto &vars = graph->Get(details::kGraphVars); + + const auto &last_live_ops = + Get>(kLastLiveOpsOfVars); + const auto &gcs = Get(kGarbageCollector); + const auto &places = Get>(kAllPlaces); + + // a reverse map of last_live_ops + // i.e., last op --> variable names which can be deleted. + OpToVarNameSetMap op_vars_map; + for (auto &var_ops_map : last_live_ops) { + for (auto &var_ops_pair : var_ops_map) { + const std::string &var_name = var_ops_pair.first; + for (auto *op : var_ops_pair.second) { + op_vars_map[op].insert(var_name); + } + } + } + + double memory_fraction = framework::GetEagerDeletionMemoryFraction(); + + op_vars_map = ShrinkGCVars(op_vars_map, vars, places, memory_fraction); + + for (auto &pair : op_vars_map) { + auto *op = pair.first; + auto &var_names = pair.second; + + auto *eager_deletion_node = + graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation); + + std::unordered_set var_info; + for (auto &var_name : var_names) { + var_info.insert(var_infos[op->GetScopeIdx()].at(var_name).get()); + } + + auto *eager_deletion_op = new details::EagerDeletionOpHandle( + eager_deletion_node, op->GetScope(), op->GetPlace(), + std::move(var_info), gcs.at(places[op->GetScopeIdx()]).get()); + + auto it = std::find_if( + op->Outputs().begin(), op->Outputs().end(), + [](details::VarHandleBase *var) { + return dynamic_cast(var) != nullptr; + }); + + if (it != op->Outputs().end()) { + eager_deletion_op->AddInput(*it); + } else { + auto *dep_var = new details::DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(details::kGraphDepVars) + .emplace(dep_var); + op->AddOutput(dep_var); + eager_deletion_op->AddInput(dep_var); + } + + auto *dummy_leaf = + new details::DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(details::kGraphDepVars) + .emplace(dummy_leaf); + eager_deletion_op->AddOutput(dummy_leaf); + + eager_deletion_op->SetDeviceContext( + places[op->GetScopeIdx()], + platform::DeviceContextPool::Instance().Get(places[op->GetScopeIdx()])); + } + + VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = " << memory_fraction; + VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; + + if (VLOG_IS_ON(10)) { + auto vars_group_by_scope_idx = VarsGroupByScopeIdx(op_vars_map); + for (auto &pair : vars_group_by_scope_idx) { + VLOG(10) << "Scope " << pair.first << " has " << pair.second.size() + << " vars"; + } + } + + auto while_op_eager_deletion_pass = + ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass"); + while_op_eager_deletion_pass->Apply(graph); + + auto recurrent_op_eager_deletion_pass = + ir::PassRegistry::Instance().Get("recurrent_op_eager_deletion_pass"); + recurrent_op_eager_deletion_pass->Apply(graph); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(eager_deletion_pass, paddle::framework::ir::EagerDeletionPass) + .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList) + .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars) + .RequirePassAttr(paddle::framework::ir::kAllPlaces) + .RequirePassAttr(paddle::framework::ir::kGarbageCollector); + +USE_PASS(while_op_eager_deletion_pass); +USE_PASS(recurrent_op_eager_deletion_pass); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc new file mode 100644 index 00000000..1935f5e3 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc @@ -0,0 +1,487 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/op_info.h" + +// NOTE(dzhwinter): inplace means one op output variable reuse the input space. +// By our design, one operator only can read its input(const Variable), +// write its output(non-const Variable). If one operator is inplaced, means +// user have chance to write the space before reading happens. +// Especially when some optimize code writing style is applied. +// +// +// /* wrong case in operator */ +// /*In this case, a larger allocation is allocated, input content is lost*/ +// const Tensor* in = ctx.Input("In") +// Tensor* out = ctx.Output("Out"); +// auto* out_ptr = out->mutable_data(ctx.GetPlace()); +// out_ptr[0] = 0; // input contect is overwrited. + +// NOTE(dzhwinter): +// Only for backward compacity and stable. if enable_inplace_whitelist is turn +// on. +// only the ops in whitelist will be use inplace strategy. +// if not, all the op will be inplaced if it registered with InplaceClass +DEFINE_bool( + enable_inplace_whitelist, false, + "If this option turns on, only these op in whitelist can be inplaced." + "If it turns off, all of the running op can be candidate of inplaced op." + "Such as scale, elementwise_add" + "By default, it's turned off"); + +namespace paddle { +namespace framework { +namespace ir { + +// clang-format off +const std::string kInplacedOpWhiteList[] = { // NOLINT + "sigmoid", + "exp", + "relu", + "tanh", + "sqrt", + "ceil", + "floor", + "reciprocal", + "relu6", + "soft_relu", + "hard_sigmoid", + "batch_norm", + "batch_norm_grad", + "sum", + "sum_grad", + "scale", + "reshape", + "elementwise_add", + "elementwise_add_grad", +}; + +// FIXME(zjl): Shapes of in-out of some ops are exactly the same, +// but the static size during compiling time would be wrong. +// Use a flag to indicate such ops. Please fix me when found a better way. +static const std::unordered_set kSameShapeOpWhiteSet{ // NOLINT + "reshape2", "reshape2_grad" +}; +// clang-format on + +class InplacePass : public ir::Pass { + public: + InplacePass(); + + protected: + void ApplyImpl(ir::Graph *graph) const override; + + private: + // Collect vars that cannot be reused + // e.g.: subblock ops in/out, distributed ops in/out, op_role_var + void CollectSkipVars(ir::Graph *graph, + const std::vector &ops) const; + + // Check whether var_name should be skipped + bool IsSkipVar(const std::string &var_name) const; + + // Rename out with name of in, and guarantee that the graph is + // still a SSA graph + void RenameInOut(ir::Node *op, ir::Node *in, ir::Node *out) const; + + // Check whether var is the last version one in SSA graph + bool IsLastVersionVar(ir::Node *var) const; + + // Check whether var is the first version one in SSA graph + bool IsFirstVersionVar(ir::Node *var) const; + + // Check whether all `ops` is the preceding ops of `op` + bool CheckOpDeps(ir::Node *op, const std::vector &ops) const; + + // Find nodes whose names are equal to the given name + static std::unordered_set FindNodesByName( + const std::string &name, const std::vector &nodes); + + // Collect inputs and outputs of op_desc + static void CollectInputArgsOfOpDesc( + const OpDesc *op_desc, std::unordered_multiset *in_args); + + // Get all versions vars named var_name + std::vector *AllVersionVars(const std::string &var_name) const; + + private: + // SSA graph. var_name -> each version of vars + mutable std::map> ssa_map_; + + // Skip vars, including subblock ops in/out, distributed ops in/out, + // op_role_var + mutable std::unordered_set skip_vars_; + + // Op whitelist which should not peform inplace + // Only enabled when FLAGS_enable_inplace_whitelist is true. + mutable std::unordered_set whitelist_ops_; +}; + +InplacePass::InplacePass() { + if (FLAGS_enable_inplace_whitelist) { + for (auto &s : kInplacedOpWhiteList) { + whitelist_ops_.emplace(s); + } + } +} + +std::vector *InplacePass::AllVersionVars( + const std::string &var_name) const { + auto iter = ssa_map_.find(var_name); + PADDLE_ENFORCE(iter != ssa_map_.end(), "cannot find var %s in ssa graph", + var_name); + PADDLE_ENFORCE(!iter->second.empty(), "var %s is empty in ssa graph", + var_name); + return &(iter->second); +} + +bool InplacePass::IsSkipVar(const std::string &var_name) const { + return skip_vars_.count(var_name) > 0; +} + +bool InplacePass::IsFirstVersionVar(ir::Node *var) const { + return AllVersionVars(var->Name())->front() == var; +} + +bool InplacePass::IsLastVersionVar(ir::Node *var) const { + return AllVersionVars(var->Name())->back() == var; +} + +bool InplacePass::CheckOpDeps(ir::Node *op, + const std::vector &ops) const { + std::unordered_set other_ops(ops.begin(), ops.end()); + other_ops.erase(op); + if (other_ops.empty()) return true; + + // Traverse all preceding ops of op + std::queue queue; + std::unordered_set visited_ops; + queue.push(op); + visited_ops.insert(op); + + // Visit all preceding ops of `op`, and erase it from other_ops if it is + // inside other_ops. Return true only if other_ops is empty(), which means + // that all `ops` are preceding ops of `op`. + while (!queue.empty()) { + auto *cur_op = queue.front(); + queue.pop(); + + for (auto *in_var : cur_op->inputs) { + for (auto *in_op : in_var->inputs) { + if (visited_ops.count(in_op) != 0) { + continue; + } + + visited_ops.insert(in_op); + queue.push(in_op); + other_ops.erase(in_op); + if (other_ops.empty()) return true; + } + } + } + return false; +} + +void InplacePass::CollectSkipVars(ir::Graph *graph, + const std::vector &ops) const { + // 1. Collect op role vars + PADDLE_ENFORCE(graph->Has(kMemOptSkipVars), "Graph should have attr %s", + kMemOptSkipVars); + auto &mem_opt_whitelist = graph->Get(kMemOptSkipVars); + for (const auto &var : mem_opt_whitelist) { + skip_vars_.emplace(var); + } +} + +void InplacePass::RenameInOut(ir::Node *op, ir::Node *in_var, + ir::Node *out_var) const { + auto out_var_name = out_var->Name(); + auto in_var_name = in_var->Name(); + + auto &all_out_nodes = *AllVersionVars(out_var_name); + auto &all_in_nodes = *AllVersionVars(in_var_name); + + auto iter = std::find(all_out_nodes.begin(), all_out_nodes.end(), out_var); + PADDLE_ENFORCE(iter != all_out_nodes.end(), "Cannot find out var %s", + out_var_name); + + // The following codes are designed to guarantee that ssa_map_ is still + // an ssa graph after inplace is performed. + // Step 1: Rename the following versions of out_var as the name of in_var + // Step 2: Remove the following versions of out_var and append them to in_var + // Be careful that the inputs of input op of out_var should not be renamed, + // but outputs should be renamed. + auto original_iter = iter; + while (iter != all_out_nodes.end()) { + auto *node = *iter; + /* Step 1 */ + node->RenameVar(in_var_name); + if (iter != original_iter) { + for (auto *in : node->inputs) { + if (in->IsOp() && in->Op()) { + in->Op()->RenameOutput(out_var_name, in_var_name); + in->Op()->RenameInput(out_var_name, in_var_name); + in->Op()->Flush(); + } + } + } + + for (auto *out : node->outputs) { + if (out->IsOp() && out->Op()) { + out->Op()->RenameOutput(out_var_name, in_var_name); + out->Op()->RenameInput(out_var_name, in_var_name); + out->Op()->Flush(); + } + } + + /* Step 2 */ + all_in_nodes.emplace_back(node); + ++iter; + } + + /* Step 2 */ + all_out_nodes.erase(original_iter, all_out_nodes.end()); + + if (all_out_nodes.empty()) { + ssa_map_.erase(out_var_name); + } + op->Op()->RenameOutput(out_var_name, in_var_name); + op->Op()->Flush(); +} + +std::unordered_set InplacePass::FindNodesByName( + const std::string &name, const std::vector &nodes) { + std::unordered_set ret; + for (auto *node : nodes) { + if (node->Name() == name) { + ret.insert(node); + } + } + return ret; +} + +void InplacePass::CollectInputArgsOfOpDesc( + const OpDesc *op_desc, std::unordered_multiset *in_args) { + in_args->clear(); + for (auto &in_name : op_desc->InputArgumentNames()) { + in_args->insert(in_name); + } +} + +void InplacePass::ApplyImpl(ir::Graph *graph) const { + // Step 1: topo sort ops, collect skip vars + auto ops = ir::TopologySortOperations(*graph); + CollectSkipVars(graph, ops); + + // Step 2: build ssa var map + for (auto *op_node : ops) { + for (auto *in : op_node->inputs) { + PADDLE_ENFORCE(in->IsVar()); + // Only create a new var node when var first occurs in input of op. + if (ssa_map_.count(in->Name()) == 0) { + ssa_map_[in->Name()].emplace_back(in); + } + } + + // Always create a new var node for each output of op. + for (auto *out : op_node->outputs) { + PADDLE_ENFORCE(out->IsVar()); + ssa_map_[out->Name()].emplace_back(out); + } + } + + // Step 3: traverse ops and try inplace if possible + bool use_cuda = Get(kUseCuda); + VLOG(4) << "Inplace pass is applied when use_cuda = " + << (use_cuda ? "true" : "false"); + + for (auto *op_node : ops) { + PADDLE_ENFORCE_NOT_NULL(op_node->Op(), "op_desc is nullptr"); + + auto *op_desc = op_node->Op(); + auto op_type = op_desc->Type(); + + // Skip op inside whitelist + if (whitelist_ops_.count(op_type) > 0) { + continue; + } + + auto &infer_inplace = OpInfoMap::Instance().Get(op_type).infer_inplace_; + + if (!infer_inplace) { + continue; + } + + auto in_to_outs = infer_inplace(*op_desc, use_cuda); + if (in_to_outs.empty()) continue; + + std::unordered_multiset all_in_args; + CollectInputArgsOfOpDesc(op_desc, &all_in_args); + + for (auto &pair : in_to_outs) { + auto &in_param = pair.first; + auto &out_param = pair.second; + + auto &in_args = op_desc->Input(in_param); + auto &out_args = op_desc->Output(out_param); + + if (in_args.empty()) { + VLOG(4) << "Cannot inplace because Input(" << in_param + << ") is empty in " << op_type; + continue; + } + + if (out_args.empty()) { + VLOG(4) << "Cannot inplace because Output(" << out_param + << ") is empty in " << op_type; + continue; + } + + auto &in_arg = in_args[0]; + auto &out_arg = out_args[0]; + + if (IsSkipVar(in_arg)) { + VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg + << " is skipped in " << op_type; + continue; + } + + if (IsSkipVar(out_arg)) { + VLOG(4) << "Cannot inplace because Output(" << out_param + << ")=" << out_arg << " is skipped in " << op_type; + continue; + } + + if (in_arg == out_arg) { + VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg + << " is the same with Output(" << out_param << ")=" << out_arg + << " in " << op_type; + continue; + } + + size_t in_arg_occur_times = all_in_args.count(in_arg); + if (in_arg_occur_times > 1) { + VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg + << " occurs " << in_arg_occur_times << " times in input of op " + << op_type; + continue; + } + + auto in_nodes = FindNodesByName(in_arg, op_node->inputs); + PADDLE_ENFORCE(!in_nodes.empty(), "Input(%s)=%s cannot be found in op %s", + in_param, in_arg, op_type); + + if (in_nodes.size() > 1) { + VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg + << " occurs in other inputs of " << op_type; + continue; + } + + auto *in_node = *in_nodes.begin(); + + if (!NodeCanReused(in_node)) { + VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg + << " is not reusable in " << op_type; + continue; + } + + if (!IsLastVersionVar(in_node)) { + VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg + << " is not the last version in " << op_type; + continue; + } + + // If in_node is used as inputs of many ops, check whether all of that ops + // depends on op_node. If not, in_node cannot be inplaced. + if (in_node->outputs.size() > 1 && + !CheckOpDeps(op_node, in_node->outputs)) { + VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg + << " is not lastly used in " << op_type; + continue; + } + + auto out_nodes = FindNodesByName(out_arg, op_node->outputs); + PADDLE_ENFORCE(!out_nodes.empty(), + "Output(%s)=%s cannot be found in op %s", out_param, + out_arg, op_type); + + PADDLE_ENFORCE_EQ( + out_nodes.size(), 1, + "Wrong graph: Output(%s)=%s occurs in other outputs of op %s", + out_param, out_arg, op_type); + + if (!FindNodesByName(in_arg, op_node->outputs).empty()) { + VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg + << " occurs in output of op " << op_type; + continue; + } + + if (!FindNodesByName(out_arg, op_node->inputs).empty()) { + VLOG(4) << "Cannot inplace because Output(" << out_param + << ")=" << out_arg << " occurs in input of op " << op_type; + continue; + } + + auto *out_node = *out_nodes.begin(); + + if (!IsFirstVersionVar(out_node)) { + VLOG(4) << "Cannot inplace because Output(" << out_param + << ")=" << out_arg << " does not occur first in op " << op_type; + continue; + } + + if (!NodeCanReused(out_node)) { + VLOG(4) << "Cannot inplace because Output(" << out_param + << ")=" << out_arg << " is not reusable in " << op_type; + continue; + } + + if (in_node->Var()->GetType() != out_node->Var()->GetType()) { + VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg + << " is not the same type with " + << "Output(" << out_param << ")=" << out_arg << " in " + << op_type; + continue; + } + + if (NodeSize(*in_node->Var()) != NodeSize(*out_node->Var()) && + kSameShapeOpWhiteSet.count(op_desc->Type()) == 0) { + VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg + << " is not the same size with " + << "Output(" << out_param << ")=" << out_arg << " in " + << op_type; + continue; + } + + VLOG(4) << "Rename " << out_node->Name() << " with " << in_node->Name() + << " in " << op_type; + RenameInOut(op_node, in_node, out_node); + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(inplace_pass, paddle::framework::ir::InplacePass) + .RequirePassAttr(paddle::framework::ir::kUseCuda); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h new file mode 100644 index 00000000..0ceac791 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h @@ -0,0 +1,105 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +class MemOptVarInfo { + public: + MemOptVarInfo(const std::string &name, size_t ref_cnt) : name_(name) { + SetRefCnt(ref_cnt); + } + + bool DecreaseRefCnt() { + return ref_cnt_ == 1 || (runtime_ref_cnt_.fetch_sub(1) == 1); + } + + void ResetRuntimeRefCnt() { runtime_ref_cnt_ = ref_cnt_; } + + void SetRefCnt(size_t ref_cnt) { + PADDLE_ENFORCE_GE(ref_cnt, 1, + "Reference count must be larger than or equal to 1"); + ref_cnt_ = ref_cnt; + runtime_ref_cnt_ = ref_cnt; + } + + bool IsSkipped() const { return skipped_; } + + void SetSkip(bool skipped) { skipped_ = skipped; } + + const std::string &Name() const { return name_; } + + private: + std::string name_; + size_t ref_cnt_; + std::atomic runtime_ref_cnt_; + bool skipped_{false}; +}; + +using MemOptVarInfoMapList = std::vector< + std::unordered_map>>; + +class SkipMemOptVarsGuard { + public: + SkipMemOptVarsGuard(MemOptVarInfoMapList *list, + const std::vector &vars, + bool need_reset_ref_cnt) + : list_(list), need_reset_ref_cnt_(need_reset_ref_cnt) { + if (!list_) return; + + skip_vars_.reserve(vars.size() * list->size()); + for (auto &var : vars) { + for (auto &map : *list_) { + auto iter = map.find(var); + if (iter != map.end() && !iter->second->IsSkipped()) { + iter->second->SetSkip(true); + skip_vars_.emplace_back(iter->second.get()); + } + } + } + } + + ~SkipMemOptVarsGuard() { + for (auto *var : skip_vars_) { + var->SetSkip(false); + } + + if (list_ && need_reset_ref_cnt_) { + for (auto &map : *list_) { + for (auto &pair : map) { + pair.second->ResetRuntimeRefCnt(); + } + } + } + } + + private: + MemOptVarInfoMapList *list_; + bool need_reset_ref_cnt_; + std::vector skip_vars_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc new file mode 100644 index 00000000..0437de68 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc @@ -0,0 +1,569 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/cpu_info.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif // PADDLE_WITH_CUDA + +namespace paddle { +namespace framework { +namespace ir { +using paddle::framework::VarDesc; + +std::vector SortOpLikeDescOrder(const ir::Graph& graph) { + PADDLE_ENFORCE(graph.Has(details::kStaleProgramOpDescs), + "Graph has no attribute of kStaleProgramOpDescs."); + // 1. get op desc order + auto& op_descs = + graph.Get>(details::kStaleProgramOpDescs); + + // 2. topology sort order + auto nodes = graph.Nodes(); + std::deque ops; + FilterVariables(nodes, [&](ir::Node* op) { + if (op->IsOp() && op->Op() != nullptr) { + ops.emplace_back(op); + } + }); + std::unordered_map op_deps; + std::list ready_ops; + std::unordered_map> pending_ops; + + for (auto* op : ops) { + std::unordered_set preceding_op; + for (auto* in : op->inputs) { + if (in->inputs.empty()) continue; + PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); + preceding_op.emplace(in->inputs[0]); + pending_ops[in->inputs[0]].emplace(op); + } + op_deps[op] = preceding_op.size(); + if (preceding_op.empty()) { + ready_ops.emplace_back(op); + } + } + + // 3. generated op list based desc order and the topology order + std::vector ret; + std::list op_descs_list(op_descs.begin(), op_descs.end()); + + auto update_by_found_node = [&](ir::Node* found_node) { + for (auto* pending_op : pending_ops[found_node]) { + if (--op_deps[pending_op] == 0) { + ready_ops.emplace_back(pending_op); + } + } + ready_ops.remove(found_node); + ret.emplace_back(found_node); + }; + + while (!ready_ops.empty()) { + bool all_of_ready_op_unmatched = true; + for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { + auto op_desc = *it; + ir::Node* found_node = nullptr; + for (auto* op : ready_ops) { + if (IsSameDesc(op->Op(), op_desc)) { + found_node = op; + break; + } + } + + // 3.1 op desc deleted by other pass + if (found_node == nullptr) { + ++it; + continue; + } else { + all_of_ready_op_unmatched = false; + it = op_descs_list.erase(it); + } + update_by_found_node(found_node); + } + + // 3.2 op descs are added by other pass + // preceding op non empty means some new op descs are + // created, but not contained in return node list. + // these new op desc may depend on each other. + std::list prev_ready_ops(ready_ops); + if (all_of_ready_op_unmatched) { + for (auto op : prev_ready_ops) { + update_by_found_node(op); + } + } + } + + PADDLE_ENFORCE(std::all_of( + op_deps.begin(), op_deps.end(), + [&](const std::pair& p) { return p.second == 0; })); + + return ret; +} + +size_t NodeSize(const VarDesc& node) { + auto shape = node.GetShape(); + int size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + size_t type_size = SizeOfType(node.GetDataType()); + return type_size * std::abs(size); +} + +size_t NodeSize(ir::Node* n) { return NodeSize(*(n->Var())); } + +std::string DebugStringImpl(VarDesc* var) { + std::stringstream ss; + ss << var->Name(); + ss << "["; + try { + auto shape = var->GetShape(); + for (size_t i = 0; i < shape.size(); ++i) { + if (i != shape.size() - 1) { + ss << shape[i] << ","; + } else { + ss << shape[i]; + } + } + ss << "]"; + } catch (...) { + ss << "Var has no VarDesc !!! Name:" << var->Name(); + } + return ss.str(); +} + +std::string DebugString(ir::Node* var) { + return DebugStringImpl(GetVarDesc(var)); +} + +// NOTE(dzh): based ir node, if a large node has been reused +// by a small size node, then next time it appear in pool, it will +// have the small size. Find the original node shap from blockdesc. +VarDesc* GetVarDesc(ir::Node* n) { + PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); + return n->Var(); +} + +struct NodeComparator { + bool operator()(ir::Node* lhs, ir::Node* rhs) const { + if (lhs->Var()->GetType() != rhs->Var()->GetType()) return false; + auto* lhs_desc = GetVarDesc(lhs); + auto* rhs_desc = GetVarDesc(rhs); + // match data type + if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) { + return false; + } + // match shape + auto lhs_shape = lhs_desc->GetShape(); + auto rhs_shape = rhs_desc->GetShape(); + if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || + (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { + return NodeSize(lhs) == NodeSize(rhs); + } else { + return false; + } + } +}; + +void OrderedSet::Insert(ir::Node* var) { + PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); + if (mark_table_.count(var->Name()) != 0) { + mark_table_[var->Name()]->emplace_back(var); + return; + } + + auto* var_desc = var->Var(); + auto var_shape = var_desc->GetShape(); + int batch_size = static_cast(var_shape[0]); + + NodeComparator functor; + Iter it = nodes_.begin(); + while (it != nodes_.end()) { + auto& prev = it->front(); + auto* cache_desc = GetVarDesc(prev); + int cache_batch_size = cache_desc->GetShape()[0]; + if ((cache_batch_size == -1 && batch_size == -1) || + (cache_batch_size != -1 && batch_size != -1)) { + if (functor(prev, var)) { + ++it; + } else { + break; + } + } else if (cache_batch_size == -1 && batch_size != -1) { + ++it; + } else if (cache_batch_size != -1 && batch_size == -1) { + break; + } + } + + it = nodes_.insert(it, {var}); + mark_table_[var->Name()] = it; +} + +int OrderedSet::GetNodeIndexInPool(ir::Node* var) { + return std::distance(nodes_.begin(), mark_table_[var->Name()]); +} + +ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const { + ir::Node* found_node = nullptr; + NodeComparator functor; + + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + auto& candidate = it->front(); + if (functor(var, candidate)) { + found_node = candidate; + break; + } + } + return found_node; +} + +ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const { + ir::Node* found_node = nullptr; + NodeComparator functor; + auto it = + std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) { + if (v.front() == prev) + return true; + else + return false; + }); + PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!"); + for (it = std::next(it); it != nodes_.end(); ++it) { + auto& candidate = it->front(); + if (functor(var, candidate)) { + found_node = candidate; + break; + } + } + return found_node; +} + +bool OrderedSet::Has(ir::Node* var) const { + if (mark_table_.count(var->Name())) { + auto& node_in_samename = mark_table_.at(var->Name()); + auto iter = + std::find_if(node_in_samename->begin(), node_in_samename->end(), + [&](ir::Node* n) { return n->Name() == var->Name(); }); + return iter != node_in_samename->end(); + } + return false; +} + +void OrderedSet::Erase(const std::string& var) { + PADDLE_ENFORCE(mark_table_.count(var)); + nodes_.erase(mark_table_[var]); + mark_table_.erase(var); +} + +void OrderedSet::Erase(ir::Node* var) { + PADDLE_ENFORCE(var != nullptr); + Erase(var->Name()); +} + +std::string OrderedSet::ToString() const { + std::stringstream ss; + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + for (auto& node : *it) { + ss << DebugString(node) << " "; + } + } + return ss.str(); +} + +bool NodeCanReused(ir::Node* node) { + // valid the node is a var node + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + if (node == nullptr || !node->IsVar() || node->IsCtrlVar() || + node->Name() == kEmptyVarName) + return false; + + bool flag = true; + // op output force generated in cpu, can not be reused. + for (auto* op : node->inputs) { + if (op->Op()->HasAttr("force_cpu")) { + flag &= framework::AttrReader(op->Op()->GetAttrMap()) + .Get("force_cpu") == 0; + } + } + // var desc validation. + flag &= NodeCanReused(*node->Var()); + return flag; +} + +int MinChunkSize() { + int size{0}; +#ifdef PADDLE_WITH_CUDA + size = platform::GpuMinChunkSize(); +#else + size = platform::CpuMinChunkSize(); +#endif // PADDLE_WITH_CUDA + return size; +} + +bool NodeCanReused(const VarDesc& node) { + auto type = node.GetType(); + // only these types holds bulk of gpu memory + // FIXME(liuwei1031) did not find good ways to test SELECTED_ROWS and + // LOD_TENSOR_ARRAY re-use logic, + // disable them in version 1.4 + // if (!(type == proto::VarType::LOD_TENSOR || + // type == proto::VarType::SELECTED_ROWS || + // type == proto::VarType::LOD_TENSOR_ARRAY)) { + // return false; + // } + if (type != proto::VarType::LOD_TENSOR) return false; + + // persistable variable is parameter + if (node.Persistable()) { + return false; + } + // shape < min_chunk_size is meaningless. + // further more, fetched loss always has size = 1 + // which should not be reused. + auto shape = node.GetShape(); + int size = std::abs( + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies())); + if (shape.empty() || size < MinChunkSize()) { + return false; + } + return true; +} + +bool OpHasSubBlock(OpDesc* desc) { + const AttributeMap& attrs = desc->GetAttrMap(); + for (auto& attr : attrs) { + if (attr.second.type() == typeid(BlockDesc*) || // NOLINT + attr.second.type() == typeid(std::vector)) // NOLINT + return true; + } + return false; +} + +ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { + ops_ = SortOpLikeDescOrder(graph); + ConnectNodes(); +} + +void ControlFlowGraph::BuildCFGGraph() { + // FIXME(dzh): same effect with ConnectNodes, but use the control + // link to build dependency graph, it goes wrong in transformer. + for (ir::Node* op : ops_) { + for (auto& input_var : op->inputs) { + if (!input_var->inputs.empty()) { + PADDLE_ENFORCE( + input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), + "Preceding Op Node of Var Node must be unique"); + auto* pred_op = input_var->inputs[0]; + if (pred_op->Op() != nullptr) { + predecessors_[op].insert(pred_op); + successors_[pred_op].insert(op); + } + } + if (input_var->IsVar() && !input_var->IsCtrlVar()) { + uses_[op].insert(input_var->Name()); + } + } + for (auto& output_var : op->outputs) { + // output var may be used by many op + for (auto* succ_op : output_var->outputs) { + if (succ_op->Op() != nullptr) { + successors_[op].insert(succ_op); + predecessors_[succ_op].insert(op); + } + } + if (output_var->IsVar() && !output_var->IsCtrlVar()) { + defs_[op].insert(output_var->Name()); + } + } + } +} + +void ControlFlowGraph::ConnectNodes() { + for (size_t i = 0; i < ops_.size(); ++i) { + auto& op = ops_[i]; + try { + auto& next_op = ops_.at(i + 1); + successors_[op].insert(next_op); + predecessors_[next_op].insert(op); + } catch (...) { + // do nothing + } + + FilterVariables(op->inputs, + [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); + + FilterVariables(op->outputs, + [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); + } +} + +void ControlFlowGraph::LiveVariableAnalysis() { + // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) + // compute the liveness of for each variable though reversed_ops algorithm. + // It iterates the operators from end to begin, compute the live in/live out + // variable set for each op, then the diff between in/out will be used for + // the variable reuse. For detail refer to + // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf + std::list work_list(ops_.rbegin(), ops_.rend()); + while (!work_list.empty()) { + ir::Node* op = work_list.front(); + work_list.pop_front(); + // get the live_in calculated before. Empty if first. + auto prev_live_in = std::move(live_in_[op]); + for (auto& s : successors_[op]) { + for (auto& var : live_in_[s]) { + live_out_[op].insert(var); + } + } + for (auto& var : uses_[op]) { + live_in_[op].insert(var); + } + for (auto& var : live_out_[op]) { + live_in_[op].insert(var); + } + for (auto& var : defs_[op]) { + if (uses_[op].count(var)) continue; + live_in_[op].erase(var); + } + + // If the live_in is not changed, then the liveness analysis of + // predecessors is completed. + // + // Otherwise, recalculate the predecessors liveness + if (live_in_[op] != prev_live_in) { + for (auto& pre : predecessors_[op]) { + work_list.push_back(pre); + } + } + } + + for (auto* op : ops_) { + unlived_vars_[op] = std::set(); + for (auto& var : this->LiveIn(op)) { + if (!this->LiveOut(op).count(var)) { + unlived_vars_[op].insert(var); + } + } + } +} + +void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, + int begin_idx) { + std::vector need_update(ops_.size(), false); + // update graph from begin idx to the end + for (size_t i = begin_idx; i != ops_.size(); ++i) { + auto* op = ops_[i]; + if (uses_[op].find(old_node) != uses_[op].end()) { + uses_[op].erase(old_node); + uses_[op].insert(new_node); + } + if (defs_[op].find(old_node) != defs_[op].end()) { + defs_[op].erase(old_node); + defs_[op].insert(new_node); + } + if (live_in_[op].find(old_node) != live_in_[op].end()) { + live_in_[op].erase(old_node); + live_in_[op].insert(new_node); + need_update[i] = true; + } + if (live_out_[op].find(old_node) != live_out_[op].end()) { + live_out_[op].erase(old_node); + live_out_[op].insert(new_node); + need_update[i] = true; + } + } + + for (size_t i = begin_idx; i < ops_.size(); ++i) { + if (!need_update[i]) continue; + auto* op = ops_[i]; + for (auto& var : this->LiveIn(op)) { + if (!this->LiveOut(op).count(var)) { + unlived_vars_[op].insert(var); + } + } + } +} + +const std::set& ControlFlowGraph::LiveIn(ir::Node* op) const { + auto it = live_in_.find(op); + PADDLE_ENFORCE( + it != live_in_.end(), + string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); + return it->second; +} + +const std::set& ControlFlowGraph::LiveOut(ir::Node* op) const { + auto it = live_out_.find(op); + PADDLE_ENFORCE( + it != live_out_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::set& ControlFlowGraph::Use(ir::Node* op) const { + auto it = uses_.find(op); + PADDLE_ENFORCE( + it != uses_.end(), + string::Sprintf("Expect %s in use, but Not Found.", op->Name())); + return it->second; +} + +const std::set& ControlFlowGraph::Unlived(ir::Node* op) const { + auto it = unlived_vars_.find(op); + PADDLE_ENFORCE( + it != unlived_vars_.end(), + string::Sprintf("Expect %s in unlived_set, but Not Found.", op->Name())); + return it->second; + return it->second; +} + +const std::vector& ControlFlowGraph::Ops() const { return ops_; } + +std::vector& ControlFlowGraph::Ops() { return ops_; } + +ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name, + ir::Node* op) const { + // in ssa-graph, different version nodes have same name, + // this function get the latest version var before target op + // It may return nullptr, such as data node. + ir::Node* found_node = nullptr; + for (auto* node : ops_) { + if (node == op) break; + for (auto& output : node->outputs) { + PADDLE_ENFORCE((output != nullptr && output->IsVar()), + "Output is empty!"); + if (output->Var() && output->Name() == name) { + found_node = output; + } + } + } + return found_node; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h new file mode 100644 index 00000000..cf9f4ef4 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h @@ -0,0 +1,189 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { + +/// this attribute is used to avoid some core variables removed/reused +/// in memory optimize related passes +constexpr char kMemOptSkipVars[] = "@MEM_OPT_SKIP_VARS@"; +typedef std::unordered_set MemOptSkipVars; + +constexpr char kUseCuda[] = "use_cuda"; + +std::vector SortOpLikeDescOrder(const ir::Graph& graph); + +// NOTE(dzh): A ordered set for node reuse in memory optimize. +// the orderedset sort node in ascend order(by node bytes size). +// in fluid, -1 means the batch_size, which is determined in runtime. +// So the reuse happens between nodes who's batch_size both are -1 +// simultaneously or not. +// +// sort rule: +// rule 0 : smaller node ranking in front. +// rule 1 : batch_size equal -1 ranking in the front than the node not. +// +// For example, +// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. + +class OrderedSet { + public: + // nodes with same name exists in pool. + using NodeVector = std::vector; + using Iter = typename std::list::iterator; + using ConstIter = typename std::list::const_iterator; + + void Insert(ir::Node* var); + void Erase(ir::Node* var); + void Erase(const std::string& var); + bool Has(ir::Node* var) const; + void Clear() { + mark_table_.clear(); + nodes_.clear(); + } + // find the bestfit shape node block with var. + ir::Node* FindBestFitNode(ir::Node* var) const; + ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const; + // map store non-const iterator, can not promise const + int GetNodeIndexInPool(ir::Node* var); + // pool all node to string + std::string ToString() const; + + Iter begin() { return nodes_.begin(); } + Iter end() { return nodes_.end(); } + ConstIter begin() const { return nodes_.begin(); } + ConstIter end() const { return nodes_.end(); } + + size_t size() const { return nodes_.size(); } + + private: + // for searching. + std::unordered_map mark_table_; + // node pool + std::list nodes_; +}; + +class ControlFlowGraph { + public: + ControlFlowGraph() = default; + // IR Graph + explicit ControlFlowGraph(const ir::Graph& graph); + + void LiveVariableAnalysis(); + + void RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, int begin_idx); + + const std::set& LiveIn(ir::Node* op) const; + const std::set& LiveOut(ir::Node* op) const; + const std::set& Use(ir::Node* op) const; + const std::set& Unlived(ir::Node* op) const; + const std::vector& Ops() const; + std::vector& Ops(); + + // for ssa-graph nodes + ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const; + + private: + void BuildCFGGraph(); + void ConnectNodes(); + + using NodeListMap = std::unordered_map>; + using VarSetMap = std::map>; + // successors ops use the output variables. + NodeListMap successors_; + // predecessors ops generated input variables. + NodeListMap predecessors_; + // variables lived before run current op. + VarSetMap live_in_; + // variables lived after run current op. + VarSetMap live_out_; + VarSetMap uses_; // op inputs + VarSetMap defs_; // op outputs + std::unordered_map> unlived_vars_; + + std::vector ops_; // op sequence by topology sort +}; + +// valid a tensor can be reuse or not +bool NodeCanReused(ir::Node* node); + +// valid a tensor can be reuse or not. +bool NodeCanReused(const VarDesc& node); + +// check op has subblock or not +bool OpHasSubBlock(OpDesc* desc); + +// node memory size in bytes +size_t NodeSize(ir::Node* n); + +// node memory size in bytes +size_t NodeSize(const VarDesc&); + +std::string DebugString(ir::Node* var); + +VarDesc* GetVarDesc(ir::Node* n); + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +template +class FilterVariableImpl { + public: + void operator()(const Container& nodes, Callback callback) { + for (auto* node : nodes) { + callback(node); + } + } +}; + +// filter var node for op->inputs/outputs +template +class FilterVariableImpl, Callback> { + public: + void operator()(const std::vector& nodes, Callback callback) { + for (auto* var : nodes) { + if (var->IsVar() && !var->IsCtrlVar()) { + callback(var); + } + } + } +}; + +template +void FilterVariables(const Container& nodes, Callback callback) { + FilterVariableImpl()(nodes, callback); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc new file mode 100644 index 00000000..d38facd0 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc @@ -0,0 +1,525 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/details/graph_test_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +TEST(OrderedSet, Normal) { + OrderedSet pool; + std::vector> nodes; + + // clang-format off + std::vector> shapes = {{-1, 10}, + {-1, 20}, + {1, 2}, + {5, 2}, + {10, 20}, + {-1, 2, 5}, + {-1, 1, 5}, + {-1, 1}}; + // clang-format on + const int COUNT = shapes.size(); + ProgramDesc prog; + BlockDesc* block_desc = prog.MutableBlock(0); + auto* op_desc = block_desc->AppendOp(); + op_desc->SetType("dummy"); + std::unique_ptr op = ir::CreateNodeForTest(op_desc); + + for (int i = 0; i < COUNT; ++i) { + auto desc = block_desc->Var(std::to_string(i)); + desc->SetShape(shapes[i]); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + + // Insert + for (auto& node : nodes) { + pool.Insert(node.get()); + } + + // Has/size + ASSERT_EQ(pool.size(), shapes.size()); + for (auto& node : nodes) { + ASSERT_TRUE(pool.Has(node.get())); + } + + // assert its order and interface. + std::cout << pool.ToString() << std::endl; + pool.Erase(nodes.front().get()); + std::cout << pool.ToString() << std::endl; + + ASSERT_EQ(pool.size(), static_cast(COUNT - 1)); + ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0); + + { + auto v1 = block_desc->Var("11"); + v1->SetShape({-1, 256, 56, 56}); + std::unique_ptr node1 = ir::CreateNodeForTest(v1); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(cache, nullptr); + } + { + auto v2 = block_desc->Var("12"); + v2->SetShape({-1, 2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v2); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2); // match 6:[-1,2,5] + } + { + auto v3 = block_desc->Var("13"); + v3->SetShape({2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v3); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.FindBestFitNode(node1.get()); + ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2] + } +} + +TEST(OrderedSet, FindBestFitNode) { + OrderedSet pool; + std::vector> nodes; + ProgramDesc prog; + BlockDesc* block_desc = prog.MutableBlock(0); + auto* op_desc = block_desc->AppendOp(); + op_desc->SetType("dummy"); + std::unique_ptr op = ir::CreateNodeForTest(op_desc); + + { + auto desc = block_desc->Var("a"); + desc->SetShape({128, 128}); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + { + auto desc = block_desc->Var("b"); + desc->SetShape({128, 129}); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + { + auto desc = block_desc->Var("c"); + desc->SetShape({128, 128}); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + + for (auto& node : nodes) { + pool.Insert(node.get()); + } + + auto* n = nodes[0].get(); + auto* cache = pool.FindBestFitNode(n); + ASSERT_TRUE(cache->Name() == "a" || cache->Name() == "c"); + auto* cache_b = pool.FindNextBestFitNode(n, cache); + ASSERT_TRUE(cache_b->Name() != cache->Name()); + ASSERT_TRUE(cache_b->Name() == "a" || cache_b->Name() == "c"); + cache = pool.FindNextBestFitNode(n, cache_b); + ASSERT_TRUE(cache == nullptr); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(assign, paddle::framework::DummyOp, + paddle::framework::AssignOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +/* + https://en.wikipedia.org/wiki/Live_variable_analysis + Create a customed classical dependency graph, left row is the instruction + number. + 1. a = 1 + 2. b = a + 3. c = a + 4. d = b + c + 5. e = d + + a--------+ + | | + b c + | | + d--------+ + | + e + Then analysis these variable's liveness range + */ + +namespace paddle { +namespace framework { +namespace ir { + +inline static ProgramDesc FillProgramDesc() { + ProgramDesc prog; + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"c"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"d"}); + op->SetOutput("Out", {"e"}); + } + return prog; +} + +TEST(CFGGraph, IRGraph) { + // prepare ir graph + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + + ControlFlowGraph cfg(graph); + cfg.LiveVariableAnalysis(); + + // test assign op + ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); + + // test assign op + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); + + // test sum op + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); + ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); + + // test assign op + ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); + ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); +} + +// 1. normal test +TEST(SortOpLikeDescOrder, NormalTest) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + + auto nodes = SortOpLikeDescOrder(graph); + auto op_descs = prog.Block(0).AllOps(); + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 2. remove some op_desc +TEST(SortOpLikeDescOrder, RemoveOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + auto nodes = graph.Nodes(); + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->outputs.back()->Name() == "e") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + graph.RemoveNode(found_node); + graph.RemoveNode(e); + + // other node keeps the same order + auto remain_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < remain_nodes.size(); ++i) { + auto node = remain_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 3. add some op_desc +TEST(SortOpLikeDescOrder, AddOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // cached desc different with real one + // mimic the intermidiete pass modify the programdesc. + std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); + + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + op_descs.insert(op_descs.begin() + 4, op); + + auto nodes = SortOpLikeDescOrder(graph); + + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 4. add and delete some op_desc +TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); + + // remove sum node + ir::Node* found_node = nullptr; + auto nodes = graph.Nodes(); + for (auto node : nodes) { + if (node->Name() == "sum") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* c = find_node_in_graph("c"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(c->outputs.begin(), c->outputs.end(), found_node); + ir::Node* pending_op = found_node->outputs[0]->outputs[0]; + graph.RemoveNode(e); + graph.RemoveNode(pending_op); + graph.RemoveNode(found_node); + } + + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + op_descs.insert(op_descs.begin() + 2, op); + + // check the order + auto mynodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < mynodes.size(); ++i) { + auto node = mynodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 5. add and replace some op_desc inplace. +TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + + op_descs.emplace_back(op); + + // replace op_desc inplace + auto nodes = graph.Nodes(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->Op() && node->Name() == "assign") { + if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { + found_node = node; + break; + } + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(e->inputs.begin(), e->inputs.end(), found_node); + graph.RemoveNode(found_node); + } + op_descs.erase(op_descs.begin() + 3); + + auto replace_op = prog.MutableBlock(0)->AppendOp(); + replace_op->SetType("sum"); + replace_op->SetInput("X", {"d", "d1"}); + replace_op->SetOutput("Out", {"e"}); + { + ir::Node* sum2 = graph.CreateOpNode(replace_op); + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + ir::Node* d1 = find_node_in_graph("d1"); + sum2->inputs.emplace_back(d); + sum2->inputs.emplace_back(d1); + sum2->outputs.emplace_back(e); + e->inputs.emplace_back(sum2); + d->outputs.emplace_back(sum2); + d1->outputs.emplace_back(sum2); + } + + op_descs.emplace_back(replace_op); + // compare op order + auto graph_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < graph_nodes.size(); ++i) { + auto node = graph_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc new file mode 100644 index 00000000..af3fbb28 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc @@ -0,0 +1,224 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const { + CollectSkipVarsSet(graph); + + cfg_.reset(new ControlFlowGraph(*graph)); + cfg_->LiveVariableAnalysis(); + InitSSAGraphNodes(); + + int reuse_id = 0; + for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) { + auto& op = cfg_->Ops()[idx]; + auto* op_desc = op->Op(); + // some op in graph has no op desc + if (op_desc == nullptr) continue; + + for (auto& var : op->outputs) { + if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) { + VLOG(3) << "Skip set contains variable of " << var->Name() + << "disable reuse on it. skipped"; + continue; + } + if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { + ir::Node* cache = pool_.FindBestFitNode(var); + while (cache != nullptr && var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused. " + << cache->Name() << " is re-filled to the pool after " + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + cache = pool_.FindNextBestFitNode(var, cache); + } + + if (cache != nullptr) { + int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(reuse_id++), DebugString(var), DebugString(cache), + node_idx_in_pool, static_cast(pool_.size())); + // NOTE(dzhwinter): update the ProgramDesc/IR Graph + // and the CFG Graph on the fly. + // + // IR Graph define the dependence relationship between nodes. + // + // ProgramDesc defines the input/output vars. Its used in + // CreateOp, CreateVar when running happens. + // + // CFG Graph store the liveness information, when reuse happens + // we also need to update the variable liveness. + const std::string var_name = var->Name(); + const std::string cache_name = cache->Name(); + + cfg_->RenameVarInCFGGraph(var_name, cache_name, idx); + RenameVarInGraphDesc(var_name, cache_name, idx); + RenameVarInGraphNode(var_name, cache_name, idx, graph); + pool_.Erase(cache_name); + } + } + } + // fill the pool + for (auto& var : cfg_->Unlived(op)) { + ir::Node* var_node = cfg_->GetNodeByName(var, op); + if (var_node == nullptr || var_node->IsCtrlVar()) continue; + if (NodeCanReused(var_node) && !pool_.Has(var_node)) { + pool_.Insert(var_node); + } + } + } + graph->ResolveHazard(var_nodes_); +} + +void MemoryOptimizePass::CollectSkipVarsSet(ir::Graph* graph) const { + // fill skip_set_ + PADDLE_ENFORCE(graph->Has(kMemOptSkipVars)); + auto& mem_opt_whitelist = graph->Get(kMemOptSkipVars); + for (const auto& var : mem_opt_whitelist) { + skip_set_.emplace(var); + } +} + +void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, + size_t idx) const { + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + PADDLE_ENFORCE(op->IsOp() && op->Op()); + auto* op_desc = op->Op(); + op_desc->RenameInput(var, cache_var); + op_desc->RenameOutput(var, cache_var); + if (op_desc->Block() != nullptr) { + op_desc->Block()->RemoveVar(var); + } else { + LOG(WARNING) << "op " << op->Name() << " not know its block." + << "Is the op_desc created without block pointer? " + << "Can not find " << var << " in Block(0)"; + } + op_desc->Flush(); + } +} + +void MemoryOptimizePass::InitSSAGraphNodes() const { + std::unordered_map> all_vars; + if (var_nodes_.empty()) { + for (auto* op : cfg_->Ops()) { + for (auto* node : op->inputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + for (auto* node : op->outputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + } + } +} + +void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, + size_t idx, + ir::Graph* graph) const { + // if replace happens, we need to create a newer version cache_var + // but use the same dims/data_type with var. + PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && + var_nodes_[var].at(0)->Var() != nullptr); + std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); + var_desc->SetName(cache_var); + + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + + // redirect the input to the latest version of cache_var + for (auto* node : op->inputs) { + if (node->Name() == var) { + ir::Node* cache_node = var_nodes_[cache_var].back(); + + // swap node to cache_node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, + cache_node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + + // erase unused node + auto& nodes = var_nodes_.at(var); + nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); + graph->RemoveNode(node); + } + } + + // if we need to rename the output, + // always create a newer version of cache_var + for (auto* node : op->outputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + cache_node->inputs.emplace_back(op); + std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + + // erase unused node + auto& nodes = var_nodes_.at(var); + nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); + graph->RemoveNode(node); + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(memory_optimize_pass, paddle::framework::ir::MemoryOptimizePass) + .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h new file mode 100644 index 00000000..eef289ef --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h @@ -0,0 +1,72 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class MemoryOptimizePass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override; + // fill the variable map(var_nodes) by version. + void InitSSAGraphNodes() const; + + private: + // update program descs + void RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, size_t idx) const; + // update ir nodes + void RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, size_t idx, + ir::Graph* graph) const; + + void SubGraphOptimize(OpDesc* op_desc) const; + // 1. scan op with subblock and collect the output/input vars. + // while, while_grad, conditional_block + // 2. scan distributed ops and collect the output/input vars + // 3. op_role_vars + void CollectSkipVarsSet(ir::Graph* graph) const; + + private: + // Reuse Node Pool, Owned. + mutable OrderedSet pool_; + // controlflow Graph + mutable std::unique_ptr cfg_; + // skip set + mutable std::unordered_set skip_set_; + // var nodes + mutable std::map> var_nodes_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc new file mode 100644 index 00000000..9a8e2530 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc @@ -0,0 +1,286 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h" +#include +#include +#include +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +// Each ShareTensorBufferOpHandle should only have one pending +// ComputationOpHandle +static details::ComputationOpHandle *GetUniquePendingComputationOpHandle( + details::ShareTensorBufferOpHandle *share_tensor_op) { + details::ComputationOpHandle *result_op = nullptr; + for (Node *out_var : share_tensor_op->Node()->outputs) { + for (Node *pending_op : out_var->outputs) { + auto &op = pending_op->Wrapper(); + auto *compute_op = dynamic_cast(&op); + PADDLE_ENFORCE_NOT_NULL(compute_op); + + if (result_op == nullptr) { + result_op = compute_op; + } else { + PADDLE_ENFORCE_EQ(result_op, compute_op); + } + } + } + + PADDLE_ENFORCE_NOT_NULL(result_op); + return result_op; +} + +void MemoryReusePass::ApplyImpl(Graph *graph) const { + graph_ = graph; + all_vars_ = &(graph_->Get(details::kGraphVars)); + var_infos_ = &(Get(kMemOptVarInfoMapList)); + last_live_ops_of_vars_ = + &(Get>(kLastLiveOpsOfVars)); + + reused_var_names_.resize(all_vars_->size()); + var_descs_.resize(all_vars_->size()); + + // Collect the existing ShareTensorBufferOpHandles. + // This is because (1) we want to reuse the existing + // ShareTensorBufferOpHandles to avoid inserting too many ops; + // (2) what is more important, a variable cannot be reused + // by two different variables, which may cause wrong calculation + // results. We have to know which variables have been reused. + CollectShareTensorBufferOpHandles(); + CollectReusedVars(); + Run(graph); + + std::map op_num; + for (auto &pair : ops_) { + ++op_num[pair.first->GetScopeIdx()]; + } + + for (auto &pair : op_num) { + VLOG(2) << "Create " << pair.second + << " ShareTensorBufferOpHandles in Scope " << pair.first; + } +} + +bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var, + details::VarHandle *out_var) const { + auto *op = + dynamic_cast(out_var->GeneratedOp()); + PADDLE_ENFORCE_NOT_NULL(op); + if (IsVarsReusable(in_var, out_var)) { + AddReuseVar(op, in_var, out_var); + return true; + } else { + return false; + } +} + +std::unordered_set MemoryReusePass::FindNodesByName( + const std::string &name, const std::vector &nodes) const { + std::unordered_set ret; + for (auto *node : nodes) { + if (node->Name() == name) { + ret.insert(node); + } + } + return ret; +} + +VarDesc *MemoryReusePass::GetVarDesc(details::VarHandle *var) const { + auto iter = var_descs_[var->scope_idx()].find(var->Name()); + if (iter == var_descs_[var->scope_idx()].end()) { + PADDLE_ENFORCE((*all_vars_)[var->scope_idx()].count(var->Name()), + "Variable %s not found", var->Name()); + auto *desc = + TryGetLatestVarDesc((*all_vars_)[var->scope_idx()].at(var->Name())); + PADDLE_ENFORCE_NOT_NULL(desc); + var_descs_[var->scope_idx()].emplace(var->Name(), desc); + return desc; + } else { + return iter->second; + } +} + +void MemoryReusePass::CollectShareTensorBufferOpHandles() const { + auto all_ops = FilterByNodeWrapper(*graph_); + for (auto *op : all_ops) { + auto *share_buffer_op = + dynamic_cast(op); + if (share_buffer_op != nullptr) { + auto *compute_op = GetUniquePendingComputationOpHandle(share_buffer_op); + PADDLE_ENFORCE(ops_.count(compute_op) == 0); + ops_.emplace(compute_op, share_buffer_op); + } + } +} + +void MemoryReusePass::CollectReusedVars() const { + for (auto &pair : ops_) { + auto reused_vars = pair.second->ReusedVarSet(); + reused_var_names_[pair.first->GetScopeIdx()].insert(reused_vars.begin(), + reused_vars.end()); + } +} + +bool MemoryReusePass::IsVarAlreadyReused(details::VarHandle *var) const { + return reused_var_names_[var->scope_idx()].count(var->Name()) > 0; +} + +details::ShareTensorBufferOpHandle * +MemoryReusePass::InsertShareTensorBufferOpHandleToGraph( + details::ComputationOpHandle *op) const { + auto *buffer_share_node = + graph_->CreateEmptyNode("buffer_share", ir::Node::Type::kOperation); + + auto *buffer_share_op = new details::ShareTensorBufferOpHandle( + buffer_share_node, op->GetScope(), op->GetScopeIdx(), op->GetOp()->Type(), + {}, {}); + + buffer_share_op->SetDeviceContext( + op->GetPlace(), + platform::DeviceContextPool::Instance().Get(op->GetPlace())); + + // Inputs of `buffer_share_op` should be all inputs of `op` + for (auto *in_var : op->Inputs()) { + buffer_share_op->AddInput(in_var); + } + + // Add a dep_var to resolve write-after-write data hazard between + // `buffer_share_op` and `op`. + auto *dep_var = new details::DummyVarHandle(graph_->CreateControlDepVar()); + graph_->Get(details::kGraphDepVars).emplace(dep_var); + op->AddInput(dep_var); + buffer_share_op->AddOutput(dep_var); + + ops_.emplace(op, buffer_share_op); + return buffer_share_op; +} + +bool MemoryReusePass::IsVarsReusable(details::VarHandle *in_var, + details::VarHandle *out_var) const { + const auto in_name = in_var->Name(); + const auto out_name = out_var->Name(); + + if (in_name == out_name) { + return false; + } + + if (in_name == kEmptyVarName || out_name == kEmptyVarName) { + return false; + } + + if (IsVarAlreadyReused(in_var)) { + return false; + } + + // out_var must be the first version!!! + auto out_var_iter = (*all_vars_)[out_var->scope_idx()].find(out_name); + PADDLE_ENFORCE(out_var_iter != (*all_vars_)[out_var->scope_idx()].end() && + !out_var_iter->second.empty(), + "Cannot find variable %s", out_name); + + if (out_var_iter->second[0] != out_var) { + return false; + } + + const VarDesc *in_var_desc = GetVarDesc(in_var); + const VarDesc *out_var_desc = GetVarDesc(out_var); + + if (in_var_desc->Persistable() || out_var_desc->Persistable()) { + return false; + } + + if (in_var_desc->GetType() != proto::VarType::LOD_TENSOR || + out_var_desc->GetType() != proto::VarType::LOD_TENSOR) { + return false; + } + + if (!FindNodesByName(in_name, out_var->GeneratedOp()->Node()->outputs) + .empty()) { + return false; + } + + if (!FindNodesByName(out_name, out_var->GeneratedOp()->Node()->inputs) + .empty()) { + return false; + } + + auto all_input_args = + out_var->GeneratedOp()->Node()->Op()->InputArgumentNames(); + if (std::count(all_input_args.begin(), all_input_args.end(), in_name) > 1) { + return false; + } + + return true; +} + +void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op, + details::VarHandle *in_var, + details::VarHandle *out_var) const { + PADDLE_ENFORCE((*var_infos_)[op->GetScopeIdx()].count(in_var->Name()) > 0, + "%s does not in mem-opt var infos", in_var->Name()); + + if (ops_.count(op) == 0) { + InsertShareTensorBufferOpHandleToGraph(op); + } + + auto *share_buffer_op = ops_[op]; + + auto &all_input_vars = share_buffer_op->Inputs(); + bool has_input = std::find(all_input_vars.begin(), all_input_vars.end(), + in_var) != all_input_vars.end(); + + if (!has_input) { + share_buffer_op->AddInput(in_var); + } + + share_buffer_op->Add( + (*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(), + out_var->Name()); + reused_var_names_[op->GetScopeIdx()].insert(in_var->Name()); + + UpdateLastLiveOpOfVar(op, in_var, out_var); +} + +// 1. Set last living op of in_var to be any last living op of out_var +// 2. Set reference count of in_var to be 1 +void MemoryReusePass::UpdateLastLiveOpOfVar(details::ComputationOpHandle *op, + details::VarHandle *in_var, + details::VarHandle *out_var) const { + size_t scope_idx = op->GetScopeIdx(); + auto out_var_op_iter = + (*last_live_ops_of_vars_)[scope_idx].find(out_var->Name()); + PADDLE_ENFORCE(out_var_op_iter != (*last_live_ops_of_vars_)[scope_idx].end(), + "Cannot find variable %s", out_var->Name()); + PADDLE_ENFORCE(!out_var_op_iter->second.empty()); + + auto &last_live_ops_of_in_var = + (*last_live_ops_of_vars_)[scope_idx][in_var->Name()]; + last_live_ops_of_in_var.clear(); + last_live_ops_of_in_var.insert(*(out_var_op_iter->second.begin())); + + auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name()); + PADDLE_ENFORCE(in_var_info_iter != (*var_infos_)[scope_idx].end(), + "Cannot find variable %s", in_var->Name()); + + in_var_info_iter->second->SetRefCnt(1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h new file mode 100644 index 00000000..f706b48e --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h @@ -0,0 +1,128 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * MemoryReusePass is the base class of InplacePass and MemoryOptimizePass. + * + * Unlike the legacy Python API fluid.memory_optimize() which changes + * variable names in the program/graph, MemoryReusePass inserts + * ShareTensorBufferOpHandle into the graph. It is because if we use the + * way of changing variable names: + * + * 1. There are so many corner cases we should skip. For example, (1) variables + * that relates to send/recv ops cannot be renamed (otherwise, pserver + * and trainer cannot find the matching variables), (2) ins/outs of ops + * containing sub-blocks cannot be optimized, (3) variables inside + * op_role_vars cannot be renamed. + * + * 2. It is very difficult to avoid reusing variables that users want to fetch. + * This is because the memory-optimize passes/transpiler runs before users + * fetch, i.e., exe.run(...). We cannot know what users want to fetch in the + * future. As a result, we have to set var.persistable = True before + * applying memory-optimize passes/transpiler, which is rather ugly and not + * friendly to users. + * + * 3. Dim and LoD of the reused variable would be changed, which may result + * in potential errors in InferShape stage of the following ops. What's + * more, it makes that we cannot use the information from + * NoNeedBufferVarsInference. + * + * Considering the drawbacks of the former renaming strategy, we design a + * novel memory-optimize pass to fix these issues. Whether in-place is + * performed can be decided during run-time. ShareTensorBufferOpHandle + * would only share tensor buffers between in/out, never rename variable, + * and not change dim and LoD of variable. If users want to fetch a certain + * variable, we can skip in-place during run-time. + * + * The only concern on speed performance may be: there are too many + * ShareTensorBufferOpHandles in the graph. This can be avoided by moving + * tensor buffer sharing in each ComputationOpHandle::Run() method. We need + * a pass to clean all ShareTensorBufferOpHandles and move sharing to + * ComputationOpHandle::Run() in the future. + */ +class MemoryReusePass : public Pass { + protected: + void ApplyImpl(Graph *graph) const final; + + virtual void Run(Graph *graph) const = 0; + + virtual std::string ReuseType() const = 0; + + bool TryReuseVar(details::VarHandle *in_var, + details::VarHandle *out_var) const; + + std::unordered_set FindNodesByName( + const std::string &name, const std::vector &nodes) const; + + size_t ScopeNum() const { return all_vars_->size(); } + + private: + VarDesc *GetVarDesc(details::VarHandle *var) const; + + bool IsVarsReusable(details::VarHandle *in_var, + details::VarHandle *out_var) const; + + bool IsVarAlreadyReused(details::VarHandle *var) const; + + details::ShareTensorBufferOpHandle *InsertShareTensorBufferOpHandleToGraph( + details::ComputationOpHandle *op) const; + + void CollectShareTensorBufferOpHandles() const; + + void CollectReusedVars() const; + + void AddReuseVar(details::ComputationOpHandle *op, details::VarHandle *in_var, + details::VarHandle *out_var) const; + + void UpdateLastLiveOpOfVar(details::ComputationOpHandle *op, + details::VarHandle *in_var, + details::VarHandle *out_var) const; + + private: + mutable Graph *graph_; + mutable details::GraphVars *all_vars_; + mutable MemOptVarInfoMapList *var_infos_; + mutable std::vector *last_live_ops_of_vars_; + + mutable std::unordered_map + ops_; + + mutable std::vector> reused_var_names_; + + mutable std::vector> var_descs_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc new file mode 100644 index 00000000..6b7249b1 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h" +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +OpGraphView::OpGraphView(const std::vector &ops) { + Build(ops); +} + +void OpGraphView::Build(const std::vector &ops) { + preceding_ops_.clear(); + pending_ops_.clear(); + for (auto &op : ops) { + preceding_ops_[op]; + pending_ops_[op]; + for (auto &var : op->Outputs()) { + for (auto &pending_op : var->PendingOps()) { + preceding_ops_[pending_op].insert(op); + pending_ops_[op].insert(pending_op); + } + } + } + PADDLE_ENFORCE( + preceding_ops_.size() == ops.size() && pending_ops_.size() == ops.size(), + "There are duplicate ops in graph."); +} + +std::unordered_set OpGraphView::AllOps() const { + std::unordered_set ret; + ret.reserve(preceding_ops_.size()); + for (auto &pair : preceding_ops_) { + ret.insert(pair.first); + } + return ret; +} + +bool OpGraphView::HasOp(details::OpHandleBase *op) const { + return preceding_ops_.count(op) != 0; +} + +void OpGraphView::EnforceHasOp(details::OpHandleBase *op) const { + PADDLE_ENFORCE(HasOp(op), "Cannot find op %s in OpGraphView", + op == nullptr ? "nullptr" : op->DebugString()); +} + +const std::unordered_set &OpGraphView::PendingOps( + details::OpHandleBase *op) const { + EnforceHasOp(op); + return pending_ops_.at(op); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h new file mode 100644 index 00000000..afd29091 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h @@ -0,0 +1,80 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class OpGraphView { + public: + explicit OpGraphView(const std::vector &ops); + + std::unordered_set AllOps() const; + + const std::unordered_set &PendingOps( + details::OpHandleBase *op) const; + + bool HasOp(details::OpHandleBase *op) const; + + // Use a visitor to visit all pending ops of op + // Stop when callback returns false + template + bool VisitAllPendingOps(details::OpHandleBase *op, Callback &&callback) const; + + private: + void Build(const std::vector &ops); + void EnforceHasOp(details::OpHandleBase *op) const; + + std::unordered_map> + preceding_ops_; + std::unordered_map> + pending_ops_; +}; + +template +bool OpGraphView::VisitAllPendingOps(details::OpHandleBase *op, + Callback &&callback) const { + EnforceHasOp(op); + std::unordered_set visited; + std::queue q; + q.push(op); + while (!q.empty()) { + op = q.front(); + q.pop(); + for (auto &pending_op : pending_ops_.at(op)) { + if (visited.count(pending_op) == 0) { + visited.insert(pending_op); + if (!callback(pending_op)) { + return false; + } + q.push(pending_op); + } + } + } + return true; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc new file mode 100644 index 00000000..040b769f --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +namespace ir { + +class RecordSkipMemoryOptVarsPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override { + PADDLE_ENFORCE(!graph->Has(kMemOptSkipVars)); + graph->Set(kMemOptSkipVars, new MemOptSkipVars); + auto& skip_vars = graph->Get(kMemOptSkipVars); + + std::vector op_nodes; + for (auto& node : graph->Nodes()) { + PADDLE_ENFORCE_NOT_NULL(node, "The node should not be nullptr."); + if (node->IsOp() && node->Op()) { + op_nodes.emplace_back(node); + } + } + + // Insert kEmptyVarName to avoid optimizing empty variable + skip_vars.insert(framework::kEmptyVarName); + + // NOTE(zcd): Insert OpRoleVars to SkipVarSet to prevent the vars are rename + // in memory optimize pass. + InsertOpRoleVarsToSkipVarSet(op_nodes, &skip_vars); + + InsertSkipMemOptOpInOutToSkipVarSet(op_nodes, &skip_vars); + } + + private: + static void InsertOpRoleVarsToSkipVarSet(const std::vector& ops, + MemOptSkipVars* skip_vars) { + for (auto& node : ops) { + try { + auto op_role_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(op_role_vars.size() % 2, 0); + for (size_t i = 0; i < op_role_vars.size(); i += 2) { + auto& g_name = op_role_vars[i + 1]; + skip_vars->insert(g_name); + } + } catch (boost::bad_get& e) { + } + } + } + + static void UpdateSkipVarSet( + MemOptSkipVars* skip_vars, + const std::vector>& var_names) { + for (auto& var_name : var_names) { + skip_vars->insert(var_name.begin(), var_name.end()); + } + } + + static std::vector ToGradVarName( + const std::vector& names) { + std::vector ret; + ret.reserve(names.size()); + for (auto& name : names) { + if (name != framework::kEmptyVarName) { + ret.emplace_back(framework::GradVarName(name)); + } + } + return ret; + } + + static void InsertSkipMemOptOpInOutToSkipVarSet( + const std::vector& ops, MemOptSkipVars* skip_vars) { + static std::unordered_set kSkipMemOptOps{ + "send", "recv", "prefetch", "send_barrier", "fetch_barrier"}; + + for (auto& node : ops) { + auto* op_desc = node->Op(); + // Some ops (while, conditional_block, recurrent, etc.) have sub-blocks. + // These ops often use variables from its parent or forward blocks. + // Optimizing in/out of such ops would make these variables cannot + // be found when running sub-block ops. + if (OpHasSubBlock(op_desc)) { + UpdateSkipVarSet(skip_vars, {op_desc->InputArgumentNames(), + op_desc->OutputArgumentNames()}); + } + + // Skip ops that are related to parameter server. + // In distributed mode, trainers and parameter server use same + // variable names to track same variables. We cannot change the + // names of these variables, otherwise trainers or parameter + // server would not find them. + if (kSkipMemOptOps.count(op_desc->Type()) > 0) { + UpdateSkipVarSet(skip_vars, {op_desc->InputArgumentNames(), + op_desc->OutputArgumentNames()}); + } + + // FIXME(zjl): some ops use variables that are not from their + // inputs or outputs. We do not have a nice method to solve this + // issue yet. Currently, we should skip these variables when + // memory optimization is enabled. + auto op_type = op_desc->Type(); + if (op_type == "while_grad") { + // In while_grad, framework::GradVarName(Input("X")) is visited + // without being any in/out of while_grad. While_grad uses + // these variable to accumulate gradient of X across time steps. + UpdateSkipVarSet(skip_vars, {ToGradVarName(op_desc->Input("X"))}); + } else if (op_type == "conditional_block_grad") { + // In conditional_block_grad, framework::GradVarName(Input("Input", + // "Cond")) is visited without being any in/out of + // conditional_block_grad. Conditional_block_grad uses these + // variables to accumulate gradient of Input/Cond across time steps. + UpdateSkipVarSet(skip_vars, {ToGradVarName(op_desc->Input("Input")), + ToGradVarName(op_desc->Input("Cond"))}); + } else if (op_type == "recurrent" || op_type == "recurrent_grad") { + // Recurrent and recurrent_grad ops are implemented by a very trickly + // way. Attr("states", "ex_states") is visited without being any + // in/out of op. It is because these variables are from sub blocks, + // not main block. Adding these variables to input would make recurrent + // fail since "states" and "ex_states" cannot be found in main block. + // When memory optimization is enabled, "states", "ex_states" and their + // gradient should be skipped. + auto ex_states = + boost::get>(op_desc->GetAttr("ex_states")); + auto states = + boost::get>(op_desc->GetAttr("states")); + if (op_type == "recurrent") { + UpdateSkipVarSet(skip_vars, {ex_states, states}); + } else { + // In recurrent_grad, framework::GradVarName(Input("parameters", + // "input")) is visited without being any in/out of recurrent_grad. + // Recurrent_grad uses these variables to accumulate gradient of + // parameters/input across time steps. + UpdateSkipVarSet( + skip_vars, + {ToGradVarName(op_desc->Input("parameters")), + ToGradVarName(op_desc->Input("inputs")), ex_states, states, + ToGradVarName(ex_states), ToGradVarName(states)}); + } + } + } + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(record_skip_memory_opt_vars_pass, + paddle::framework::ir::RecordSkipMemoryOptVarsPass); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc new file mode 100644 index 00000000..40e07ce8 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h" + +#include +#include + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +using paddle::operators::OpVariant; +using paddle::operators::OpVariantSet; +using paddle::operators::OpAndGradOpPair; + +void RecurrentOpEagerDeletionPass::ApplyImpl(Graph *graph) const { + // Find all recurrent_op and recurrent_grad_op in graph + // Note the graph only contains ops and block 0 + std::unordered_map target_ops = + DeviceIdToRecurrentAndRecurrentGradOp(*graph); + + for (auto &entry : target_ops) { + // Prepare safe eager deletion on different devices because the garbage + // collection may be different across devices + OpAndGradOpPair &op_pair = entry.second; + PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(&op_pair); + } +} + +// Returns a std::unordered_map mapping from the device id to recurrent op and +// grad op pair +std::unordered_map +RecurrentOpEagerDeletionPass::DeviceIdToRecurrentAndRecurrentGradOp( + const Graph &graph) const { + std::unordered_map ret; + std::vector all_ops = + FilterByNodeWrapper(graph); + + for (auto *op : all_ops) { + auto compute_op = dynamic_cast(op); + if (compute_op == nullptr) continue; + + if (compute_op->Name() == "recurrent") { + // GetScopeIdx() returns device/place id + ret[compute_op->GetScopeIdx()].first.emplace(compute_op->GetOp()); + } else if (compute_op->Name() == "recurrent_grad") { + // GetScopeIdx() returns device/place id + ret[compute_op->GetScopeIdx()].second.emplace(compute_op->GetOp()); + } + } + return ret; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(recurrent_op_eager_deletion_pass, + paddle::framework::ir::RecurrentOpEagerDeletionPass); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h new file mode 100644 index 00000000..9c39a9fa --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/operators/controlflow/op_variant.h" +#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +// Pass class set skip eager deletion vars for recurrent ops +class RecurrentOpEagerDeletionPass : public Pass { + protected: + void ApplyImpl(Graph *graph) const override; + + private: + // Returns a std::unordered_map mapping from the device id to recurrent op and + // grad op pair + std::unordered_map + DeviceIdToRecurrentAndRecurrentGradOp(const Graph &graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc new file mode 100644 index 00000000..e9114156 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc @@ -0,0 +1,381 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ReferenceCountPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override; +}; + +// A functor to shrink/remove operators who depend on other operators in a set +class ShrinkDepsOpFunctor { + private: + enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; + + public: + explicit ShrinkDepsOpFunctor( + const std::vector &all_ops) + : graph_(all_ops) {} + + template + OpSet operator()(const OpSet &op_set) const { + using KeyType = typename OpSet::key_type; + static_assert( + std::is_base_of::type>::value, + "Key type of OpSet must be details::OpHandleBase, or derived of " + "details::OpHandleBase"); + + if (op_set.size() <= 1) return op_set; + std::vector ops(op_set.begin(), op_set.end()); + OpSet ret; + auto rels = GetRelations(ops); + auto not_before = [](RelationShip r) { return r != kBefore; }; + for (size_t i = 0; i < rels.size(); ++i) { + if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) { + ret.emplace(static_cast(ops[i])); + } + } + return ret; + } + + private: + std::vector> GetRelations( + const std::vector &ops) const { + std::unordered_map op_to_idx; + for (size_t i = 0; i < ops.size(); ++i) { + PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph"); + op_to_idx[ops[i]] = i; + } + + PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops"); + + std::vector> ret(ops.size()); + for (auto &e : ret) { + e.assign(ops.size(), kSame); + } + + size_t found_num = ops.size(); + size_t total_num = ops.size() * ops.size(); + auto visitor = [&](details::OpHandleBase *op, size_t i) { + auto it = op_to_idx.find(op); + if (it != op_to_idx.end()) { + size_t j = it->second; + if (i != j && ret[i][j] == kSame) { + ret[i][j] = kBefore; + ret[j][i] = kAfter; + found_num += 2; + if (found_num == total_num) { + return false; + } + } + } + return true; + }; + + for (size_t i = 0; i < ops.size(); ++i) { + auto sub_visitor = [&, i](details::OpHandleBase *op) { + return visitor(op, i); + }; + if (!graph_.VisitAllPendingOps(ops[i], sub_visitor)) { + break; + } + } + + for (size_t i = 0; i < ops.size(); ++i) { + for (size_t j = i + 1; j < ops.size(); ++j) { + if (ret[i][j] != kSame) continue; + ret[i][j] = kNoDeps; + ret[j][i] = kNoDeps; + } + } + + return ret; + } + + const OpGraphView graph_; +}; + +/** + * Shrink op dependencies according to no need buffer vars. + * + * If some ops do not need Tensor buffer of any input, + * just remove the dependency of this op, i.e, decrease reference count. + * + * For example, input Y of elementwise_add_grad op is only used to infer shape + * and lod of Y@GRAD, we do not need the buffer of input Y. Data buffer of + * input Y can be collected before elementwise_add_grad op runs. + * + * This method returns whether the dependency count decreases to 0, and + * shrinks op dependency if possible. + */ +static bool ShrinkNoNeedBufferVarOpDependency( + const std::string &var_name, + std::unordered_set *op_handles) { + std::vector skip_ops; + for (auto *op_handle : *op_handles) { + auto *op_base = op_handle->GetOp(); + auto &inferer = op_base->Info().NoNeedBufferVarsInferer(); + if (!inferer) { + continue; + } + + std::unordered_set no_need_buffer_vars = + inferer(op_base->Inputs(), op_base->Outputs(), op_base->Attrs()); + + // Check whether var_name occurs in other inputs or outputs of the op + // If it occurs, we cannot decrease the dependency number. + bool occurred_in_other_vars = false; + for (auto &in_pair : op_base->Inputs()) { + if (no_need_buffer_vars.count(in_pair.first) > 0) { + continue; + } + + auto &args = in_pair.second; + auto iter = std::find(args.begin(), args.end(), var_name); + if (iter != args.end()) { + occurred_in_other_vars = true; + break; + } + } + + if (occurred_in_other_vars) { + continue; + } + + for (auto &out_pair : op_base->Outputs()) { + auto &args = out_pair.second; + auto iter = std::find(args.begin(), args.end(), var_name); + if (iter != args.end()) { + occurred_in_other_vars = true; + break; + } + } + + if (!occurred_in_other_vars) { + VLOG(2) << "Shrink var " << var_name << " in op " << op_handle->Name(); + skip_ops.emplace_back(op_handle); + } + } + + if (skip_ops.size() == op_handles->size()) { + op_handles->clear(); + return true; + } else { + for (auto *skip_op : skip_ops) { + op_handles->erase(skip_op); + } + return false; + } +} + +/** + * Find the nearest downstream computation op handle. If the op is a + * computation op, just return itself. + */ +static details::ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( + details::OpHandleBase *op, size_t scope_idx) { + std::queue q; + std::unordered_set visited; + q.push(op); + while (!q.empty()) { + auto *op = q.front(); + q.pop(); + auto *compute_op = dynamic_cast(op); + if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) { + return compute_op; + } + for (auto *out_var : op->Outputs()) { + for (auto *pending_op : out_var->PendingOps()) { + if (visited.count(pending_op)) continue; + visited.insert(pending_op); + q.push(pending_op); + } + } + } + return nullptr; +} + +enum LastLiveOpSearchStatus { kSuccess, kFailure, kShouldPrecede }; + +static std::unordered_set +ExtractComputationOpFromLastLivedVar(details::VarHandle *var, size_t scope_idx, + const std::string &var_name, + const ShrinkDepsOpFunctor &shrink_func, + LastLiveOpSearchStatus *status) { + // stage one. Get last op for variable. + std::unordered_set candidates; + { + if (var->PendingOps().empty() && var->GeneratedOp()) { + // No operator depends on this variable. So the last operator is the op + // who generates this variable. + candidates.emplace(var->GeneratedOp()); + } else { + candidates = var->PendingOps(); + } + + // No pending ops or generated op is nullptr + if (candidates.empty()) { + *status = LastLiveOpSearchStatus::kFailure; + return {}; + } + } + + // stage two. Try to cast them to computation op. + // return (*status=kFailure) when failed. + // + // The reason why we cannot make any types of op handle to be the last lived + // op is: + // some op handle may operate on many DeviceContext, however, our garbage + // collector can only wait one DeviceContext for now. So currently, we wait + // the nearest compute op. + std::unordered_set computation_op; + { + for (auto *op : candidates) { + auto *compute_op = + FindNextComputationOpHandleOrReturnItself(op, scope_idx); + if (compute_op == nullptr) { + *status = LastLiveOpSearchStatus::kFailure; + return {}; + } + computation_op.emplace(compute_op); + } + } + + // stage three. Try to shrink computation op if any of them does + // not need the buffer of var_name. + // If all computation ops do not need the buffer of var_name, + // return empty computation op set, and mark the status as kShouldPrecede, + // which means that the last living ops of var_name should be + // found in the previous version of var_name. + if (ShrinkNoNeedBufferVarOpDependency(var_name, &computation_op)) { + *status = LastLiveOpSearchStatus::kShouldPrecede; + return {}; + } + + PADDLE_ENFORCE(!computation_op.empty(), + "Computation ops should not be empty"); + + // stage four. Try to shrink computation op if they depend on each other. + // Get the smallest set of the most ops. + *status = LastLiveOpSearchStatus::kSuccess; + return shrink_func(computation_op); +} + +void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const { + auto &var_infos = Get(kMemOptVarInfoMapList); + auto &last_live_ops_of_vars = + Get>(kLastLiveOpsOfVars); + + PADDLE_ENFORCE(last_live_ops_of_vars.empty() && var_infos.empty(), + "Last Live Ops and Reference Counts of vars should be " + "initialized at here."); + + const auto &vars = graph->Get(details::kGraphVars); + + last_live_ops_of_vars.resize(vars.size()); + var_infos.resize(vars.size()); + + ShrinkDepsOpFunctor shrink_func( + ir::FilterByNodeWrapper(*graph)); + + VLOG(1) << "Place number: " << vars.size(); + for (size_t i = 0; i < vars.size(); ++i) { + for (auto &name_var_pair : vars[i]) { + // Whether this variable can be reused or deleted? If not, we do not + // compute reference counts and dependencies. + VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second); + if (var_desc == nullptr || var_desc->Persistable()) { + continue; + } + + auto var_type = var_desc->Proto()->type().type(); + if (var_type != proto::VarType::LOD_TENSOR && + var_type != proto::VarType::SELECTED_ROWS && + var_type != proto::VarType::LOD_TENSOR_ARRAY) { + // Var type cannot be deleted + continue; + } + + auto &var_name = name_var_pair.first; + auto &var_handles = name_var_pair.second; + + PADDLE_ENFORCE_EQ(var_desc->Name(), var_name); + + for (auto iter = var_handles.rbegin(); iter != var_handles.rend(); + ++iter) { + VLOG(10) << "Try to find last living ops of " << var_name << " " + << (iter - var_handles.rbegin()) << " time"; + LastLiveOpSearchStatus status = LastLiveOpSearchStatus::kFailure; + auto result = ExtractComputationOpFromLastLivedVar( + *iter, i, var_name, shrink_func, &status); + + // Seldomly, some vars may have no pending or preceding computation ops + // Just break; + if (status == LastLiveOpSearchStatus::kFailure) { + break; + } + + if (status == LastLiveOpSearchStatus::kShouldPrecede) { + VLOG(10) << "Try to precede reference count computing at var " + << var_name; + continue; + } + + PADDLE_ENFORCE_EQ(status, LastLiveOpSearchStatus::kSuccess); + PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty", + var_name); + + VLOG(10) << "Extract " << result.size() << " ops of var " << var_name; + var_infos[i][var_name].reset( + new MemOptVarInfo(var_name, result.size())); + last_live_ops_of_vars[i].emplace(var_name, std::move(result)); + break; + } + + // Seldomly, all preceding trying failed. + // Just skip this corner case + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(reference_count_pass, paddle::framework::ir::ReferenceCountPass) + .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList) + .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc new file mode 100644 index 00000000..ed87f73a --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/var_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +VarDesc *TryGetLatestVarDesc(const std::vector &vars) { + VarDesc *var_desc = nullptr; + std::find_if(vars.rbegin(), vars.rend(), + [&](details::VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + return var_desc; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h new file mode 100644 index 00000000..3433694b --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h @@ -0,0 +1,52 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/garbage_collector.h" + +namespace paddle { +namespace framework { + +class VarDesc; + +namespace ir { + +using GarbageCollectorMap = + std::map>; + +const char kMemOptVarInfoMapList[] = "mem_opt_var_info_map_list"; +const char kGarbageCollector[] = "garbage_collector"; +const char kAllPlaces[] = "all_places"; + +using LastLiveOpsOfVars = + std::unordered_map>; +const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; + +VarDesc *TryGetLatestVarDesc(const std::vector &vars); + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc new file mode 100644 index 00000000..63f996ad --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc @@ -0,0 +1,60 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +class WhileOpEagerDeletionPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override { + auto all_ops = ir::FilterByNodeWrapper(*graph); + + // Find all while_op and while_grad_op + std::unordered_map, + std::vector>> + target_ops; + for (auto *op : all_ops) { + auto compute_op = dynamic_cast(op); + if (compute_op == nullptr) continue; + + if (compute_op->Name() == "while") { + target_ops[compute_op->GetScopeIdx()].first.emplace_back( + compute_op->GetOp()); + } else if (compute_op->Name() == "while_grad") { + target_ops[compute_op->GetScopeIdx()].second.emplace_back( + compute_op->GetOp()); + } + } + + for (auto &ops_pair : target_ops) { + auto &while_ops = ops_pair.second.first; + auto &while_grad_ops = ops_pair.second.second; + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + while_ops, while_grad_ops); + } + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(while_op_eager_deletion_pass, + paddle::framework::ir::WhileOpEagerDeletionPass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc new file mode 100644 index 00000000..bbfc8c00 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +template +LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b, + BinaryOperation f) { + PADDLE_ENFORCE_EQ(vec_a.dims(), vec_b.dims()); + LoDTensor vec_y; + vec_y.Resize(vec_a.dims()); + const float* a = vec_a.data(); + const float* b = vec_b.data(); + float* y = vec_y.mutable_data(platform::CPUPlace()); + for (int i = 0; i < vec_a.numel(); i++) { + y[i] = f(a[i], b[i]); + } + return vec_y; +} + +void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* conv_input = + gpd.mutable_pattern() + ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) + ->AsInput() + ->assert_is_op_input(type(), "Input"); + patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_); + conv_bias_pattern(conv_input, type()); + int found_conv_bias_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBias fuse"; + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, + conv_bias_pattern); // Filter + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern); // tmp + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern); // CONV op + // bias + GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern); + // output + GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern); + // elementwise_add op + GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern); + + PADDLE_ENFORCE(subgraph.count(conv_input)); + + // check if fuse can be done and if MKL-DNN should be used + FuseOptions fuse_option = FindFuseOption(*conv, *eltwise); + if (fuse_option == DO_NOT_FUSE || fuse_option == FUSE_NATIVE) { + VLOG(3) << "do not perform " + type() + "+bias fuse"; + return; + } + + auto* eltwise_bias_tensor = + scope->FindVar(eltwise_bias->Name())->GetMutable(); + + auto input_names = conv->Op()->InputNames(); + bool has_bias = std::find(input_names.begin(), input_names.end(), "Bias") != + input_names.end(); + if (has_bias && conv->Op()->Input("Bias").size() > 0) { + auto conv_bias_names = conv->Op()->Input("Bias"); + // add eltwise bias to existing conv bias + PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1); + auto* conv_bias_var = scope->FindVar(conv_bias_names[0]); + auto* conv_bias_tensor = conv_bias_var->GetMutable(); + PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), eltwise_bias_tensor->dims()); + *conv_bias_tensor = tensor_apply_eltwise( + *conv_bias_tensor, *eltwise_bias_tensor, std::plus()); + + conv->Op()->SetOutput("Output", + std::vector({eltwise_out->Name()})); + + GraphSafeRemoveNodes(graph, {eltwise, conv_out}); + + IR_NODE_LINK_TO(conv, eltwise_out); + } else { + // take eltwise bias as conv bias + OpDesc desc; + + desc.SetInput( + "Input", std::vector({subgraph.at(conv_input)->Name()})); + desc.SetInput("Filter", std::vector({conv_weight->Name()})); + desc.SetInput("Bias", std::vector({eltwise_bias->Name()})); + desc.SetOutput("Output", std::vector({eltwise_out->Name()})); + desc.SetType(type()); + + for (auto& attr : conv->Op()->GetAttrMap()) { + desc.SetAttr(attr.first, attr.second); + } + auto conv_bias_node = g->CreateOpNode(&desc); + + IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node); + IR_NODE_LINK_TO(conv_weight, conv_bias_node); + IR_NODE_LINK_TO(eltwise_bias, conv_bias_node); + IR_NODE_LINK_TO(conv_bias_node, eltwise_out); + + GraphSafeRemoveNodes(graph, {conv, eltwise, conv_out}); + } + + found_conv_bias_count++; + }; + gpd(graph, handler); + AddStatis(found_conv_bias_count); +} +} // namespace ir +} // namespace framework +} // namespace paddle +REGISTER_PASS(conv_bias_mkldnn_fuse_pass, + paddle::framework::ir::ConvBiasFusePass); +REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass, + paddle::framework::ir::Conv2DTransposeBiasFusePass); +REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass, + paddle::framework::ir::Conv3DBiasFusePass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h new file mode 100644 index 00000000..833fbc74 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" +namespace paddle { +namespace framework { +namespace ir { +/* +* Fuse the Conv and Elementwise_add to a ConvBiasOp. +*/ +class ConvBiasFusePass : public FusePassBase { + public: + virtual ~ConvBiasFusePass() {} + virtual std::string type() const { return "conv2d"; } + + protected: + void ApplyImpl(ir::Graph* graph) const override; + const std::string name_scope_{"conv_bias_mkldnn_fuse"}; +}; +/* +* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp. +*/ +class Conv2DTransposeBiasFusePass : public ConvBiasFusePass { + public: + std::string type() const override { return "conv2d_transpose"; } +}; + +class Conv3DBiasFusePass : public ConvBiasFusePass { + public: + std::string type() const override { return "conv3d"; } +}; +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc new file mode 100644 index 00000000..427d7bc9 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,155 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h" +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/platform/place.h" + +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "conv2d") { + op->SetAttr("use_mkldnn", true); + op->SetAttr("name", name); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + if (inputs.size() > 2) + op->SetInput("Bias", {inputs[2]}); + else + op->SetInput("Bias", {}); + } else if (type == "elementwise_add") { + op->SetAttr("use_mkldnn", true); + op->SetInput("X", {inputs[0]}); + op->SetInput("Y", {inputs[1]}); + } + op->SetOutput("Out", outputs); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); +} + +// (c, weights)->conv->f +// (f)->elementwise_add->g +ProgramDesc BuildProgramDesc(bool convWithExistingBias) { + ProgramDesc prog; + std::vector nodes{"c", "weights", "f", "eltwise_bias", "g"}; + if (convWithExistingBias) nodes.push_back("conv_bias"); + for (auto& v : nodes) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::LOD_TENSOR); + if (v == "weights" || v == "conv_bias" || v == "eltwise_bias") { + var->SetPersistable(true); + } + } + + // conv+bias, both with MKL-DNN + if (convWithExistingBias) { + SetOp(&prog, "conv2d", "conv", + std::vector({"c", "weights", "conv_bias"}), + std::vector({"f"})); + } else { + SetOp(&prog, "conv2d", "conv", std::vector({"c", "weights"}), + std::vector({"f"})); + } + SetOp(&prog, "elementwise_add", "eltwise", + std::vector({"f", "eltwise_bias"}), + std::vector({"g"})); + + return prog; +} + +void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, + const char* var_name) { + auto x = scope->Var(var_name); + auto tensor = x->GetMutable(); + tensor->mutable_data(place, proto::VarType::FP32, 1); +} + +void MainTest(bool convWithExistingBias) { + auto prog = BuildProgramDesc(convWithExistingBias); + std::unique_ptr graph(new ir::Graph(prog)); + auto place = paddle::platform::CPUPlace(); + NaiveExecutor exe{place}; + Scope scope; + // Init scope, as it is used in pass + exe.CreateVariables(prog, 0, true, &scope); + if (convWithExistingBias) { + InitTensorHolder(&scope, place, "conv_bias"); + InitTensorHolder(&scope, place, "eltwise_bias"); + } + graph->SetNotOwned(kParamScopeAttr, &scope); + + auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass"); + + int original_nodes_num = graph->Nodes().size(); + + graph.reset(pass->Apply(graph.release())); + + int current_nodes_num = graph->Nodes().size(); + + // Remove 3 Nodes: Conv, Bias, conv_out + // Add 1 Node: ConvBias + EXPECT_EQ(original_nodes_num - 2, current_nodes_num); + + // Assert conv_bias op in newly generated graph + int conv_bias_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "conv2d") { + auto* op = node->Op(); + ASSERT_TRUE(op->HasAttr("use_mkldnn")); + EXPECT_TRUE(boost::get(op->GetAttr("use_mkldnn"))); + // check if "conv" convolution is fused + auto op_name = boost::get(op->GetAttr("name")); + if (op_name == "conv") { + auto input_names = op->InputNames(); + ASSERT_TRUE(std::find(input_names.begin(), input_names.end(), "Bias") != + input_names.end()); + auto bias = boost::get>(op->Input("Bias")); + if (bias.size()) { + ++conv_bias_count; + } + } + } + } + EXPECT_EQ(conv_bias_count, 1); +} + +TEST(ConvBiasFusePass, bias_free_conv) { MainTest(false); } + +TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); } + +TEST(ConvBiasFusePass, conv3d) { + Conv3DBiasFusePass pass; + ASSERT_EQ(pass.type(), std::string("conv3d")); +} + +TEST(ConvBiasFusePass, conv2d_transpose) { + Conv2DTransposeBiasFusePass pass; + ASSERT_EQ(pass.type(), std::string("conv2d_transpose")); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(conv_bias_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc new file mode 100644 index 00000000..dd9d4486 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h" +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void ConvBReLUFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("conv_bounded_relu_mkldnn_fuse", graph); + + GraphPatternDetector gpd; + auto* conv_input = gpd.mutable_pattern() + ->NewNode("conv_bounded_relu_mkldnn_fuse/conv_input") + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvBReLU conv_brelu_pattern(gpd.mutable_pattern(), + "conv_bounded_relu_mkldnn_fuse"); + conv_brelu_pattern(conv_input); + + int found_conv_brelu_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvBoundedReLUFusePass fuse"; + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, + conv_brelu_pattern); // Filter + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_brelu_pattern); // tmp + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_brelu_pattern); // CONV op + GET_IR_NODE_FROM_SUBGRAPH(brelu_out, brelu_out, conv_brelu_pattern); // Out + GET_IR_NODE_FROM_SUBGRAPH(brelu, brelu, conv_brelu_pattern); // ReLU op + + // Transform Conv node into ConvBReLU node. + OpDesc* desc = conv->Op(); + desc->SetOutput("Output", std::vector({brelu_out->Name()})); + desc->SetAttr("fuse_brelu", true); + desc->SetAttr("fuse_brelu_threshold", brelu->Op()->GetAttr("threshold")); + + GraphSafeRemoveNodes(graph, {brelu, conv_out}); + + PADDLE_ENFORCE(subgraph.count(conv_input)); + IR_NODE_LINK_TO(conv, brelu_out); + found_conv_brelu_count++; + }; + + gpd(graph, handler); + + AddStatis(found_conv_brelu_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_brelu_mkldnn_fuse_pass, + paddle::framework::ir::ConvBReLUFusePass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h new file mode 100644 index 00000000..c898be69 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the CONV and ReLU6 to a ConvReLU6Op. + */ +class ConvBReLUFusePass : public FusePassBase { + public: + virtual ~ConvBReLUFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc new file mode 100644 index 00000000..5a546bfa --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,135 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h" + +#include +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn = false) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "conv2d") { + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("name", name); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + op->SetInput("Bias", {inputs[2]}); + } else if (type == "relu6") { + op->SetAttr("use_mkldnn", use_mkldnn); + if (use_mkldnn) { + op->SetAttr("threshold", 6.0f); + } + op->SetInput("X", inputs); + } + op->SetOutput("Out", outputs); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); +} + +// a->OP0->b +// b->OP1->c +// (c, weights, bias)->conv->f +// (f)->brelu->g +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : + std::vector({"a", "b", "c", "weights", "bias", "f", "g", + "h", "weights2", "bias2", "k", "l"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "OP0", "op0", std::vector({"a"}), + std::vector({"b"})); + SetOp(&prog, "OP1", "op1", std::vector({"b"}), + std::vector({"c"})); + // conv+brelu, both with MKL-DNN + SetOp(&prog, "conv2d", "conv1", + std::vector({"c", "weights", "bias"}), + std::vector({"f"}), true); + SetOp(&prog, "relu6", "relu1", std::vector({"f"}), + std::vector({"g"}), true); + SetOp(&prog, "OP3", "op3", std::vector({"g"}), + std::vector({"h"})); + // conv+brelu, only one with MKL-DNN + SetOp(&prog, "conv2d", "conv2", + std::vector({"h", "weights2", "bias2"}), + std::vector({"k"}), true); + SetOp(&prog, "relu6", "relu2", std::vector({"k"}), + std::vector({"l"})); + + return prog; +} + +TEST(ConvBReLUFusePass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("conv_brelu_mkldnn_fuse_pass"); + + int original_nodes_num = graph->Nodes().size(); + + graph.reset(pass->Apply(graph.release())); + + int current_nodes_num = graph->Nodes().size(); + + // Remove 3 Nodes: CONV, BRELU, conv_out + // Add 1 Node: ConvBReLU + EXPECT_EQ(original_nodes_num - 2, current_nodes_num); + + // Assert conv_brelu op in newly generated graph + int conv_brelu_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "conv2d") { + auto* op = node->Op(); + ASSERT_TRUE(op->HasAttr("use_mkldnn")); + EXPECT_TRUE(boost::get(op->GetAttr("use_mkldnn"))); + // check if only "conv1" convolution is fused + auto op_name = boost::get(op->GetAttr("name")); + if (op_name == "conv1") { + ASSERT_TRUE(op->HasAttr("fuse_brelu")); + ASSERT_TRUE(op->HasAttr("fuse_brelu_threshold")); + + bool fuse_brelu = boost::get(op->GetAttr("fuse_brelu")); + if (fuse_brelu) { + ++conv_brelu_count; + float fuse_brelu_threshold = + boost::get(op->GetAttr("fuse_brelu_threshold")); + EXPECT_EQ(fuse_brelu_threshold, 6.0f); + } + } else if (op_name == "conv2") { + ASSERT_FALSE(op->HasAttr("fuse_brelu")); + } + } + } + EXPECT_EQ(conv_brelu_count, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(conv_brelu_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc new file mode 100644 index 00000000..a037a6bf --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h" +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void ConvConcatReLUFusePass::FindConcatWithConvs( + ir::Graph* graph, + std::unordered_map* concat_with_convs_counter) const { + GraphPatternDetector gpd; + patterns::ConcatReLU concat_relu_pattern{gpd.mutable_pattern(), + "concat_relu"}; + concat_relu_pattern(); + + int found_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Find Concats with Convs"; + GET_IR_NODE_FROM_SUBGRAPH(concat_op, concat_op, concat_relu_pattern); + GET_IR_NODE_FROM_SUBGRAPH(relu_op, relu_op, concat_relu_pattern); + + auto concat_inputs = concat_op->inputs; + + for (auto node : concat_inputs) { + auto prev_op_node = node->inputs; + PADDLE_ENFORCE_EQ(prev_op_node.size(), 1); + auto* conv_op = prev_op_node[0]; + if (conv_op->Op()->Type() != "conv2d") return; + + FuseOptions fuse_option = FindFuseOption(*conv_op, *relu_op); + if (fuse_option == DO_NOT_FUSE) { + return; + } + } + + (*concat_with_convs_counter)[concat_op] = concat_inputs.size(); + found_count++; + }; + gpd(graph, handler); + AddStatis(found_count); +} + +void ConvConcatReLUFusePass::FuseConvConcatReLU( + ir::Graph* graph, + std::unordered_map* concat_with_convs_counter) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::ConvConcatReLU conv_concat_relu(pattern, name_scope_); + conv_concat_relu(); + + int found_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvConcatReLU fuse"; + + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_concat_relu); + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_concat_relu); + GET_IR_NODE_FROM_SUBGRAPH(concat_op, concat_op, conv_concat_relu); + GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, conv_concat_relu); + GET_IR_NODE_FROM_SUBGRAPH(relu_op, relu_op, conv_concat_relu); + GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_concat_relu); + + if (!concat_with_convs_counter->count(concat_op)) { + VLOG(4) << "this concat has input from non-conv2d operator"; + return; + } + + // Transform Conv node into ConvReLU node. + OpDesc* conv_desc = conv_op->Op(); + conv_desc->SetAttr("fuse_relu", true); + + // Remove ReLU when all Convs were transformed. + auto number_of_unfused_convs_left = + --(*concat_with_convs_counter)[concat_op]; + if (number_of_unfused_convs_left == 0) { + OpDesc* concat_desc = concat_op->Op(); + concat_desc->SetOutput("Out", + std::vector({relu_out->Name()})); + GraphSafeRemoveNodes(graph, {relu_op, concat_out}); + IR_NODE_LINK_TO(concat_op, relu_out); + } + + found_count++; + }; + gpd(graph, handler); + AddStatis(found_count); +} + +void ConvConcatReLUFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); + + std::unordered_map concat_with_convs_counter; + FindConcatWithConvs(graph, &concat_with_convs_counter); + FuseConvConcatReLU(graph, &concat_with_convs_counter); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_concat_relu_mkldnn_fuse_pass, + paddle::framework::ir::ConvConcatReLUFusePass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h new file mode 100644 index 00000000..91ff0760 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h @@ -0,0 +1,53 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the (multi conv) -> Concat -> ReLU -> next_op + * to a: + * (multi ConvReLU) -> Concat -> next_op. + */ +class ConvConcatReLUFusePass : public FusePassBase { + public: + virtual ~ConvConcatReLUFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + void FindConcatWithConvs( + Graph* graph, + std::unordered_map* concat_with_convs_counter) const; + + void FuseConvConcatReLU( + Graph* graph, + std::unordered_map* concat_with_convs_counter) const; + + const std::string name_scope_{"conv_concat_relu_mkldnn_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc new file mode 100644 index 00000000..0d7ddac8 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,157 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.h" + +#include +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn = true) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "conv2d") { + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("fuse_relu", false); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + if (inputs.size() > 2) { + op->SetInput("Bias", {inputs[2]}); + } + op->SetOutput("Output", outputs); + } else if (type == "relu") { + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); + } else if (type == "pool2d") { + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); + } else if (type == "concat") { + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); + } + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); +} + +// (a1,w1)->conv1->c1 +// (a2,w2,b2)->conv2->c2 +// if put_only_convs_before_concat=true +// (a3,w3)->conv3->c3 +// else +// a3->pool1->c3 +// +// (c1,c2,c3)->concat1->d +// d->relu1->e +ProgramDesc BuildProgramDesc(bool put_only_convs_before_concat, + bool all_convs_use_mkldnn) { + ProgramDesc prog; + for (auto& v : + std::initializer_list({"a1", "w1", "c1", "a2", "w2", "b2", + "c2", "a3", "w3", "c3", "d", "e"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v.find("w") == 0 || v.find("b") == 0) { + var->SetPersistable(true); + } + } + + SetOp(&prog, "conv2d", {"a1", "w1", "b1"}, {"c1"}, all_convs_use_mkldnn); + SetOp(&prog, "conv2d", {"a2", "w2", "b2"}, {"c2"}); + if (put_only_convs_before_concat) { + SetOp(&prog, "conv2d", {"a3", "w3", "b3"}, {"c3"}); + } else { + SetOp(&prog, "pool2d", {"a3"}, {"c3"}); + } + SetOp(&prog, "concat", {"c1", "c2", "c3"}, {"d"}); + SetOp(&prog, "relu", {"d"}, {"e"}); + + return prog; +} + +void MainTest(const ProgramDesc& prog, bool fuse_relu) { + std::unique_ptr graph(new ir::Graph(prog)); + + int original_nodes_num = graph->Nodes().size(); + + auto pass = PassRegistry::Instance().Get("conv_concat_relu_mkldnn_fuse_pass"); + graph.reset(pass->Apply(graph.release())); + + int current_nodes_num = graph->Nodes().size(); + + if (fuse_relu) { + // Remove 2 nodes: concat_out, relu + EXPECT_EQ(original_nodes_num - 2, current_nodes_num); + } else { + EXPECT_EQ(original_nodes_num, current_nodes_num); + } + + int relu_count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "conv2d") { + ASSERT_TRUE(op->HasAttr("fuse_relu")); + bool fuse_relu_attr = boost::get(op->GetAttr("fuse_relu")); + EXPECT_EQ(fuse_relu, fuse_relu_attr); + } else if (op->Type() == "relu") { + relu_count++; + } + } + } + EXPECT_EQ(relu_count, fuse_relu ? 0 : 1); +} + +TEST(ConvConcatReLUFusePass, only_convs_before_concat) { + bool all_convs_use_mkldnn = true; + bool put_only_convs_before_concat = true; + auto prog = + BuildProgramDesc(put_only_convs_before_concat, all_convs_use_mkldnn); + + bool expect_relu_fuse = true; + MainTest(prog, expect_relu_fuse); +} + +TEST(ConvConcatReLUFusePass, only_convs_before_concat_but_one_non_mkldnn) { + bool all_convs_use_mkldnn = false; + bool put_only_convs_before_concat = true; + auto prog = + BuildProgramDesc(put_only_convs_before_concat, all_convs_use_mkldnn); + + bool expect_relu_fuse = false; + MainTest(prog, expect_relu_fuse); +} + +TEST(ConvConcatReLUFusePass, convs_and_pool_before_concat) { + bool all_convs_use_mkldnn = true; + bool put_only_convs_before_concat = false; + auto prog = + BuildProgramDesc(put_only_convs_before_concat, all_convs_use_mkldnn); + + bool expect_relu_fuse = false; + MainTest(prog, expect_relu_fuse); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(conv_concat_relu_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc new file mode 100644 index 00000000..ef7874c1 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -0,0 +1,345 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph_traits.h" + +namespace paddle { +namespace framework { +namespace ir { + +bool IsReachable(ir::Graph* graph, Node* from, Node* to) { + auto find_node = [](ir::Graph* graph, const Node* node) -> Node* { + for (auto n : graph->Nodes()) { + if (n == node) { + return n; + } + } + + return nullptr; + }; + + if (from == to) { + return true; + } + + std::map visited; + + for (auto& node : GraphTraits::DFS(*graph)) { + visited[&node] = false; + } + + visited[from] = true; + + std::list queue; + queue.push_back(from); + + while (!queue.empty()) { + auto cur = find_node(graph, queue.front()); + queue.pop_front(); + + if (!cur) return false; + + for (auto n : cur->outputs) { + if (n == to) { + return true; + } + + if (!visited[n]) { + visited[n] = true; + queue.push_back(n); + } + } + } + return false; +} + +template +boost::optional HasAttribute(const Node& op, const std::string& attr) { + if (op.Op()->HasAttr(attr)) + return boost::get(op.Op()->GetAttr(attr)); + else + return boost::none; +} + +ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::IdentityFuseHandle( + const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func, + const ResidualConnectionMKLDNNFusePass::IdentityConvFunc& + get_node_from_conv_op, + const ResidualConnectionMKLDNNFusePass::IdentityElementwiseAddFunc& + get_node_from_elementwise_add_op) + : fusion_stats{std::make_shared(0)}, + can_fuse_func{can_fuse_func}, + get_node_from_conv_op{get_node_from_conv_op}, + get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {} + +void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + Node* conv_op; + Node* conv_input; + Node* conv_filter; + Node* conv_output; + + Node* elementwise_add_op; + Node* elementwise_add_identity; + Node* elementwise_add_out; + + std::tie(conv_op, conv_input, conv_filter, conv_output) = + get_node_from_conv_op(subgraph); + std::tie(elementwise_add_op, elementwise_add_identity, elementwise_add_out) = + get_node_from_elementwise_add_op(subgraph); + + if (!can_fuse_func(conv_op, elementwise_add_op)) return; + + if (!IsReachable(graph, elementwise_add_identity, conv_output)) return; + + auto fuse_relu = HasAttribute(*conv_op, "fuse_relu"); + if (fuse_relu && *fuse_relu) return; + + conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); + conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(graph, {conv_output, elementwise_add_op}); + + IR_NODE_LINK_TO(elementwise_add_identity, conv_op); + IR_NODE_LINK_TO(conv_op, elementwise_add_out); + + (*fusion_stats)++; +} + +ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::ProjectionFuseHandle( + const ResidualConnectionMKLDNNFusePass::CanFuseFunc& can_fuse_func, + const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc& + get_node_from_conv_x_op, + const ResidualConnectionMKLDNNFusePass::ProjectionConvFunc& + get_node_from_conv_y_op, + const ResidualConnectionMKLDNNFusePass::ProjectionElementwiseAddFunc& + get_node_from_elementwise_add_op) + : fusion_stats{std::make_shared(0)}, + can_fuse_func{can_fuse_func}, + get_node_from_conv_x_op{get_node_from_conv_x_op}, + get_node_from_conv_y_op{get_node_from_conv_y_op}, + get_node_from_elementwise_add_op{get_node_from_elementwise_add_op} {} + +void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()( + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + Node* conv_x_op; + Node* conv_x_input; + Node* conv_x_filter; + Node* conv_x_output; + + Node* conv_y_op; + Node* conv_y_input; + Node* conv_y_filter; + Node* conv_y_output; + + Node* elementwise_add_op; + Node* elementwise_add_out; + + std::tie(conv_x_op, conv_x_input, conv_x_filter, conv_x_output) = + get_node_from_conv_x_op(subgraph); + std::tie(conv_y_op, conv_y_input, conv_y_filter, conv_y_output) = + get_node_from_conv_y_op(subgraph); + std::tie(elementwise_add_op, elementwise_add_out) = + get_node_from_elementwise_add_op(subgraph); + + if (!can_fuse_func(conv_x_op, elementwise_add_op)) return; + if (!can_fuse_func(conv_y_op, elementwise_add_op)) return; + + Node* projection_node; + Node* residual_conv_op; + Node* residual_conv_output; + + if (IsReachable(graph, conv_x_input, conv_y_output)) { + projection_node = conv_x_output; + residual_conv_op = conv_y_op; + residual_conv_output = conv_y_output; + } else if (IsReachable(graph, conv_y_input, conv_x_output)) { + projection_node = conv_y_output; + residual_conv_op = conv_x_op; + residual_conv_output = conv_x_output; + } else { + return; + } + + auto fuse_relu = HasAttribute(*residual_conv_op, "fuse_relu"); + if (fuse_relu && *fuse_relu) return; + + residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); + residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); + + residual_conv_op->Op()->SetAttr("fuse_residual_connection", true); + + GraphSafeRemoveNodes(graph, {residual_conv_output, elementwise_add_op}); + + IR_NODE_LINK_TO(projection_node, residual_conv_op); + IR_NODE_LINK_TO(residual_conv_op, elementwise_add_out); + + (*fusion_stats)++; +} + +std::tuple +ResidualConnectionMKLDNNFusePass::GetNodesFromConv( + const patterns::Conv& conv_pattern, + const GraphPatternDetector::subgraph_t& subgraph) const { + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + + return std::make_tuple(conv_op, conv_input, conv_filter, conv_output); +} + +GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsX( + const std::string& name_scope, + const GraphWithStats& graph_with_stats) const { + ir::Graph* graph; + int stats; + + std::tie(graph, stats) = graph_with_stats; + + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + + patterns::Conv conv_pattern{pattern, name_scope}; + auto conv_output = conv_pattern(); + + patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; + elementwise_add_pattern( + conv_output, + pattern->NewNode(elementwise_add_pattern.elementwise_add_y_repr())); + conv_output->AsIntermediate(); + + auto get_node_from_elementwise_add = [&elementwise_add_pattern]( + const GraphPatternDetector::subgraph_t& subgraph) + -> std::tuple { + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_y, elementwise_add_y, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + return std::make_tuple(elementwise_add_op, elementwise_add_y, + elementwise_add_out); + }; + + return ExecuteHandleOnGraph( + &gpd, graph_with_stats, + [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { + return GetNodesFromConv(conv_pattern, subgraph); + }, + get_node_from_elementwise_add); +} + +GraphWithStats ResidualConnectionMKLDNNFusePass::FuseConvAsY( + const std::string& name_scope, + const GraphWithStats& graph_with_stats) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + + patterns::Conv conv_pattern{pattern, name_scope}; + auto conv_output = conv_pattern(); + + patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; + elementwise_add_pattern( + pattern->NewNode(elementwise_add_pattern.elementwise_add_x_repr()), + conv_output); + conv_output->AsIntermediate(); + + auto get_node_from_elementwise_add = [&elementwise_add_pattern]( + const GraphPatternDetector::subgraph_t& subgraph) + -> std::tuple { + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_x, elementwise_add_x, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + return std::make_tuple(elementwise_add_op, elementwise_add_x, + elementwise_add_out); + }; + + return ExecuteHandleOnGraph( + &gpd, graph_with_stats, + [this, &conv_pattern](const GraphPatternDetector::subgraph_t& subgraph) { + return GetNodesFromConv(conv_pattern, subgraph); + }, + get_node_from_elementwise_add); +} + +GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( + const std::string& name_scope, + const GraphWithStats& graph_with_stats) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + + patterns::Conv conv_x_pattern{pattern, name_scope}; + auto conv_x_output = conv_x_pattern(); + + patterns::Conv conv_y_pattern{pattern, name_scope}; + auto conv_y_output = conv_y_pattern(); + + patterns::ElementwiseAdd elementwise_add_pattern{pattern, name_scope}; + elementwise_add_pattern(conv_x_output, conv_y_output); + conv_x_output->AsIntermediate(); + conv_y_output->AsIntermediate(); + + auto get_node_from_elementwise_add = [&elementwise_add_pattern]( + const GraphPatternDetector::subgraph_t& subgraph) + -> std::tuple { + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_op, elementwise_add_op, + elementwise_add_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + elementwise_add_pattern); + + return std::make_tuple(elementwise_add_op, elementwise_add_out); + }; + + return ExecuteHandleOnGraph( + &gpd, graph_with_stats, + [this, + &conv_x_pattern](const GraphPatternDetector::subgraph_t& subgraph) { + return GetNodesFromConv(conv_x_pattern, subgraph); + }, + [this, + &conv_y_pattern](const GraphPatternDetector::subgraph_t& subgraph) { + return GetNodesFromConv(conv_y_pattern, subgraph); + }, + get_node_from_elementwise_add); +} + +void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { + FusePassBase::Init(name_scope_, graph); + auto fused_graph_with_stats = FuseConvAsY( + name_scope_, + FuseConvAsX(name_scope_, + FuseProjectionConv(name_scope_, std::make_pair(graph, 0)))); + + std::cout << "Fused graph " << fused_graph_with_stats.second << std::endl; + AddStatis(fused_graph_with_stats.second); +} +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_mkldnn_fuse_pass, + paddle::framework::ir::ResidualConnectionMKLDNNFusePass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h new file mode 100644 index 00000000..9bf1ae60 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h @@ -0,0 +1,134 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +#include + +namespace paddle { +namespace framework { +namespace ir { + +using graph_ptr = ir::Graph*; +using GraphWithStats = std::pair; + +void CorrectGraphEdges(Graph* graph, Node* from, Node* to); +bool IsReachable(ir::Graph* graph, Node* from, Node* to); +boost::optional HasBias(const Node& op, const std::string& bias_name); + +class ResidualConnectionMKLDNNFusePass : public FusePassBase { + private: + GraphWithStats FuseConvAsX(const std::string& name_scope, + const GraphWithStats& graph_with_stats) const; + GraphWithStats FuseConvAsY(const std::string& name_scope, + const GraphWithStats& graph_with_stats) const; + GraphWithStats FuseProjectionConv( + const std::string& name_scope, + const GraphWithStats& graph_with_stats) const; + + template + using GetNodeFunc = + std::function; + using IdentityConvFunc = GetNodeFunc>; + using IdentityElementwiseAddFunc = + GetNodeFunc>; + + using ProjectionConvFunc = IdentityConvFunc; + using ProjectionElementwiseAddFunc = GetNodeFunc>; + + using CanFuseFunc = std::function; + + std::tuple GetNodesFromConv( + const patterns::Conv& conv_pattern, + const GraphPatternDetector::subgraph_t& subgraph) const; + + std::tuple GetNodesFromProjectionConv( + const patterns::Conv& conv_pattern, + const GraphPatternDetector::subgraph_t& subgraph) const; + + template + GraphWithStats ExecuteHandleOnGraph(GraphPatternDetector* gpd, + const GraphWithStats& graph_with_stats, + OpFuncs&&... op_funcs) const { + ir::Graph* graph; + int stats; + + std::tie(graph, stats) = graph_with_stats; + + auto can_fuse = [this](Node* op1, Node* op2) -> bool { + return this->FindFuseOption(*op1, *op2) == FUSE_MKLDNN; + }; + + auto fuse_handle = HandleType{can_fuse, std::forward(op_funcs)...}; + + (*gpd)(graph, fuse_handle); + + return std::make_pair(graph, stats + fuse_handle.get_stats()); + } + + struct IdentityFuseHandle { + IdentityFuseHandle( + const CanFuseFunc& can_fuse_func, + const IdentityConvFunc& get_node_from_conv_op, + const IdentityElementwiseAddFunc& get_node_from_elementwise_add_op); + + void operator()(const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph); + int get_stats() const { return *fusion_stats; } + + private: + std::shared_ptr fusion_stats; + CanFuseFunc can_fuse_func; + IdentityConvFunc get_node_from_conv_op; + IdentityElementwiseAddFunc get_node_from_elementwise_add_op; + }; + + struct ProjectionFuseHandle { + ProjectionFuseHandle( + const CanFuseFunc& can_fuse_func, + const ProjectionConvFunc& get_node_from_conv_x_op, + const ProjectionConvFunc& get_node_from_conv_y_op, + const ProjectionElementwiseAddFunc& get_node_from_elementwise_add_op); + + void operator()(const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph); + int get_stats() const { return *fusion_stats; } + + private: + std::shared_ptr fusion_stats; + CanFuseFunc can_fuse_func; + ProjectionConvFunc get_node_from_conv_x_op; + ProjectionConvFunc get_node_from_conv_y_op; + ProjectionElementwiseAddFunc get_node_from_elementwise_add_op; + }; + + public: + virtual ~ResidualConnectionMKLDNNFusePass() {} + + protected: + void ApplyImpl(graph_ptr graph) const; + + const std::string name_scope_{"residual_connection_fuse_pass"}; +}; +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc new file mode 100644 index 00000000..8a13596c --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,274 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +namespace { +constexpr int nodes_removed = 3; +constexpr int nodes_added = 1; + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector>& inputs, + const std::pair& output) { + auto op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("use_mkldnn", true); + + for (const auto& input : inputs) { + op->SetInput(input.first, {input.second}); + } + + op->SetOutput(output.first, {output.second}); +} + +struct TestIsReachable { + using func = std::function; + + auto operator()(const std::unique_ptr& graph) -> func { + auto hash = [](const Node* node) -> std::string { + return node->Name() + std::to_string(node->id()); + }; + + auto find_node = [&](const std::unique_ptr& graph, + const std::string& name) -> Node* { + for (auto& node : GraphTraits::DFS(*graph)) { + if (name == hash(&node)) { + return &node; + } + } + + return nullptr; + }; + + // update the from and to strings to hashed equivs in loop from graph traits + return [&](std::string from, std::string to) -> bool { + if (from == to) return true; + + std::map visited; + + for (auto& node : GraphTraits::DFS(*graph)) { + auto hashed = hash(&node); + if (node.Name() == from) from = hashed; + if (node.Name() == to) to = hashed; + visited[hashed] = false; + } + + visited[from] = true; + + std::list queue; + queue.push_back(from); + + while (!queue.empty()) { + auto cur = find_node(graph, queue.front()); + queue.pop_front(); + if (cur == nullptr) return false; + + for (auto n : cur->outputs) { + auto hashed_name = hash(n); + if (hashed_name == to) return true; + + if (!visited[hashed_name]) { + visited[hashed_name] = true; + queue.push_back(hashed_name); + } + } + } + return false; + }; + } +}; + +void AssertOpsCount(const std::unique_ptr& graph, + int expected_conv_count, + int expected_elementwise_add_count = 0) { + int conv_count = 0; + int elementwise_add_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "conv2d") { + ++conv_count; + } + if (node->IsOp() && node->Op()->Type() == "elementwise_add") { + ++elementwise_add_count; + } + } + EXPECT_EQ(conv_count, expected_conv_count); + EXPECT_EQ(elementwise_add_count, expected_elementwise_add_count); +} + +ProgramDesc BuildProgramDesc(const std::vector& transient_vars, + const std::vector& persistent_vars) { + ProgramDesc prog; + + auto add_var_to_prog = [&prog](const std::string& var_name) -> VarDesc* { + auto var = prog.MutableBlock(0)->Var(var_name); + var->SetType(proto::VarType::LOD_TENSOR); + + return var; + }; + + for (const auto& v : transient_vars) { + add_var_to_prog(v); + } + + for (const auto& v : persistent_vars) { + auto var = add_var_to_prog(v); + var->SetPersistable(true); + } + + return prog; +} + +void RunPassAndAssert(ProgramDesc* prog, const std::string& from, + const std::string& to, int expected_conv_num) { + std::unique_ptr graph(new ir::Graph(*prog)); + + TestIsReachable is_reachable; + EXPECT_TRUE(is_reachable(graph)(from, to)); + + auto pass = + PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); + int original_nodes_num = graph->Nodes().size(); + graph.reset(pass->Apply(graph.release())); + int current_nodes_num = graph->Nodes().size(); + + EXPECT_TRUE(is_reachable(graph)(from, to)); + + EXPECT_EQ(original_nodes_num - nodes_removed + nodes_added, + current_nodes_num); + + AssertOpsCount(graph, expected_conv_num); +} +} // namespace + +TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsYWithElementwiseAddRelu) { + auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"}); + + SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"}); + SetOp(&prog, "conv2d", + {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}}, + {"Output", "c"}); + + SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"}); + SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"}); + + RunPassAndAssert(&prog, "a", "relu", 1); +} + +TEST(ConvElementwiseAddMKLDNNFusePass, + ConvolutionProjectionAsYWithElementwiseAddRelu) { + auto prog = BuildProgramDesc({"a", "b", "c", "d", "e", "f"}, + {"bias", "weights", "bias2", "weights2"}); + + SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"}); + // right branch + SetOp(&prog, "conv2d", + {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}}, + {"Output", "c"}); + + // left branch + SetOp(&prog, "conv2d", + {{"Input", "a"}, {"Bias", "bias2"}, {"Filter", "weights2"}}, + {"Output", "f"}); + + SetOp(&prog, "elementwise_add", {{"X", "f"}, {"Y", "c"}}, {"Out", "d"}); + SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"}); + + RunPassAndAssert(&prog, "a", "relu", 2); +} + +TEST(ConvElementwiseAddMKLDNNFusePass, + ConvolutionAsYWithElementwiseAddReluNoBias) { + auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"}); + + SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"}); + SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}}, + {"Output", "c"}); + SetOp(&prog, "elementwise_add", {{"X", "a"}, {"Y", "c"}}, {"Out", "d"}); + SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"}); + + RunPassAndAssert(&prog, "a", "relu", 1); +} + +TEST(ConvElementwiseAddMKLDNNFusePass, ConvolutionAsXWithElementwiseAddRelu) { + auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"bias", "weights"}); + + SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"}); + SetOp(&prog, "conv2d", + {{"Input", "b"}, {"Bias", "bias"}, {"Filter", "weights"}}, + {"Output", "c"}); + + SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"}); + SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"}); + + RunPassAndAssert(&prog, "a", "relu", 1); +} + +TEST(ConvElementwiseAddMKLDNNFusePass, + ConvolutionAsXWithElementwiseAddReluNoBias) { + auto prog = BuildProgramDesc({"a", "b", "c", "d", "e"}, {"weights"}); + + SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"}); + SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}}, + {"Output", "c"}); + SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "a"}}, {"Out", "d"}); + SetOp(&prog, "relu", {{"X", "d"}}, {"Out", "e"}); + + RunPassAndAssert(&prog, "a", "relu", 1); +} + +TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) { + auto prog = + BuildProgramDesc({"a", "b", "c", "d", "e", "f", "g"}, {"weights"}); + + SetOp(&prog, "sigmoid", {{"X", "a"}}, {"Out", "b"}); + SetOp(&prog, "conv2d", {{"Input", "b"}, {"Filter", "weights"}}, + {"Output", "c"}); + + SetOp(&prog, "conv2d", {{"Input", "d"}, {"Filter", "weights"}}, + {"Output", "e"}); + + SetOp(&prog, "elementwise_add", {{"X", "c"}, {"Y", "e"}}, {"Out", "f"}); + SetOp(&prog, "relu", {{"X", "f"}}, {"Out", "g"}); + + std::unique_ptr graph(new ir::Graph(prog)); + + TestIsReachable is_reachable; + EXPECT_TRUE(is_reachable(graph)("a", "g")); + + auto pass = + PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); + int original_nodes_num = graph->Nodes().size(); + graph.reset(pass->Apply(graph.release())); + int current_nodes_num = graph->Nodes().size(); + + EXPECT_TRUE(is_reachable(graph)("a", "g")); + EXPECT_EQ(original_nodes_num, current_nodes_num); + + AssertOpsCount(graph, 2, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(conv_elementwise_add_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc new file mode 100644 index 00000000..dd0fb456 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h" +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void ConvReLUFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("conv_relu_mkldnn_fuse", graph); + + GraphPatternDetector gpd; + auto* conv_input = gpd.mutable_pattern() + ->NewNode("conv_relu_mkldnn_fuse/conv_input") + ->AsInput() + ->assert_is_op_input("conv2d", "Input"); + patterns::ConvReLU conv_relu_pattern(gpd.mutable_pattern(), + "conv_relu_mkldnn_fuse"); + conv_relu_pattern(conv_input); + + int found_conv_relu_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle ConvReLU fuse"; + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, + conv_relu_pattern); // Filter + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op + GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out + GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op + + FuseOptions fuse_option = FindFuseOption(*conv, *relu); + if (fuse_option == DO_NOT_FUSE) { + VLOG(3) << "do not perform conv+relu fuse"; + return; + } + + // Transform Conv node into ConvReLU node. + OpDesc* desc = conv->Op(); + desc->SetOutput("Output", std::vector({relu_out->Name()})); + desc->SetAttr("fuse_relu", true); + GraphSafeRemoveNodes(graph, {relu, conv_out}); + + PADDLE_ENFORCE(subgraph.count(conv_input)); + IR_NODE_LINK_TO(conv, relu_out); + + found_conv_relu_count++; + }; + + gpd(graph, handler); + + AddStatis(found_conv_relu_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_relu_mkldnn_fuse_pass, + paddle::framework::ir::ConvReLUFusePass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h new file mode 100644 index 00000000..2174c22d --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the CONV and ReLU to a ConvReLUOp. + */ +class ConvReLUFusePass : public FusePassBase { + public: + virtual ~ConvReLUFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc new file mode 100644 index 00000000..67a99570 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h" + +#include +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn = false) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "conv2d") { + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("name", name); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + op->SetInput("Bias", {inputs[2]}); + } else if (type == "relu") { + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetInput("X", inputs); + } + op->SetOutput("Out", outputs); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); +} + +// a->OP0->b +// b->OP1->c +// (c, weights, bias)->conv->f +// (f)->relu->g +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : + std::vector({"a", "b", "c", "weights", "bias", "f", "g", + "h", "weights2", "bias2", "k", "l"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "OP0", "op0", std::vector({"a"}), + std::vector({"b"})); + SetOp(&prog, "OP1", "op1", std::vector({"b"}), + std::vector({"c"})); + // conv+relu, both with MKL-DNN + SetOp(&prog, "conv2d", "conv1", + std::vector({"c", "weights", "bias"}), + std::vector({"f"}), true); + SetOp(&prog, "relu", "relu1", std::vector({"f"}), + std::vector({"g"}), true); + SetOp(&prog, "OP3", "op3", std::vector({"g"}), + std::vector({"h"})); + // conv+relu, only one with MKL-DNN + SetOp(&prog, "conv2d", "conv2", + std::vector({"h", "weights2", "bias2"}), + std::vector({"k"}), true); + SetOp(&prog, "relu", "relu2", std::vector({"k"}), + std::vector({"l"})); + + return prog; +} + +TEST(ConvReLUFusePass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("conv_relu_mkldnn_fuse_pass"); + + int original_nodes_num = graph->Nodes().size(); + + graph.reset(pass->Apply(graph.release())); + + int current_nodes_num = graph->Nodes().size(); + + // Remove 3 Nodes: CONV, RELU, conv_out + // Add 1 Node: ConvReLU + EXPECT_EQ(original_nodes_num - 2, current_nodes_num); + + // Assert conv_relu op in newly generated graph + int conv_relu_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "conv2d") { + auto* op = node->Op(); + ASSERT_TRUE(op->HasAttr("use_mkldnn")); + EXPECT_TRUE(boost::get(op->GetAttr("use_mkldnn"))); + // check if only "conv1" convolution is fused + auto op_name = boost::get(op->GetAttr("name")); + if (op_name == "conv1") { + ASSERT_TRUE(op->HasAttr("fuse_relu")); + bool fuse_relu = boost::get(op->GetAttr("fuse_relu")); + if (fuse_relu) { + ++conv_relu_count; + } + } else if (op_name == "conv2") { + ASSERT_FALSE(op->HasAttr("fuse_relu")); + } + } + } + EXPECT_EQ(conv_relu_count, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(conv_relu_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc new file mode 100644 index 00000000..89f51bfa --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -0,0 +1,365 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +namespace { + +void UnlinkNodes(ir::Node* a, ir::Node* b) { + a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b), + a->outputs.end()); + b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a), + b->inputs.end()); +} + +} // namespace + +enum { U8_MAX = 255, S8_MAX = 127 }; + +using EigenVectorArrayMap = Eigen::Map>; +using string::PrettyLogDetail; + +void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input, + std::string input_name, double scale_to_one, + bool is_unsigned, + std::string scale_attr_name) const { + unsigned max = is_unsigned ? U8_MAX : S8_MAX; + float scale = scale_to_one * max; + + // Create quantize output variable + VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out")); + auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc); + + // create a quantize op node + OpDesc q_desc; + q_desc.SetType("quantize"); + q_desc.SetInput("Input", std::vector({input->Name()})); + q_desc.SetOutput("Output", + std::vector({quantize_out_node->Name()})); + q_desc.SetAttr("Scale", scale); + q_desc.SetAttr("is_negative_input", !is_unsigned); + auto quantize_op = g->CreateOpNode(&q_desc); // OpDesc will be copied. + + // update op's input + op->Op()->SetInput(input_name, + std::vector({quantize_out_node->Name()})); + + // link quantize op + UnlinkNodes(input, op); + IR_NODE_LINK_TO(input, quantize_op); + IR_NODE_LINK_TO(quantize_op, quantize_out_node); + IR_NODE_LINK_TO(quantize_out_node, op); + + if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale); +} + +void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name, + VarQuantScale* scales, bool are_unsigned, + std::string scale_attr_name) const { + auto inputs = op->inputs; + auto output = op->outputs[0]; + PADDLE_ENFORCE_GE(inputs.size(), 1); + PADDLE_ENFORCE_EQ(op->outputs.size(), 1); + + // create a quantize op desc prototype + OpDesc q_desc; + q_desc.SetType("quantize"); + + std::vector quantize_out_nodes(inputs.size()); + std::vector quantize_out_node_names(inputs.size()); + + double scale_out = (*scales)[output->Name()].second.data()[0]; + unsigned max = are_unsigned ? U8_MAX : S8_MAX; + float scale = scale_out * max; + + for (size_t i = 0; i < inputs.size(); i++) { + // Create quantize output variable + VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out")); + quantize_out_nodes[i] = g->CreateVarNode(&quantize_out_desc); + quantize_out_node_names[i] = quantize_out_nodes[i]->Name(); + + q_desc.SetAttr("Scale", scale); + q_desc.SetInput("Input", std::vector({inputs[i]->Name()})); + q_desc.SetOutput("Output", + std::vector({quantize_out_node_names[i]})); + q_desc.SetAttr("is_negative_input", !are_unsigned); + auto quantize_op = g->CreateOpNode(&q_desc); // OpDesc will be copied. + + // link quantize op + UnlinkNodes(inputs[i], op); + IR_NODE_LINK_TO(inputs[i], quantize_op); + IR_NODE_LINK_TO(quantize_op, quantize_out_nodes[i]); + IR_NODE_LINK_TO(quantize_out_nodes[i], op); + } + + // update op's input + op->Op()->SetInput(input_name, quantize_out_node_names); + + if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale); +} + +void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output, + std::string output_name, + double scale_to_one, bool is_unsigned, + std::string scale_attr_name) const { + unsigned max = is_unsigned ? U8_MAX : S8_MAX; + float scale = scale_to_one * max; + + // Create dequantize input variable + VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in")); + auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc); + + // create a dequantize op node for output. + OpDesc deq_desc; + deq_desc.SetType("dequantize"); + deq_desc.SetInput("Input", + std::vector({dequantize_in_node->Name()})); + deq_desc.SetOutput("Output", std::vector({output->Name()})); + deq_desc.SetAttr("Scale", scale); + auto dequantize_op = g->CreateOpNode(&deq_desc); // OpDesc will be copied. + + // update op's output + op->Op()->SetOutput(output_name, + std::vector({dequantize_in_node->Name()})); + + // link dequantize op + UnlinkNodes(op, output); + IR_NODE_LINK_TO(op, dequantize_in_node); + IR_NODE_LINK_TO(dequantize_in_node, dequantize_op); + IR_NODE_LINK_TO(dequantize_op, output); + + if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale); +} + +void CPUQuantizePass::QuantizeConv(Graph* graph, + bool with_residual_data) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::ConvResidual conv_pattern{pattern, name_scope_}; + conv_pattern(with_residual_data); + + int quantize_conv_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize conv2d op"; + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern); + auto* conv_op_desc = conv_op->Op(); + + // skip if should not be quantized + if (!conv_op_desc->HasAttr("use_quantizer") || + !boost::get(conv_op_desc->GetAttr("use_quantizer"))) + return; + + GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern); + + // get scales calculated after warmup, they scale variables to MAX=1.0 + auto scales = Get("quant_var_scales"); + + auto input_scale = scales[conv_input->Name()].second.data()[0]; + bool is_input_unsigned = scales[conv_input->Name()].first; + QuantizeInput(g, conv_op, conv_input, "Input", input_scale, + is_input_unsigned, "Scale_in"); + + auto filter_scale_tensor = scales[conv_filter->Name()].second; + EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data(), + filter_scale_tensor.numel(), 1}; + eigen_tensor *= static_cast(S8_MAX); + std::vector filter_scale{ + filter_scale_tensor.data(), + filter_scale_tensor.data() + filter_scale_tensor.numel()}; + + conv_op->Op()->SetAttr("Scale_weights", filter_scale); + + if (with_residual_data) { + GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data, + conv_pattern); + auto residual_scale = + scales[conv_residual_data->Name()].second.data()[0]; + bool is_residual_unsigned = scales[conv_residual_data->Name()].first; + + QuantizeInput(g, conv_op, conv_residual_data, "ResidualData", + residual_scale, is_residual_unsigned, "Scale_in_eltwise"); + } + + auto output_scale = scales[conv_output->Name()].second.data()[0]; + bool is_output_unsigned = scales[conv_output->Name()].first; + DequantizeOutput(g, conv_op, conv_output, "Output", output_scale, + is_output_unsigned, "Scale_out"); + + ++quantize_conv_count; + }; + + gpd(graph, handler); + AddStatis(quantize_conv_count); + + std::stringstream msg_ss; + msg_ss << "--- quantized " << quantize_conv_count << " conv2d ops"; + if (with_residual_data) msg_ss << " with residual connection"; + PrettyLogDetail(msg_ss.str().c_str()); +} + +void CPUQuantizePass::QuantizePool(Graph* graph) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::Pool pool_pattern{pattern, name_scope_}; + pool_pattern(); + + int quantize_pool_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize pool2d op"; + GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern); + auto* pool_op_desc = pool_op->Op(); + + // skip if should not be quantized + if (!pool_op_desc->HasAttr("use_quantizer") || + !boost::get(pool_op_desc->GetAttr("use_quantizer"))) + return; + + GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern); + GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern); + + // get scales calculated after warmup, they scale variables to MAX=1.0 + auto scales = Get("quant_var_scales"); + + auto input_scale = scales[pool_input->Name()].second.data()[0]; + bool is_input_unsigned = scales[pool_input->Name()].first; + QuantizeInput(g, pool_op, pool_input, "X", input_scale, is_input_unsigned); + + auto output_scale = scales[pool_output->Name()].second.data()[0]; + bool is_output_unsigned = scales[pool_output->Name()].first; + DequantizeOutput(g, pool_op, pool_output, "Out", output_scale, + is_output_unsigned); + + ++quantize_pool_count; + }; + + gpd(graph, handler); + AddStatis(quantize_pool_count); + + PrettyLogDetail("--- quantized %d pool2d ops", quantize_pool_count); +} + +void CPUQuantizePass::QuantizeConcat(Graph* graph) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::Concat concat_pattern{pattern, name_scope_}; + concat_pattern(); + + int quantize_concat_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize concat op"; + GET_IR_NODE_FROM_SUBGRAPH(concat_op, concat_op, concat_pattern); + auto* concat_op_desc = concat_op->Op(); + + // skip if should not be quantized + if (!concat_op_desc->HasAttr("use_quantizer") || + !boost::get(concat_op_desc->GetAttr("use_quantizer"))) + return; + + GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, concat_pattern); + + // get scales calculated after warmup, they scale variables to MAX=1.0 + auto scales = Get("quant_var_scales"); + + // if all inputs were unsigned, then the output was set to unsigned + // during the scale calculation step + bool are_all_inputs_unsigned = scales[concat_out->Name()].first; + QuantizeInputs(g, concat_op, "X", &scales, are_all_inputs_unsigned); + + auto output_scale = scales[concat_out->Name()].second.data()[0]; + + DequantizeOutput(g, concat_op, concat_out, "Out", output_scale, + are_all_inputs_unsigned); + + ++quantize_concat_count; + }; + + gpd(graph, handler); + AddStatis(quantize_concat_count); + + PrettyLogDetail("--- quantized %d concat ops", quantize_concat_count); +} + +void CPUQuantizePass::QuantizePriorBox(Graph* graph) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::PriorBox prior_box_pattern{pattern, name_scope_}; + prior_box_pattern(); + + int quantize_prior_box_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize prior_box op"; + GET_IR_NODE_FROM_SUBGRAPH(prior_box_op, prior_box_op, prior_box_pattern); + auto* prior_box_op_desc = prior_box_op->Op(); + + // skip if should not be quantized + if (!prior_box_op_desc->HasAttr("use_quantizer") || + !boost::get(prior_box_op_desc->GetAttr("use_quantizer"))) + return; + + GET_IR_NODE_FROM_SUBGRAPH(prior_box_input, prior_box_input, + prior_box_pattern); + + // get scales calculated after warmup, they scale variables to MAX=1.0 + auto scales = Get("quant_var_scales"); + + auto input_scale = scales[prior_box_input->Name()].second.data()[0]; + bool is_input_unsigned = scales[prior_box_input->Name()].first; + QuantizeInput(g, prior_box_op, prior_box_input, "Input", input_scale, + is_input_unsigned); + + ++quantize_prior_box_count; + }; + + gpd(graph, handler); + AddStatis(quantize_prior_box_count); + + PrettyLogDetail("--- quantized %d prior_box ops", + quantize_prior_box_count); +} + +void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { + VLOG(3) << "Quantizing the graph."; + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); + + PADDLE_ENFORCE(param_scope()); + + QuantizeConv(graph, false /* with_residual_data */); + QuantizeConv(graph, true /* with_residual_data */); + QuantizePool(graph); + QuantizeConcat(graph); + QuantizePriorBox(graph); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass) + .RequirePassAttr("quant_var_scales"); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h new file mode 100644 index 00000000..ec4db662 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Map variable name to tensor of scaling factors scaling it to MAX=1.0. + * bool denotes whether quantization of the variable should be done to unsigned + * type. + */ +using VarQuantScale = + std::unordered_map>; + +/* + * Quantize all supported operators. + */ +class CPUQuantizePass : public FusePassBase { + public: + virtual ~CPUQuantizePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + void QuantizeConv(Graph* graph, bool with_residual_data = false) const; + + void QuantizePool(Graph* graph) const; + + void QuantizeConcat(Graph* graph) const; + + void QuantizePriorBox(Graph* graph) const; + + void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, + double scale_to_one, bool is_unsigned, + std::string scale_attr_name = "") const; + + // quantize all inputs of given name with the same (minimum) scale + void QuantizeInputs(Graph* g, Node* op, std::string input_name, + VarQuantScale* scales, bool are_unsigned, + std::string scale_attr_name = "") const; + + void DequantizeOutput(Graph* g, Node* op, Node* output, + std::string output_name, double scale_to_one, + bool is_unsigned, + std::string scale_attr_name = "") const; + + const std::string name_scope_{"quantize"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc new file mode 100644 index 00000000..0a689441 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -0,0 +1,310 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h" +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn, + bool use_quantizer = false) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("name", name); + if (type == "conv2d") { + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + if (inputs.size() > 2) + op->SetInput("Bias", {inputs[2]}); + else + op->SetInput("Bias", {}); + if (inputs.size() > 3) { + op->SetInput("ResidualData", {inputs[3]}); + op->SetAttr("fuse_residual_connection", true); + } else { + op->SetInput("ResidualData", {}); + op->SetAttr("fuse_residual_connection", false); + } + op->SetOutput("Output", {outputs[0]}); + op->SetAttr("use_quantizer", use_quantizer); + op->SetAttr("Scale_in", 1.0f); + op->SetAttr("Scale_out", 1.0f); + op->SetAttr("Scale_weights", std::vector{1.0f}); + } else if (type == "pool2d") { + op->SetInput("X", {inputs[0]}); + op->SetOutput("Out", {outputs[0]}); + op->SetAttr("use_quantizer", use_quantizer); + } else if (type == "dropout") { + op->SetInput("X", {inputs[0]}); + op->SetOutput("Out", {outputs[0]}); + } else if (type == "fc") { + op->SetInput("Input", {inputs[0]}); + if (inputs.size() > 1) op->SetInput("W", {inputs[1]}); + if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]}); + op->SetOutput("Out", {outputs[0]}); + } else if (type == "concat") { + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); + op->SetAttr("use_quantizer", use_quantizer); + } +} + +namespace { +static const std::initializer_list variable_names{ + "a", "w1", "c", "d", "w2", "e", "f", "g", + "h", "w3", "b1", "i", "j", "w4", "b2"}; +// (a,w1)->Conv1->c and c->Pool1->d +// +// (d,w2)->Conv2->e and e->Pool2->f +// +// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j +// +// (d,w4, b2)->Conv4->i +ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) { + ProgramDesc prog; + for (auto& v : variable_names) { + auto* var = prog.MutableBlock(0)->Var(v); + if (v.find("w") == 0 || v.find("b") == 0) { + var->SetPersistable(true); + } + } + + SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn, + use_quantizer); + SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer); + + SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn, + use_quantizer); + SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer); + + SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn); + SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn); + SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn, + use_quantizer); + + SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn, + use_quantizer); + + return prog; +} + +void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, + const char* var_name) { + auto x = scope->Var(var_name); + auto tensor = x->GetMutable(); + tensor->mutable_data(place, proto::VarType::FP32, 1); +} + +void MainTest(const ProgramDesc& prog, int conv_count, int pool_count, + int quant_count, int dequant_count, int added_nodes_count, + float scale) { + std::unique_ptr graph(new ir::Graph(prog)); + + // Init scope, as it is used in pass + auto place = paddle::platform::CPUPlace(); + NaiveExecutor exe{place}; + Scope scope; + exe.CreateVariables(prog, 0, true, &scope); + + auto* scales = new VarQuantScale(); + + for (auto& v : variable_names) { + InitTensorHolder(&scope, place, v.c_str()); + LoDTensor tensor; + tensor.Resize({1}); + auto* ptr = tensor.mutable_data(place); + ptr[0] = 2.0; + + (*scales)[v] = std::make_pair(false, std::move(tensor)); + } + + graph->SetNotOwned(kParamScopeAttr, &scope); + + auto pass = PassRegistry::Instance().Get("cpu_quantize_pass"); + pass->Set("quant_var_scales", scales); + + int original_nodes_num = graph->Nodes().size(); + + graph.reset(pass->Apply(graph.release())); + + int current_nodes_num = graph->Nodes().size(); + + int quantize_nodes_count = 0; + int dequantize_nodes_count = 0; + int conv2d_nodes_count = 0; + int pool2d_nodes_count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "conv2d") { + conv2d_nodes_count++; + auto op_name = boost::get(op->GetAttr("name")); + EXPECT_EQ(boost::get(op->GetAttr("Scale_in")), scale) + << "Scale_in for node '" + op_name + "'."; + EXPECT_EQ(boost::get(op->GetAttr("Scale_out")), scale) + << "Scale_out for node '" + op_name + "'."; + EXPECT_EQ( + boost::get>(op->GetAttr("Scale_weights"))[0], + scale) + << "Scale_weights for node '" + op_name + "'."; + } else if (op->Type() == "pool2d") { + pool2d_nodes_count++; + } else if (op->Type() == "quantize") { + quantize_nodes_count++; + } else if (op->Type() == "dequantize") { + dequantize_nodes_count++; + } + } + } + EXPECT_EQ(conv2d_nodes_count, conv_count); + EXPECT_EQ(pool2d_nodes_count, pool_count); + EXPECT_EQ(quantize_nodes_count, quant_count); + EXPECT_EQ(dequantize_nodes_count, dequant_count); + EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num); +} + +TEST(CpuQuantizePass, quantize) { + bool use_mkldnn = true; + bool use_quantizer = true; + // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and + // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d + // + // (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and + // e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f + // + // d->Dropout1->g and g->Fc1->h and + // (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j + // + // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i + // Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT + int added_nodes = 7 + 7 + 6 + 6; + MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes, + 2.0f * 127); +} + +TEST(CpuQuantizePass, do_not_quantize) { + bool use_mkldnn = true; + bool use_quantizer = false; + int added_nodes = 0; + MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes, + 1.0f); +} + +} // namespace + +namespace { +static const std::initializer_list variable_names_concat = { + "a1", "b1", "a2", "b2", "c", "d"}; + +// a1->Pool1->b1 +// a2->Pool2->b2 +// (b1,b2)->Concat->c +// c->Pool3->d +ProgramDesc BuildProgramDescConcat() { + ProgramDesc prog; + + SetOp(&prog, "pool2d", "Pool1", {"a1"}, {"b1"}, true, false); + SetOp(&prog, "pool2d", "Pool2", {"a2"}, {"b2"}, true, false); + SetOp(&prog, "concat", "Concat", {"b1", "b2"}, {"c"}, true, true); + SetOp(&prog, "pool2d", "Pool3", {"c"}, {"d"}, true, false); + + return prog; +} + +void MainTestConcat(const ProgramDesc& prog, int pool_count, int concat_count, + int quant_count, int dequant_count, int added_nodes_count) { + std::unique_ptr graph(new ir::Graph(prog)); + + // Init scope, as it is used in pass + auto place = paddle::platform::CPUPlace(); + NaiveExecutor exe{place}; + Scope scope; + exe.CreateVariables(prog, 0, true, &scope); + + auto* scales = new VarQuantScale(); + + for (auto& v : variable_names_concat) { + InitTensorHolder(&scope, place, v.c_str()); + LoDTensor tensor; + tensor.Resize({1}); + auto* ptr = tensor.mutable_data(place); + ptr[0] = 2.0; + + (*scales)[v] = std::make_pair(false, std::move(tensor)); + } + + graph->SetNotOwned(kParamScopeAttr, &scope); + + auto pass = PassRegistry::Instance().Get("cpu_quantize_pass"); + pass->Set("quant_var_scales", scales); + + int original_nodes_num = graph->Nodes().size(); + + graph.reset(pass->Apply(graph.release())); + + int current_nodes_num = graph->Nodes().size(); + + int quantize_nodes_count = 0; + int dequantize_nodes_count = 0; + int concat_nodes_count = 0; + int pool2d_nodes_count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "concat") { + concat_nodes_count++; + } else if (op->Type() == "pool2d") { + pool2d_nodes_count++; + } else if (op->Type() == "quantize") { + quantize_nodes_count++; + } else if (op->Type() == "dequantize") { + dequantize_nodes_count++; + } + } + } + EXPECT_EQ(concat_nodes_count, concat_count); + EXPECT_EQ(pool2d_nodes_count, pool_count); + EXPECT_EQ(quantize_nodes_count, quant_count); + EXPECT_EQ(dequantize_nodes_count, dequant_count); + EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num); +} + +TEST(CpuQuantizePass, concat) { + // a1->Pool1->b1 + // a2->Pool2->b2 + // (b1->QUANT1->IN1, b2->QUANT2->IN2)->Concat->c + // c->OUT1->DEQUANT1->Pool3->d + int pool_count = 3; + int concat_count = 1; + int quant_count = 2; + int dequant_count = 1; + int added_nodes_count = 6; + MainTestConcat(BuildProgramDescConcat(), pool_count, concat_count, + quant_count, dequant_count, added_nodes_count); +} + +} // namespace + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(cpu_quantize_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc new file mode 100644 index 00000000..79a8ac68 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc @@ -0,0 +1,56 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h" +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { + VLOG(3) << "Marks operators which are to be quantized."; + const auto& excluded_ids_list = + Get>("quantize_excluded_op_ids"); + const auto& op_types_list = + Get>("quantize_enabled_op_types"); + for (const Node* n : graph->Nodes()) { + if (n->IsOp()) { + if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(), + n->id()) != excluded_ids_list.end()) + continue; + auto* op = n->Op(); + if (op->HasAttr("use_quantizer") || op->HasProtoAttr("use_quantizer")) { + if (op_types_list.empty()) { + op->SetAttr("use_quantizer", true); + } else if (std::find(op_types_list.begin(), op_types_list.end(), + n->Name()) != op_types_list.end()) { + op->SetAttr("use_quantizer", true); + } + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(cpu_quantize_placement_pass, + paddle::framework::ir::CPUQuantizePlacementPass) + // a vector of operator type names to be quantized ("conv2d" etc.) + .RequirePassAttr("quantize_enabled_op_types") + // a vector of operator ids that are to be excluded from quantization + .RequirePassAttr("quantize_excluded_op_ids"); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h new file mode 100644 index 00000000..008a462d --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { +/* + * Specifies which operators should be quantized. + */ +class CPUQuantizePlacementPass : public Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc new file mode 100644 index 00000000..ba4d281f --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc @@ -0,0 +1,129 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h" + +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, + boost::tribool use_quantizer) { + auto* op = prog->MutableBlock(0)->AppendOp(); + + op->SetType(type); + + if (!boost::indeterminate(use_quantizer)) + op->SetAttr("use_quantizer", use_quantizer); + + if (type == "conv2d") { + op->SetAttr("name", name); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + op->SetInput("Bias", {inputs[2]}); + } else if (type == "relu") { + op->SetInput("X", inputs); + } else if (type == "concat") { + op->SetAttr("axis", 1); + op->SetInput("X", {inputs[0], inputs[1]}); + } else if (type == "pool2d") { + op->SetInput("X", {inputs[0]}); + } else { + FAIL() << "Unexpected operator type."; + } + op->SetOutput("Out", {outputs[0]}); +} + +// operator use_quantizer +// --------------------------------------- +// (a,b)->concat->c none +// (c,weights,bias)->conv->f false +// f->relu->g none +// g->pool->h false +// (h,weights2,bias2)->conv->k false +// k->pool->l false +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + + for (auto& v : + std::vector({"a", "b", "c", "weights", "bias", "f", "g", + "h", "weights2", "bias2", "k", "l"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, boost::indeterminate); + SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, false); + SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, boost::indeterminate); + SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, false); + SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, false); + SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, false); + + return prog; +} + +void MainTest(std::initializer_list quantize_enabled_op_types, + std::initializer_list quantize_excluded_op_ids, + unsigned expected_use_quantizer_true_count) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass"); + pass->Set("quantize_enabled_op_types", + new std::unordered_set(quantize_enabled_op_types)); + pass->Set("quantize_excluded_op_ids", + new std::unordered_set(quantize_excluded_op_ids)); + + graph.reset(pass->Apply(graph.release())); + + unsigned use_quantizer_true_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->HasAttr("use_quantizer") && + boost::get(op->GetAttr("use_quantizer"))) { + ++use_quantizer_true_count; + } + } + } + + EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count); +} + +TEST(QuantizerPlacementPass, enabled_pool) { MainTest({"pool2d"}, {}, 2); } + +TEST(QuantizerPlacementPass, enabled_conv_excluded_one) { + MainTest({"conv2d"}, {4}, 1); +} + +TEST(QuantizerPlacementPass, excluded_none) { + // 2 conv + 2 pool + MainTest({}, {}, 4); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(cpu_quantize_placement_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc new file mode 100644 index 00000000..2270e2b5 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc @@ -0,0 +1,142 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file eint8_outcept in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either eint8_outpress or +// implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h" +#include +#include +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogDetail; + +void CPUQuantizeSquashPass::FindNodesToKeep( + Graph* graph, + std::unordered_map* nodes_keep_counter) const { + GraphPatternDetector gpd; + patterns::DequantAny deq_any_pattern{gpd.mutable_pattern(), "deqant_any"}; + deq_any_pattern(); + + int found_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, deq_any_pattern); + + if (nodes_keep_counter->find(dequant_out) == nodes_keep_counter->end()) + (*nodes_keep_counter)[dequant_out] = 1; + else + (*nodes_keep_counter)[dequant_out] += 1; + + found_count++; + }; + gpd(graph, handler); + AddStatis(found_count); +} + +void CPUQuantizeSquashPass::Squash( + Graph* graph, + std::unordered_map* nodes_keep_counter) const { + GraphPatternDetector gpd; + patterns::DequantQuantAny squash_pattern{gpd.mutable_pattern(), "squash"}; + squash_pattern(); + + int found_squash_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "squash requantize-quantize ops pair"; + + GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern); + + auto* next_op_desc = next_op->Op(); + float dequant_scale = boost::get(dequant_op->Op()->GetAttr("Scale")); + float quant_scale = boost::get(quant_op->Op()->GetAttr("Scale")); + PADDLE_ENFORCE(nodes_keep_counter->find(dequant_out) != + nodes_keep_counter->end()); + + // check if dequantize op should be kept or removed, decrease the counter + bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1; + + if (dequant_scale == quant_scale) { + // squash dequantize-quantize to nothing + auto quant_out_var_name = quant_out->Name(); + auto next_op_inputs = next_op_desc->InputNames(); + for (const auto& name : next_op_inputs) { + auto input_names = next_op_desc->Input(name); + std::replace(input_names.begin(), input_names.end(), quant_out_var_name, + dequant_in->Name()); + next_op_desc->SetInput(name, input_names); + } + + if (keep_dequant) + GraphSafeRemoveNodes(graph, {quant_op, quant_out}); + else + GraphSafeRemoveNodes(graph, + {dequant_op, quant_op, dequant_out, quant_out}); + + IR_NODE_LINK_TO(dequant_in, next_op); + + found_squash_count++; + } else { + // squash dequantize-quantize to requantize op + OpDesc desc; + desc.SetType("requantize"); + desc.SetInput("Input", std::vector({dequant_in->Name()})); + desc.SetOutput("Output", std::vector({quant_out->Name()})); + desc.SetAttr("Scale_in", dequant_scale); + desc.SetAttr("Scale_out", quant_scale); + + auto requant_op = g->CreateOpNode(&desc); + + if (keep_dequant) + GraphSafeRemoveNodes(graph, {quant_op}); + else + GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out}); + + IR_NODE_LINK_TO(dequant_in, requant_op); + IR_NODE_LINK_TO(requant_op, quant_out); + + found_squash_count++; + } + }; + gpd(graph, handler); + AddStatis(found_squash_count); + PrettyLogDetail("--- squashed %d dequantize-quantize pairs", + found_squash_count); +} + +void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("cpu_quantize_squash_pass", graph); + + std::unordered_map nodes_keep_counter; + FindNodesToKeep(graph, &nodes_keep_counter); + Squash(graph, &nodes_keep_counter); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(cpu_quantize_squash_pass, + paddle::framework::ir::CPUQuantizeSquashPass); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h new file mode 100644 index 00000000..e873994c --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h @@ -0,0 +1,57 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Squash dequantize->quantize pair pattern into requantize op + */ +class CPUQuantizeSquashPass : public FusePassBase { + public: + virtual ~CPUQuantizeSquashPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + /* + * For each dequantize's output find the number of operators it is an input to + */ + void FindNodesToKeep( + Graph* graph, + std::unordered_map* nodes_keep_counter) const; + + /* + * Squash dequantize-quantize ops pairs into requantize or nothing + */ + void Squash(Graph* graph, + std::unordered_map* nodes_keep_counter) const; + + const std::string name_scope_{"squash"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc new file mode 100644 index 00000000..057a790c --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h" +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn, + float scale = 0) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("name", name); + if (type == "conv2d") { + op->SetInput("Input", {inputs[0]}); + if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]}); + if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]}); + op->SetOutput("Output", {outputs[0]}); + } else if (type == "quantize") { + op->SetInput("Input", {inputs[0]}); + op->SetOutput("Output", {outputs[0]}); + op->SetAttr("Scale", scale); + } else if (type == "dequantize") { + op->SetInput("Input", {inputs[0]}); + op->SetOutput("Output", {outputs[0]}); + op->SetAttr("Scale", scale); + } +} + +// (a,w1,b1)->Conv1->d +// d->Dequant->e +// e->Quant->f +// (f,w2,b2)->Conv2->i +ProgramDesc BuildProgramDesc(bool use_mkldnn, float scale1, float scale2) { + ProgramDesc prog; + for (auto& v : std::initializer_list( + {"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) { + auto* var = prog.MutableBlock(0)->Var(v); + if (v.find("w") == 0 || v.find("b") == 0) { + var->SetPersistable(true); + } + } + + SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn); + SetOp(&prog, "dequantize", "Dequant", {"d"}, {"e"}, use_mkldnn, scale1); + SetOp(&prog, "quantize", "Quant", {"e"}, {"f"}, use_mkldnn, scale2); + SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn); + return prog; +} + +static const std::initializer_list variable_names{ + "a", "b", "c", "d", "e", "f", "g", "h"}; +// a->Conv1->b +// b->Dequant->c +// +// c->Quant1->d and d->Conv2->e +// +// c->Conv3->f +// +// c->Quant2->g and g->Conv4->h +// +ProgramDesc BuildProgramDesc2(bool use_mkldnn, float scale1, float scale2, + float scale3) { + ProgramDesc prog; + for (auto& v : variable_names) { + prog.MutableBlock(0)->Var(v); + } + + SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn); + SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1); + + SetOp(&prog, "quantize", "Quant1", {"c"}, {"d"}, use_mkldnn, scale2); + SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn); + + SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn); + + SetOp(&prog, "quantize", "Quant2", {"c"}, {"g"}, use_mkldnn, scale3); + SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn); + + return prog; +} + +void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, + const char* var_name) { + auto x = scope->Var(var_name); + auto tensor = x->GetMutable(); + tensor->mutable_data(place, proto::VarType::FP32, 1); +} + +void MainTest(const ProgramDesc& prog, int removed_nodes_num) { + std::unique_ptr graph(new ir::Graph(prog)); + + // Init scope, as it is used in pass + auto place = paddle::platform::CPUPlace(); + NaiveExecutor exe{place}; + Scope scope; + exe.CreateVariables(prog, 0, true, &scope); + + for (auto& v : variable_names) { + InitTensorHolder(&scope, place, v.c_str()); + } + + graph->SetNotOwned(kParamScopeAttr, &scope); + + auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass"); + + int original_nodes_num = graph->Nodes().size(); + + graph.reset(pass->Apply(graph.release())); + + int current_nodes_num = graph->Nodes().size(); + + EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num); +} + +TEST(CpuQuantizeSquashPass, equal_scales) { + auto scale = 1.2345f; + auto use_mkldnn = true; + // Remove 4 nodes: Dequant, Quant, e, f + auto remove_nodes = 4; + MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes); + + use_mkldnn = !use_mkldnn; + MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes); +} + +TEST(CpuQuantizeSquashPass, inequal_scales) { + auto scale1 = 1.2345f; + auto scale2 = 21.0f; + auto use_mkldnn = true; + // Remove 3 nodes: Dequant, Quant, e + // Insert 1 node: requantize + auto remove_nodes = 2; + MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes); + + use_mkldnn = !use_mkldnn; + MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes); +} + +TEST(CpuQuantizeSquashPass, branch_to_equal_inequal_and_fp32) { + // Delete both quantize ops, + // bypass dequantize in both branches, + // insert requantize on one branch + auto scale = 1.2345f; + auto scale2 = 21.0f; + auto use_mkldnn = true; + // Remove 3 nodes: Quant1, Quant2, g + // Insert 1 node: requantize + auto remove_nodes = 2; + MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes); + + use_mkldnn = !use_mkldnn; + MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(cpu_quantize_squash_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc new file mode 100644 index 00000000..e854559a --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc @@ -0,0 +1,56 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_NODE(id, pattern) \ + PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \ + "pattern has no Node called %s", #id); \ + auto* id = subgraph.at(pattern.RetrieveNode(#id)); \ + PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); + +void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("depthwise_conv_mkldnn_pass", graph); + GraphPatternDetector gpd; + + auto* pattern = gpd.mutable_pattern(); + pattern->NewNode("depthwise_conv") + ->assert_is_op("depthwise_conv2d") + ->assert_op_attr("use_mkldnn", true); + + int found_depthwise_conv_mkldnn_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(3) << "handle DepthwiseConvMKLDNN fuse"; + GET_NODE(depthwise_conv, (*pattern)); + depthwise_conv->Op()->SetType("conv2d"); + found_depthwise_conv_mkldnn_count++; + }; + + gpd(graph, handler); + AddStatis(found_depthwise_conv_mkldnn_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(depthwise_conv_mkldnn_pass, + paddle::framework::ir::DepthwiseConvMKLDNNPass); diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h new file mode 100644 index 00000000..ca314afd --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class DepthwiseConvMKLDNNPass : public FusePassBase { + public: + virtual ~DepthwiseConvMKLDNNPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc new file mode 100644 index 00000000..f2dfbc84 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h" + +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn = false) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("name", name); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + op->SetInput("Bias", {inputs[2]}); + op->SetOutput("Out", outputs); +} + +// (a, weights, bias)->depthwise conv mkldnn->b +// (b, weights2, bias2)->depthwise conv no mkldnn->c +// (c, weights3, bias3)->conv mkldnn->d +// (d, weights3, bias3)->conv no mkldnn->e +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : std::vector( + {"a", "b", "c", "d", "e", "weights", "bias", "weights2", "bias2", + "weights3", "bias3", "weights4", "bias4"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2" || + v == "weights3" || v == "bias3" || v == "weights4" || v == "bias4") { + var->SetPersistable(true); + } + } + + // depthwise conv with MKL-DNN + SetOp(&prog, "depthwise_conv2d", "conv1", + std::vector({"a", "weights", "bias"}), + std::vector({"b"}), true); + // depthwise conv without MKL-DNN + SetOp(&prog, "depthwise_conv2d", "conv2", + std::vector({"b", "weights2", "bias2"}), + std::vector({"c"}), false); + // conv with MKL-DNN + SetOp(&prog, "conv2d", "conv3", + std::vector({"c", "weights3", "bias3"}), + std::vector({"d"}), true); + // conv without MKL-dNN + SetOp(&prog, "conv2d", "conv4", + std::vector({"d", "weights4", "bias4"}), + std::vector({"e"}), false); + + return prog; +} + +TEST(DepthwiseConvMKLDNNPass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("depthwise_conv_mkldnn_pass"); + + struct counters { + int mkldnn_depthwise_conv_nodes; + int other_depthwise_conv_nodes; + int mkldnn_conv_nodes; + int other_conv_nodes; + }; + + counters before{1, 1, 1, 1}; + + graph.reset(pass->Apply(graph.release())); + + // initialize counters before loop + counters after{0, 0, 0, 0}; + + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "conv2d") { + if (boost::get(op->GetAttr("use_mkldnn"))) + after.mkldnn_conv_nodes++; + else + after.other_conv_nodes++; + } else if (op->Type() == "depthwise_conv2d") { + if (boost::get(op->GetAttr("use_mkldnn"))) + after.mkldnn_depthwise_conv_nodes++; + else + after.other_depthwise_conv_nodes++; + } + } + } + + EXPECT_EQ(after.other_depthwise_conv_nodes, + before.other_depthwise_conv_nodes); + EXPECT_EQ(after.other_conv_nodes, before.other_conv_nodes); + EXPECT_EQ(after.mkldnn_depthwise_conv_nodes, + before.mkldnn_depthwise_conv_nodes - 1); + EXPECT_EQ(after.mkldnn_conv_nodes, before.mkldnn_conv_nodes + 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(depthwise_conv_mkldnn_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc new file mode 100644 index 00000000..9cc2d3da --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + Init("fc_mkldnn_pass", graph); + + auto* scope = param_scope(); + PADDLE_ENFORCE(scope); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("fc_mkldnn_pass/x") + ->AsInput() + ->assert_is_op_input("fc", "Input"); + patterns::FCMKLDNN fc_pattern(gpd.mutable_pattern(), "fc_mkldnn_pass"); + fc_pattern(x, true /*with bias*/); + + int found_fc_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Handle FC MKL-DNN pass"; + if (!(graph->Has("use_mkldnn") && graph->Get("use_mkldnn"))) { + VLOG(3) << "do not perform fc fuse"; + return; + } + GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(weights, weights, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bias, bias, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(output, output, fc_pattern); + + OpDesc* desc = fc->Op(); + auto in_size = fc->inputs[0]->Var()->GetShape().size(); + if (in_size != 2 && in_size != 4) { + VLOG(3) << "Do not enable FC MKL-DNN for dimensions different than 2 & 4"; + return; + } + desc->SetAttr("use_mkldnn", true); + PADDLE_ENFORCE(subgraph.count(x)); + + found_fc_count++; + }; + + gpd(graph, handler); + + AddStatis(found_fc_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fc_mkldnn_pass, paddle::framework::ir::FCMKLDNNPass); diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h new file mode 100644 index 00000000..97c6b242 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.h @@ -0,0 +1,38 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Transpose weights of FC to comply with MKL-DNN interface + */ +class FCMKLDNNPass : public FusePassBase { + public: + virtual ~FCMKLDNNPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc new file mode 100644 index 00000000..a2092a50 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" +#include +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +void MKLDNNPlacementPass::ApplyImpl(ir::Graph* graph) const { + VLOG(3) << "Applies MKL-DNN placement strategy."; + const auto& op_types_list = + Get>("mkldnn_enabled_op_types"); + if (!graph->Has("use_mkldnn")) { + graph->Set("use_mkldnn", new bool(true)); + } + for (const Node* n : graph->Nodes()) { + if (n->IsOp()) { + auto* op = n->Op(); + if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) { + if (op_types_list.empty()) { + op->SetAttr("use_mkldnn", true); + } else if (std::find(op_types_list.begin(), op_types_list.end(), + n->Name()) != op_types_list.end()) { + op->SetAttr("use_mkldnn", true); + } + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass) + .RequirePassAttr("mkldnn_enabled_op_types"); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h new file mode 100644 index 00000000..ffa62273 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Specifies which operators should use MKLDNN. + */ +class MKLDNNPlacementPass : public Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc new file mode 100644 index 00000000..5885f327 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc @@ -0,0 +1,136 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" + +#include +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, boost::tribool use_mkldnn) { + auto* op = prog->MutableBlock(0)->AppendOp(); + + op->SetType(type); + + if (!boost::indeterminate(use_mkldnn)) op->SetAttr("use_mkldnn", use_mkldnn); + + if (type == "conv2d") { + op->SetAttr("name", name); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + op->SetInput("Bias", {inputs[2]}); + } else if (type == "relu") { + op->SetInput("X", inputs); + } else if (type == "concat") { + op->SetAttr("axis", 1); + op->SetInput("X", {inputs[0], inputs[1]}); + } else if (type == "pool2d") { + op->SetInput("X", {inputs[0]}); + } else { + FAIL() << "Unexpected operator type."; + } + op->SetOutput("Out", {outputs[0]}); +} + +// operator use_mkldnn +// --------------------------------------- +// (a,b)->concat->c none +// (c,weights,bias)->conv->f none +// f->relu->g false +// g->pool->h false +// (h,weights2,bias2)->conv->k true +// k->relu->l true +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + + for (auto& v : + std::vector({"a", "b", "c", "weights", "bias", "f", "g", + "h", "weights2", "bias2", "k", "l"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "concat", "concat1", std::vector({"a", "b"}), + std::vector({"c"}), boost::indeterminate); + SetOp(&prog, "conv2d", "conv1", + std::vector({"c", "weights", "bias"}), + std::vector({"f"}), boost::indeterminate); + SetOp(&prog, "relu", "relu1", std::vector({"f"}), + std::vector({"g"}), false); + SetOp(&prog, "pool2d", "pool1", std::vector({"g"}), + std::vector({"h"}), false); + SetOp(&prog, "conv2d", "conv2", + std::vector({"h", "weights2", "bias2"}), + std::vector({"k"}), true); + SetOp(&prog, "relu", "relu2", std::vector({"k"}), + std::vector({"l"}), true); + + return prog; +} + +void MainTest(std::initializer_list mkldnn_enabled_op_types, + unsigned expected_use_mkldnn_true_count) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("mkldnn_placement_pass"); + pass->Set("mkldnn_enabled_op_types", + new std::unordered_set(mkldnn_enabled_op_types)); + + graph.reset(pass->Apply(graph.release())); + + unsigned use_mkldnn_true_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->HasAttr("use_mkldnn") && + boost::get(op->GetAttr("use_mkldnn"))) { + ++use_mkldnn_true_count; + } + } + } + + EXPECT_EQ(use_mkldnn_true_count, expected_use_mkldnn_true_count); +} + +TEST(MKLDNNPlacementPass, enable_conv_relu) { + // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 0 pool + MainTest({"conv2d", "relu"}, 3); +} + +TEST(MKLDNNPlacementPass, enable_relu_pool) { + // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool + MainTest({"relu", "pool2d"}, 4); +} + +TEST(MKLDNNPlacementPass, enable_all) { + // 1 conv (1 conv is always true) + 2 relu (1 relu is always true) + 1 pool + MainTest({}, 4); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(mkldnn_placement_pass); diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc new file mode 100644 index 00000000..a8720ff4 --- /dev/null +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc @@ -0,0 +1,336 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/multi_batch_merge_pass.h" + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +static const char kNumRepeats[] = "num_repeats"; +typedef std::unordered_map> SSAVarList; + +ir::Node* SameNameVar(std::unordered_set all, ir::Node* target) { + for (auto n : all) { + if (target->IsVar() && target->Name() == n->Name()) { + return n; + } + } + return nullptr; +} + +VarDesc CopyVarDesc(VarDesc* var_desc) { + VarDesc repeated_var(var_desc->Name()); + // copy other variable attributes + if (var_desc->GetType() != proto::VarType::READER) { + repeated_var.SetType(var_desc->GetType()); + repeated_var.SetShape(var_desc->GetShape()); + repeated_var.SetDataType(var_desc->GetDataType()); + repeated_var.SetLoDLevel(var_desc->GetLoDLevel()); + repeated_var.SetPersistable(var_desc->Persistable()); + } else { + // TODO(typhoonzero): copy reader var + } + return repeated_var; +} + +VarDesc UpdateGradVarDesc( + VarDesc* var_desc, int repeat, + const std::unordered_set& grad_names, + const std::unordered_set& bn_vars_need_rename) { + if (grad_names.find(var_desc->Name()) != grad_names.end() || + bn_vars_need_rename.find(var_desc->Name()) != bn_vars_need_rename.end()) { + std::string new_gname = + string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat); + VarDesc repeated_var = CopyVarDesc(var_desc); + repeated_var.SetName(new_gname); + VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat; + return repeated_var; + } + return *var_desc; +} + +void BatchMergePass::ApplyImpl(ir::Graph* graph) const { + int num_repeats = Get(kNumRepeats); + std::vector forward_backward_ops; + std::vector optimize_ops; + std::vector lr_ops; // ops other than forward/backward/optimize + std::unordered_set grad_names; + std::unordered_map gradname2paramname; + + std::vector nodes = TopologySortOperations(*graph); + auto origin_nodes = graph->ReleaseNodes(); + VLOG(3) << "origin nodes count: " << origin_nodes.size(); + ir::Graph& result = *graph; + + // 1. record op nodes of different roles + for (auto node : nodes) { + if (!node->IsOp()) continue; + PADDLE_ENFORCE(node->Op(), "must find opdesc"); + int op_role = boost::get(node->Op()->GetAttr( + framework::OpProtoAndCheckerMaker::OpRoleAttrName())); + if ((op_role == static_cast(framework::OpRole::kForward)) || + (op_role & static_cast(framework::OpRole::kBackward)) || + (op_role & static_cast(framework::OpRole::kLoss))) { + forward_backward_ops.push_back(node); + } else if ((op_role & static_cast(framework::OpRole::kOptimize)) || + (op_role & static_cast(framework::OpRole::kDist)) || + (op_role & static_cast(framework::OpRole::kRPC))) { + optimize_ops.push_back(node); + auto op_role_var = node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName()); + auto op_role_vars = boost::get>(op_role_var); + for (size_t i = 0; i < op_role_vars.size(); i += 2) { + grad_names.insert(op_role_vars[i + 1]); + gradname2paramname[op_role_vars[i + 1]] = op_role_vars[i]; + } + } else if (op_role & static_cast(framework::OpRole::kLRSched)) { + lr_ops.push_back(node); + } else { // NOLINT + PADDLE_THROW("Invalid op_role: %d", static_cast(op_role)); + } + } + + // 2. copy forward backward + ir::Node* prev_repeat_last_op_node = nullptr; + // record origin_grad -> repeated_grad_list map. + std::map> grad_repeated_map; + std::map> created; + std::unordered_set bn_vars_need_rename; + for (int i = 0; i < num_repeats; ++i) { + std::unordered_set copied; + for (size_t node_idx = 0; node_idx < forward_backward_ops.size(); + ++node_idx) { + auto node = forward_backward_ops[node_idx]; + OpDesc repeated_op(*(node->Op()), node->Op()->Block()); + // 3. rename grad outputs to current repeat. + for (auto outname : repeated_op.OutputArgumentNames()) { + if (grad_names.find(outname) != grad_names.end()) { + std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i); + repeated_op.RenameOutput(outname, new_gname); + // remove op_role_var for backward ops that outputs grad for a + // parameter. + repeated_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(), + std::vector()); + } + } + // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do + // not need this update, because only moving mean and variance should be + // differ, trainable parameter scale and bias is the same as other + // parameters. + if (node->Name() == "batch_norm") { + // NOTE: assume bn op created by layers use save var as output mean and + // variance + std::string new_mean_name = + string::Sprintf("%s.repeat.%d", repeated_op.Input("Mean")[0], i); + std::string new_var_name = string::Sprintf( + "%s.repeat.%d", repeated_op.Input("Variance")[0], i); + bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]); + bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]); + VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to " + << new_mean_name; + repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name); + repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name); + repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0], + new_mean_name); + repeated_op.RenameOutput(repeated_op.Output("VarianceOut")[0], + new_var_name); + } + + // 3.9 do copy + auto repeated_node = result.CreateOpNode(&repeated_op); + copied.insert(node); + + // 4. add deps between repeats + if (node_idx == forward_backward_ops.size() - 1) { + prev_repeat_last_op_node = repeated_node; + } + if (node_idx == 0 && prev_repeat_last_op_node) { + auto* depvar = result.CreateControlDepVar(); + prev_repeat_last_op_node->outputs.push_back(depvar); + depvar->inputs.push_back(prev_repeat_last_op_node); + repeated_node->inputs.push_back(depvar); + depvar->outputs.push_back(repeated_node); + } + + for (auto in_node : node->inputs) { + if (in_node->IsCtrlVar()) { + continue; + } + ir::Node* var = nullptr; + auto updated_var = UpdateGradVarDesc(in_node->Var(), i, grad_names, + bn_vars_need_rename); + // should be initialized by startup, how to initilize tensor in the + // scope? + if (node->Name() == "batch_norm" && + bn_vars_need_rename.find(in_node->Name()) != + bn_vars_need_rename.end()) { + // Create bn mean/variance for each repeat + var = result.CreateVarNode(&updated_var); + created[updated_var.Name()].push_back(var); + copied.insert(in_node); + repeated_node->inputs.push_back(var); + var->outputs.push_back(repeated_node); + continue; + } + + // for other ops + if (in_node->inputs.empty() && i > 0) { + // do not copy head vars (inputs, params) in repeats > 0 + var = created.at(in_node->Name()).back(); + } else { + if (copied.find(in_node) == copied.end()) { + var = result.CreateVarNode(&updated_var); + if (grad_names.find(in_node->Var()->Name()) != grad_names.end()) { + grad_repeated_map[in_node].push_back(var); + } + copied.insert(in_node); + created[updated_var.Name()].push_back(var); + } else { + var = created.at(updated_var.Name()).back(); + } + } + repeated_node->inputs.push_back(var); + var->outputs.push_back(repeated_node); + } + for (auto out_node : node->outputs) { + if (out_node->IsCtrlVar()) { + continue; + } + ir::Node* var = nullptr; + auto updated_var = UpdateGradVarDesc(out_node->Var(), i, grad_names, + bn_vars_need_rename); + if (copied.find(out_node) == copied.end()) { + var = result.CreateVarNode(&updated_var); + if (grad_names.find(out_node->Var()->Name()) != grad_names.end()) { + grad_repeated_map[out_node].push_back(var); + } + copied.insert(out_node); + created[updated_var.Name()].push_back(var); + } else { + var = created.at(updated_var.Name()).back(); + } + repeated_node->outputs.push_back(var); + var->inputs.push_back(repeated_node); + } + } + } // end copy forward backward + + // 5. create GRAD merge op node: sum(repeat.0...repeat.n) -> + // scale(1/num_repeats) + for (auto kv : grad_repeated_map) { + OpDesc sum_op; + sum_op.SetType("sum"); + std::vector repeated_grad_names; + std::vector param_grad_op_role_var; + for (auto r : kv.second) { + repeated_grad_names.push_back(r->Var()->Name()); + } + // NOTE: use op_role_var to control allreduce op appending in + // multi_devices_graph_pass, we want to append op_role_var + // only once for the merged gradient, so break after first call. + param_grad_op_role_var.push_back( + gradname2paramname.at(kv.first->Var()->Name())); // param + param_grad_op_role_var.push_back(kv.first->Var()->Name()); // grad + + sum_op.SetInput("X", repeated_grad_names); + sum_op.SetOutput("Out", {kv.first->Var()->Name()}); + sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kBackward)); + auto sum_op_node = result.CreateOpNode(&sum_op); + for (auto r : kv.second) { + sum_op_node->inputs.push_back(r); + r->outputs.push_back(sum_op_node); + } + auto sum_out_var_node = result.CreateVarNode(kv.first->Var()); + sum_op_node->outputs.push_back(sum_out_var_node); + sum_out_var_node->inputs.push_back(sum_op_node); + created[sum_out_var_node->Name()].push_back(sum_out_var_node); + + OpDesc scale_op; + scale_op.SetType("scale"); + scale_op.SetInput("X", {sum_out_var_node->Var()->Name()}); + // NOTE: inplace scale. + scale_op.SetOutput("Out", {sum_out_var_node->Var()->Name()}); + scale_op.SetAttr("scale", static_cast(1.0f / num_repeats)); + scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kBackward)); + + scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName(), + param_grad_op_role_var); + + auto scale_op_node = result.CreateOpNode(&scale_op); + scale_op_node->inputs.push_back(sum_out_var_node); + sum_out_var_node->outputs.push_back(scale_op_node); + auto scale_out_var_node = result.CreateVarNode(sum_out_var_node->Var()); + scale_op_node->outputs.push_back(scale_out_var_node); + scale_out_var_node->inputs.push_back(scale_op_node); + created[scale_out_var_node->Name()].push_back(scale_out_var_node); + } + // 6. add optimize ops + { + auto copy_node = [&result, &created](ir::Node* node) { + auto op_node = result.CreateOpNode(node->Op()); + // copy op ins/outs + // NOTE: for send/recv ops, the OpDesc uses ctrldepvar to describe + // dependencies, so create those depvars if OpDesc have in/outs. + for (auto in_node : node->inputs) { + if (in_node->IsCtrlVar() && !in_node->Var()) { + continue; + } + ir::Node* var = nullptr; + if (created.find(in_node->Name()) == created.end()) { + var = result.CreateVarNode(in_node->Var()); + created[in_node->Name()].push_back(var); + } else { + var = created.at(in_node->Name()).back(); + } + op_node->inputs.push_back(var); + var->outputs.push_back(op_node); + } + for (auto out_node : node->outputs) { + if (out_node->IsCtrlVar() && !out_node->Var()) { + continue; + } + auto var = result.CreateVarNode(out_node->Var()); + created[out_node->Name()].push_back(var); + op_node->outputs.push_back(var); + var->inputs.push_back(op_node); + } + }; + for (auto node : lr_ops) { + copy_node(node); + } + for (auto node : optimize_ops) { + copy_node(node); + } + } + + result.ResolveHazard(created); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(multi_batch_merge_pass, paddle::framework::ir::BatchMergePass) + .RequirePassAttr(paddle::framework::ir::kNumRepeats); diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.h b/paddle/fluid/framework/ir/multi_batch_merge_pass.h new file mode 100644 index 00000000..a8961668 --- /dev/null +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +// BatchMergePass is used to copy forward and backward ops for several +// times to run several batches to simulate large batch size training +// as if we have more than 1 GPUs. +// User can define how many batches to run, gradients will be merged +// through those repeats, and then do optimization using merged gradients. +// This pass is extremely useful when doing large batch-size distributed +// sync training, we can simulate even large batch size as if we have more +// GPUs. + +class BatchMergePass : public Pass { + public: + virtual ~BatchMergePass() {} + + protected: + void ApplyImpl(Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt new file mode 100644 index 00000000..4cdb6a7d --- /dev/null +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt @@ -0,0 +1,17 @@ +cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) + +cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) +cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) + +set(ALL_REDUCE_OP_HANDLES all_reduce_op_handle) +if(WITH_GPU AND WITH_DGC) + list(APPEND ALL_REDUCE_OP_HANDLES sparse_all_reduce_op_handle) +endif() + +cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle + scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle ${ALL_REDUCE_OP_HANDLES} reduce_op_handle broadcast_op_handle fused_broadcast_op_handle) +cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) + +cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle) +cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS all_reduce_op_handle graph graph_helper pass) +cc_library(backward_optimizer_op_deps_pass SRCS backward_optimizer_op_deps_pass.cc DEPS graph graph_helper pass) diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc new file mode 100644 index 00000000..1019c4f8 --- /dev/null +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc @@ -0,0 +1,219 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +class AllReduceDepsPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override { + std::vector all_reduce_op_handles = + GetSortedAllReduceOps(*graph); + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto use_hierarchical_allreduce = + Get(details::kUseHierarchicalAllReduce); + for (size_t i = 0; i < all_reduce_op_handles.size(); ++i) { + auto op_handle = + dynamic_cast(all_reduce_op_handles[i]); + PADDLE_ENFORCE(op_handle, "op_handle must be NCCLOpHandleBase"); + op_handle->SetRunEnv(i, use_hierarchical_allreduce); + } +#endif + + for (size_t i = 1; i < all_reduce_op_handles.size(); ++i) { + auto* dep_var = new details::DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(details::kGraphDepVars) + .emplace(dep_var); + all_reduce_op_handles[i - 1]->AddOutput(dep_var); + all_reduce_op_handles[i]->AddInput(dep_var); + } + + if (VLOG_IS_ON(10)) { + DebugString(*graph, all_reduce_op_handles); + } + } + + std::vector GetSortedAllReduceOps( + const ir::Graph& graph) const { + std::vector all_reduce_op_handles; + std::unordered_map pending_ops; + std::unordered_set ready_ops; + std::unordered_set next_ready_ops; + auto op_handles = ir::FilterByNodeWrapper(graph); + size_t num_of_ops = op_handles.size(); + for (details::OpHandleBase* op : op_handles) { + size_t not_ready_vars = op->NotReadyInputSize(); + if (not_ready_vars) { + pending_ops.insert({op, not_ready_vars}); + } else { + ready_ops.insert(op); + } + } + + GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles); + + size_t has_run_ops = ready_ops.size(); + while (has_run_ops != num_of_ops) { + for (auto* op : ready_ops) { + for (auto& ready_var : op->Outputs()) { + for (auto* pend_op : ready_var->PendingOps()) { + auto& deps = --pending_ops[pend_op]; + if (deps == 0) { + next_ready_ops.insert(pend_op); + } + } + } + } + + PADDLE_ENFORCE_NE(next_ready_ops.size(), 0, "There maybe have a cycle."); + ready_ops.clear(); + std::swap(ready_ops, next_ready_ops); + GetSortedAllReduceOps(ready_ops, &all_reduce_op_handles); + has_run_ops += ready_ops.size(); + } + return all_reduce_op_handles; + } + + void GetSortedAllReduceOps( + const std::unordered_set& ready_ops, + std::vector* all_reduce_op_handles) const { + std::vector current_all_reduce_op_handles; + for (auto& op_handle : ready_ops) { + auto all_reduce_op_handle = + dynamic_cast(op_handle); + auto fused_all_reduce_op_handle = + dynamic_cast(op_handle); + + if (all_reduce_op_handle || fused_all_reduce_op_handle) { + current_all_reduce_op_handles.emplace_back(op_handle); + } + } + + // NOTE(zcd): For distributed training, it is important to keep the order of + // allReduce on each node consistent. Otherwise, hang may occur. + // Sort the current_all_reduce_op_handles according to the name of input. + sort(current_all_reduce_op_handles.begin(), + current_all_reduce_op_handles.end(), + [](const details::OpHandleBase* left, + const details::OpHandleBase* right) -> bool { + auto left_in_vars = + details::DynamicCast(left->Inputs()); + auto right_in_vars = + details::DynamicCast(right->Inputs()); + PADDLE_ENFORCE_GT(left_in_vars.size(), 0); + PADDLE_ENFORCE_EQ(left_in_vars.size(), right_in_vars.size()); + return left_in_vars[0]->Name() > right_in_vars[0]->Name(); + }); + + all_reduce_op_handles->insert(all_reduce_op_handles->end(), + current_all_reduce_op_handles.begin(), + current_all_reduce_op_handles.end()); + } + + void DebugString( + const ir::Graph& graph, + const std::vector& all_reduce_op_handles) const { + // get vars order + std::map> vars = + GetSoredGradientsFromStaleProgram(graph); + std::stringstream out; + size_t grads_of_stale_program = 0; + out << "Get Order From details::kStaleProgramOpDescs: "; + for (auto& var : vars) { + out << "Order " << var.first << " ["; + for (auto& var_name : var.second) { + out << var_name << ", "; + ++grads_of_stale_program; + } + out << "], "; + } + VLOG(10) << out.str(); + + std::stringstream out2; + out2 << "Get Order From Topological order: "; + for (auto& op : all_reduce_op_handles) { + bool find_valid_input = false; + for (auto& in_var : op->Inputs()) { + if (dynamic_cast(in_var)) { + out2 << in_var->Name() << ", "; + find_valid_input = true; + break; + } + } + PADDLE_ENFORCE(find_valid_input, "Doesn't find valid input."); + } + VLOG(10) << out2.str(); + if (grads_of_stale_program != all_reduce_op_handles.size()) { + VLOG(10) + << "The gradients number of stale program and graph is not equal."; + } + } + + std::map> GetSoredGradientsFromStaleProgram( + const ir::Graph& graph) const { + std::map> vars; + auto ops = + graph.Get>(details::kStaleProgramOpDescs); + int order = 0; + for (auto* op_desc : ops) { + try { + bool is_bk_op = + static_cast(boost::get(op_desc->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + + auto backward_vars = + boost::get>(op_desc->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + if (backward_vars.empty()) continue; + + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + for (size_t i = 1; i < backward_vars.size(); i += 2) { + vars[order].emplace_back(backward_vars[i]); + VLOG(1) << "get parameter and gradient: " << backward_vars[i - 1] + << ", " << backward_vars[i]; + } + order++; + } catch (boost::bad_get e) { + } + } + return vars; + } +}; +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(all_reduce_deps_pass, paddle::framework::ir::AllReduceDepsPass) + .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc new file mode 100644 index 00000000..c7ab32a2 --- /dev/null +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/backward_optimizer_op_deps_pass.cc @@ -0,0 +1,223 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace ir { + +class BackWardOpDepsPass : public ir::Pass { + protected: + void AddDep(ir::Graph* graph, details::OpHandleBase* l, + details::OpHandleBase* r) const { + auto* dep_var = new details::DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(details::kGraphDepVars).emplace(dep_var); + l->AddOutput(dep_var); + r->AddInput(dep_var); + VLOG(10) << "add deps:" << l->DebugString() << " and " << r->DebugString(); + } + + void ApplyImpl(ir::Graph* graph) const override { + // NOTE: The operator nodes should be in topology order. + std::vector backward_op_handles; + std::vector all_opt_handles; + details::ParamsAndGrads params_grads; + std::vector topo_nodes = ir::TopologySortOperations(*graph); + for (auto& node : topo_nodes) { + if (!node->Op()) continue; + + GetBackWardOpHandles(node, &backward_op_handles, ¶ms_grads); + GetOptimizerOpHandles(node, &all_opt_handles); + } + + VLOG(10) << "backward_op_handles size:" << backward_op_handles.size() + << ", opt_handles size:" << all_opt_handles.size(); + + if (backward_op_handles.size() <= 1 || all_opt_handles.size() <= 1) { + VLOG(10) << "need not backward_op_deps_pass"; + return; + } + + std::vector opt_handles; + GetOptimizerHandlesRoot(all_opt_handles, &opt_handles, params_grads); + + if (opt_handles.size() <= 1) { + VLOG(10) << "need not backward_op_deps_pass"; + return; + } + + VLOG(10) << "add optimize deps"; + for (size_t i = 1; i < opt_handles.size(); ++i) { + AddDep(graph, opt_handles[i - 1], opt_handles[i]); + } + + VLOG(10) << "add deps between backward and optimze:"; + AddDep(graph, backward_op_handles[backward_op_handles.size() - 1], + opt_handles[0]); + } + + /* + * When the backward ophandles complete, the optimizer ophandle's inputs var + * are ready.Since the optimizer ophandles can be seen as graphs which each of + * them doesn't connect to each other, they can run parallelly or by a + * specified order, such as by the grads generated order. This function will + * get these graphs' root. + */ + void GetOptimizerHandlesRoot( + const std::vector& ops, + std::vector* result, + const details::ParamsAndGrads& params_grads) const { + std::unordered_set visit; + for (auto op : ops) { + if (visit.find(op) != visit.end()) { + continue; + } + + VLOG(10) << "visiting all_opt_handles:" << op->DebugString(); + + result->emplace_back(op); + visit.insert(op); + VisitChildrens(op, &visit); + } + + for (size_t i = 0; i < result->size(); i++) { + VLOG(10) << "get potential head op:" << (*result)[i]->DebugString(); + } + + // sort by param_grad order + std::unordered_map pg_order; + int order = 0; + for (auto& p_g : params_grads) { + pg_order[p_g.second] = order++; + } + + std::vector> op_handles; + for (auto op : *result) { + int order = 0; + for (auto input : op->Inputs()) { + if (dynamic_cast(input) == nullptr) continue; + + if (pg_order.find(input->Name()) == pg_order.end()) { + VLOG(10) << "not find input " << input->Name() << " in grad"; + continue; + } + + if (order < pg_order.at(input->Name())) { + order = pg_order.at(input->Name()); + } + } + op_handles.emplace_back(std::make_pair(op, order)); + } + + sort(op_handles.begin(), op_handles.end(), + [](const std::pair& left, + const std::pair& right) -> bool { + return left.second < right.second; + }); + + result->clear(); + for (auto p : op_handles) { + result->emplace_back(p.first); + } + + for (size_t i = 0; i < result->size(); i++) { + VLOG(10) << "get head op:" << (*result)[i]->DebugString(); + } + } + + void VisitChildrens(details::OpHandleBase* op, + std::unordered_set* visit) const { + for (auto out : op->Outputs()) { + for (auto* pending_op : out->PendingOps()) { + if (visit->find(pending_op) != visit->end()) { + continue; + } + + VLOG(10) << "visiting:" << pending_op->DebugString(); + + visit->insert(pending_op); + VisitChildrens(pending_op, visit); + } + } + } + + void GetBackWardOpHandles( + ir::Node* node, std::vector* backward_op_handles, + details::ParamsAndGrads* params_grads) const { + try { + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) return; + + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + auto backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast(0)); + PADDLE_ENFORCE(node->IsWrappedBy()); + + backward_op_handles->emplace_back( + &node->Wrapper()); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + VLOG(10) << "Trainable parameter: " << backward_vars[i] + << ", gradient: " << backward_vars[i + 1]; + + params_grads->emplace_back(std::make_pair( + backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/)); + } + } catch (boost::bad_get e) { + } + } + + void GetOptimizerOpHandles( + ir::Node* node, std::vector* opt_handles) const { + try { + bool is_opt_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kOptimize)); + if (!is_opt_op) return; + + opt_handles->emplace_back(&node->Wrapper()); + } catch (boost::bad_get e) { + } + } +}; +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(backward_optimizer_op_deps_pass, + paddle::framework::ir::BackWardOpDepsPass); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc new file mode 100644 index 00000000..d0afebcb --- /dev/null +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc @@ -0,0 +1,206 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/fused_all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +class FuseAllReduceOpPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override { + ir::Graph &result = *graph; + auto &places = Get>(details::kPlaces); + auto &local_scopes = Get>(details::kLocalScopes); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto *multi_nccl_ctxs = + &Get(details::kNCCLCtxs); +#endif + + auto ¶ms_grads = + result.Get(details::kParamsAndDenseGrads); + size_t num_of_all_reduce = params_grads.size(); + std::unordered_set grads; + grads.reserve(num_of_all_reduce); + for (auto p_g : params_grads) { + grads.insert(p_g.second); + } + + std::unordered_map all_reduce_ops = + GetAllReduceOps(result, places, grads); + + VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size(); + if (all_reduce_ops.size() == 0) { + return; + } + + PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(), + "The number of all_reduce OpHandle is not equal to the " + "number of grads. Maybe some gradients are sparse type, " + "it is not supported currently."); + VLOG(10) << "Insert fused_all_reduce"; + + auto &group_params_grads = graph->Get( + details::kGroupParamsAndDenseGrads); + + for (auto &group_p_g : group_params_grads) { + size_t group_size = group_p_g.size(); + PADDLE_ENFORCE_GT(group_size, static_cast(0)); + std::vector group_all_reduce_ops; + group_all_reduce_ops.reserve(group_size); + for (auto &p_g : group_p_g) { + group_all_reduce_ops.emplace_back(all_reduce_ops.at(p_g.second)); + } +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + InsertFusedAllReduce(places, local_scopes, group_size, + group_all_reduce_ops, multi_nccl_ctxs, &result); +#else + InsertFusedAllReduce(places, local_scopes, group_size, + group_all_reduce_ops, &result); +#endif + } + } + + std::unordered_map GetAllReduceOps( + const Graph &result, const std::vector &places, + const std::unordered_set &grads) const { + size_t num_place = places.size(); + std::unordered_map all_reduce_ops; + all_reduce_ops.reserve(grads.size()); + for (auto &node : result.Nodes()) { + if (node->IsOp()) { + PADDLE_ENFORCE(node->IsWrappedBy()); + auto *all_reduce_op_handle = dynamic_cast( + &node->Wrapper()); + if (all_reduce_op_handle) { + auto inputs = details::DynamicCast( + all_reduce_op_handle->Inputs()); + PADDLE_ENFORCE_EQ(inputs.size(), num_place); + // The inputs' name should be the same. + auto &grad_name = inputs[0]->name(); + for (size_t i = 1; i < inputs.size(); ++i) { + PADDLE_ENFORCE_EQ(inputs[i]->name(), grad_name, + "The input name should be the same."); + } + PADDLE_ENFORCE_NE(grads.count(grad_name), static_cast(0)); + all_reduce_ops.emplace(grad_name, node); + } + } + } + return all_reduce_ops; + } + + void InsertFusedAllReduce(const std::vector &places, + const std::vector &local_scopes, + const size_t num_of_all_reduce, + const std::vector &all_reduce_ops, +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const platform::NCCLCommunicator *multi_nccl_ctxs, +#endif + ir::Graph *result) const { + std::vector inputs; + std::vector outputs; + for (auto &op : all_reduce_ops) { + auto &op_handle = op->Wrapper(); + inputs.insert(inputs.end(), op_handle.Inputs().begin(), + op_handle.Inputs().end()); + // Remove output + for_each(op_handle.Inputs().begin(), op_handle.Inputs().end(), + [&op_handle](details::VarHandleBase *var_handle) { + var_handle->RemoveOutput(&op_handle, op_handle.Node()); + }); + + outputs.insert(outputs.end(), op_handle.Outputs().begin(), + op_handle.Outputs().end()); + // Remove Input + for_each(op_handle.Outputs().begin(), op_handle.Outputs().end(), + [](details::VarHandleBase *var_handle) { + var_handle->ClearGeneratedOp(); + }); + + result->RemoveNode(op_handle.Node()); + } + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, + local_scopes, multi_nccl_ctxs, result); +#else + CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, + local_scopes, result); +#endif + } + + private: + void CreateFusedAllReduceOp( + const std::vector &inputs, + const std::vector &outputs, + const size_t num_of_all_reduce, + const std::vector &places, + const std::vector &local_scopes, +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + const platform::NCCLCommunicator *multi_nccl_ctxs, +#endif + ir::Graph *result) const { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto *op_handle = new details::FusedAllReduceOpHandle( + result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), + local_scopes, places, num_of_all_reduce, multi_nccl_ctxs); +#else + auto *op_handle = new details::FusedAllReduceOpHandle( + result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), + local_scopes, places, num_of_all_reduce); +#endif + + for (auto in : inputs) { + op_handle->AddInput(in); + } + + for (auto out : outputs) { + op_handle->AddOutput(out); + } + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + if (!multi_nccl_ctxs) { + SetCommunicationContext(places, op_handle); + } +#else + SetCommunicationContext(places, op_handle); +#endif + } + + void SetCommunicationContext( + const std::vector &places, + details::FusedAllReduceOpHandle *op_handle) const { + for (size_t i = 0; i < places.size(); ++i) { + op_handle->SetDeviceContext( + places[i], platform::DeviceContextPool::Instance().Get(places[i])); + } + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_all_reduce_op_pass, + paddle::framework::ir::FuseAllReduceOpPass); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc new file mode 100644 index 00000000..e9b35aef --- /dev/null +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h" + +namespace paddle { +namespace framework { +namespace ir { + +static bool IsLockAndRecordEventFreeComputationOpHandle( + details::ComputationOpHandle *op, const OpGraphView &graph_view) { + if (!platform::is_gpu_place(op->GetPlace())) return false; + for (auto &pending_op : graph_view.PendingOps(op)) { + auto *tmp = dynamic_cast(pending_op); + if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) { + return false; + } + } + return true; +} + +class ModifyOpLockAndRecordEventPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override { + auto all_ops = ir::FilterByNodeWrapper(*graph); + OpGraphView graph_view(all_ops); + for (auto &op : all_ops) { + auto *compute_op = dynamic_cast(op); + if (compute_op == nullptr) continue; + bool is_lock_and_record_event_free = + IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view); + compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free); + if (is_lock_and_record_event_free) { + VLOG(10) << "Set is_lock_and_record_event_free be true in op " + << compute_op->DebugString(); + } + } + } +}; +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(modify_op_lock_and_record_event_pass, + paddle::framework::ir::ModifyOpLockAndRecordEventPass); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc new file mode 100644 index 00000000..8cc33a6c --- /dev/null +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_check_pass.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SSAGraghBuilderWithChecker : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override { + PADDLE_ENFORCE(IsValidGraph(graph)); + } + + bool IsValidGraph(const ir::Graph *graph) const { + std::unordered_map pending_ops; + std::unordered_set pending_vars; + std::unordered_set ready_vars; + std::unordered_set ready_ops; + + auto insert_pending_var = [&](details::VarHandleBase *var) { + pending_vars.insert(var); + if (var->GeneratedOp() == nullptr) { + ready_vars.emplace(var); + } + }; + + for (auto &var_map : graph->Get(details::kGraphVars)) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + insert_pending_var(version_pair); + } + } + } + + for (auto &var : + graph->Get(details::kGraphDepVars)) { + insert_pending_var(var); + } + + for (auto *op : ir::FilterByNodeWrapper(*graph)) { + if (op->Inputs().empty()) { + ready_ops.insert(op); + } else { + pending_ops.insert({op, op->NoDupInputSize()}); + } + } + + auto run_all_ops = [&](std::unordered_set &set) { + for (auto *op : set) { + for (auto out : op->Outputs()) { + ready_vars.emplace(out); + } + } + set.clear(); + }; + + while (!pending_vars.empty()) { + run_all_ops(ready_ops); + + if (ready_vars.empty()) { + return false; + } + + for (auto ready_var : ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->PendingOps()) { + auto &deps = --pending_ops[op]; + if (deps == 0) { + ready_ops.insert(op); + } + } + } + ready_vars.clear(); + } + return true; + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(multi_devices_check_pass, + paddle::framework::ir::SSAGraghBuilderWithChecker) + .RequireGraphAttr(paddle::framework::details::kGraphVars) + .RequireGraphAttr(paddle::framework::details::kGraphDepVars); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc new file mode 100644 index 00000000..d6d9c8bb --- /dev/null +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc @@ -0,0 +1,1118 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/broadcast_op_handle.h" +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/fetch_barrier_op_handle.h" +#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" +#include "paddle/fluid/framework/details/reduce_op_handle.h" +#include "paddle/fluid/framework/details/rpc_op_handle.h" +#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/math/math_function.h" + +#if defined(PADDLE_WITH_DGC) +#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h" +#endif + +namespace paddle { +namespace framework { +namespace ir { + +namespace { +// TODO(panyx0718): Clean this up as well. +// all operators. NOTE that even we use a vector here, the operators is +// unordered. +typedef std::vector GraphOps; +const char kGraphOps[] = "ops"; + +bool OpHaveRole(const ir::Node &node, const framework::OpRole &role) { + return boost::get( + node.Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + static_cast(role); +} + +void PolishGraphToSupportDataHazards(ir::Graph *graph) { + for (auto &var_map : graph->Get(details::kGraphVars)) { + for (auto &name_pair : var_map) { + if (name_pair.second.size() <= 1) { + continue; + } + auto it_new = name_pair.second.rbegin(); + auto it_old = name_pair.second.rbegin(); + ++it_old; + for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { + details::OpHandleBase *write_op = (*it_new)->GeneratedOp(); + const auto &read_ops = (*it_old)->PendingOps(); + + for (auto *read_op : read_ops) { + // Manually add a dependency var from read_op to write_op; + if (read_op == write_op) { + // Read Write is the same op. + continue; + } + bool has_dep = false; + for (auto *r_out : read_op->Outputs()) { + for (auto *w_in : write_op->Inputs()) { + if (r_out->Node() == w_in->Node()) { + has_dep = true; + break; + } + } + } + if (has_dep) continue; + + auto *dep_var = + new details::DummyVarHandle(graph->CreateControlDepVar()); + read_op->AddOutput(dep_var); + write_op->AddInput(dep_var); + graph->Get(details::kGraphDepVars) + .emplace(dep_var); + } + } + } + } +} + +details::VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node, + const platform::Place &place, + size_t place_offset) { + auto &var_holders = + graph->Get(details::kGraphVars)[place_offset]; + auto &var_holder = var_holders[node->Name()]; + details::VarHandle *var = nullptr; + if (var_holder.empty()) { + if (node->Var()) { + var = new details::VarHandle(graph->CreateVarNode(node->Var()), 0, + place_offset, node->Name(), place); + } else { + var = new details::VarHandle( + graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0, + place_offset, node->Name(), place); + } + var_holder.emplace_back(var); + } else { + var = *var_holder.rbegin(); + } + return var; +} + +void CreateOpOutput(ir::Graph *graph, details::OpHandleBase *op_handle, + ir::Node *new_node, const platform::Place &place, + size_t place_offset) { + auto &vars = graph->Get( + details::kGraphVars)[place_offset][new_node->Name()]; + size_t version = vars.size(); + auto var = new details::VarHandle(new_node, version, place_offset, + new_node->Name(), place); + vars.emplace_back(var); + op_handle->AddOutput(var); +} + +void AddOutputToLeafOps(ir::Graph *graph) { + for (auto &op : graph->Get(kGraphOps)) { + if (!op->Outputs().empty()) { + continue; + } + auto *dummy_leaf = + new details::DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(details::kGraphDepVars) + .emplace(dummy_leaf); + op->AddOutput(dummy_leaf); + } +} +} // namespace + +void MultiDevSSAGraphBuilderBase::CheckGraph(const ir::Graph &graph) const {} + +void MultiDevSSAGraphBuilderBase::Init() const { + all_vars_.clear(); + + loss_var_name_ = Get(kLossVarName); + VLOG(10) << "Init MultiDevSSAGraphBuilder, loss name: " << loss_var_name_; + places_ = Get>(details::kPlaces); + local_scopes_ = Get>(details::kLocalScopes); + strategy_ = Get(kStrategy); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + multi_nccl_ctxs_ = &Get(details::kNCCLCtxs); + nccl_ctxs_ = nullptr; + if (multi_nccl_ctxs_) { + nccl_ctxs_ = multi_nccl_ctxs_->DefaultFlatCtx(); + } +#endif + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); +} + +void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const { + Init(); + CheckGraph(*graph); + std::vector sorted_ops = SortOperations(*graph); + + auto nodes = graph->ReleaseNodes(); + ir::Graph &result = *graph; + + for (auto &node : nodes) { + if (node->IsVar() && node->Var()) { + all_vars_.emplace(node->Name(), node->Var()); + } + } + + // We cannot invoke resize. It is a bug of GCC 4.8 + result.Set(details::kGraphVars, new details::GraphVars(places_.size())); + result.Set(details::kGraphDepVars, new details::GraphDepVars); + result.Set(kGraphOps, new GraphOps); + + bool is_forwarding = true; + + for (ir::Node *node : sorted_ops) { + if (DealWithSpecialOp(&result, node)) { + continue; + } else { + // This op runs on all devices + if (IsScaleLossOp(node)) { + // user can customize loss@grad if not use_default_grad_scale_ + InsertScaleLossGradOp(&result, node); + // This assumes the backward generating code will ensure IsScaleLossOp + // is true only for the op that scale the final scalar loss. + // It also assumes backward op will always follow the forward op in + // the block. + is_forwarding = false; + } else { + CreateComputationalOps(&result, node, places_.size()); + } + + // Insert collective ops if nranks > 1 + if (!is_forwarding && Get(kNRanks) > 1) { + try { + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + // optimize op is already processed in DealWithSpecialOp, + // here we only consider backward op + if (!is_bk_op) continue; + + /* + * the op that will generate the gradient of on parameter will have + one attr op_role_var + * to record the parameter and gradient, like: + attrs { + name: "op_role_var" + type: STRINGS + strings: "fc_1.b_0" + strings: "fc_1.b_0@GRAD" + } + */ + + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + auto backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &p_name = backward_vars[i]; + auto &g_name = backward_vars[i + 1]; + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name + << " op_type " << node->Op()->Type(); + if (NeedCollectiveForGrad(g_name, sorted_ops)) { + InsertCollectiveOp(&result, p_name, g_name); + } + } + } catch (boost::bad_get e) { + } + } + } + } + + InsertPostprocessOps(&result); + + /* + Dependency graph has been constructed. However, there are still data + hazards need to be handled. + */ + PolishGraphToSupportDataHazards(&result); + + /* + * Only variables should be the leaves of graph. + */ + AddOutputToLeafOps(&result); + + result.Erase(kGraphOps); +} + +void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( + ir::Graph *result, const ir::Node *node) const { + // user can customize loss@grad if not use_default_grad_scale_ + size_t loss_scale = 0; + switch (this->strategy_.gradient_scale_) { + case details::BuildStrategy::GradientScaleStrategy::kOne: + loss_scale = 1; + break; + case details::BuildStrategy::GradientScaleStrategy::kCoeffNumDevice: + loss_scale = Get(kNRanks); + break; + case details::BuildStrategy::GradientScaleStrategy::kCustomized: + loss_scale = 0; + break; + default: + LOG(FATAL) << "Unknown gradient scale strategy."; + break; + } + + VLOG(3) << "loss_scale: " << loss_scale; + + if (loss_scale) { + // TODO(paddle-dev): Why is there no input for this op_handle? + auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; + auto out_dtype = this->all_vars_.at(loss_grad_name)->GetDataType(); + this->CreateScaleLossGradOp(result, loss_grad_name, node->outputs[0], + loss_scale, out_dtype); + } +} + +bool MultiDevSSAGraphBuilderBase::DealWithSpecialOp(ir::Graph *result, + ir::Node *node) const { + return false; +} + +std::vector MultiDevSSAGraphBuilderBase::SortOperations( + const ir::Graph &graph) const { + return ir::TopologySortOperations(graph); +} + +bool MultiDevSSAGraphBuilderBase::UseGPU() const { + bool use_gpu = false; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + use_gpu = nccl_ctxs_ != nullptr; +#endif + return use_gpu; +} + +bool MultiDevSSAGraphBuilderBase::NeedCollectiveForGrad( + const std::string &grad_name, std::vector ops) const { + // if we have allreduce_op for current gradient variable in the graph, + // then we don't need to add allreduce_op_handle for this gradient + // NOTE: This is for the case that all gradients should add collective ops + for (auto *node : ops) { + if (node->Op()->Type() != "allreduce") continue; + for (auto in_name : node->Op()->InputArgumentNames()) { + if (in_name == grad_name) { + return false; + } + } + } + return true; +} + +void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, + ir::Node *node, + size_t place_id) const { + auto p = places_[place_id]; + auto *op_handle = result->Get(kGraphOps).back(); + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + + for (ir::Node *input : node->inputs) { + details::VarHandle *var = + CreateOrGetLatestVarHandle(result, input, p, place_id); + op_handle->AddInput(var); + } + + for (ir::Node *output : node->outputs) { + ir::Node *new_node = nullptr; + if (output->Var()) { + new_node = result->CreateVarNode(output->Var()); + } else { + new_node = + result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); + } + CreateOpOutput(result, op_handle, new_node, p, place_id); + } +} + +void MultiDevSSAGraphBuilderBase::SetCommunicationContext( + details::OpHandleBase *op_handle, const platform::Place &p) const { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + if (nccl_ctxs_ == nullptr) { + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + } +#else + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); +#endif +} + +void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, + const std::string &p_name, + size_t src_dev_id) const { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto *op_handle = new details::BroadcastOpHandle( + result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), + local_scopes_, places_, nccl_ctxs_); +#else + auto *op_handle = new details::BroadcastOpHandle( + result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), + local_scopes_, places_); +#endif + result->Get(kGraphOps).emplace_back(op_handle); + + auto *in = result->Get(details::kGraphVars) + .at(src_dev_id) + .at(p_name) + .back(); + op_handle->AddInput(in); + + for (size_t i = 0; i < places_.size(); ++i) { + auto &p = places_[i]; + SetCommunicationContext(op_handle, p); + auto &vars = + result->Get(details::kGraphVars).at(i).at(p_name); + auto *out_var = new details::VarHandle( + result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(), + i, p_name, p); + vars.emplace_back(out_var); + op_handle->AddOutput(out_var); + } +} + +void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( + ir::Graph *result, + const std::vector> &bcast_varnames) const { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto *op_handle = new details::FusedBroadcastOpHandle( + result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), + local_scopes_, places_, nccl_ctxs_); +#else + auto *op_handle = new details::FusedBroadcastOpHandle( + result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), + local_scopes_, places_); +#endif + result->Get(kGraphOps).emplace_back(op_handle); + + for (size_t i = 0; i < places_.size(); ++i) { + auto &p = places_[i]; + SetCommunicationContext(op_handle, p); + } + + for (size_t dev_id = 0; dev_id < bcast_varnames.size(); ++dev_id) { + for (auto &p_name : bcast_varnames[dev_id]) { + auto *in = result->Get(details::kGraphVars) + .at(dev_id) + .at(p_name) + .back(); + op_handle->AddInput(in); + for (size_t out_dev_id = 0; out_dev_id < places_.size(); ++out_dev_id) { + auto &p = places_[out_dev_id]; + auto &vars = result->Get(details::kGraphVars) + .at(out_dev_id) + .at(p_name); + auto *out_var = new details::VarHandle( + result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), + vars.size(), out_dev_id, p_name, p); + vars.emplace_back(out_var); + op_handle->AddOutput(out_var); + } + } + } +} + +void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, + ir::Node *node, + size_t dev_id) const { + result->Get(kGraphOps).emplace_back( + new details::ComputationOpHandle(result->CreateOpNode(node->Op()), + local_scopes_[dev_id], places_[dev_id], + dev_id)); + CreateOpHandleIOs(result, node, dev_id); +} + +void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, + const std::string &og, + bool is_encoded) const { + details::OpHandleBase *op_handle = nullptr; + + auto append_allreduce_op = [&]( + const std::vector &scopes, + const std::vector &places) -> details::OpHandleBase * { +#if defined(PADDLE_WITH_DGC) + if (is_encoded) { + result->Get(kGraphOps).emplace_back( + new details::SparseAllReduceOpHandle( + result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), + scopes, places, multi_nccl_ctxs_, is_encoded, + static_cast(strategy_.trainers_endpoints_.size()) * + places_.size())); + } else { + result->Get(kGraphOps).emplace_back( + new details::AllReduceOpHandle( + result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), + scopes, places, multi_nccl_ctxs_)); + } +#elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + result->Get(kGraphOps).emplace_back( + new details::AllReduceOpHandle( + result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), + scopes, places, multi_nccl_ctxs_)); +#else + result->Get(kGraphOps).emplace_back( + new details::AllReduceOpHandle( + result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), + scopes, places)); +#endif + return result->Get(kGraphOps).back(); + }; + + if (!strategy_.enable_parallel_graph_) + op_handle = append_allreduce_op(local_scopes_, places_); + + for (size_t i = 0; i < places_.size(); ++i) { + if (strategy_.enable_parallel_graph_) { + op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]}); + } + + SetCommunicationContext(op_handle, places_[i]); + auto &vars = result->Get(details::kGraphVars)[i][og]; + PADDLE_ENFORCE(!vars.empty()); + auto &prev_grad = vars.back(); + op_handle->AddInput(prev_grad); + VLOG(10) << "all_reduce_op_handle add input " << prev_grad->DebugString(); + + auto var = new details::VarHandle( + result->CreateEmptyNode(og, ir::Node::Type::kVariable), vars.size(), i, + og, places_[i]); + vars.emplace_back(var); + op_handle->AddOutput(var); + VLOG(10) << "all_reduce_op_handle add output " << og + << ", handle:" << var->DebugString(); + } +} + +void MultiDevSSAGraphBuilderBase::CreateScaleLossGradOp( + ir::Graph *result, const std::string &loss_grad_name, + ir::Node *out_var_node, size_t loss_scale, + proto::VarType::Type dtype) const { + for (size_t i = 0; i < places_.size(); ++i) { + auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); + auto *op_handle = new details::ScaleLossGradOpHandle( + result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), + loss_scale, local_scopes_[i], places_[i], dev_ctx, dtype); + result->Get(kGraphOps).emplace_back(op_handle); + + // FIXME: Currently ScaleLossGradOp only use device_count as scale + // factor. So it does not depend on any other operators. + // VarHandle *loss = GetVarHandle(loss_var_name, place); + // loss->pending_ops_.emplace_back(op_handle); + // op_handle->inputs_.emplace_back(loss); + + CreateOpOutput(result, op_handle, + result->CreateVarNode(out_var_node->Var()), places_[i], i); + } +} + +void MultiDevSSAGraphBuilderBase::CreateComputationalOps( + ir::Graph *result, ir::Node *node, size_t num_places) const { + for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { + auto p = places_[scope_idx]; + auto s = local_scopes_[scope_idx]; + result->Get(kGraphOps).emplace_back( + new details::ComputationOpHandle(result->CreateOpNode(node->Op()), s, p, + scope_idx)); + CreateOpHandleIOs(result, node, scope_idx); + } +} + +details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp( + ir::Graph *result, const std::string &og, size_t dst_dev_id) const { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + result->Get(kGraphOps).emplace_back(new details::ReduceOpHandle( + result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), + local_scopes_, places_, nccl_ctxs_)); +#else + result->Get(kGraphOps).emplace_back(new details::ReduceOpHandle( + result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), + local_scopes_, places_)); +#endif + auto *op_handle = result->Get(kGraphOps).back(); + + for (size_t i = 0; i < places_.size(); ++i) { + auto &p = places_[i]; + SetCommunicationContext(op_handle, p); + auto &vars = result->Get(details::kGraphVars)[i][og]; + PADDLE_ENFORCE(!vars.empty()); + auto &prev_grad = vars.back(); + op_handle->AddInput(prev_grad); + } + auto &vars = + result->Get(details::kGraphVars)[dst_dev_id][og]; + auto var = new details::VarHandle( + result->CreateEmptyNode(og, ir::Node::Type::kVariable), vars.size(), + dst_dev_id, og, places_[dst_dev_id]); + vars.emplace_back(var); + op_handle->AddOutput(var); + return var; +} + +bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const { + return !loss_var_name_.empty() && node->Op() && + boost::get( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + (static_cast(OpRole::kBackward) | + static_cast(OpRole::kLoss)); +} + +bool MultiDevSSAGraphBuilderBase::IsSparseGradient( + const std::string &og) const { + PADDLE_ENFORCE(all_vars_.count(og) != 0); + return all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS; +} + +void AllReduceSSAGraphBuilder::InsertCollectiveOp( + ir::Graph *result, const std::string &p_name, + const std::string &g_name) const { + if (IsSparseGradient(g_name)) { + CreateReduceOp(result, g_name, 0); + CreateBroadcastOp(result, g_name, 0); + } else { +#if defined(PADDLE_WITH_DGC) + CreateAllReduceOp(result, g_name, IsEncoded(p_name)); +#else + CreateAllReduceOp(result, g_name); +#endif + } +} + +int BalanceVarSSAGraphBuilder::GetVarDeviceID( + const std::string &varname) const { + auto got = sharded_var_device_.find(varname); + if (got == sharded_var_device_.end()) { + auto pos = varname.find(framework::kNewGradSuffix); + if (pos != std::string::npos) { + got = sharded_var_device_.find(varname.substr(0, pos)); + } + } + return got == sharded_var_device_.end() ? -1 : got->second; +} + +int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { + if (strategy_.reduce_ != details::BuildStrategy::ReduceStrategy::kReduce) { + return -1; + } + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1]); + PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", + node->Op()->Type(), param_grad[0], param_grad[1]); + return dev_id; +} + +size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID( + const std::vector &var_names) const { + int64_t numel_sum = 0; + for (auto var_name : var_names) { + if (all_vars_.find(var_name) == all_vars_.end()) continue; + auto var_desc = all_vars_.at(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc); + auto dim = framework::make_ddim(var_desc->GetShape()); + int64_t numel = framework::product(dim); + PADDLE_ENFORCE_GT(numel, 0); + numel_sum += numel; + } + + auto smallest = + std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); + size_t dev_id = + static_cast(std::distance(std::begin(balance_vars_), smallest)); + balance_vars_[dev_id] += numel_sum; + return dev_id; +} + +void BalanceVarSSAGraphBuilder::ResetState() const { + balance_vars_.clear(); + sharded_var_device_.clear(); + + balance_vars_.resize(places_.size(), 0); +} + +void ReduceSSAGraphBuilder::Init() const { + MultiDevSSAGraphBuilderBase::Init(); + ResetState(); +} + +void ReduceSSAGraphBuilder::ResetState() const { + BalanceVarSSAGraphBuilder::ResetState(); + bcast_var_name_set_.clear(); + bcast_var_name_set_.resize(places_.size()); +} + +void ReduceSSAGraphBuilder::InsertCollectiveOp( + ir::Graph *result, const std::string &p_name, + const std::string &g_name) const { + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(result, g_name, cur_device_id); + sharded_var_device_.emplace(g_name, cur_device_id); + bcast_var_name_set_[cur_device_id].emplace(p_name); +} + +bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, + ir::Node *node) const { + int op_dev_id = BalanceVarSSAGraphBuilder::GetOpDeviceID(node); + if (op_dev_id != -1) { + // This op only runs on one specific device. + CreateComputationalOp(result, node, op_dev_id); + for (ir::Node *n : node->outputs) { + sharded_var_device_.emplace(n->Name(), op_dev_id); + } + return true; + } + return false; +} + +void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { + if (UseGPU()) { + if (strategy_.fuse_broadcast_ops_) { + CreateFusedBroadcastOp(result, bcast_var_name_set_); + } else { + for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set_[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(result, bcast_name, dev_id); + } + } + } + } +} + +int ReduceSSAGraphBuilder::GetOpDeviceID( + ir::Node *node, + std::unordered_map> *delay_ops) const { + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1]); + + if (dev_id == -1) { + (*delay_ops)[param_grad[1]].push_back(node); + return -2; + } + return dev_id; +} + +std::vector ReduceSSAGraphBuilder::SortOperations( + const ir::Graph &graph) const { + std::vector sorted_ops = ir::TopologySortOperations(graph); + return SortForReduceMode(sorted_ops); +} + +std::vector ReduceSSAGraphBuilder::SortForReduceMode( + const std::vector &topo_ops) const { + std::vector sorted_ops; + std::unordered_map> delayed_op; + sorted_ops.reserve(topo_ops.size()); + ResetState(); + + auto insert_delayed_op = [&](const std::string &var_name, int dev_id) { + sharded_var_device_.emplace(var_name, dev_id); + if (delayed_op.count(var_name)) { + auto &ops = delayed_op.at(var_name); + sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end()); + delayed_op.at(var_name).clear(); + } + }; + + for (ir::Node *node : topo_ops) { + int op_dev_id = GetOpDeviceID(node, &delayed_op); + if (op_dev_id > -1) { + // This op only runs on one specific device. + sorted_ops.emplace_back(node); + for (ir::Node *n : node->outputs) { + insert_delayed_op(n->Name(), op_dev_id); + } + } else if (op_dev_id == -1) { + // This op runs on all devices, and its output may have parameter's + // gradients. + sorted_ops.emplace_back(node); + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + std::vector backward_vars; + try { + backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + } catch (boost::bad_get e) { + } + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &g_name = backward_vars[i + 1]; + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + insert_delayed_op(g_name, static_cast(cur_device_id)); + } + } else if (op_dev_id == -2) { + // The Op on which the Op depends has not yet been generated. + } + } + + PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size()); + + ResetState(); + return sorted_ops; +} + +void DistSSAGraphBuilder::Init() const { + MultiDevSSAGraphBuilderBase::Init(); + ResetState(); +} + +void DistSSAGraphBuilder::ResetState() const { + BalanceVarSSAGraphBuilder::ResetState(); + bcast_var_name_set_.clear(); + bcast_var_name_set_.resize(places_.size()); +} + +bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, + ir::Node *node) const { + bool insert_op = false; + if (OpHaveRole(*node, OpRole::kRPC)) { + int op_dev_id = CreateRPCOp(result, node); + PADDLE_ENFORCE(op_dev_id != -1, + "Can not schedule the RPC operator to the right place."); + if (node->Op()->Type() == "recv") { + auto recv_vars_attr = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient] + if (recv_vars_attr[0].find(".block") == std::string::npos) { + bcast_var_name_set_[op_dev_id].emplace(recv_vars_attr[0]); + } + } + insert_op = true; + need_broadcast_var_ = true; + } else if (OpHaveRole(*node, OpRole::kDist)) { + int op_dev_id = CreateDistTrainOp(result, node); + if (node->Op()->Type() == "concat") { + // the input(block of parameter) of concat is on different device, + // the output(parameter) will on one device. + auto origin_param_name = node->Op()->OutputArgumentNames()[0]; + bcast_var_name_set_[op_dev_id].emplace(origin_param_name); + } + insert_op = true; + } else { + int op_dev_id = GetOpDeviceID(node); + if (op_dev_id != -1) { // This op only runs on one specific device. + // optimize op will be processed here. + CreateComputationalOp(result, node, op_dev_id); + for (ir::Node *n : node->outputs) { + sharded_var_device_.emplace(n->Name(), op_dev_id); + } + insert_op = true; + } + } + return insert_op; +} + +void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { + auto *op_handle = result->Get(kGraphOps).back(); + for (ir::Node *input : node->inputs) { + details::VarHandle *var = nullptr; + for (int place_offset = 0; place_offset < num_places; ++place_offset) { + auto &var_holders = + result->Get(details::kGraphVars)[place_offset]; + auto &var_holder = var_holders[input->Name()]; + if (!var_holder.empty()) { + var = *var_holder.rbegin(); + op_handle->AddInput(var); + } + } + } +} + +// Create RPC related op handles that connects its in ops and out ops. +int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { + int op_dev_id = -1; + if (node->Op()->Type() == "send") { + // TODO(paddle-dev): getting the first var is not safe. + op_dev_id = GetVarDeviceID(node->inputs[0]->Name()); + PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), + "This hack no longer holds, please fix."); + // the variable name which contains .block means it was splited by + // split_byref op + if (strategy_.reduce_ == + details::BuildStrategy::ReduceStrategy::kAllReduce && + node->inputs[0]->Name().find(".block") == std::string::npos) { + std::vector input_var_names; + for (ir::Node *n : node->inputs) { + input_var_names.push_back(n->Name()); + } + auto send_param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(send_param_grad.size(), 2U); + op_dev_id = GetAppropriateDeviceID({send_param_grad[1]}); + VLOG(10) << "send grad " << input_var_names[0] << " origin " + << send_param_grad[1] << " place: " << op_dev_id; + for (auto &varname : input_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + sharded_var_device_.emplace(send_param_grad[1], op_dev_id); + } + } else if (node->Op()->Type() == "recv") { + std::vector output_var_names; + for (ir::Node *n : node->outputs) { + output_var_names.push_back(n->Name()); + } + auto recv_param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + if (recv_param_grad.size() == 2U) { + op_dev_id = GetVarDeviceID(recv_param_grad[1]); + VLOG(10) << "recv param " << recv_param_grad[0] + << " get grad place: " << recv_param_grad[1] + << " place: " << op_dev_id; + } else { + op_dev_id = GetAppropriateDeviceID(output_var_names); + } + for (auto &varname : output_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } else { + // send_barrier, fetch_barrier will run on place 0; + op_dev_id = 0; + } + + PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", + node->Op()->Type()); + + // Create fetch_barrier op handle to enable output on all devices. + // **NOTE** fetch_barrier should output variables list same as recv op does. + if (node->Op()->Type() == "fetch_barrier") { + result->Get(kGraphOps).emplace_back( + new details::FetchBarrierOpHandle(result->CreateOpNode(node->Op()), + local_scopes_, places_)); + } else { + result->Get(kGraphOps).emplace_back(new details::RPCOpHandle( + result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], + node->Op()->Type(), places_[op_dev_id])); + } + + if (node->Op()->Type() == "send") { + CreateOpHandleIOs(result, node, op_dev_id); + } else { + // send_barrier, recv, fetch_barrier's inputs are deps var, get them from + // all places + auto p = places_[op_dev_id]; + auto *op_handle = result->Get(kGraphOps).back(); + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + + SetOpInputsAllPlaces(result, node, places_.size()); + for (ir::Node *output : node->outputs) { + int outvar_dev_id = op_dev_id; + if (node->Op()->Type() == "fetch_barrier") { + outvar_dev_id = GetVarDeviceID(output->Name()); + PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name()); + } + p = places_[outvar_dev_id]; + ir::Node *new_node = nullptr; + if (output->Var()) { + new_node = result->CreateVarNode(output->Var()); + } else { + new_node = + result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); + } + CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id); + } + } + return op_dev_id; +} + +int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, + ir::Node *node) const { + int op_dev_id = -1; + std::vector input_var_names; + std::vector output_var_names; + for (ir::Node *input : node->inputs) { + input_var_names.push_back(input->Name()); + } + for (ir::Node *output : node->outputs) { + output_var_names.push_back(output->Name()); + } + + if (node->Op()->Type() == "split_byref" || + node->Op()->Type() == "split_selected_rows" || + node->Op()->Type() == "split_ids") { + // TODO(paddle-dev): getting the first var is not safe. + op_dev_id = GetVarDeviceID(input_var_names[0]); + if (strategy_.reduce_ == + details::BuildStrategy::ReduceStrategy::kAllReduce) { + op_dev_id = GetAppropriateDeviceID(input_var_names); + for (auto &varname : input_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } + for (auto &varname : output_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } else if (node->Op()->Type() == "concat") { + op_dev_id = GetVarDeviceID(input_var_names[0]); + for (auto &varname : output_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } else { + LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type(); + PADDLE_THROW( + "the distribute training related op should be in [split_byref, " + "concat]."); + } + + PADDLE_ENFORCE(op_dev_id != -1, + "can not find right place for distributed op: %s", + node->Op()->Type()); + + CreateComputationalOp(result, node, op_dev_id); + return op_dev_id; +} + +#if defined(PADDLE_WITH_DGC) +bool AllReduceSSAGraphBuilder::IsEncoded(const std::string &p_name) const { + auto u_name = p_name + details::g_dgc_u; + auto it = all_vars_.find(u_name); + if (it == all_vars_.end()) { + VLOG(10) << "can't find u_name, so it's not encoded:" << u_name; + return false; + } + + return true; +} +#else +bool AllReduceSSAGraphBuilder::IsEncoded(const std::string &p_name) const { + return false; +} +#endif + +void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, + const std::string &p_name, + const std::string &g_name) const { + // collective gradient to each device + size_t cur_device_id = 0; + switch (strategy_.reduce_) { + case details::BuildStrategy::ReduceStrategy::kReduce: + cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(result, g_name, cur_device_id); + sharded_var_device_.emplace(g_name, cur_device_id); + break; + case details::BuildStrategy::ReduceStrategy::kAllReduce: + if (IsSparseGradient(g_name)) { + CreateReduceOp(result, g_name, 0); + CreateBroadcastOp(result, g_name, 0); + } else { + CreateAllReduceOp(result, g_name); + } + break; + default: + LOG(FATAL) << "Unknown reduce strategy."; + break; + } +} + +void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { + // broad cast received parameters when training in parameter server mode. + if (need_broadcast_var_) { + // There are 4 conditions: + // 1. GPU && Reduce: Reduce gradient then broadcast gradient to other GPUS. + // Need to broadcast received parameters to other GPU. + // 2. GPU && AllReduce: AllReduce all graident to each GPU. Need to + // broadcast received parameters to other GPU. + // 3. CPU && AllReduce: AllReduce all gradient to each thread. Need to + // broadcast received parameters to other scope. + // 4. CPU && Reduce: because all parameters share the same memory, did not + // broadcast received parameters. + if (!UseGPU() && + strategy_.reduce_ == details::BuildStrategy::ReduceStrategy::kReduce) { + return; + } + if (strategy_.fuse_broadcast_ops_) { + CreateFusedBroadcastOp(result, bcast_var_name_set_); + } else { + for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set_[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(result, bcast_name, dev_id); + } + } + } + } +} + +std::unordered_set &MultiDevSSAGraphBuilder() { + static std::unordered_set regs; + return regs; +} + +static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) { + MultiDevSSAGraphBuilder().insert(builder_mode); + return 0; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +#define REGISTER_MULTI_DEVICES_PASS(pass_name, pass_class) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + _reg_ssa_graph_builder_##pass_name, \ + "REGISTER_MULTI_DEVICES_PASS must be called in global namespace."); \ + int _reg_ssa_graph_builder_entry_##pass_name = \ + paddle::framework::ir::MultiDevSSAGraphBuilderRegister(#pass_name); \ + REGISTER_PASS(pass_name, pass_class) \ + .RequirePassAttr(paddle::framework::ir::kLossVarName) \ + .RequirePassAttr(paddle::framework::details::kPlaces) \ + .RequirePassAttr(paddle::framework::details::kLocalScopes) \ + .RequirePassAttr(paddle::framework::ir::kStrategy) \ + .RequirePassAttr(paddle::framework::ir::kNRanks) + +REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass, + paddle::framework::ir::ReduceSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS(all_reduce_mode_multi_devices_pass, + paddle::framework::ir::AllReduceSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass, + paddle::framework::ir::DistSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS(async_multi_devices_pass, + paddle::framework::ir::AsyncSSAGraphBuilder); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h new file mode 100644 index 00000000..9b36d231 --- /dev/null +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h @@ -0,0 +1,215 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace platform { +class NCCLContextMap; +} + +namespace framework { +class Scope; +namespace ir { + +constexpr char kLossVarName[] = "loss_var_name"; +constexpr char kStrategy[] = "strategy"; +constexpr char kNRanks[] = "nranks"; + +class MultiDevSSAGraphBuilderBase : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override; + + virtual void Init() const; + + virtual void CheckGraph(const ir::Graph &graph) const; + + virtual std::vector SortOperations(const ir::Graph &graph) const; + + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const = 0; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const; + + virtual void InsertPostprocessOps(ir::Graph *result) const = 0; + + bool UseGPU() const; + + virtual bool NeedCollectiveForGrad(const std::string &grad_name, + std::vector ops) const; + + bool IsScaleLossOp(ir::Node *node) const; + + void CreateComputationalOps(ir::Graph *result, ir::Node *node, + size_t num_places) const; + + void CreateScaleLossGradOp(ir::Graph *result, + const std::string &loss_grad_name, + ir::Node *out_var_node, size_t loss_scale, + proto::VarType::Type dtype) const; + + details::VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, + size_t dst_dev_id) const; + + void CreateComputationalOp(ir::Graph *result, ir::Node *node, + size_t dev_id) const; + + bool IsSparseGradient(const std::string &og) const; + + void CreateAllReduceOp(ir::Graph *result, const std::string &og, + bool is_encoded = false) const; + + void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, + size_t src_dev_id) const; + + void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const; + + void CreateFusedBroadcastOp( + ir::Graph *result, + const std::vector> &bcast_varnames) const; + + void SetCommunicationContext(details::OpHandleBase *op_handle, + const platform::Place &p) const; + + void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, + size_t device_id) const; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + mutable platform::NCCLContextMap *nccl_ctxs_{nullptr}; + mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr}; +#endif + + mutable std::string loss_var_name_; + mutable std::vector places_; + mutable std::vector local_scopes_; + + mutable details::BuildStrategy strategy_; + mutable std::unordered_map all_vars_; +}; + +class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual void InsertPostprocessOps(ir::Graph *result) const {} + + bool IsEncoded(const std::string &p_name) const; +}; + +class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const override {} + + bool NeedCollectiveForGrad(const std::string &grad_name, + std::vector ops) const { + return false; + } + + bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override { + if (node->Op()->Type() == "recv") { + VLOG(1) << "set recv op do_not_run to true"; + node->Op()->SetAttr("do_not_run", 1); + node->Op()->Flush(); + } else if (node->Name() == "lookup_table" || node->Name() == "nce" || + node->Name() == "hierarchical_sigmoid") { + // in async_mode, we do not need remote prefetch, because communicator + // will do async parameter recv. + VLOG(1) << "set " << node->Name() << " op remote_prefetch to false"; + node->Op()->SetAttr("remote_prefetch", false); + node->Op()->Flush(); + } + return false; + } + + void InsertPostprocessOps(ir::Graph *result) const override {} +}; + +class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + int GetVarDeviceID(const std::string &varname) const; + + int GetOpDeviceID(ir::Node *node) const; + + size_t GetAppropriateDeviceID( + const std::vector &var_names) const; + + virtual void ResetState() const; + + mutable std::unordered_map sharded_var_device_; + mutable std::vector balance_vars_; +}; + +class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder { + protected: + virtual void Init() const; + + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const; + + virtual void InsertPostprocessOps(ir::Graph *result) const; + + virtual std::vector SortOperations(const ir::Graph &graph) const; + + virtual void ResetState() const; + + int GetOpDeviceID(ir::Node *node, + std::unordered_map> + *delay_ops) const; + + std::vector SortForReduceMode( + const std::vector &topo_ops) const; + + mutable std::vector> bcast_var_name_set_; +}; + +class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder { + protected: + virtual void Init() const; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const; + + virtual void InsertPostprocessOps(ir::Graph *result) const; + + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual void ResetState() const; + + int CreateRPCOp(ir::Graph *result, ir::Node *node) const; + + int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; + + mutable std::vector> bcast_var_name_set_; + mutable bool need_broadcast_var_{false}; +}; + +std::unordered_set &MultiDevSSAGraphBuilder(); + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc new file mode 100644 index 00000000..a6c2b282 --- /dev/null +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SSAGraghBuilderWithPrinterPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override { + std::unique_ptr fout( + new std::ofstream(Get(kGraphvizPath))); + PADDLE_ENFORCE(fout->good()); + Get("graph_printer").Print(*graph, *fout); + } +}; + +template +static inline void IterAllVar(const ir::Graph &graph, Callback callback) { + for (auto &each : graph.Get(details::kGraphVars)) { + for (auto &pair1 : each) { + for (auto &pair2 : pair1.second) { + callback(*pair2); + } + } + } + + for (auto &var : graph.Get(details::kGraphDepVars)) { + callback(*var); + } +} + +void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, + std::ostream &sout) const { + size_t var_id = 0; + std::unordered_map vars; + + sout << "digraph G {\n"; + + IterAllVar(graph, [&](const details::VarHandleBase &var) { + auto *var_ptr = &var; + auto *var_handle_ptr = dynamic_cast(var_ptr); + auto *dummy_ptr = dynamic_cast(var_ptr); + + size_t cur_var_id = var_id++; + vars[var_ptr] = cur_var_id; + + if (var_handle_ptr) { + sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name() + << "\\n" + << var_handle_ptr->place() << "\\n" + << "scope: " << var_handle_ptr->scope_idx() << "\\n" + << "v" << var_handle_ptr->version() << "\"]" << std::endl; + } else if (dummy_ptr) { + sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl; + } + }); + + size_t op_id = 0; + for (auto &op : ir::FilterByNodeWrapper(graph)) { + std::string op_name = "op_" + std::to_string(op_id++); + sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]" + << std::endl; + for (auto in : op->Inputs()) { + std::string var_name = "var_" + std::to_string(vars[in]); + sout << var_name << " -> " << op_name << std::endl; + } + + for (auto out : op->Outputs()) { + std::string var_name = "var_" + std::to_string(vars[out]); + sout << op_name << " -> " << var_name << std::endl; + } + } + + sout << "}\n"; +} +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(multi_devices_print_pass, + paddle::framework::ir::SSAGraghBuilderWithPrinterPass) + .RequirePassAttr(paddle::framework::ir::kGraphvizPath); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h new file mode 100644 index 00000000..8562856e --- /dev/null +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/multi_devices_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +constexpr char kGraphvizPath[] = "debug_graphviz_path"; + +class SSAGraphPrinter { + public: + virtual ~SSAGraphPrinter() {} + virtual void Print(const ir::Graph& graph, std::ostream& sout) const = 0; +}; + +class GraphvizSSAGraphPrinter : public SSAGraphPrinter { + public: + void Print(const ir::Graph& graph, std::ostream& sout) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc new file mode 100644 index 00000000..3a5333d0 --- /dev/null +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +class SequentialExecutionPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override { + // FIXME(zjl): Insert dependencies between some distributed ops may cause + // the multi_devices_graph_pass fails. So we skip these ops here. + // Indeed, maybe we should not insert dependencies between these ops + // casually, which may cause deadlock easily. + // We should add more skipped distributed ops when found errors in + // multi_devices_graph_pass + static std::unordered_set skip_dist_ops{ + "send", "recv", "send_barrier", "fetch_barrier"}; + + auto &ops = + graph->Get>(details::kStaleProgramOpDescs); + std::vector op_node_list; + op_node_list.reserve(ops.size()); + + std::unordered_map op_deps; + std::unordered_map> pending_ops; + std::unordered_set ready_ops; + + for (ir::Node *node : graph->Nodes()) { + if (!node->IsOp()) continue; + std::unordered_set preceding_ops; + for (auto *in : node->inputs) { + PADDLE_ENFORCE(in->IsVar(), + "Preceding Node of Op Nodes must be Var Node"); + if (in->inputs.empty()) continue; + PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp(), + "Preceding Op Node of Var Node must be unique"); + preceding_ops.insert(in->inputs[0]); + pending_ops[in->inputs[0]].insert(node); + } + op_deps[node] = preceding_ops.size(); + if (preceding_ops.empty()) { + ready_ops.insert(node); + } + } + + for (auto *op_desc : ops) { + ir::Node *found_node = nullptr; + for (auto *node : ready_ops) { + if (IsSameOpDesc(op_desc, node->Op())) { + PADDLE_ENFORCE(found_node == nullptr, + "Found multiple op_desc in graph: %s", + op_desc->Type()); + found_node = node; + } + } + + PADDLE_ENFORCE_NOT_NULL(found_node, "Cannot find op_desc in graph: %s", + op_desc->Type()); + for (auto *pending_op : pending_ops[found_node]) { + if (--op_deps.at(pending_op) == 0) { + ready_ops.insert(pending_op); + } + } + ready_ops.erase(found_node); + if (skip_dist_ops.count(op_desc->Type()) == 0) { + op_node_list.push_back(found_node); + } + } + + for (size_t i = 1; i < op_node_list.size(); ++i) { + auto *dep_var = graph->CreateControlDepVar(); + op_node_list[i]->inputs.push_back(dep_var); + op_node_list[i - 1]->outputs.push_back(dep_var); + dep_var->outputs.push_back(op_node_list[i]); + dep_var->inputs.push_back(op_node_list[i - 1]); + VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name() + << " and " << op_node_list[i]->Name(); + } + } +}; +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(sequential_execution_pass, + paddle::framework::ir::SequentialExecutionPass) + .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs); diff --git a/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc b/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc new file mode 100644 index 00000000..4664d13d --- /dev/null +++ b/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc @@ -0,0 +1,158 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/ngraph_subgraph_pass.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include "paddle/fluid/operators/ngraph/ngraph_bridge.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +namespace ANAT = paddle::inference::analysis; + +std::string GenerateEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs, + const std::string &size) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + engine_hash_key += size; + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} + +void NgraphSubgraphPass::ApplyImpl(ir::Graph *graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("ngraph_subgraph_pass", graph); + + std::unordered_set nodes2delete; + + auto teller = [](const Node *node) { + if (!node->IsOp() || !node->Op()) return false; + auto op_type = node->Op()->Type(); + return !paddle::operators::NgraphBridge::isRegister(op_type); + }; + + ANAT::SubGraphFuser fuser(graph, teller, 0, "ngraph_engine"); + fuser(); + + for (auto *node : graph->Nodes()) { + if (node->IsOp() && !ANAT::Agent(node).subgraph()->empty()) { + OpDesc *op_desc = node->Op(); + op_desc->SetType("ngraph_engine"); + for (auto it = ANAT::Agent(node).subgraph()->begin(); + it != ANAT::Agent(node).subgraph()->end(); ++it) { + } + + CreateNgraphEngineOp(node, graph); + + std::unordered_set nodes2remove( + ANAT::Agent(node).subgraph()->begin(), + ANAT::Agent(node).subgraph()->end()); + GraphSafeRemoveNodes(graph, nodes2remove); + } + } + + std::unordered_set nodes2remove; + for (auto *node : graph->Nodes()) { + if (node->IsOp() && ANAT::Agent(node).deleted()) { + nodes2remove.insert(node); + } + } + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); + std::vector nodes = ir::TopologySortOperations(*graph); +} + +void NgraphSubgraphPass::CreateNgraphEngineOp(framework::ir::Node *node, + Graph *graph) const { + auto *op_desc = node->Op(); + auto &subgraph = *ANAT::Agent(node).subgraph(); + PADDLE_ENFORCE(!subgraph.empty()); + + framework::ProgramDesc *program_desc = + Get("program"); + const framework::BlockDesc &main_block = + program_desc->Block(framework::kRootBlockIndex); + framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); + + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + for (auto *node : subgraph) { + auto *new_block_op = new_block->AppendOp(); + auto *op = block_desc.AppendOp(); + *new_block_op->Proto() = *node->Op()->Proto(); + *op->Proto() = *node->Op()->Proto(); + } + + std::set input_names; + std::set input_names_with_id; + for (auto *x : node->inputs) { + input_names.insert(x->Name()); + input_names_with_id.insert(x->Name() + std::to_string(x->id())); + } + op_desc->SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); + + std::set output_names; + std::set output_names_with_id; + + for (auto *x : node->outputs) { + output_names.insert(x->Name()); + output_names_with_id.insert(x->Name() + std::to_string(x->id())); + } + op_desc->SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); + auto *vars = block_desc.Proto()->mutable_vars(); + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + *vars->Add() = *node->Var()->Proto(); + } + } + + PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), + "the block has no var-desc"); + + op_desc->SetType("ngraph_engine"); + + int sgs = subgraph.size(); + std::string engine_key = GenerateEngineKey( + input_names_with_id, output_names_with_id, std::to_string(sgs)); + std::vector interval{0, sgs}; + op_desc->SetAttr("interval", interval); + op_desc->SetAttr("graph", block_desc.Proto()->SerializeAsString()); + op_desc->SetAttr("engine_key", engine_key); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(ngraph_subgraph_pass, paddle::framework::ir::NgraphSubgraphPass); diff --git a/paddle/fluid/framework/ir/ngraph_subgraph_pass.h b/paddle/fluid/framework/ir/ngraph_subgraph_pass.h new file mode 100644 index 00000000..09f06267 --- /dev/null +++ b/paddle/fluid/framework/ir/ngraph_subgraph_pass.h @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse supported ops to a NgraphEngineOp. + */ +class NgraphSubgraphPass : public FusePassBase { + public: + void ApplyImpl(ir::Graph *graph) const override; + + virtual ~NgraphSubgraphPass() {} + + private: + void CreateNgraphEngineOp(framework::ir::Node *x, + framework::ir::Graph *graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc new file mode 100644 index 00000000..45d81b93 --- /dev/null +++ b/paddle/fluid/framework/ir/node.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_info.h" + +namespace paddle { +namespace framework { +namespace ir { +// msvc15 don't support constexpr in correct way. +#if !defined(_WIN32) +constexpr char Node::kControlDepVarName[]; +#else +const char Node::kControlDepVarName[] = "__control_var"; +#endif + +std::unique_ptr CreateNodeForTest(const std::string &name, + Node::Type type) { + return std::unique_ptr(new Node(name, type)); +} + +std::unique_ptr CreateNodeForTest(VarDesc *var_desc) { + return std::unique_ptr(new Node(var_desc)); +} + +std::unique_ptr CreateNodeForTest(OpDesc *op_desc) { + return std::unique_ptr(new Node(op_desc)); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h new file mode 100644 index 00000000..276e6a5b --- /dev/null +++ b/paddle/fluid/framework/ir/node.h @@ -0,0 +1,174 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { +namespace ir { + +// Node should only created by Graph::CreateXXXNode(). +// 1. Every Node should be part of a graph. No dangling Node exists. +// 2. Node only contains members necessary for building graph structure. +// It doesn't contain other unrelated members, such as device, etc. +// +// Sometimes, for specific usages, Node needs to have additional members, +// such as device_placement, version in order to be executed. It is suggested +// to use composition pattern. +// +// class RunnableOp { +// RunnableOp(ir::Node* n) : n_(n) { n_.WrappedBy(this); } +// +// int any_thing_; +// } +// +// RunnableOp is owned by the ir::Node that composes it. In other words. +// ir::Node will be responsible for deleting RunnableOp, say, when ir::Node +// is deleted from the graph. +class Node { + public: + virtual ~Node() { + if (!wrapper_.empty()) { + VLOG(10) << "ir::Node deleting a wrapper node " << Name(); + wrapper_deleter_(); + } + } + + enum class Type { kOperation, kVariable }; +#if !defined(_WIN32) // msvc not support constexpr correctly. + static constexpr char kControlDepVarName[] = "__control_var"; +#else + static const char kControlDepVarName[]; +#endif + + Type NodeType() const { return type_; } + + std::string Name() const { return name_; } + + VarDesc* Var() const { + PADDLE_ENFORCE(IsVar()); + return var_desc_.get(); + } + + OpDesc* Op() const { + PADDLE_ENFORCE(IsOp()); + return op_desc_.get(); + } + + // Set the `wrapper` that wraps the Node. `wrapper` is owned by Node. + template + void WrappedBy(T* wrapper) { + if (!wrapper_.empty()) { + wrapper_deleter_(); + } + wrapper_ = wrapper; + wrapper_deleter_ = [wrapper]() { delete wrapper; }; + wrapper_type_ = std::type_index(typeid(T)); + } + + // Return a reference to the `wrapper`. + template + T& Wrapper() { + try { + return *boost::any_cast(wrapper_); + } catch (boost::bad_any_cast&) { + PADDLE_THROW("Invalid wrapper type error, expected %s, actual %s", + typeid(T).name(), wrapper_type_.name()); + } + } + + // Test if the Node is wrapped by type T. + template + bool IsWrappedBy() { + return std::type_index(typeid(T)) == wrapper_type_; + } + + // Please don't use this API! + int id() const { return id_; } + + bool IsOp() const { return type_ == Type::kOperation; } + bool IsVar() const { return type_ == Type::kVariable; } + bool IsCtrlVar() const { + return type_ == Type::kVariable && + Name().find(ir::Node::kControlDepVarName) != std::string::npos; + } + + void RenameVar(const std::string& new_name) { + PADDLE_ENFORCE(type_ == Type::kVariable && var_desc_, + "Must be type of variable"); + name_ = new_name; + var_desc_->SetName(new_name); + } + + std::vector inputs; + std::vector outputs; + + protected: + std::string name_; + std::unique_ptr var_desc_; + std::unique_ptr op_desc_; + Type type_; + int id_; + + private: + // ID can only set by a Graph. + void SetId(int id) { id_ = id; } + + friend class Graph; + friend std::unique_ptr CreateNodeForTest(const std::string& name, + Node::Type type); + friend std::unique_ptr CreateNodeForTest(VarDesc* var_desc); + friend std::unique_ptr CreateNodeForTest(OpDesc* op_desc); + + explicit Node(const std::string& name, Type type) + : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} + + explicit Node(VarDesc* var_desc) + : name_(var_desc->Name()), + var_desc_(new VarDesc(*var_desc)), + op_desc_(nullptr), + type_(Type::kVariable) {} + + explicit Node(OpDesc* op_desc) + : name_(op_desc->Type()), + var_desc_(nullptr), + op_desc_(new OpDesc(*op_desc, op_desc->Block())), + type_(Type::kOperation) {} + + Node() = delete; + + boost::any wrapper_; + std::function wrapper_deleter_; + std::type_index wrapper_type_ = std::type_index(typeid(void)); + + DISABLE_COPY_AND_ASSIGN(Node); +}; + +std::unique_ptr CreateNodeForTest(const std::string& name, + Node::Type type); +std::unique_ptr CreateNodeForTest(VarDesc* var_desc); + +std::unique_ptr CreateNodeForTest(OpDesc* op_desc); +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/node_test.cc b/paddle/fluid/framework/ir/node_test.cc new file mode 100644 index 00000000..694efadd --- /dev/null +++ b/paddle/fluid/framework/ir/node_test.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class RunnableOp { + public: + RunnableOp(Node* node, bool* alive) : node_(node), alive_(alive) { + node_->WrappedBy(this); + } + + virtual ~RunnableOp() { *alive_ = false; } + + private: + Node* node_; + bool* alive_; +}; + +class RunnableOp2 { + public: + RunnableOp2(Node* node, bool* alive) : node_(node), alive_(alive) { + node_->WrappedBy(this); + } + + virtual ~RunnableOp2() { *alive_ = false; } + + private: + Node* node_; + bool* alive_; +}; + +TEST(NodeTest, Basic) { + bool alive1 = true; + bool alive2 = true; + std::unique_ptr n1(CreateNodeForTest("n1", Node::Type::kVariable)); + std::unique_ptr n2(CreateNodeForTest("n2", Node::Type::kVariable)); + + EXPECT_FALSE(n1->IsWrappedBy()); + EXPECT_FALSE(n1->IsWrappedBy()); + EXPECT_FALSE(n2->IsWrappedBy()); + EXPECT_FALSE(n2->IsWrappedBy()); + + new RunnableOp(n1.get(), &alive1); + new RunnableOp2(n2.get(), &alive2); + + EXPECT_TRUE(n1->IsWrappedBy()); + EXPECT_FALSE(n1->IsWrappedBy()); + EXPECT_FALSE(n2->IsWrappedBy()); + EXPECT_TRUE(n2->IsWrappedBy()); + + EXPECT_TRUE(alive1); + EXPECT_TRUE(alive2); + + n1.reset(nullptr); + n2.reset(nullptr); + EXPECT_FALSE(alive1); + EXPECT_FALSE(alive2); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc new file mode 100644 index 00000000..ca8e8299 --- /dev/null +++ b/paddle/fluid/framework/ir/pass.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/pass.h" + +#include +#include + +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +Graph* Pass::Apply(Graph* graph) const { + PADDLE_ENFORCE(graph, "graph passed to Pass::Apply() cannot be empty."); + for (const std::string& attr : required_pass_attrs_) { + PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), + "Required pass atrribute %s not set.", attr); + } + for (const std::string& attr : required_graph_attrs_) { + PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.", + attr); + } + ApplyImpl(graph); + // TODO(panyx0718): Add more verifications. + PADDLE_ENFORCE(!HasCircle(*graph), + "Illegal Pass. Generated graph shouldn't has cycle."); + PADDLE_ENFORCE(VarDescIsConsistency(*graph), + "The VarDescs of persistable variable are not consistency."); + applied_ = true; + return graph; +} + +PassRegistry& PassRegistry::Instance() { + static PassRegistry g_pass_info_map; + return g_pass_info_map; +} +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h new file mode 100644 index 00000000..6cbe9a82 --- /dev/null +++ b/paddle/fluid/framework/ir/pass.h @@ -0,0 +1,232 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace framework { +namespace ir { +template +struct PassRegistrar; + +class Pass { + public: + Pass() = default; + virtual ~Pass() { + for (auto &attr : attrs_) { + if (attr_dels_.find(attr.first) != attr_dels_.end()) { + attr_dels_[attr.first](); + } + } + attrs_.clear(); + attr_dels_.clear(); + } + + std::string Type() const { return type_; } + + Graph *Apply(Graph *graph) const; + + // Get a reference to the attributed previously set. + template + AttrType &Get(const std::string &attr_name) const { + PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(), + "%s attr not registered for pass.", attr_name); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast &) { + PADDLE_THROW( + "Invalid attribute type of %s error, expected: %s, actual: %s", + attr_name, typeid(AttrType *).name(), + attrs_.at(attr_name).type().name()); + } + } + + bool Has(const std::string &attr_name) const { + return attrs_.count(attr_name) > 0; + } + + void Erase(const std::string &attr_name) { + if (!Has(attr_name)) { + return; + } + if (attr_dels_.find(attr_name) != attr_dels_.end()) { + attr_dels_[attr_name](); + attr_dels_.erase(attr_name); + } + attrs_.erase(attr_name); + } + + // Set a pointer to the attribute. Pass takes ownership of the attribute. + template + void Set(const std::string &attr_name, AttrType *attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass", + attr_name); + attrs_[attr_name] = attr; + attr_dels_[attr_name] = [attr, attr_name]() { + VLOG(3) << "deleting " << attr_name; + delete attr; + }; + } + + // Set a pointer to the attribute. Pass doesn't take ownership. Caller + // should delete the attribute. + template + void SetNotOwned(const std::string &attr_name, AttrType *attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass", + attr_name); + attrs_[attr_name] = attr; + } + + protected: + virtual void ApplyImpl(Graph *graph) const { + LOG(FATAL) << "Calling virtual Pass not implemented."; + } + + private: + template + friend struct PassRegistrar; + + void RegisterRequiredPassAttrs(const std::unordered_set &attrs) { + required_pass_attrs_.insert(attrs.begin(), attrs.end()); + } + + void RegisterRequiredGraphAttrs( + const std::unordered_set &attrs) { + required_graph_attrs_.insert(attrs.begin(), attrs.end()); + } + + void RegisterType(const std::string &type) { type_ = type; } + + mutable bool applied_{false}; + std::string type_; + std::unordered_set required_pass_attrs_; + std::unordered_set required_graph_attrs_; + std::map attrs_; + std::map> attr_dels_; +}; + +using PassCreator = std::function()>; + +class Registrar { + public: + // In our design, various kinds of passes, + // have their corresponding registry and registrar. The action of + // registration is in the constructor of a global registrar variable, which + // are not used in the code that calls package framework, and would + // be removed from the generated binary file by the linker. To avoid such + // removal, we add Touch to all registrar classes and make USE_PASS macros to + // call this method. So, as long as the callee code calls USE_PASS, the global + // registrar variable won't be removed by the linker. + void Touch() {} +}; + +class PassRegistry { + public: + static PassRegistry &Instance(); + + bool Has(const std::string &pass_type) const { + return map_.find(pass_type) != map_.end(); + } + + void Insert(const std::string &pass_type, const PassCreator &pass_creator) { + PADDLE_ENFORCE(!Has(pass_type), "Pass %s has been registered", pass_type); + map_.insert({pass_type, pass_creator}); + } + + std::unique_ptr Get(const std::string &pass_type) const { + PADDLE_ENFORCE(Has(pass_type), "Pass %s has not been registered", + pass_type); + return map_.at(pass_type)(); + } + + private: + PassRegistry() = default; + std::unordered_map map_; + + DISABLE_COPY_AND_ASSIGN(PassRegistry); +}; + +template +struct PassRegistrar : public Registrar { + explicit PassRegistrar(const char *pass_type) { + PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type), + "'%s' is registered more than once.", pass_type); + PassRegistry::Instance().Insert( + pass_type, [this, pass_type]() -> std::unique_ptr { + std::unique_ptr pass(new PassType()); + pass->RegisterRequiredPassAttrs(this->required_pass_attrs_); + pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_); + pass->RegisterType(pass_type); + return pass; + }); + } + + PassRegistrar &RequirePassAttr(const std::string &attr) { + required_pass_attrs_.insert(attr); + return *this; + } + + PassRegistrar &RequireGraphAttr(const std::string &attr) { + required_graph_attrs_.insert(attr); + return *this; + } + + private: + std::unordered_set required_pass_attrs_; + std::unordered_set required_graph_attrs_; +}; + +#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +// Register a new pass that can be applied on the IR. +#define REGISTER_PASS(pass_type, pass_class) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __reg_pass__##pass_type, \ + "REGISTER_PASS must be called in global namespace"); \ + static ::paddle::framework::ir::PassRegistrar \ + __pass_registrar_##pass_type##__(#pass_type); \ + int TouchPassRegistrar_##pass_type() { \ + __pass_registrar_##pass_type##__.Touch(); \ + return 0; \ + } \ + static ::paddle::framework::ir::PassRegistrar \ + &__pass_tmp_registrar_##pass_type##__ UNUSED = \ + __pass_registrar_##pass_type##__ + +#define USE_PASS(pass_type) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __use_pass_itself_##pass_type, \ + "USE_PASS must be called in global namespace"); \ + extern int TouchPassRegistrar_##pass_type(); \ + static int use_pass_itself_##pass_type##_ UNUSED = \ + TouchPassRegistrar_##pass_type() + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc new file mode 100644 index 00000000..e0719867 --- /dev/null +++ b/paddle/fluid/framework/ir/pass_builder.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/pass_builder.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::shared_ptr PassBuilder::AppendPass(const std::string& pass_type) { + auto pass = ir::PassRegistry::Instance().Get(pass_type); + passes_.emplace_back(pass.release()); + return passes_.back(); +} + +void PassBuilder::RemovePass(size_t idx) { + PADDLE_ENFORCE(passes_.size() > idx); + passes_.erase(passes_.begin() + idx); +} + +std::shared_ptr PassBuilder::InsertPass(size_t idx, + const std::string& pass_type) { + PADDLE_ENFORCE(passes_.size() >= idx); + std::shared_ptr pass( + ir::PassRegistry::Instance().Get(pass_type).release()); + passes_.insert(passes_.begin() + idx, std::move(pass)); + return passes_[idx]; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_builder.h b/paddle/fluid/framework/ir/pass_builder.h new file mode 100644 index 00000000..733d3a3a --- /dev/null +++ b/paddle/fluid/framework/ir/pass_builder.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class PassBuilder { + public: + PassBuilder() {} + + virtual ~PassBuilder() {} + + // Append a new pass to the end. + std::shared_ptr AppendPass(const std::string& pass_type); + + // Insert a new pass after `idx`. + std::shared_ptr InsertPass(size_t idx, const std::string& pass_type); + + // Remove a new pass at `idx`. + void RemovePass(size_t idx); + + // Returns a list of all passes. + std::vector> AllPasses() const { return passes_; } + + protected: + std::vector> passes_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc new file mode 100644 index 00000000..87e3c964 --- /dev/null +++ b/paddle/fluid/framework/ir/pass_test.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/pass.h" +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { +void BuildCircleGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + + o2->outputs.push_back(v2); + o1->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o1); +} + +class TestPass : public Pass { + protected: + void ApplyImpl(ir::Graph* graph) const { + graph->Set("copy_test_pass_attr", new int); + graph->Set("copy_test_graph_attr", new int); + + int test_pass_attr = this->Get("test_pass_attr"); + graph->Get("copy_test_pass_attr") = test_pass_attr + 1; + + int test_graph_attr = graph->Get("test_graph_attr"); + graph->Get("copy_test_graph_attr") = test_graph_attr + 1; + } +}; + +TEST(PassTest, TestPassAttrCheck) { + ProgramDesc prog; + auto pass = PassRegistry::Instance().Get("test_pass"); + std::unique_ptr graph(new Graph(prog)); + std::string exception; + try { + graph.reset(pass->Apply(graph.release())); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("test_pass_attr not set") != exception.npos); + + int val = 1; + graph.reset(new Graph(prog)); + pass->SetNotOwned("test_pass_attr", &val); + + try { + graph.reset(pass->Apply(graph.release())); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("test_graph_attr not set") != exception.npos); + + graph.reset(new Graph(prog)); + graph->Set("test_graph_attr", new int); + graph->Get("test_graph_attr") = 1; + graph.reset(pass->Apply(graph.release())); + ASSERT_EQ(graph->Get("copy_test_pass_attr"), 2); + ASSERT_EQ(graph->Get("copy_test_graph_attr"), 2); + + // Allow apply more than once. + graph.reset(new Graph(prog)); + graph->Set("test_graph_attr", new int); + graph.reset(pass->Apply(graph.release())); + + pass = PassRegistry::Instance().Get("test_pass"); + pass->SetNotOwned("test_pass_attr", &val); + graph.reset(new Graph(prog)); + BuildCircleGraph(graph.get()); + graph->Set("test_graph_attr", new int); + graph->Get("test_graph_attr") = 2; + try { + pass->Apply(graph.release()); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("shouldn't has cycle") != exception.npos); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(test_pass, paddle::framework::ir::TestPass) + .RequirePassAttr("test_pass_attr") + .RequireGraphAttr("test_graph_attr"); diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc new file mode 100644 index 00000000..62fba440 --- /dev/null +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -0,0 +1,208 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +void RunQuantDequant(ir::Graph* graph, Scope* scope, int times, + const std::string& op_type, const std::string& quant_type, + const std::string& dequant_type) { + const std::string pattern_name = "quant_dequant_fuse"; + int kNumFields = 5; + const int kQuantizedWeightOffset = 0; + const int kQuantizedOpOffset = 1; + const int kQuantizedOpOutOffset = 2; + const int kDequantOpOffset = 3; + const int kDequantOpOutOffset = 4; + const int kDequantOpWeightScaleOffset = 5; + + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + kNumFields += 1; + } + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input(quant_type, "X") + ->AsInput(); + + std::string quantized_op_type = op_type; + std::string weight_name = ""; + if (op_type == "conv2d" || op_type == "depthwise_conv2d" || + op_type == "conv2d_fusion") { + weight_name = "Filter"; + } else if (op_type == "mul") { + weight_name = "Y"; + } else if (op_type == "fc") { + weight_name = "W"; + } else { + PADDLE_ENFORCE( + "QuantDequantFuse: We only support conv2d, conv2d_fusion, fc, mul for " + "now."); + } + + patterns::QuantDequantOpFuse pattern(gpd.mutable_pattern(), pattern_name); + pattern(x, quantized_op_type, weight_name, times, quant_type, dequant_type); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + PADDLE_ENFORCE(subgraph.count(x)); + auto* input_node = subgraph.at(x); + Node* quant_op_in_scale = + subgraph.at(pattern.GetPDNode("quant_op_in_scale")); + Node* quant_op = subgraph.at(pattern.GetPDNode("quant_op")); + Node* quant_op_out_scale = + subgraph.at(pattern.GetPDNode("quant_op_out_scale")); + Node* quant_op_out = subgraph.at(pattern.GetPDNode("quant_op_out")); + + std::vector nodes; + for (int i = 0; i < times; i++) { + nodes.push_back(subgraph.at( + pattern.GetPDNode("quantized_op_weight" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("quantized_op" + std::to_string(i)))); + nodes.push_back(subgraph.at( + pattern.GetPDNode("quantized_op_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("dequant_op" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("dequant_op_out" + std::to_string(i)))); + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + nodes.push_back(subgraph.at( + pattern.GetPDNode("dequant_channel_scale" + std::to_string(i)))); + } + } + + int bit_length = boost::get(quant_op->Op()->GetAttr("bit_length")); + int range = ((1 << (bit_length - 1)) - 1); + // Prepare input scale + std::string input_scale_var_name = quant_op->Op()->Input("InScale").front(); + PADDLE_ENFORCE(scope); + const LoDTensor& input_scale_tensor = + scope->FindVar(input_scale_var_name)->Get(); + + PADDLE_ENFORCE(paddle::platform::is_cpu_place(input_scale_tensor.place())); + const float* input_scale_data = input_scale_tensor.data(); + float input_scale = input_scale_data[0]; + std::unordered_set delete_nodes; + + for (int i = 0; i < times; i++) { + std::vector weight_scale; + + // Get weight scale from dequant op. + if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + auto scales_name = + nodes[i * kNumFields + kDequantOpOffset]->Op()->Input("Scales"); + PADDLE_ENFORCE(scales_name.size() == 2); + const LoDTensor& channel_scale_tensor = + scope->FindVar(scales_name[0])->Get(); + PADDLE_ENFORCE( + paddle::platform::is_cpu_place(channel_scale_tensor.place())); + const float* channel_scale_data = channel_scale_tensor.data(); + for (int i = 0; i < channel_scale_tensor.numel(); i++) { + weight_scale.push_back(channel_scale_data[i]); + } + delete_nodes.insert( + nodes[i * kNumFields + kDequantOpWeightScaleOffset]); + } else { + float max_range = boost::get( + nodes[i * kNumFields + kDequantOpOffset]->Op()->GetAttr( + "max_range")); + weight_scale.push_back((range * range) / max_range); + } + + // create new op_desc + auto base_op_desc = + *nodes[i * kNumFields + kQuantizedOpOffset]->Op()->Proto(); + std::string new_input = input_node->Name(); + std::string new_output = + nodes[i * kNumFields + kDequantOpOutOffset]->Name(); + + framework::OpDesc new_op_desc(base_op_desc, nullptr); + new_op_desc.SetType(quantized_op_type); + + if (quantized_op_type == "conv2d" || + quantized_op_type == "conv2d_fusion" || + quantized_op_type == "depthwise_conv2d") { + new_op_desc.SetInput("Input", {new_input}); + new_op_desc.SetOutput("Output", {new_output}); + } else if (quantized_op_type == "fc") { + new_op_desc.SetInput("Input", {new_input}); + new_op_desc.SetOutput("Out", {new_output}); + } else if (quantized_op_type == "mul") { + new_op_desc.SetInput("X", {new_input}); + new_op_desc.SetOutput("Out", {new_output}); + } + + new_op_desc.SetAttr("enable_int8", true); + new_op_desc.SetAttr("input_scale", input_scale); + new_op_desc.SetAttr("weight_scale", weight_scale); + new_op_desc.Flush(); + auto* new_op = graph->CreateOpNode(&new_op_desc); + IR_NODE_LINK_TO(input_node, new_op); + IR_NODE_LINK_TO(nodes[i * kNumFields + kQuantizedWeightOffset], new_op); + IR_NODE_LINK_TO(new_op, nodes[i * kNumFields + kDequantOpOutOffset]); + + delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOffset]); + delete_nodes.insert(nodes[i * kNumFields + kQuantizedOpOutOffset]); + delete_nodes.insert(nodes[i * kNumFields + kDequantOpOffset]); + } + + delete_nodes.insert(quant_op_in_scale); + delete_nodes.insert(quant_op); + delete_nodes.insert(quant_op_out); + delete_nodes.insert(quant_op_out_scale); + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, delete_nodes); + }; + gpd(graph, handler); +} + +void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "quant_dequant_fuse"; + FusePassBase::Init(pattern_name, graph); + + std::unordered_set dequant_types = { + "fake_dequantize_max_abs", "fake_channel_wise_dequantize_max_abs"}; + std::unordered_set quant_types = { + "fake_quantize_range_abs_max", "fake_quantize_moving_average_abs_max"}; + std::unordered_set quantized_op_types = {"conv2d", "mul", + "depthwise_conv2d"}; + auto* scope = param_scope(); + for (auto& dequant_type : dequant_types) { + for (auto& quant_type : quant_types) { + for (auto& op_type : quantized_op_types) { + for (int i = 6; i >= 1; i--) { + RunQuantDequant(graph, scope, i, op_type, quant_type, dequant_type); + } + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(quant_conv2d_dequant_fuse_pass, + paddle::framework::ir::QuantDequantFusePass); diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h new file mode 100644 index 00000000..a61b3456 --- /dev/null +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class QuantDequantFusePass : public FusePassBase { + public: + virtual ~QuantDequantFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc new file mode 100644 index 00000000..00263b8a --- /dev/null +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc @@ -0,0 +1,384 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h" +#include // for max +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +#define MAX_NUM_FC 10 + +namespace paddle { +namespace framework { +namespace ir { + +PDNode* BuildRepeatedFCReluPattern(PDPattern* pattern, + const std::string& name_scope, int num_fc) { + auto var_next_is_fc_act = [=](Node* x, const std::string& act_type = "relu", + bool check_in_has_only_one_out = true, + int fc_idx = 0) -> bool { + bool next_is_fc = x && x->IsVar() && VarLinksToOp(x, "fc"); + if (check_in_has_only_one_out) { + next_is_fc = next_is_fc && x->outputs.size() == 1; + } + if (!next_is_fc) { + return false; + } + auto* fc_op = x->outputs[fc_idx]; + bool next_is_act = fc_op && fc_op->IsOp() && fc_op->outputs.size() == 1 && + fc_op->outputs[0] && fc_op->outputs[0]->IsVar() && + VarLinksToOp(fc_op->outputs[0], act_type) && + fc_op->outputs[0]->outputs.size() == 1; + if (!next_is_act) { + return false; + } + auto* act_op = fc_op->outputs[0]->outputs[0]; + return act_op && act_op->IsOp() && act_op->outputs.size() == 1; + }; + + auto find_fc_idx = [=](Node* x, const std::string& act_type = "relu") -> int { + bool next_is_fc = x && x->IsVar() && VarLinksToOp(x, "fc"); + if (!next_is_fc) { + return 0; + } + for (size_t k = 0; k < x->outputs.size(); ++k) { + auto* fc_op = x->outputs[k]; + bool next_is_act = fc_op && fc_op->IsOp() && fc_op->outputs.size() == 1 && + fc_op->outputs[0] && fc_op->outputs[0]->IsVar() && + VarLinksToOp(fc_op->outputs[0], act_type) && + fc_op->outputs[0]->outputs.size() == 1; + if (!next_is_act) { + continue; + } + auto* act_op = fc_op->outputs[0]->outputs[0]; + if (act_op && act_op->IsOp() && act_op->outputs.size() == 1) { + return k; + } + } + return 0; + }; + + auto next_var_of_part = [=](Node* x, int fc_idx = 0) -> Node* { + return x->outputs[fc_idx]->outputs[0]->outputs[0]->outputs[0]; + }; + auto var_next_is_fc_act_repeated_n_times = [=]( + Node* x, int repeated_times, const std::string& act_type = "relu", + bool check_in_has_only_one_out = true) -> bool { + for (int i = 0; i < repeated_times; ++i) { + if (!var_next_is_fc_act(x, act_type, + i == 0 && check_in_has_only_one_out)) { + return false; + } + x = next_var_of_part(x); + } + return true; + }; + + auto var_before_is_fc_act = [=](Node* x, const std::string& act_type = "relu", + bool at_top = false) -> bool { + bool before_is_act = + x && x->IsVar() && x->inputs.size() == 1 && VarLinksFromOp(x, "relu"); + if (!before_is_act) { + return false; + } + auto* relu_op = x->inputs[0]; + bool before_is_fc = relu_op->IsOp() && relu_op->inputs.size() == 1 && + relu_op->inputs[0]->IsVar() && + VarLinksFromOp(relu_op->inputs[0], "fc") && + relu_op->inputs[0]->inputs.size() == 1; + + if (!before_is_fc) { + return false; + } + auto* fc_op = relu_op->inputs[0]->inputs[0]; + bool is_fc = fc_op->IsOp() && fc_op->inputs.size() == 3; + if (!is_fc) { + return false; + } + for (auto* fc_i : fc_op->inputs) { + if (!fc_i->inputs.empty()) { + if (at_top) { + return true; + } else { + return VarLinksFromOp(fc_i, "relu"); + } + } + } + return false; + }; + + auto before_var_of_part = [=](Node* x) -> Node* { + auto* fc_op = x->inputs[0]->inputs[0]; + for (auto* fc_i : fc_op->inputs) { + if (!fc_i->inputs.empty()) { + return fc_i->inputs[0]; + } + } + return nullptr; + }; + + auto var_before_is_fc_act_repeated_n_times = [=]( + Node* x, int repeated_times, + const std::string& act_type = "relu") -> bool { + for (int i = 0; i < repeated_times; ++i) { + if (!var_before_is_fc_act(x, act_type, i == repeated_times - 1)) { + return false; + } + x = before_var_of_part(x); + } + return true; + }; + + std::vector fc_input_var(num_fc); + std::vector fc_output_var(num_fc); + std::vector fc_weight_var(num_fc); + std::vector fc_bias_var(num_fc); + std::vector fc_ops(num_fc); + std::vector relu_ops(num_fc); + + for (int i = 0; i < num_fc; ++i) { + fc_input_var[i] = pattern->NewNode( + [=](Node* x) { + if (i == 0 && x->outputs.size() > 0) { + bool ok = x->inputs.size() > 0; + if (!ok) { + return false; + } + int idx = find_fc_idx(x); + if (idx == 0) { + return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu"); + } else { + x = next_var_of_part(x, idx); + return var_next_is_fc_act_repeated_n_times( + x, std::max(1, num_fc - i - 1), "relu"); + } + } else { + return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu") && + x->inputs.size() > 0 && + var_before_is_fc_act_repeated_n_times(x, i, "relu"); + } + }, + name_scope + "/fc_in_" + std::to_string(i)); + + fc_weight_var[i] = pattern->NewNode( + [=](Node* x) { + return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu") && + x->inputs.empty() && + var_before_is_fc_act_repeated_n_times(x->outputs[0]->inputs[0], + i, "relu") && + x->Name() == x->outputs[0]->Op()->Input("W")[0]; + }, + name_scope + "/fc_weight_" + std::to_string(i)); + + fc_bias_var[i] = pattern->NewNode( + [=](Node* x) { + return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu") && + x->inputs.empty() && + var_before_is_fc_act_repeated_n_times(x->outputs[0]->inputs[0], + i, "relu") && + x->Name() == x->outputs[0]->Op()->Input("Bias")[0]; + }, + name_scope + "/fc_bias_" + std::to_string(i)); + + fc_output_var[i] = pattern->NewNode( + [=](Node* x) { + bool basic = x && x->IsVar() && VarLinksFromOp(x, "fc") && + VarLinksToOp(x, "relu") && x->inputs.size() == 1 && + x->inputs[0]->inputs.size() == 3; + if (!basic) { + return false; + } + x = x->inputs[0]->inputs[0]; + if (i == 0 && x->outputs.size() > 0) { + bool ok = x->inputs.size() > 0; + if (!ok) { + return false; + } + int idx = find_fc_idx(x); + if (idx == 0) { + return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu"); + } else { + x = next_var_of_part(x, idx); + return var_next_is_fc_act_repeated_n_times( + x, std::max(1, num_fc - i - 1), "relu"); + } + } else { + return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu") && + x->inputs.size() > 0 && + var_before_is_fc_act_repeated_n_times(x, i, "relu"); + } + }, + name_scope + "/fc_out_" + std::to_string(i)); + + fc_ops[i] = pattern->NewNode( + [=](Node* x) { + bool basic = x && x->IsOp() && x->Op()->Type() == "fc" && + x->inputs.size() == 3 && x->outputs.size() == 1; + if (!basic) { + return false; + } + auto* fc_out_var = x->outputs[0]; + return fc_out_var && fc_out_var->IsVar() && + fc_out_var->outputs.size() == 1 && + VarLinksToOp(fc_out_var, "relu") && + fc_out_var->outputs[0]->outputs.size() == 1 && + var_next_is_fc_act_repeated_n_times( + fc_out_var->outputs[0]->outputs[0], num_fc - i - 1, + "relu") && + var_before_is_fc_act_repeated_n_times( + fc_out_var->outputs[0]->outputs[0], i + 1, "relu"); + }, + name_scope + "/fc_op_" + std::to_string(i)); + + relu_ops[i] = pattern->NewNode( + [=](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "relu" && + x->inputs.size() == 1 && x->outputs.size() == 1 && + x->inputs[0]->IsVar() && VarLinksFromOp(x->inputs[0], "fc") && + x->outputs[0]->IsVar() && + var_next_is_fc_act_repeated_n_times(x->outputs[0], + num_fc - i - 1, "relu") && + var_before_is_fc_act_repeated_n_times(x->outputs[0], i + 1, + "relu"); + }, + name_scope + "/act_op_" + std::to_string(i)); + + fc_ops[i] + ->LinksFrom({fc_input_var[i], fc_weight_var[i], fc_bias_var[i]}) + .LinksTo({fc_output_var[i]}); + relu_ops[i]->LinksFrom({fc_output_var[i]}); + } + + auto* last_out_var = pattern->NewNode( + [=](Node* x) { + return var_before_is_fc_act_repeated_n_times(x, num_fc, "relu"); + }, + name_scope + "/act_out"); + for (int i = 0; i < num_fc - 1; ++i) { + relu_ops[i]->LinksTo({fc_input_var[i + 1]}); + } + relu_ops[num_fc - 1]->LinksTo({last_out_var}); + return last_out_var; +} + +static int BuildFusion(Graph* graph, const std::string& name_scope, + int num_fc) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + BuildRepeatedFCReluPattern(pattern, name_scope, num_fc); + + auto retrieve_node = [](const std::string& name, + const GraphPatternDetector::subgraph_t& subgraph, + const PDPattern& pat) -> Node* { + PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)), + "pattern has no Node called %s", name.c_str()); + Node* p = subgraph.at(pat.RetrieveNode(name)); + PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str()); + return p; + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + LOG(INFO) << "handle Repeated FC Act fuse"; + std::vector weights_vars(num_fc); + std::vector bias_vars(num_fc); + std::vector relu_vars(num_fc - 1); + + std::vector weight_names(num_fc); + std::vector bias_names(num_fc); + std::vector relu_names(num_fc - 1); + + auto& fused_pattern = gpd.pattern(); + for (int i = 0; i < num_fc; ++i) { + if (i >= 1) { + relu_vars[i - 1] = + retrieve_node(name_scope + "/fc_in_" + std::to_string(i), subgraph, + fused_pattern); + relu_names[i - 1] = relu_vars[i - 1]->Name(); + } + + weights_vars[i] = + retrieve_node(name_scope + "/fc_weight_" + std::to_string(i), + subgraph, fused_pattern); + weight_names[i] = weights_vars[i]->Name(); + + bias_vars[i] = retrieve_node(name_scope + "/fc_bias_" + std::to_string(i), + subgraph, fused_pattern); + bias_names[i] = bias_vars[i]->Name(); + } + + auto* input_var = + retrieve_node(name_scope + "/fc_in_0", subgraph, fused_pattern); + auto* last_out_var = + retrieve_node(name_scope + "/act_out", subgraph, fused_pattern); + + // Create New OpDesc + OpDesc op_desc; + op_desc.SetType("fusion_repeated_fc_relu"); + op_desc.SetInput("X", {input_var->Name()}); + op_desc.SetInput("W", weight_names); + op_desc.SetInput("Bias", bias_names); + op_desc.SetOutput("ReluOut", relu_names); + op_desc.SetOutput("Out", {last_out_var->Name()}); + auto* op = graph->CreateOpNode(&op_desc); + IR_NODE_LINK_TO(input_var, op); + for (size_t i = 0; i < weights_vars.size(); ++i) { + IR_NODE_LINK_TO(weights_vars[i], op); + IR_NODE_LINK_TO(bias_vars[i], op); + } + for (size_t i = 0; i < relu_vars.size(); ++i) { + IR_NODE_LINK_TO(op, relu_vars[i]); + } + IR_NODE_LINK_TO(op, last_out_var); + + std::unordered_set marked_nodes; + for (auto& item : subgraph) { + marked_nodes.insert(item.second); + } + for (size_t i = 0; i < weights_vars.size(); ++i) { + marked_nodes.erase(weights_vars[i]); + marked_nodes.erase(bias_vars[i]); + } + for (size_t i = 0; i < relu_vars.size(); ++i) { + marked_nodes.erase(relu_vars[i]); + } + marked_nodes.erase(input_var); + marked_nodes.erase(last_out_var); + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + + gpd(graph, handler); + return fusion_count; +} + +void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + int fusion_count = 0; + for (int i = MAX_NUM_FC; i > 1; --i) { + fusion_count += + BuildFusion(graph, name_scope_ + "/" + std::to_string(i), i); + } + AddStatis(fusion_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(repeated_fc_relu_fuse_pass, + paddle::framework::ir::RepeatedFCReluFusePass); diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h new file mode 100644 index 00000000..ae777bcc --- /dev/null +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/** + * Fuse Repeated FC Relu + */ +class RepeatedFCReluFusePass : public FusePassBase { + public: + virtual ~RepeatedFCReluFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + const std::string name_scope_{"repeated_fc_relu_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc new file mode 100644 index 00000000..566b654f --- /dev/null +++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc @@ -0,0 +1,37 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/runtime_context_cache_pass.h" +#include +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +namespace ir { + +void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const { + VLOG(3) << "Applies Runtime Context Cache strategy."; + for (const Node* n : graph->Nodes()) { + if (n->IsOp() && n->Op()) { + n->Op()->SetAttr(kEnableCacheRuntimeContext, true); + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(runtime_context_cache_pass, + paddle::framework::ir::RuntimeContextCachePass); diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.h b/paddle/fluid/framework/ir/runtime_context_cache_pass.h new file mode 100644 index 00000000..e4783166 --- /dev/null +++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class RuntimeContextCachePass : public Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc new file mode 100644 index 00000000..b230c501 --- /dev/null +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc @@ -0,0 +1,255 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +struct FuseExpr {}; + +// sequence expand, concat fuse pattern, return concat's output +PDNode* BuildSeqExpandConcatPattern(PDPattern* pattern) { + // The following operators will be fused: + // concat + // sequence_expand + // sequence_expand + + // The following variables will be treat as inputs: + // concat mid input, 0th input for fused op + // sequence_expand input, 1th input for fused op + // sequence_expand input, 2th input for fused op + + // The following variables will be treat as outputs: + // concat output + + // So the following variables will be removed: + // sequence-expand output + // sequence-expand output + + // Three operators + auto* sequence_expand0 = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "sequence_expand"; + }, + "sequence_expand0"); + + auto* sequence_expand1 = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "sequence_expand"; + }, + "sequence_expand1"); + + auto* concat = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "concat" && // basic check + x->Op()->Input("X").size() == 3; // Special case + }, + "concat"); + + auto* sequence_expand0_in = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && VarLinksToOp(x, "sequence_expand"); + }, + "sequence_expand0_in"); + auto* sequence_expand1_in = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && VarLinksToOp(x, "sequence_expand"); + }, + "sequence_expand1_in"); + + // The variables + auto* sequence_expand0_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && + VarLinksFromOp(x, "sequence_expand") && // basic check + VarLinksToOp(x, "concat") && // is concat's input + IsNthInput(x, x->outputs[0], "X", 1); // X[0] + }, + "sequence_expand0_out"); + + auto* sequence_expand1_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && + VarLinksFromOp(x, "sequence_expand") && // basic check + VarLinksToOp(x, "concat") && // is concat's input + IsNthInput(x, x->outputs[0], "X", 2); // x[2] + }, + "sequence_expand1_out"); + + auto* concat_in0 = pattern->NewNode( + [](Node* x) { return x && x->IsVar() && VarLinksToOp(x, "concat"); }, + "concat_in0"); + + auto* concat_out = pattern->NewNode( + [](Node* x) { return x && x->IsVar() && VarLinksFromOp(x, "concat"); }, + "concat_out"); + + // Links + sequence_expand0->LinksFrom({sequence_expand0_in}) + .LinksTo({sequence_expand0_out}); + sequence_expand1->LinksFrom({sequence_expand1_in}) + .LinksTo({sequence_expand1_out}); + concat->LinksFrom({sequence_expand0_out, sequence_expand1_out, concat_in0}) + .LinksTo({concat_out}); + return concat_out; +} + +PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) { + PDNode* fc_w = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksToOp(x, "mul") && // link + x->Var()->Proto()->persistable(); // is a parameter + }, + "fc_w"); + + PDNode* mul_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksFromOp(x, "mul") && // link + VarLinksToOp(x, "elementwise_add") && // + !x->Var()->Proto()->persistable(); // is a parameter + }, + "mul_out"); + + PDNode* fc_mul = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "mul"; // basic + }, + "fc_mul"); + + PDNode* fc_bias = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksToOp(x, "elementwise_add") && // link + x->Var()->Proto()->persistable(); // is a parameter + }, + "fc_bias"); + + PDNode* elementwise_add = pattern->NewNode( + [](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "elementwise_add"; + }, + "elementwise_add"); + + PDNode* add_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + VarLinksFromOp(x, "elementwise_add") && // link + !x->Var()->Proto()->persistable(); // is a parameter + }, + "add_out"); + + std::set acts({"sigmoid", "tanh", "relu", "identity"}); + PDNode* act = pattern->NewNode( + [=](Node* x) { return x && x->IsOp() && acts.count(x->Op()->Type()); }, + "act"); + + PDNode* fc_out = pattern->NewNode( + [](Node* x) { + return x && x->IsVar() && // basic + !x->Var()->Proto()->persistable(); // is a parameter + }, + "fc_out"); + + fc_mul->LinksFrom({fc_w, fc_x}).LinksTo({mul_out}); + elementwise_add->LinksFrom({mul_out, fc_bias}).LinksTo({add_out}); + act->LinksFrom({add_out}).LinksTo({fc_out}); + return fc_out; +} + +void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init("seq_concat_fc_fuse", graph); + GraphPatternDetector detector; + auto* pattern = detector.mutable_pattern(); + auto* concat_out = BuildSeqExpandConcatPattern(pattern); + BuildFCPattern(pattern, concat_out); + +#define GET_NODE(id, pattern) \ + PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \ + "pattern has no Node called %s", #id); \ + auto* id = subgraph.at(pattern.RetrieveNode(#id)); \ + PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); + + int fuse_count{0}; + + detector(graph, [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(4) << "get one concat pattern"; + // fc + GET_NODE(fc_w, detector.pattern()); + GET_NODE(fc_bias, detector.pattern()); + GET_NODE(act, detector.pattern()); + GET_NODE(fc_out, detector.pattern()); + + // concat + GET_NODE(concat_in0, detector.pattern()); + GET_NODE(sequence_expand0_in, detector.pattern()); + GET_NODE(sequence_expand1_in, detector.pattern()); + + OpDesc op_desc; + op_desc.SetType("fusion_seqexpand_concat_fc"); + op_desc.SetInput("X", {concat_in0->Name(), sequence_expand0_in->Name(), + sequence_expand1_in->Name()}); + op_desc.SetInput("FCWeight", {fc_w->Name()}); + op_desc.SetInput("FCBias", {fc_bias->Name()}); + const std::string fc_out_tmp = fc_out->Name() + ".tmp"; + param_scope()->Var(fc_out_tmp)->GetMutable(); + op_desc.SetOutput("FCOut", {fc_out_tmp}); + op_desc.SetOutput("Out", {fc_out->Name()}); + op_desc.SetAttr("fc_activation", act->Op()->Type()); + + auto* op_node = graph->CreateOpNode(&op_desc); + // Add links + IR_NODE_LINK_TO(fc_w, op_node); + IR_NODE_LINK_TO(fc_bias, op_node); + IR_NODE_LINK_TO(concat_in0, op_node); + IR_NODE_LINK_TO(sequence_expand0_in, op_node); + IR_NODE_LINK_TO(sequence_expand1_in, op_node); + IR_NODE_LINK_TO(op_node, fc_out); + + // Clean nodes. + std::unordered_set marked_nodes; + for (auto& item : subgraph) { + marked_nodes.insert(item.second); + } + marked_nodes.erase(fc_w); + marked_nodes.erase(fc_bias); + marked_nodes.erase(concat_in0); + marked_nodes.erase(sequence_expand0_in); + marked_nodes.erase(sequence_expand1_in); + marked_nodes.erase(fc_out); + GraphSafeRemoveNodes(graph, marked_nodes); + + ++fuse_count; + }); + + AddStatis(fuse_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(seq_concat_fc_fuse_pass, + paddle::framework::ir::SeqConcatFcFusePass); diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h new file mode 100644 index 00000000..d68840a5 --- /dev/null +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SeqConcatFcFusePass : public FusePassBase { + public: + virtual ~SeqConcatFcFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc new file mode 100644 index 00000000..556d28a4 --- /dev/null +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h" +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "X")) + ->assert_is_op_input("sequence_conv") + ->assert_var_not_persistable(); + patterns::SeqConvEltAddRelu fuse_pattern(pattern, name_scope); + fuse_pattern(x); + + // Create New OpDesc + auto fuse_creator = [&](Node* seqconv, Node* input, Node* seqconv_weight, + Node* eltadd_bias, Node* relu_out) { + OpDesc op_desc; + op_desc.SetType("fusion_seqconv_eltadd_relu"); + op_desc.SetInput("X", {input->Name()}); + op_desc.SetInput("Filter", {seqconv_weight->Name()}); + op_desc.SetInput("Bias", {eltadd_bias->Name()}); + op_desc.SetAttr("contextLength", seqconv->Op()->GetAttr("contextLength")); + op_desc.SetAttr("contextStart", seqconv->Op()->GetAttr("contextStart")); + op_desc.SetAttr("contextStride", seqconv->Op()->GetAttr("contextStride")); + PADDLE_ENFORCE(graph->Has(kParamScopeAttr)); + auto& scope = graph->Get(kParamScopeAttr); + const std::string ColMat = patterns::UniqueKey("SeqConvColMat"); + op_desc.SetOutput("ColMat", {ColMat}); + op_desc.SetOutput("Out", {relu_out->Name()}); + scope.Var(ColMat)->GetMutable(); + + auto* op = graph->CreateOpNode(&op_desc); + IR_NODE_LINK_TO(input, op); + IR_NODE_LINK_TO(seqconv_weight, op); + IR_NODE_LINK_TO(eltadd_bias, op); + IR_NODE_LINK_TO(op, relu_out); + return op; + }; + + int fusion_count{0}; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle SeqConv EltAdd Relu fuse"; + GET_IR_NODE_FROM_SUBGRAPH(seqconv, seqconv, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(seqconv_weight, seqconv_weight, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(seqconv_out, seqconv_out, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd, eltadd, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_bias, eltadd_bias, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_out, eltadd_out, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(relu, relu, fuse_pattern); + GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, fuse_pattern); + + fuse_creator(seqconv, subgraph.at(x), seqconv_weight, eltadd_bias, + relu_out); + std::unordered_set marked_nodes( + {seqconv, seqconv_out, eltadd, eltadd_out, relu}); + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + + gpd(graph, handler); + + return fusion_count; +} + +void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + + int fusion_count = BuildFusion(graph, name_scope_, param_scope()); + AddStatis(fusion_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(seqconv_eltadd_relu_fuse_pass, + paddle::framework::ir::SeqConvEltAddReluFusePass); diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h new file mode 100644 index 00000000..fde9b586 --- /dev/null +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h @@ -0,0 +1,38 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SeqConvEltAddReluFusePass : public FusePassBase { + public: + virtual ~SeqConvEltAddReluFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + const std::string name_scope_{"seqconv_eltadd_relu_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc new file mode 100644 index 00000000..4ac379eb --- /dev/null +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc @@ -0,0 +1,213 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +#define MAX_CONCAT_INPUTS 200 + +namespace paddle { +namespace framework { +namespace ir { + +PDNode* BuildSeqPoolConcatPattern(PDPattern* pattern, + const std::string& name_scope, + int num_inputs) { + auto is_concat_op_with_inputs = [](Node* x, int num) -> bool { + return x && x->IsOp() && x->Op()->Type() == "concat" && + x->Op()->Input("X").size() == static_cast(num); + }; + + auto is_nth_input_var_of_concat = [=](Node* x, int idx) -> bool { + return x && x->IsVar() && VarLinksToOp(x, "concat") && + x->outputs.size() == 1 && IsNthInput(x, x->outputs[0], "X", idx) && + is_concat_op_with_inputs(x->outputs[0], num_inputs); + }; + + auto is_seqpool_op_with_pootype_of_nth_input_of_concat = [=]( + Node* x, const std::string& type, int idx) -> bool { + bool this_is_seqpool_op = + x && x->IsOp() && x->Op()->Type() == "sequence_pool" && + x->Op()->HasAttr("pooltype") && + boost::get(x->Op()->GetAttr("pooltype")) == type && + x->outputs.size() == 2; // seqpool should only have 2 outputs + bool satisfied_all = this_is_seqpool_op; + if (this_is_seqpool_op) { + // Only one output of seqpool_op is nth_input_var of concat, + // the other one should be unused empty var. + if (is_nth_input_var_of_concat(x->outputs[0], idx)) { + satisfied_all = satisfied_all && x->outputs[1]->IsVar() && + x->outputs[1]->outputs.empty(); + } else { + satisfied_all = + satisfied_all && is_nth_input_var_of_concat(x->outputs[1], idx) && + x->outputs[0]->IsVar() && x->outputs[0]->outputs.size() == 0; + } + } + return satisfied_all; + }; + + auto* concat_op = pattern->NewNode( + [=](Node* x) { return is_concat_op_with_inputs(x, num_inputs); }, + name_scope + "/concat_op"); + concat_op->assert_op_attr("axis", 1); + + auto* concat_out_var = pattern->NewNode( + [=](Node* x) { + return x && x->IsVar() && VarLinksFromOp(x, "concat") && + x->inputs.size() == 1 && + is_concat_op_with_inputs(x->inputs[0], num_inputs); + }, + name_scope + "/concat_out_var"); + concat_out_var->assert_is_only_output_of_op("concat"); + + std::vector seqpool_ops_input_var(num_inputs); + std::vector seqpool_ops_output_var(num_inputs); + std::vector seqpool_ops_output_unused_var(num_inputs); + std::vector seqpool_ops(num_inputs); + + for (int i = 0; i < num_inputs; ++i) { + seqpool_ops_output_var[i] = pattern->NewNode( + [=](Node* x) { + return x && x->IsVar() && is_nth_input_var_of_concat(x, i) && + x->inputs.size() == 1 && + is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0], + "SUM", i); + }, + name_scope + "/sequence_pool_out_" + std::to_string(i)); + + seqpool_ops_output_unused_var[i] = pattern->NewNode( + [=](Node* x) { + return x && x->IsVar() && x->inputs.size() == 1 && + x->outputs.size() == 0 && + is_seqpool_op_with_pootype_of_nth_input_of_concat(x->inputs[0], + "SUM", i); + }, + name_scope + "/sequence_pool_unused_out_" + std::to_string(i)); + + seqpool_ops[i] = pattern->NewNode( + [=](Node* x) { + return x && x->IsOp() && + is_seqpool_op_with_pootype_of_nth_input_of_concat(x, "SUM", i); + }, + name_scope + "/sequence_pool_op_" + std::to_string(i)); + + seqpool_ops_input_var[i] = pattern->NewNode( + [=](Node* x) { + bool basic = x && x->IsVar() && x->outputs.size() >= 1; + bool next_is_fine = false; + for (auto* o : x->outputs) { + if (is_seqpool_op_with_pootype_of_nth_input_of_concat(o, "SUM", + i)) { + next_is_fine = true; + break; + } + } + return basic && next_is_fine; + }, + name_scope + "/sequence_pool_in_" + std::to_string(i)); + + // Links + seqpool_ops[i] + ->LinksFrom({seqpool_ops_input_var[i]}) + .LinksTo({seqpool_ops_output_var[i], seqpool_ops_output_unused_var[i]}); + } + concat_op->LinksFrom(seqpool_ops_output_var).LinksTo({concat_out_var}); + return concat_out_var; +} + +static int BuildFusion(Graph* graph, const std::string& name_scope, + int num_inputs) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + BuildSeqPoolConcatPattern(pattern, name_scope, num_inputs); + + auto retrieve_node = [](const std::string& name, + const GraphPatternDetector::subgraph_t& subgraph, + const PDPattern& pat) -> Node* { + PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)), + "pattern has no Node called %s", name.c_str()); + Node* p = subgraph.at(pat.RetrieveNode(name)); + PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str()); + return p; + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle SeqPool Concat fuse"; + std::vector input_names(num_inputs); + std::vector input_vars(num_inputs); + auto& fused_pattern = gpd.pattern(); + for (int i = 0; i < num_inputs; ++i) { + input_vars[i] = + retrieve_node(name_scope + "/sequence_pool_in_" + std::to_string(i), + subgraph, fused_pattern); + input_names[i] = input_vars[i]->Name(); + } + auto* concat_op = + retrieve_node(name_scope + "/concat_op", subgraph, fused_pattern); + auto* concat_out_var = + retrieve_node(name_scope + "/concat_out_var", subgraph, fused_pattern); + auto* seqpool_op0 = retrieve_node(name_scope + "/sequence_pool_op_0", + subgraph, fused_pattern); + + // Create New OpDesc + OpDesc op_desc; + op_desc.SetType("fusion_seqpool_concat"); + op_desc.SetInput("X", input_names); + op_desc.SetAttr("pooltype", seqpool_op0->Op()->GetAttr("pooltype")); + op_desc.SetAttr("axis", concat_op->Op()->GetAttr("axis")); + op_desc.SetOutput("Out", {concat_out_var->Name()}); + auto* op = graph->CreateOpNode(&op_desc); + for (size_t i = 0; i < input_vars.size(); ++i) { + IR_NODE_LINK_TO(input_vars[i], op); + } + IR_NODE_LINK_TO(op, concat_out_var); + + std::unordered_set marked_nodes; + for (auto& item : subgraph) { + marked_nodes.insert(item.second); + } + for (size_t i = 0; i < input_vars.size(); ++i) { + marked_nodes.erase(input_vars[i]); + } + marked_nodes.erase(concat_out_var); + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + + gpd(graph, handler); + return fusion_count; +} + +void SeqPoolConcatFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + int fusion_count = 0; + for (int i = MAX_CONCAT_INPUTS; i > 0; --i) { + fusion_count += + BuildFusion(graph, name_scope_ + "/" + std::to_string(i), i); + } + AddStatis(fusion_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(seqpool_concat_fuse_pass, + paddle::framework::ir::SeqPoolConcatFusePass); diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h new file mode 100644 index 00000000..40a9edc5 --- /dev/null +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/** + * Fuse SequencePool(with sum pooltype yet) and Concat; + * + * Before fuse: + * | | | + * seq_pool, seq_pool, ... seq_pool + * \ | ... / + * concat + * | + * After fuse: + * \ | / + * FusionSeqPoolConcat + * | + */ +class SeqPoolConcatFusePass : public FusePassBase { + public: + virtual ~SeqPoolConcatFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + const std::string name_scope_{"seqpool_concat_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc new file mode 100644 index 00000000..d3668038 --- /dev/null +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc @@ -0,0 +1,198 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h" +#include +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "sequence_pool") { + op->SetInput("X", {inputs[0]}); + std::string pooltype = "SUM"; + op->SetAttr("pooltype", pooltype); + op->SetOutput("MaxIndex", {outputs[0]}); + op->SetOutput("Out", {outputs[1]}); + } else if (type == "concat") { + op->SetInput("X", inputs); + op->SetAttr("axis", 1); + op->SetOutput("Out", {outputs[0]}); + } else { + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); + } + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); +} + +int CountOpType(const ir::Graph* graph, + const std::string& op_type = "fusion_seqpool_concat") { + int count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == op_type) { + ++count; + } + } + return count; +} + +std::unique_ptr GetNumNodesOfBeforeAfter( + std::unique_ptr graph, int* before, int* after, + const std::string& pass_type = "seqpool_concat_fuse_pass") { + auto pass = PassRegistry::Instance().Get(pass_type); + *before = graph->Nodes().size(); + graph.reset(pass->Apply(graph.release())); + *after = graph->Nodes().size(); + return graph; +} + +/* + * Before fuse: + * a b c + * | | | + * op1 op2 op3 + * / \ / \ / \ + * d e f g h i + * \ | / + * concat + * | + * j + * Type of op1, op2 and op3 are sequence_pool, with "SUM" pooltype attr + * + * After fuse: + * a b c + * \ | / + * fusion_seqpool_concat + * | + * j + */ +TEST(SeqPoolConcatFusePass, basic) { + ProgramDesc prog; + for (auto& v : std::vector( + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::LOD_TENSOR); + } + + SetOp(&prog, "sequence_pool", std::vector({"a"}), + std::vector({"d", "e"})); + SetOp(&prog, "sequence_pool", std::vector({"b"}), + std::vector({"f", "g"})); + SetOp(&prog, "sequence_pool", std::vector({"c"}), + std::vector({"h", "i"})); + SetOp(&prog, "concat", std::vector({"e", "g", "i"}), + std::vector({"j"})); + + std::unique_ptr graph(new ir::Graph(prog)); + int before, after; + graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after); + // Remove 10 Nodes: op1, op2, op3, d, e, f, g, h, i, concat_op + // Add 1 Node: fusion_seqpool_concat + EXPECT_EQ(after, before - 9); + EXPECT_EQ(CountOpType(graph.get()), 1); +} + +/* + * Before fuse: + * a b + * | / \ + * op1 op2 op3 + * / \ / \ \ + * c d e f g + * \ / + * concat + * | + * h + * Type of op1 and op2 are sequence_pool, with "SUM" pooltype attr + * + * After fuse: + * a b + * \ / \ + * fusion_seqpool_concat op3 + * | | + * h g + */ +TEST(SeqPoolConcatFusePass, advanced) { + ProgramDesc prog; + for (auto& v : + std::vector({"a", "b", "c", "d", "e", "f", "g", "h"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::LOD_TENSOR); + } + + SetOp(&prog, "sequence_pool", std::vector({"a"}), + std::vector({"c", "d"})); + SetOp(&prog, "sequence_pool", std::vector({"b"}), + std::vector({"e", "f"})); + SetOp(&prog, "op3", std::vector({"b"}), + std::vector({"g"})); + SetOp(&prog, "concat", std::vector({"d", "f"}), + std::vector({"h"})); + + std::unique_ptr graph(new ir::Graph(prog)); + int before, after; + graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after); + // Remove 7 Nodes: op1, op2, c, d, e, f concat_op + // Add 1 Node: fusion_seqpool_concat + EXPECT_EQ(after, before - 6); + EXPECT_EQ(CountOpType(graph.get()), 1); +} + +ProgramDesc BuildProgramDesc(int num_inputs_of_concat) { + ProgramDesc prog; + auto new_var = [&](const std::string& name) { + auto* var = prog.MutableBlock(0)->Var(name); + var->SetType(proto::VarType::LOD_TENSOR); + }; + std::vector concat_inputs; + for (int i = 0; i < num_inputs_of_concat; ++i) { + std::string prefix = "seqpool_op_" + std::to_string(i); + new_var(prefix + "in"); + new_var(prefix + "out"); + new_var(prefix + "out_unused"); + SetOp(&prog, "sequence_pool", std::vector({prefix + "in"}), + std::vector({prefix + "out", prefix + "out_unused"})); + concat_inputs.push_back(prefix + "out"); + } + SetOp(&prog, "concat", concat_inputs, + std::vector({"concat_out"})); + return prog; +} + +// test more inputs of concat +TEST(SeqPoolConcatFusePass, more_inputs) { + for (int num : {1, 2, 10}) { + ProgramDesc prog = BuildProgramDesc(num); + std::unique_ptr graph(new ir::Graph(prog)); + int before, after; + graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after); + // Remove Nodes: n * (seqpool_op, out, out_unused), and concat_op + // Add Node: fusion_seqpool_concat op + EXPECT_EQ(after, before - num * 3); + EXPECT_EQ(CountOpType(graph.get()), 1); + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(seqpool_concat_fuse_pass); diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc new file mode 100644 index 00000000..e5578363 --- /dev/null +++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/ir/shuffle_channel_detect_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(reshape1_op); \ + GET_IR_NODE(reshape1_out); \ + GET_IR_NODE(transpose_op); \ + GET_IR_NODE(transpose_out); \ + GET_IR_NODE(reshape2_op); \ + GET_IR_NODE(reshape2_out); + +void ShuffleChannelDetectPass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "shufflechannel_pattern"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("reshape2", "X") + ->AsInput(); + + patterns::ShuffleChannelPattern pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + PADDLE_ENFORCE(subgraph.count(x)); + auto* input_node = subgraph.at(x); + auto reshape1_desc = reshape1_op->Op(); + auto reshape2_desc = reshape2_op->Op(); + std::string input_name = input_node->Name(); + std::string output_name = reshape2_out->Name(); + + auto reshape1_shape = + boost::get>(reshape1_desc->GetAttr("shape")); + auto reshape2_shape = + boost::get>(reshape2_desc->GetAttr("shape")); + + int i_c = reshape1_shape[2]; + int o_c = reshape2_shape[1]; + int group = o_c / i_c; + + framework::OpDesc new_op_desc; + new_op_desc.SetType("shuffle_channel"); + new_op_desc.SetInput("X", {input_name}); + new_op_desc.SetOutput("Out", {output_name}); + + new_op_desc.SetAttr("group", group); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto* new_op = graph->CreateOpNode(&new_op_desc); + + IR_NODE_LINK_TO(input_node, new_op); + IR_NODE_LINK_TO(new_op, reshape2_out); + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, {reshape1_op, reshape1_out, transpose_op, + transpose_out, reshape2_op}); + }; + + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(shuffle_channel_detect_pass, + paddle::framework::ir::ShuffleChannelDetectPass); diff --git a/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h new file mode 100644 index 00000000..008f8013 --- /dev/null +++ b/paddle/fluid/framework/ir/shuffle_channel_detect_pass.h @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ShuffleChannelDetectPass : public FusePassBase { + public: + virtual ~ShuffleChannelDetectPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.cc new file mode 100644 index 00000000..b3606e4d --- /dev/null +++ b/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +void RunSimplifyAnakinDetection(ir::Graph *graph, int times, bool is_density, + bool is_reshape) { + const std::string pattern_name = + "simplify_anakin_detection_pattern_pass" + std::to_string(times); + std::string priorbox_type = is_density ? "density_prior_box" : "prior_box"; + + GraphPatternDetector gpd; + std::vector input_nodes; + for (int i = 0; i < times; i++) { + input_nodes.push_back(gpd.mutable_pattern() + ->NewNode("x" + std::to_string(i)) + ->assert_is_op_input(priorbox_type, "Input") + ->AsInput()); + } + input_nodes.push_back(gpd.mutable_pattern() + ->NewNode("x" + std::to_string(times)) + ->assert_is_op_input("box_coder", "TargetBox") + ->AsInput()); + + input_nodes.push_back(gpd.mutable_pattern() + ->NewNode("x" + std::to_string(times + 1)) + ->assert_is_op_input("transpose2") + ->AsInput()); + + patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name); + pattern(input_nodes, times, priorbox_type, is_reshape); + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + const int kNumFields = 7; + const int kPriorBoxLocOffset = 1; + const int kReshape1Offset = 2; + const int kReshape1OutOffset = 3; + const int kPriorBoxVarOffset = 4; + const int kReshape2Offset = 5; + const int kReshape2OutOffset = 6; + std::vector nodes; + + for (int i = 0; i < times; i++) { + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i)))); + + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i)))); + + nodes.push_back( + subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i)))); + } + + Node *concat_op1 = subgraph.at(pattern.GetPDNode("concat1")); + Node *concat_out1 = subgraph.at(pattern.GetPDNode("concat1_out")); + + Node *concat_op2 = subgraph.at(pattern.GetPDNode("concat2")); + Node *concat_out2 = subgraph.at(pattern.GetPDNode("concat2_out")); + + Node *box_coder_third_input = subgraph.at(input_nodes[times]); + Node *box_coder_op = subgraph.at(pattern.GetPDNode("box_coder")); + Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out")); + + Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]); + Node *transpose_before_nms = + subgraph.at(pattern.GetPDNode("transpose_before_nms")); + Node *transpose_before_nms_out = + subgraph.at(pattern.GetPDNode("transpose_before_nms_out")); + + Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms")); + Node *multiclass_nms_out = + subgraph.at(pattern.GetPDNode("multiclass_nms_out")); + + std::string code_type = + boost::get(box_coder_op->Op()->GetAttr("code_type")); + bool box_normalized = + boost::get(box_coder_op->Op()->GetAttr("box_normalized")); + + int background_label = + boost::get(multiclass_nms->Op()->GetAttr("background_label")); + float score_threshold = + boost::get(multiclass_nms->Op()->GetAttr("score_threshold")); + int nms_top_k = boost::get(multiclass_nms->Op()->GetAttr("nms_top_k")); + float nms_threshold = + boost::get(multiclass_nms->Op()->GetAttr("nms_threshold")); + float nms_eta = boost::get(multiclass_nms->Op()->GetAttr("nms_eta")); + int keep_top_k = + boost::get(multiclass_nms->Op()->GetAttr("keep_top_k")); + + std::vector concat1_input_names; + for (int i = 0; i < times; i++) { + concat1_input_names.push_back( + nodes[i * kNumFields + kPriorBoxLocOffset]->Name()); + } + + framework::OpDesc concat1_desc; + concat1_desc.SetType("concat"); + concat1_desc.SetInput("X", concat1_input_names); + concat1_desc.SetAttr("axis", 2); + concat1_desc.SetOutput("Out", {concat_out1->Name()}); + + auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc); + + for (int i = 0; i < times; i++) { + nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back( + new_add_concat_op); + new_add_concat_op->inputs.push_back( + nodes[i * kNumFields + kPriorBoxLocOffset]); + } + + framework::OpDesc new_op_desc; + new_op_desc.SetType("detection_out"); + new_op_desc.SetInput("PriorBox", {concat_out1->Name()}); + new_op_desc.SetInput("TargetBox", {box_coder_third_input->Name()}); + new_op_desc.SetInput("Scores", {multiclass_nms_second_input->Name()}); + new_op_desc.SetAttr("code_type", code_type); + new_op_desc.SetAttr("box_normalized", box_normalized); + new_op_desc.SetAttr("background_label", background_label); + new_op_desc.SetAttr("score_threshold", score_threshold); + new_op_desc.SetAttr("nms_top_k", nms_top_k); + new_op_desc.SetAttr("nms_threshold", nms_threshold); + new_op_desc.SetAttr("nms_eta", nms_eta); + new_op_desc.SetAttr("keep_top_k", keep_top_k); + new_op_desc.SetOutput("Out", {multiclass_nms_out->Name()}); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto *detection_out_op = graph->CreateOpNode(&new_op_desc); + + std::unordered_set delete_nodes; + + for (int i = 0; i < times; i++) { + nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(concat_op1); + delete_nodes.insert(nodes[i * kNumFields + kReshape1Offset]); + delete_nodes.insert(nodes[i * kNumFields + kReshape1OutOffset]); + delete_nodes.insert(nodes[i * kNumFields + kPriorBoxVarOffset]); + delete_nodes.insert(nodes[i * kNumFields + kReshape2Offset]); + delete_nodes.insert(nodes[i * kNumFields + kReshape2OutOffset]); + } + + delete_nodes.insert(concat_op1); + delete_nodes.insert(concat_op2); + delete_nodes.insert(concat_out2); + delete_nodes.insert(box_coder_op); + delete_nodes.insert(box_coder_out); + delete_nodes.insert(transpose_before_nms); + delete_nodes.insert(transpose_before_nms_out); + delete_nodes.insert(multiclass_nms); + + new_add_concat_op->outputs.push_back(concat_out1); + concat_out1->inputs.push_back(new_add_concat_op); + + detection_out_op->inputs.push_back(concat_out1); + detection_out_op->inputs.push_back(box_coder_third_input); + detection_out_op->inputs.push_back(multiclass_nms_second_input); + detection_out_op->outputs.push_back(multiclass_nms_out); + + concat_out1->outputs.push_back(detection_out_op); + box_coder_third_input->outputs.push_back(detection_out_op); + multiclass_nms_second_input->outputs.push_back(detection_out_op); + multiclass_nms_out->inputs.push_back(detection_out_op); + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, delete_nodes); + }; + + gpd(graph, handler); +} + +void SimplifyAnakinDetectionPatternPass::ApplyImpl(ir::Graph *graph) const { + const int pattern_nums = 6; + const std::string pattern_name = "simplify_anakin_detection_pattern_pass"; + FusePassBase::Init(pattern_name, graph); + std::vector options = {true, false}; + for (const auto &is_density : options) { + for (const auto &is_reshape : options) { + for (int i = 1; i <= pattern_nums; i++) { + RunSimplifyAnakinDetection(graph, i, is_density, is_reshape); + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +typedef paddle::framework::ir::SimplifyAnakinDetectionPatternPass + priorbox_pattern; +REGISTER_PASS(simplify_anakin_priorbox_detection_out_pass, priorbox_pattern); diff --git a/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h b/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h new file mode 100644 index 00000000..e882b9dc --- /dev/null +++ b/paddle/fluid/framework/ir/simplify_anakin_priorbox_detection_out_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +// There may be many transpose-flatten structures in a model, and the output of +// these structures will be used as inputs to the concat Op. This pattern will +// be detected by our pass. The times here represents the repeat times of this +// structure. +class SimplifyAnakinDetectionPatternPass : public FusePassBase { + public: + virtual ~SimplifyAnakinDetectionPatternPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc new file mode 100644 index 00000000..42f4a91a --- /dev/null +++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc @@ -0,0 +1,377 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +PDNode* BuildSquaredMatSubPattern(PDPattern* pattern, + const std::string& name_scope) { + auto var_is_op_input = [=](Node* x, const std::string& op_type, + const std::string& arg_name = "") -> bool { + if (!(x && x->IsVar())) { + return false; + } + for (auto* op : x->outputs) { + if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) { + if (arg_name.empty()) { + return true; + } + for (auto& name : op->Op()->Input(arg_name)) { + if (name == x->Name()) { + return true; + } + } + } + } + return false; + }; + + auto var_is_op_only_output = [](Node* x, const std::string& op_type) -> bool { + return x && x->IsVar() && x->inputs.size() == 1 && x->inputs[0] && + x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == op_type && + x->inputs[0]->outputs.size() == 1; + }; + + auto next_op = [=](Node* x, const std::string& op_type) -> Node* { + if (!(x && x->IsVar())) { + return nullptr; + } + for (auto* op : x->outputs) { + if (op && op->IsOp() && op->Op() && op->Op()->Type() == op_type) { + return op; + } + } + return nullptr; + }; + + auto get_op_input_var = [=](Node* x, const std::string& arg_name) -> Node* { + if (!(x && x->IsOp())) { + return nullptr; + } + for (auto* var : x->inputs) { + for (auto name : x->Op()->Input(arg_name)) { + if (var->Name() == name) { + return var; + } + } + } + return nullptr; + }; + + auto is_fusion_input_var = [=](Node* x, const std::string& arg_name) { + bool basic = var_is_op_input(x, "matmul", arg_name) && + var_is_op_input(x, "square", "X"); + if (!basic) { + return false; + } + auto* squared_x_op = next_op(x, "square"); + if (!(squared_x_op && squared_x_op->outputs.size() == 1)) { + return false; + } + auto* squared_x = squared_x_op->outputs[0]; + bool next_is_matmul_from_arg = + var_is_op_input(squared_x, "matmul", arg_name) && + squared_x->outputs.size() == 1 && + squared_x->outputs[0]->outputs.size() == 1; + if (!next_is_matmul_from_arg) { + return false; + } + auto* sub_y_in = squared_x->outputs[0]->outputs[0]; + return var_is_op_input(sub_y_in, "elementwise_sub", "Y") && + sub_y_in->outputs[0]->outputs.size() == 1 && + var_is_op_input(sub_y_in->outputs[0]->outputs[0], "elementwise_mul"); + }; + + auto is_fusion_first_mul_out = [=](Node* x) -> bool { + bool input_is_matmul_op = x && x->inputs.size() == 1 && + x->inputs[0]->IsOp() && + x->inputs[0]->Op()->Type() == "matmul"; + if (!input_is_matmul_op) { + return false; + } + auto* mat_x = get_op_input_var(x->inputs[0], "X"); + auto* mat_y = get_op_input_var(x->inputs[0], "Y"); + bool input_mul_is_valid = mat_x && is_fusion_input_var(mat_x, "X") && + mat_y && is_fusion_input_var(mat_y, "Y"); + if (!input_mul_is_valid) { + return false; + } + + bool next_is_square = var_is_op_input(x, "square", "X") && + x->outputs.size() == 1 && + x->outputs[0]->outputs.size() == 1; + if (!next_is_square) { + return false; + } + auto* sub_x_in = x->outputs[0]->outputs[0]; + return var_is_op_input(sub_x_in, "elementwise_sub", "X") && + sub_x_in->outputs[0]->outputs.size() == 1 && + var_is_op_input(sub_x_in->outputs[0]->outputs[0], "elementwise_mul"); + }; + + auto* x = pattern->NewNode( + [=](Node* x) { return is_fusion_input_var(x, "X"); }, name_scope + "/x"); + + auto* y = pattern->NewNode( + [=](Node* x) { return is_fusion_input_var(x, "Y"); }, name_scope + "/y"); + + auto* square_x_op = pattern->NewNode( + [=](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "square" && + is_fusion_input_var(x->inputs[0], "X"); + }, + name_scope + "/squared_x_op"); + + auto* square_y_op = pattern->NewNode( + [=](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "square" && + is_fusion_input_var(x->inputs[0], "Y"); + }, + name_scope + "/squared_y_op"); + + auto* squared_x = pattern->NewNode( + [=](Node* x) { + return x && x->inputs.size() == 1 && x->inputs[0]->inputs.size() == 1 && + is_fusion_input_var(x->inputs[0]->inputs[0], "X"); + }, + name_scope + "/squared_x"); + + auto* squared_y = pattern->NewNode( + [=](Node* x) { + return x && x->inputs.size() == 1 && x->inputs[0]->inputs.size() == 1 && + is_fusion_input_var(x->inputs[0]->inputs[0], "Y"); + }, + name_scope + "/squared_y"); + + auto* matmuled_xy = + pattern->NewNode([=](Node* x) { return is_fusion_first_mul_out(x); }, + name_scope + "/matmuled_xy"); + + auto* matmul_xy_op = pattern->NewNode( + [=](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "matmul" && + is_fusion_first_mul_out(x->outputs[0]); + }, + name_scope + "/matmul_xy_op"); + + auto* square_matmuled_xy_op = pattern->NewNode( + [=](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "square" && + is_fusion_first_mul_out(x->inputs[0]); + }, + name_scope + "/square_matmuled_xy_op"); + + auto* squared_xmuly = pattern->NewNode( + [=](Node* x) { + return x && x->IsVar() && x->inputs.size() == 1 && + x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == "square" && + is_fusion_first_mul_out(x->inputs[0]->inputs[0]); + }, + name_scope + "/squared_xmuly"); + + auto is_fusion_mat_squared_x_y_op_out = [=](Node* x) -> bool { + bool basic = x && x->IsVar() && x->inputs.size() == 1 && + x->inputs[0]->IsOp() && x->inputs[0]->Op()->Type() == "matmul"; + if (!basic) { + return false; + } + auto* sqx = get_op_input_var(x->inputs[0], "X"); + auto* sqy = get_op_input_var(x->inputs[0], "Y"); + + return var_is_op_only_output(sqx, "square") && + var_is_op_only_output(sqy, "square") && sqx->inputs[0] && + sqx->inputs[0]->inputs.size() == 1 && + is_fusion_input_var(sqx->inputs[0]->inputs[0], "X") && + sqy->inputs[0] && sqy->inputs[0]->inputs.size() == 1 && + is_fusion_input_var(sqy->inputs[0]->inputs[0], "Y"); + }; + + auto* matmul_squared_x_y_op = pattern->NewNode( + [=](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "matmul" && + is_fusion_mat_squared_x_y_op_out(x->outputs[0]); + }, + name_scope + "/matmul_squared_x_y_op"); + + auto* mat_squared_x_y_op_out = pattern->NewNode( + [=](Node* x) { return is_fusion_mat_squared_x_y_op_out(x); }, + name_scope + "/mat_squared_x_y_op_out"); + + auto is_fusion_sub_op = [=](Node* x) -> bool { + bool is_sub_op = x && x->IsOp() && x->Op()->Type() == "elementwise_sub"; + if (!is_sub_op) { + return false; + } + auto* matmul_sqx_sqy_var = get_op_input_var(x, "Y"); + return is_fusion_mat_squared_x_y_op_out(matmul_sqx_sqy_var); + }; + + auto* sub_op = pattern->NewNode([=](Node* x) { return is_fusion_sub_op(x); }, + name_scope + "/sub_op"); + + auto* sub_op_out = pattern->NewNode( + [=](Node* x) { + return x && x->IsVar() && x->inputs.size() == 1 && + is_fusion_sub_op(x->inputs[0]); + }, + name_scope + "/sub_op_out"); + + auto is_fusion_element_op = [=](Node* x) -> bool { + bool is_elemul_op = x && x->IsOp() && x->Op()->Type() == "elementwise_mul"; + if (!is_elemul_op) { + return false; + } + for (auto* in : x->inputs) { + if (in && in->inputs[0] && is_fusion_sub_op(in->inputs[0])) { + return true; + } + } + return false; + }; + + auto* elementmul_op = + pattern->NewNode([=](Node* x) { return is_fusion_element_op(x); }, + name_scope + "/elementmul_op"); + + auto* constant_op = pattern->NewNode( + [=](Node* x) { + return x && x->IsOp() && x->Op()->Type() == "fill_constant" && + x->outputs.size() == 1 && + is_fusion_element_op(x->outputs[0]->outputs[0]); + }, + name_scope + "/fill_constant_op"); + + auto* constant_op_out = pattern->NewNode( + [=](Node* x) { + return x && x->IsVar() && var_is_op_input(x, "elementwise_mul") && + x->inputs[0] && x->inputs[0]->IsOp() && + x->inputs[0]->Op()->Type() == "fill_constant" && x->outputs[0] && + is_fusion_element_op(x->outputs[0]); + }, + name_scope + "/constant_op_out"); + + auto* last_out_var = pattern->NewNode( + [=](Node* x) { + return var_is_op_only_output(x, "elementwise_mul") && + is_fusion_element_op(x->inputs[0]); + }, + name_scope + "/out"); + + square_x_op->LinksFrom({x}).LinksTo({squared_x}); + square_y_op->LinksFrom({y}).LinksTo({squared_y}); + matmul_xy_op->LinksFrom({x, y}).LinksTo({matmuled_xy}); + matmul_squared_x_y_op->LinksFrom({squared_x, squared_y}) + .LinksTo({mat_squared_x_y_op_out}); + square_matmuled_xy_op->LinksFrom({matmuled_xy}).LinksTo({squared_xmuly}); + sub_op->LinksFrom({squared_xmuly, mat_squared_x_y_op_out}) + .LinksTo({sub_op_out}); + constant_op->LinksFrom({}).LinksTo({constant_op_out}); + elementmul_op->LinksFrom({constant_op_out, sub_op_out}) + .LinksTo({last_out_var}); + + return last_out_var; +} + +static int BuildFusion(Graph* graph, const std::string& name_scope) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + BuildSquaredMatSubPattern(pattern, name_scope); + + auto retrieve_node = [](const std::string& name, + const GraphPatternDetector::subgraph_t& subgraph, + const PDPattern& pat) -> Node* { + PADDLE_ENFORCE(subgraph.count(pat.RetrieveNode(name)), + "pattern has no Node called %s", name.c_str()); + Node* p = subgraph.at(pat.RetrieveNode(name)); + PADDLE_ENFORCE_NOT_NULL(p, "subgraph has no node %s", name.c_str()); + return p; + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + LOG(INFO) << "handle sqaure mat sub fuse"; + auto& fused_pattern = gpd.pattern(); + + auto* matx = retrieve_node(name_scope + "/x", subgraph, fused_pattern); + auto* maty = retrieve_node(name_scope + "/y", subgraph, fused_pattern); + auto* squaredx = + retrieve_node(name_scope + "/squared_x", subgraph, fused_pattern); + auto* squaredy = + retrieve_node(name_scope + "/squared_y", subgraph, fused_pattern); + auto* squaredxy = + retrieve_node(name_scope + "/squared_xmuly", subgraph, fused_pattern); + auto* last_out_var = + retrieve_node(name_scope + "/out", subgraph, fused_pattern); + auto* fill_constant_op = retrieve_node(name_scope + "/fill_constant_op", + subgraph, fused_pattern); + + // Create New OpDesc + OpDesc op_desc; + op_desc.SetType("fusion_squared_mat_sub"); + op_desc.SetInput("X", {matx->Name()}); + op_desc.SetInput("Y", {maty->Name()}); + op_desc.SetOutput("SquaredX", {squaredx->Name()}); + op_desc.SetOutput("SquaredY", {squaredy->Name()}); + op_desc.SetOutput("SquaredXY", {squaredxy->Name()}); + op_desc.SetOutput("Out", {last_out_var->Name()}); + op_desc.SetAttr("scalar", fill_constant_op->Op()->GetAttr("value")); + + auto* op = graph->CreateOpNode(&op_desc); + IR_NODE_LINK_TO(matx, op); + IR_NODE_LINK_TO(maty, op); + IR_NODE_LINK_TO(op, squaredx); + IR_NODE_LINK_TO(op, squaredy); + IR_NODE_LINK_TO(op, squaredxy); + IR_NODE_LINK_TO(op, last_out_var); + + std::unordered_set marked_nodes; + for (auto& item : subgraph) { + marked_nodes.insert(item.second); + } + + marked_nodes.erase(matx); + marked_nodes.erase(maty); + marked_nodes.erase(squaredx); + marked_nodes.erase(squaredy); + marked_nodes.erase(squaredxy); + marked_nodes.erase(last_out_var); + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + + gpd(graph, handler); + return fusion_count; +} + +void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + int fusion_count = BuildFusion(graph, name_scope_); + AddStatis(fusion_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(squared_mat_sub_fuse_pass, + paddle::framework::ir::SquaredMatSubFusePass); diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h new file mode 100644 index 00000000..b6165a51 --- /dev/null +++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/** + * Fuse ( (A.^2 * B.^2) - (A * B).^2 ) .* scalar + */ +class SquaredMatSubFusePass : public FusePassBase { + public: + virtual ~SquaredMatSubFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + const std::string name_scope_{"squared_mat_sub_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc new file mode 100644 index 00000000..25207ffc --- /dev/null +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SyncBatchNormPass : public Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override { + VLOG(3) << "Use synchronous batch norm"; + for (const Node *n : graph->Nodes()) { + if (n->IsOp()) { + auto *op = n->Op(); + if (op->Type() == "batch_norm") { + op->SetType("sync_batch_norm"); + } + if (op->Type() == "batch_norm_grad") { + op->SetType("sync_batch_norm_grad"); + } + } + } + } +}; +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(sync_batch_norm_pass, paddle::framework::ir::SyncBatchNormPass); diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc new file mode 100644 index 00000000..90d21411 --- /dev/null +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/program_desc.h" +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("name", name); + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); +} + +// (a, conv_w)->conv2d->b +// (b, bn_scale, bn_bias, mean, var)->batch_norm +// ->(c, mean, var, save_mean, save_inv_var) +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : std::vector({"a", "conv_w", "b", "bn_scale", + "bn_bias", "mean", "var", "c", + "save_mean", "save_inv_var"})) { + auto* var = prog.MutableBlock(0)->Var(v); + if (v == "conv_w" || v == "bn_scale" || v == "bn_bias" || v == "mean" || + v == "var") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "conv2d", "conv", std::vector({"a", "conv_w"}), + std::vector({"b"})); + SetOp(&prog, "batch_norm", "bn", + std::vector({"b", "bn_scale", "bn_bias", "mean", "var"}), + std::vector( + {"c", "mean", "var", "save_mean", "save_inv_var"})); + return prog; +} + +TEST(IsTestPass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass"); + + graph.reset(pass->Apply(graph.release())); + + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + auto op_name = boost::get(op->GetAttr("name")); + if (op_name == "bn") { + ASSERT_EQ(op->Type(), "sync_batch_norm"); + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(sync_batch_norm_pass); diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc new file mode 100644 index 00000000..a984a494 --- /dev/null +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +void RunTransposeFlattenConcatFuse(ir::Graph *graph, int times) { + const std::string pattern_name = + "transpose_flatten" + std::to_string(times) + "_concat_fuse"; + + GraphPatternDetector gpd; + std::vector input_nodes; + for (int i = 0; i < times; i++) { + input_nodes.push_back(gpd.mutable_pattern() + ->NewNode("x" + std::to_string(i)) + ->assert_is_op_input("transpose2", "X") + ->AsInput()); + } + + patterns::TransposeFlattenConcat pattern(gpd.mutable_pattern(), pattern_name); + pattern(input_nodes, times); + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + const int kNumFields = 5; + const int kTransOffset = 1; + const int kTransOutOffset = 2; + const int kFlattenOffset = 3; + const int kFlattenOutOffset = 4; + std::vector nodes; + + for (int i = 0; i < times; i++) { + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i)))); + PADDLE_ENFORCE(subgraph.at(input_nodes[i])); + + nodes.push_back(subgraph.at(input_nodes[i])); + nodes.push_back( + subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("transpose_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("flatten" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("flatten_out" + std::to_string(i)))); + } + + Node *concat_op = subgraph.at(pattern.GetPDNode("concat")); + Node *concat_out = subgraph.at(pattern.GetPDNode("concat_out")); + std::vector input_names; + std::vector trans_axis = boost::get>( + nodes[kTransOffset]->Op()->GetAttr("axis")); + int flatten_axis = + boost::get(nodes[kFlattenOffset]->Op()->GetAttr("axis")); + int concat_axis = boost::get(concat_op->Op()->GetAttr("axis")); + std::string output_name = concat_out->Name(); + + for (int i = 0; i < times; i++) { + input_names.push_back(nodes[i * kNumFields]->Name()); + } + + framework::OpDesc new_op_desc; + new_op_desc.SetType("fusion_transpose_flatten_concat"); + new_op_desc.SetInput("X", input_names); + new_op_desc.SetAttr("trans_axis", trans_axis); + new_op_desc.SetAttr("flatten_axis", flatten_axis); + new_op_desc.SetAttr("concat_axis", concat_axis); + new_op_desc.SetOutput("Out", {output_name}); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto *new_conv_op = graph->CreateOpNode(&new_op_desc); + + std::unordered_set delete_nodes; + + for (int i = 0; i < times; i++) { + nodes[i * kNumFields]->outputs.push_back(new_conv_op); + new_conv_op->inputs.push_back(nodes[i * kNumFields]); + delete_nodes.insert(nodes[i * kNumFields + kTransOffset]); + delete_nodes.insert(nodes[i * kNumFields + kTransOutOffset]); + delete_nodes.insert(nodes[i * kNumFields + kFlattenOffset]); + delete_nodes.insert(nodes[i * kNumFields + kFlattenOutOffset]); + } + delete_nodes.insert(concat_op); + + new_conv_op->outputs.push_back(concat_out); + concat_out->inputs.push_back(new_conv_op); + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, delete_nodes); + }; + + gpd(graph, handler); +} + +void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const { + const int pattern_nums = 6; + const std::string pattern_name = "transpose_flatten_concat_fuse"; + FusePassBase::Init(pattern_name, graph); + for (int i = 1; i <= pattern_nums; i++) { + RunTransposeFlattenConcatFuse(graph, i); + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(transpose_flatten_concat_fuse_pass, + paddle::framework::ir::TransposeFlattenConcatFusePass); diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h new file mode 100644 index 00000000..939a8c31 --- /dev/null +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +// There may be many transpose-flatten structures in a model, and the output of +// these structures will be used as inputs to the concat Op. This pattern will +// be detected by our pass. The times here represents the repeat times of this +// structure. +class TransposeFlattenConcatFusePass : public FusePassBase { + public: + virtual ~TransposeFlattenConcatFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h new file mode 100644 index 00000000..904cc013 --- /dev/null +++ b/paddle/fluid/framework/library_type.h @@ -0,0 +1,72 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +namespace paddle { +namespace framework { + +// For more details about the design of LibraryType, Please refer to +// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library + +enum class LibraryType { + kPlain = 0, + kMKLDNN = 1, + kCUDNN = 2, +}; + +inline std::string LibraryTypeToString(const LibraryType& library_type) { + switch (library_type) { + case LibraryType::kPlain: + return "PLAIN"; + case LibraryType::kMKLDNN: + return "MKLDNN"; + case LibraryType::kCUDNN: + return "CUDNN"; + default: + PADDLE_THROW("unknown LibraryType %d", static_cast(library_type)); + } +} + +inline LibraryType StringToLibraryType(const char* ctype) { + std::string s(ctype); + for (size_t i = 0; i < s.size(); ++i) { + s[i] = toupper(s[i]); + } + if (s == std::string("PLAIN")) { + return LibraryType::kPlain; + } else if (s == std::string("MKLDNN")) { + return LibraryType::kMKLDNN; + } else if (s == std::string("CUDNN")) { + return LibraryType::kCUDNN; + // To be compatible with register macro. + // CPU, CUDA, PLAIN are same library type. + } else if (s == std::string("CPU")) { + return LibraryType::kPlain; + } else if (s == std::string("CUDA")) { + return LibraryType::kPlain; + } else { + PADDLE_THROW("Unknown LibraryType %s", s.c_str()); + } +} + +inline std::ostream& operator<<(std::ostream& out, LibraryType l) { + out << LibraryTypeToString(l); + return out; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/lod_rank_table.cc b/paddle/fluid/framework/lod_rank_table.cc new file mode 100644 index 00000000..6bc795b6 --- /dev/null +++ b/paddle/fluid/framework/lod_rank_table.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/lod_rank_table.h" + +namespace paddle { +namespace framework { +void LoDRankTable::Reset(const LoD& lod, size_t level) { + this->coarse_lod_.clear(); + this->items_.clear(); + PADDLE_ENFORCE(level < lod.size(), + "Cannot rank lod since the level %d is less than lod size %d", + level, lod.size()); + coarse_lod_.reserve(level); + for (size_t i = 0; i < level; ++i) { + coarse_lod_.push_back(lod[i]); + } + auto& vec = lod[level]; + for (size_t i = 0; i < vec.size() - 1; ++i) { + TableItem item; + item.index = i; + item.length = vec[i + 1] - vec[i]; + VLOG(10) << "Add item to rank table " << item.index << " " << item.length; + items_.emplace_back(item); + } + // NOTE(yuyang18): + // + // The time complexity of stable_sort is O(N*log(N)) if additional memory is + // available. It is easy to debug and unit test when using `stable_sort` + // instead of `sort`. Also, the items of a rank table will not be too large. + std::stable_sort(items_.begin(), items_.end(), + [](const TableItem& a, const TableItem& b) { + return a.length > b.length; + }); +} + +} // namespace framework + +std::ostream& operator<<(std::ostream& out, + const framework::LoDRankTable& table) { + out << "NumOfSequence " << table.items().size() << "\n"; + for (auto& each_item : table.items()) { + out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n"; + } + return out; +} +} // namespace paddle diff --git a/paddle/fluid/framework/lod_rank_table.h b/paddle/fluid/framework/lod_rank_table.h new file mode 100644 index 00000000..8c6e8b0c --- /dev/null +++ b/paddle/fluid/framework/lod_rank_table.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { + +// LoD Rank Table stores the `level` of `lod` which is ordered by sequence +// length in descending order. It is useful when implement dynamic RNN and is +// shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice +// output operators. +// +// The table item contains two element. The length of sequence and the index of +// sequence in that level. +// +// LoDRankTable also stores the coarse_lod, which is the lod information whose +// level is less than input level, in order to restore the output LoD +// information. +class LoDRankTable { + public: + struct TableItem { + size_t index; + size_t length; + }; + + LoDRankTable() {} + + void Reset(const LoD& lod, size_t level); + + const std::vector& items() const { return this->items_; } + + const LoD& coarse_lod() const { return this->coarse_lod_; } + + size_t level() const { return coarse_lod_.size(); } + + private: + LoD coarse_lod_; + std::vector items_; +}; + +} // namespace framework + +std::ostream& operator<<(std::ostream& out, + const framework::LoDRankTable& table); + +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc new file mode 100644 index 00000000..9883a194 --- /dev/null +++ b/paddle/fluid/framework/lod_tensor.cc @@ -0,0 +1,432 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/framework/version.h" + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" + +#include "paddle/fluid/recordio/scanner.h" +#include "paddle/fluid/recordio/writer.h" + +namespace paddle { +namespace framework { + +std::ostream &operator<<(std::ostream &os, const LoD &lod) { + os << "{"; + for (auto &v : lod) { + os << "{"; + bool is_first = true; + for (auto &i : v) { + if (is_first) { + os << i; + is_first = false; + } else { + os << ", " << i; + } + } + os << "}"; + } + os << "}"; + + return os; +} + +std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { + os << "\tlod: " << t.lod() << "\n"; + os << static_cast(t) << "\n"; + + return os; +} + +std::string LoDToString(const LoD &lod) { + std::ostringstream stream; + stream << lod; + return stream.str(); +} + +LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin, + size_t elem_end) { + PADDLE_ENFORCE_LT(level, in.size()); + PADDLE_ENFORCE_LT(elem_begin, elem_end); + PADDLE_ENFORCE_LT(elem_end, in[level].size()); + + LoD res; + res.resize(in.size() - level); + // copy the first level + res[0].assign(in[level].begin() + elem_begin, + in[level].begin() + elem_end + 1); + for (size_t lvl = 1; lvl < res.size(); lvl++) { + const auto &in_level = in[level + lvl]; + const auto &above_level = res[lvl - 1]; + auto &out_level = res[lvl]; + out_level.assign(in_level.begin() + above_level.front(), + in_level.begin() + above_level.back() + 1); + } + for (size_t lvl = 0; lvl < res.size(); lvl++) { + // to make the first offset equals 0, all the elements minus the first + // element + size_t front = res[lvl].front(); + for (auto &ele : res[lvl]) { + ele -= front; + } + } + return res; +} + +LoD ToAbsOffset(const LoD &in) { + // the lowest level stores relative offsets + if (in.empty() || in.size() == 1) return in; + LoD result = in; + for (auto level = static_cast(in.size() - 2); level >= 0; level--) { + for (size_t i = 0; i < in[level].size(); ++i) { + size_t index = in[level][i]; + result[level][i] = result[level + 1][index]; + } + } + return result; +} + +bool operator==(const LoD &a, const LoD &b) { + if (a.size() != b.size()) { + return false; + } + + for (size_t i = 0; i < a.size(); i++) { + const auto &a_level = a[i]; + const auto &b_level = b[i]; + if (a_level.size() != b_level.size()) { + return false; + } + for (size_t j = 0; j < a_level.size(); j++) { + if (a_level[j] != b_level[j]) { + return false; + } + } + } + return true; +} + +bool CheckLoD(const LoD &in, int tensor_height) { + if (in.empty()) return true; + for (const auto &level : in) { + // check: there should be more than 2 offsets existing in each level. + if (level.size() < 2) return false; + // check: the first offset(the begin offset) of each level should be 0. + if (level.front() != 0) return false; + // check: all the offsets in a level should be non-descending + if (!std::is_sorted(level.begin(), level.end())) { + return false; + } + } + // check: the lowest level's last offset should equals `tensor_height` if + // tensor_height>0. + if (tensor_height > 0 && (size_t)tensor_height != in.back().back()) + return false; + + // check: the higher level's last offset should equals the lower level's + // size-1. + // NOTE LoD store the levels from top to bottom, so the higher level goes + // first. + for (size_t level = 0; level < in.size() - 1; level++) { + if (in[level].back() != in[level + 1].size() - 1) return false; + } + return true; +} + +bool CheckAbsLoD(const LoD &in, int tensor_height) { + if (in.empty()) return true; + for (const auto &level : in) { + // check: all the offsets in a level should be ascending(no same items + // allowed). + if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) { + if (a < b) return true; + return false; + })) { + return false; + } + + // check: there should be more than 2 offsets existing in each level. + if (level.size() < 2) return false; + + // check: the first offset of each level should be 0, and the last should be + // the same(the height of underlying tensor). + if (level.front() != 0) return false; + if (tensor_height < 0) { + tensor_height = level.back(); + } else if ((size_t)tensor_height != level.back()) { + return false; + } + } + return true; +} + +using LoDAndOffset = std::pair>; +LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx, + size_t end_idx, size_t start_level) { + LoD sub_lod; + + for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) { + PADDLE_ENFORCE_LE(start_idx, end_idx); + PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size()); + std::vector level_lens; + for (size_t i = start_idx; i < end_idx; ++i) { + level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]); + } + sub_lod.emplace_back(level_lens); + start_idx = lod[level_idx][start_idx]; + end_idx = lod[level_idx][end_idx]; + } + + return LoDAndOffset{sub_lod, {start_idx, end_idx}}; +} + +void AppendLoD(LoD *lod, const LoD &lod_length) { + PADDLE_ENFORCE( + lod->empty() || lod->size() == lod_length.size(), + "The lod_length should has the same size with the appended lod."); + if (lod->empty()) { + for (size_t i = 0; i < lod_length.size(); ++i) { + lod->emplace_back(1, 0); // size = 1, value = 0; + } + *lod = LoD(lod_length.size(), std::vector({0})); + } + for (size_t i = 0; i < lod->size(); ++i) { + auto &level = (*lod)[i]; + for (size_t len : lod_length[i]) { + level.push_back(level.back() + len); + } + } +} + +void SerializeToStream(std::ostream &os, const LoDTensor &tensor, + const platform::DeviceContext &dev_ctx) { + { // the 1st field, uint32_t version for LoDTensor + os.write(reinterpret_cast(&kCurTensorVersion), + sizeof(kCurTensorVersion)); + } + { + // the 2st field, LoD information + // uint64_t lod_level + // uint64_t lod_level_1 size in byte. + // int* lod_level_1 data + // ... + auto lod = tensor.lod(); + uint64_t size = lod.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + + for (auto &each : lod) { + size = each.size() * sizeof(framework::LoD::value_type::value_type); + os.write(reinterpret_cast(&size), sizeof(size)); + os.write(reinterpret_cast(each.data()), + static_cast(size)); + } + } + // the 3st field, Tensor + TensorToStream(os, static_cast(tensor), dev_ctx); +} + +void DeserializeFromStream(std::istream &is, LoDTensor *tensor, + const platform::DeviceContext &dev_ctx) { + { + // the 1st field, unit32_t version for LoDTensor + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE(framework::IsTensorVersionSupported(version), + "tensor version %u is not supported.", version); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + } + { + // the 2st field, LoD information + uint64_t lod_level; + is.read(reinterpret_cast(&lod_level), sizeof(lod_level)); + auto &lod = *tensor->mutable_lod(); + lod.resize(lod_level); + for (uint64_t i = 0; i < lod_level; ++i) { + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::vector tmp(size / sizeof(size_t)); + is.read(reinterpret_cast(tmp.data()), + static_cast(size)); + lod[i] = tmp; + } + } + // the 3st filed, Tensor + TensorFromStream(is, static_cast(tensor), dev_ctx); +} + +void WriteToRecordIO(recordio::Writer *writer, + const std::vector &tensor, + const platform::DeviceContext &dev_ctx) { + std::stringstream buffer; + size_t sz = tensor.size(); + buffer.write(reinterpret_cast(&sz), sizeof(uint32_t)); + for (auto &each : tensor) { + SerializeToStream(buffer, each, dev_ctx); + } + writer->Write(buffer.str()); +} + +bool ReadFromRecordIO(recordio::Scanner *scanner, + const platform::DeviceContext &dev_ctx, + std::vector *result_ptr) { + if (!scanner->HasNext()) { + return false; + } + std::istringstream sin(scanner->Next()); + uint32_t sz; + sin.read(reinterpret_cast(&sz), sizeof(uint32_t)); + auto &result = *result_ptr; + result.resize(sz); + for (uint32_t i = 0; i < sz; ++i) { + DeserializeFromStream(sin, &result[i], dev_ctx); + } + + return true; +} + +std::vector LoDTensor::SplitLoDTensor( + const std::vector places) const { + check_memory_size(); + int batch_size = + lod().empty() ? dims()[0] : static_cast(lod()[0].size()) - 1; + size_t result_size = std::min(static_cast(batch_size), places.size()); + size_t remainder = batch_size % places.size(); + + std::vector results; + results.reserve(result_size); + + int step_width = static_cast(batch_size / result_size); + for (size_t i = 0; i < result_size; ++i) { + int begin = static_cast(i * step_width); + int end = static_cast((i + 1) * step_width); + if (i + 1 == places.size()) { // last + end += remainder; + } + + LoDTensor dst; + if (lod().empty()) { + auto src = Slice(begin, end); + auto &dst_place = places[i]; + framework::TensorCopy(src, dst_place, &dst); + } else { + auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0); + + auto &offset = lod_and_offset.second; + auto src = Slice(offset.first, offset.second); + auto &dst_place = places[i]; + framework::TensorCopy(src, dst_place, &dst); + + LoD my_lod; + for (auto &l : lod_and_offset.first) { + std::vector v{0}; + for (auto &ll : l) { + v.push_back(ll + v.back()); + } + my_lod.emplace_back(v); + } + dst.set_lod(my_lod); + } + results.emplace_back(dst); + } + + return results; +} + +void LoDTensor::MergeLoDTensor( + const std::vector &lod_tensors, + platform::Place dst_place) { + PADDLE_ENFORCE(!lod_tensors.empty()); + + framework::DDim new_dim = lod_tensors[0]->dims(); + auto new_type = lod_tensors[0]->type(); + framework::DataLayout new_layout = lod_tensors[0]->layout(); + LoD new_lod = lod_tensors[0]->lod(); + for (size_t i = 1; i < lod_tensors.size(); ++i) { + auto *t = lod_tensors[i]; + PADDLE_ENFORCE_EQ(new_type, t->type()); + PADDLE_ENFORCE_EQ(new_layout, t->layout()); + + PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0], + framework::product(t->dims()) / t->dims()[0]); + new_dim[0] += t->dims()[0]; + + auto &lod = t->lod(); + PADDLE_ENFORCE_EQ(new_lod.size(), lod.size()); + for (size_t j = 0; j < lod.size(); ++j) { + auto &sub_lod = new_lod[j]; + size_t offset = sub_lod.back(); + for (size_t k = 1; k < lod[j].size(); ++k) { + sub_lod.push_back(lod[j][k] + offset); + } + } + } + Resize(new_dim); + set_layout(new_layout); + set_lod(new_lod); + mutable_data(dst_place, new_type); + + int begin = 0; + for (auto *src : lod_tensors) { + int end = begin + src->dims()[0]; + auto dst = Slice(begin, end); + framework::TensorCopy(*src, dst_place, &dst); + begin = end; + } +} + +LoD ConvertToLengthBasedLoD(const LoD &offset_lod) { + LoD length_lod; + length_lod.reserve(offset_lod.size()); + for (size_t lvl = 0; lvl < offset_lod.size(); ++lvl) { + std::vector level; + if (offset_lod[lvl].size() > 0) { + level.reserve(offset_lod[lvl].size() - 1); + } + for (size_t idx = 0; idx < offset_lod[lvl].size() - 1; ++idx) { + level.push_back(offset_lod[lvl][idx + 1] - offset_lod[lvl][idx]); + } + length_lod.push_back(level); + } + return length_lod; +} + +LoD ConvertToOffsetBasedLoD(const LoD &length_lod) { + LoD offset_lod; + offset_lod.reserve(length_lod.size()); + for (size_t lvl = 0; lvl < length_lod.size(); ++lvl) { + std::vector level; + level.reserve(length_lod[lvl].size() + 1); + size_t tmp = 0; + level.push_back(tmp); + for (size_t idx = 0; idx < length_lod[lvl].size(); ++idx) { + tmp += length_lod[lvl][idx]; + level.push_back(tmp); + } + offset_lod.push_back(level); + } + return offset_lod; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h new file mode 100644 index 00000000..5e20ba7c --- /dev/null +++ b/paddle/fluid/framework/lod_tensor.h @@ -0,0 +1,242 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#ifdef PADDLE_WITH_CUDA +#include +#include +#endif + +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { + +namespace recordio { +class Writer; +class Scanner; +} + +namespace framework { + +/* + * LoD is short for Level of Details. + * + * - in a level, each element indicates relative offset of the lower level + * - the first element should be 0 and that indicates that this sequence start + * from 0 + * - each sequence's begin and end(no-inclusive) is level[id, id+1] + * + * For example: + * 3-level LoD stores + * + * 0 2 3 + * 0 2 4 7 + * 0 2 5 7 10 12 15 20 + */ +using LoD = std::vector>; + +std::ostream& operator<<(std::ostream& os, const LoD& lod); +std::ostream& operator<<(std::ostream& os, const LoDTensor& t); + +std::string LoDToString(const LoD& lod); + +LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, + size_t elem_end); +/* + * Transform an LoD from relative offsets to absolute offsets. + */ +LoD ToAbsOffset(const LoD& in); + +bool operator==(const LoD& a, const LoD& b); + +/* + * Check whether this lod's format is valid. + * + * ATTENTION: + * - Empty lod is treated as valid. + * + * It will check two things: + * + * 1. all the offsets in a level should be non-descending. + * 2. there should be more than 2 offsets existing in each level. + * 3. the higher level's last offset should equals the lower level's size-1. + * 4. the first offset(the begin offset) of each level should be 0. + * 5. the lowest level's last offset should equals `tensor_height` if + * tensor_height>0. + */ + +bool CheckLoD(const LoD& in, int tensor_height = -1); +/* + * Check whether this absolute lod's format is valid. + * + * ATTENTION: + * - Empty lod is treated as valid. + * + * It will check two things: + * 1. all the offsets in a level should be ascending(no same items allowed). + * 2. there should be more than 2 offsets existing in each level. + * 3. the first offset of each level should be 0, and the last should be the + * same(the height of underlying tensor) or `tensor_height` if + * tensor_height>0. + */ +bool CheckAbsLoD(const LoD& in, int tensor_height = -1); + +/* + * LoDTensor (Level of details Tensor) + * see https://en.wikipedia.org/wiki/Level_of_details for reference. + */ +class LoDTensor : public Tensor { + public: + LoDTensor() : Tensor() {} + + explicit LoDTensor(const LoD& lod) : lod_(lod) {} + + void set_lod(const LoD& lod) { lod_ = lod; } + + const LoD& lod() const { return lod_; } + + LoD* mutable_lod() { return &lod_; } + + /* + * Get the start offset and end offset of an element from LoD. + */ + std::pair lod_element(size_t level, size_t elem) const { + PADDLE_ENFORCE_LT(level, NumLevels()); + PADDLE_ENFORCE_LT(elem, NumElements(level)); + return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]); + } + + /* + * Number of LoDTensor's levels, each level has units of data, for example, + * in the sentence's view, article, paragraph, sentence are 3 levels. + */ + size_t NumLevels() const { return lod_.size(); } + /* + * Number of elements in a level. + */ + size_t NumElements(size_t level = 0) const { + PADDLE_ENFORCE_LT(level, NumLevels()); + // the last offset is the end of last element + return (lod_)[level].size() - 1; + } + + // Split LoDTensor and copy to each place specified in places. + std::vector SplitLoDTensor( + const std::vector places) const; + + void MergeLoDTensor(const std::vector& lod_tensors, + platform::Place place); + + private: + LoD lod_; +}; + +/* + * Expand the `source` to fit the LoD of `lod`. For example, a `source` + * LoDTensor is + * - LoD: [0, 2] + * - tensor: [a0, a1] + * a `lod` is + * - LoD: [0 3 5] + * returns a new LoDTensor + * - [a0 a0 a0 a1 a1] + */ +template +LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level, + const platform::Place& place) { + LoD abs_lod = ToAbsOffset(lod); + const auto& lod_level = lod[level]; + size_t num_instances = source.dims()[0]; + + // new tensor + LoDTensor tensor; + tensor.set_lod(lod); + auto dims = source.dims(); + dims[0] = lod_level.back(); + tensor.Resize(dims); + tensor.mutable_data(place); + + PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1); + for (size_t ins = 0; ins < num_instances; ins++) { + for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) { + auto slice = tensor.Slice(elem, elem + 1); + TensorCopy(source.Slice(ins, ins + 1), platform::CPUPlace(), + platform::CPUDeviceContext(), &slice); + } + } + return tensor; +} + +// Get the absolute offset of a lod[start_level][start_idx:end_idx] and +// relative length of details for every levels(i.e., [start_level: ]). +// +// For example, +// lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]] +// start_level = 0 +// start_idx = 1 +// end_idx = 3 +// +// Returns: +// LoD = [[1, 4], [2, 4, 2, 3, 2]] +// pair = {11, 24} +std::pair> GetSubLoDAndAbsoluteOffset( + const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level); + +void AppendLoD(LoD* lod, const LoD& lod_length); + +/* + * Serialize/Desiralize LoDTensor to std::ostream + * You can pass ofstream or ostringstream to serilize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, const LoDTensor& tensor, + const platform::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, LoDTensor* tensor, + const platform::DeviceContext& dev_ctx); + +extern void WriteToRecordIO(recordio::Writer* writer, + const std::vector& tensor, + const platform::DeviceContext& dev_ctx); + +extern bool ReadFromRecordIO(recordio::Scanner* scanner, + const platform::DeviceContext& dev_ctx, + std::vector* result_ptr); + +/* + * Convert between length-based LoD and offset-based LoD. + * The implementation of LoDTensor class use offset-based LoD. + * However, we want to expose the more user-friendly length-based + * LoD to the Python side instead. + * + * Example: + * If offset_lod = [[0, 2, 3],[0, 3, 5, 9]] + * then length_lod = [[2, 1], [3, 2, 4]] + */ +LoD ConvertToLengthBasedLoD(const LoD& offset_lod); + +LoD ConvertToOffsetBasedLoD(const LoD& length_lod); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h new file mode 100644 index 00000000..36a5c3c5 --- /dev/null +++ b/paddle/fluid/framework/lod_tensor_array.h @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { + +using LoDTensorArray = std::vector; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc new file mode 100644 index 00000000..d1554113 --- /dev/null +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -0,0 +1,332 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" + +#include "paddle/fluid/recordio/scanner.h" +#include "paddle/fluid/recordio/writer.h" + +namespace paddle { +namespace framework { + +TEST(LoD, PrintLoDTensor) { + LoDTensor tensor1; + tensor1.Resize({2}); + tensor1.mutable_data(platform::CPUPlace()); + tensor1.data()[0] = 0.2; + tensor1.data()[1] = 0.5; + LOG(INFO) << tensor1; + + LoDTensor tensor2; + tensor2.Resize({2}); + tensor2.mutable_data(platform::CPUPlace()); + tensor2.data()[0] = 1; + tensor2.data()[1] = 2; + LOG(INFO) << tensor2; +} + +TEST(LoD, data) { + LoD lod{{0, 1, 2}}; + lod.push_back({0, 2, 4, 5}); + lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); + + auto& v = lod[0]; + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], i); + } +} + +TEST(LoD, ExpandLoD) { + LoD lod{{0, 2}}; + LoDTensor tensor; + tensor.set_lod(lod); + tensor.Resize({2, 1}); + tensor.mutable_data(platform::CPUPlace()); + tensor.data()[0] = 0; + tensor.data()[1] = 1; + + LoD target; + target.emplace_back(std::vector{0, 3, 5}); + auto new_tensor = LodExpand(tensor, target, 0UL, platform::CPUPlace()); + std::vector result{{0, 0, 0, 1, 1}}; + for (size_t i = 0; i < 5; i++) { + ASSERT_EQ(new_tensor.data()[i], result[i]); + } +} + +TEST(LoD, GetFineGrainedLoDLength) { + LoD lod; + lod.push_back(std::vector({0, 2, 4, 5})); + lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); + lod.push_back( + std::vector({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29})); + + auto lod_and_offset = + paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0); + LoD lod_length = lod_and_offset.first; + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + + LoD expected; + expected.push_back(std::vector{2}); + expected.push_back(std::vector{2, 2}); + expected.push_back(std::vector{2, 3, 4, 2}); + EXPECT_EQ(lod_length, expected); + EXPECT_EQ(start_offset, 15UL); + EXPECT_EQ(end_offset, 26UL); +} + +TEST(LoD, AppendLoD) { + LoD lod_lens; + lod_lens.push_back(std::vector({2})); + lod_lens.push_back(std::vector({2, 2})); + lod_lens.push_back(std::vector({2, 3, 4, 2})); + + LoD origin; + origin.push_back(std::vector({0, 2})); + origin.push_back(std::vector({0, 1, 6})); + origin.push_back(std::vector({0, 2, 5, 7, 10, 12, 15})); + + paddle::framework::AppendLoD(&origin, lod_lens); + + LoD expected; + expected.push_back(std::vector({0, 2, 4})); + expected.push_back(std::vector({0, 1, 6, 8, 10})); + expected.push_back( + std::vector({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26})); + EXPECT_EQ(origin, expected); +} + +TEST(LoD, ToAbsOffset) { + LoD relative_lod; + relative_lod.push_back(std::vector({0, 2})); + relative_lod.push_back(std::vector({0, 1, 3})); + relative_lod.push_back(std::vector({0, 2, 4, 5})); + + LoD abs_lod = paddle::framework::ToAbsOffset(relative_lod); + + LoD expected; + expected.push_back(std::vector({0, 5})); + expected.push_back(std::vector({0, 2, 5})); + expected.push_back(std::vector({0, 2, 4, 5})); + + EXPECT_EQ(abs_lod, expected); +} + +TEST(LoD, SplitLoDTensor) { + LoD lod; + lod.push_back(std::vector({0, 2, 4, 5, 6})); + lod.push_back(std::vector({0, 1, 6, 8, 13, 15, 20})); + + platform::CPUPlace place; + LoDTensor lod_tensor; + lod_tensor.Resize({20, 1}); + float* dst_ptr = lod_tensor.mutable_data(place); + for (int i = 0; i < lod_tensor.numel(); ++i) { + dst_ptr[i] = i; + } + lod_tensor.set_lod(lod); + + std::vector places{platform::CPUPlace(), + platform::CPUPlace()}; + LoD lod0; + lod0.push_back(std::vector({0, 2, 4})); + lod0.push_back(std::vector({0, 1, 6, 8, 13})); + LoD lod1; + lod1.push_back(std::vector({0, 1, 2})); + lod1.push_back(std::vector({0, 2, 7})); + + auto lods = lod_tensor.SplitLoDTensor(places); + EXPECT_EQ(lods[0].lod(), lod0); + EXPECT_EQ(lods[1].lod(), lod1); +} + +TEST(LoD, MergeLoDTensor) { + LoD lod; + lod.push_back(std::vector({0, 2, 4, 5, 6})); + lod.push_back(std::vector({0, 1, 6, 8, 13, 15, 20})); + + platform::CPUPlace place; + + LoDTensor lod_tensor0; + LoD lod0; + lod0.push_back(std::vector({0, 2, 4})); + lod0.push_back(std::vector({0, 1, 6, 8, 13})); + lod_tensor0.set_lod(lod0); + + lod_tensor0.Resize({13, 1}); + float* dst_ptr = lod_tensor0.mutable_data(place); + for (int i = 0; i < lod_tensor0.numel(); ++i) { + dst_ptr[i] = i; + } + + LoDTensor lod_tensor1; + LoD lod1; + lod1.push_back(std::vector({0, 1, 2})); + lod1.push_back(std::vector({0, 2, 7})); + lod_tensor1.set_lod(lod1); + lod_tensor1.Resize({7, 1}); + dst_ptr = lod_tensor1.mutable_data(place); + for (int i = 0; i < lod_tensor1.numel(); ++i) { + dst_ptr[i] = i; + } + + std::vector lods{&lod_tensor0, &lod_tensor1}; + + LoDTensor lod_tensor; + lod_tensor.MergeLoDTensor(lods, place); + EXPECT_EQ(lod_tensor.lod(), lod); +} + +TEST(LoD, CheckLoD) { + LoD relative_lod; + relative_lod.push_back(std::vector({0, 2})); + relative_lod.push_back(std::vector({0, 1, 3})); + relative_lod.push_back(std::vector({0, 2, 4, 5})); + + // check compatible + ASSERT_TRUE(CheckLoD(relative_lod)); + relative_lod[1].back()++; + ASSERT_FALSE(CheckLoD(relative_lod)); + relative_lod[1].back()--; // recover it + + // check empty + LoD empty_lod; + ASSERT_TRUE(CheckLoD(empty_lod)); + + // check less than 2 offsets in a level + LoD some_lod0; + some_lod0.push_back(std::vector({0})); + ASSERT_FALSE(CheckLoD(some_lod0)); + + // check with underlying tensor storage. + ASSERT_TRUE(CheckLoD(relative_lod, 5)); + ASSERT_FALSE(CheckLoD(relative_lod, 9)); + + // check whether lod is ascending-sorted (allow same items) + ASSERT_TRUE(CheckLoD({{0, 1, 2, 3, 4, 5}}, 5)); + ASSERT_TRUE(CheckLoD({{0, 1, 3, 3, 4, 5}}, 5)); + ASSERT_FALSE(CheckLoD({{0, 1, 3, 2, 5}}, 5)); +} + +TEST(LoD, CheckAbsLoD) { + LoD relative_lod; + relative_lod.push_back(std::vector({0, 2})); + relative_lod.push_back(std::vector({0, 1, 3})); + relative_lod.push_back(std::vector({0, 2, 4, 5})); + + auto abs_lod = ToAbsOffset(relative_lod); + + ASSERT_TRUE(CheckAbsLoD(abs_lod)); + + // check less than 2 offsets in a level. + + // check the last item should be compatible with tensor height. + abs_lod.back().back()++; + ASSERT_FALSE(CheckAbsLoD(abs_lod)); + abs_lod.back().back()--; // restore + + // check less than 2 offsets in a lod. + LoD abs_lod0; + abs_lod0.push_back(std::vector({0})); + ASSERT_FALSE(CheckAbsLoD(abs_lod0)); +} + +TEST(LoD, ConvertToLengthBasedLoD) { + LoD offset_lod; + offset_lod.push_back(std::vector({0, 2})); + offset_lod.push_back(std::vector({0, 1, 3})); + offset_lod.push_back(std::vector({0, 2, 4, 5})); + + LoD length_lod = ConvertToLengthBasedLoD(offset_lod); + + LoD expected; + expected.push_back(std::vector({2})); + expected.push_back(std::vector({1, 2})); + expected.push_back(std::vector({2, 2, 1})); + + EXPECT_EQ(length_lod, expected); +} + +TEST(LoD, ConvertToOffsetBasedLoD) { + LoD length_lod; + length_lod.push_back(std::vector({2})); + length_lod.push_back(std::vector({1, 2})); + length_lod.push_back(std::vector({2, 2, 1})); + + LoD offset_lod = ConvertToOffsetBasedLoD(length_lod); + + LoD expected; + expected.push_back(std::vector({0, 2})); + expected.push_back(std::vector({0, 1, 3})); + expected.push_back(std::vector({0, 2, 4, 5})); + + EXPECT_EQ(offset_lod, expected); +} + +template +static void TestRecordIO() { + LoDTensor tensor; + T* tmp = tensor.mutable_data(make_ddim({4, 5}), platform::CPUPlace()); + for (int i = 0; i < 20; ++i) { + tmp[i] = static_cast(i); + } + + std::stringstream* stream = new std::stringstream(); + auto& ctx = + *platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); + { + recordio::Writer writer(stream, recordio::Compressor::kSnappy); + WriteToRecordIO(&writer, {tensor, tensor}, ctx); + WriteToRecordIO(&writer, {tensor, tensor}, ctx); + writer.Flush(); + } + + auto assert_tensor_ok = [](const LoDTensor& tensor) { + for (int i = 0; i < 20; ++i) { + ASSERT_EQ(tensor.data()[i], static_cast(i)); + } + }; + + { + std::unique_ptr stream_ptr(stream); + recordio::Scanner scanner(std::move(stream_ptr)); + std::vector tensors; + ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors)); + ASSERT_EQ(tensors.size(), static_cast(2)); + assert_tensor_ok(tensors[0]); + assert_tensor_ok(tensors[1]); + ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors)); + ASSERT_EQ(tensors.size(), static_cast(2)); + assert_tensor_ok(tensors[0]); + assert_tensor_ok(tensors[1]); + } +} + +TEST(LoDTensor, RecordIO) { + TestRecordIO(); + TestRecordIO(); + TestRecordIO(); + TestRecordIO(); + TestRecordIO(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu new file mode 100644 index 00000000..b9950627 --- /dev/null +++ b/paddle/fluid/framework/lod_tensor_test.cu @@ -0,0 +1,72 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/place.h" + +__global__ void test(size_t* a, int size) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; + i += blockDim.x * gridDim.x) { + a[i] *= 2; + } +} + +TEST(LoD, data) { + paddle::framework::InitDevices(true); + + paddle::framework::LoD lod{{0, 1, 2}}; + lod.push_back({0, 2, 4, 5}); + lod.push_back(std::vector({0, 1, 6, 8, 10, 11})); + + auto& v = lod[0]; + paddle::platform::CUDAPlace gpu(0); + test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size()); + cudaDeviceSynchronize(); + for (size_t i = 0; i < v.size(); ++i) { + EXPECT_EQ(v[i], i * 2); + } +} + +TEST(LoDTensor, LoDInGPU) { + paddle::framework::InitDevices(true); + + paddle::framework::LoDTensor lod_tensor; + paddle::platform::CUDAPlace place(0); + + paddle::framework::LoD src_lod; + src_lod.push_back(std::vector{0, 2, 4, 6, 8, 10, 12, 14}); + + lod_tensor.Resize({14, 16}); + lod_tensor.mutable_data(place); + + lod_tensor.set_lod(src_lod); + EXPECT_EQ(lod_tensor.lod_element(0, 2).first, 4UL); + EXPECT_EQ(lod_tensor.lod_element(0, 4).first, 8UL); + + auto lod = lod_tensor.lod(); + + test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size()); + cudaDeviceSynchronize(); + + for (size_t i = 0; i < src_lod[0].size(); ++i) { + EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); + } +} diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h new file mode 100644 index 00000000..01ba743b --- /dev/null +++ b/paddle/fluid/framework/mixed_vector.h @@ -0,0 +1,537 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include +#include "paddle/fluid/framework/details/cow_ptr.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" + +#include "glog/logging.h" + +namespace paddle { +namespace framework { + +#if defined(PADDLE_WITH_CUDA) +// Vector implements the std::vector interface, and can get Data or +// MutableData from any place. The data will be synced implicitly inside. +template +class Vector { + public: + using value_type = T; + using iterator = typename std::vector::iterator; + using const_iterator = typename std::vector::const_iterator; + + private: + // The actual class to implement vector logic + class VectorData { + public: + VectorData() : flag_(kDataInCPU) {} + VectorData(size_t count, const T &value) + : cpu_(count, value), flag_(kDataInCPU) {} + VectorData(std::initializer_list init) : cpu_(init), flag_(kDataInCPU) {} + template + explicit VectorData(const std::vector &dat) + : cpu_(dat), flag_(kDataInCPU) {} + ~VectorData() {} + + VectorData(const VectorData &o) { + o.ImmutableCPU(); + cpu_ = o.cpu_; + flag_ = kDataInCPU; + } + + VectorData &operator=(const VectorData &o) { + o.ImmutableCPU(); + cpu_ = o.cpu_; + flag_ = kDataInCPU; + return *this; + } + + T &operator[](size_t i) { + MutableCPU(); + return cpu_[i]; + } + + const T &operator[](size_t i) const { + ImmutableCPU(); + return cpu_[i]; + } + + size_t size() const { return cpu_.size(); } + + iterator begin() { + MutableCPU(); + return cpu_.begin(); + } + + iterator end() { + MutableCPU(); + return cpu_.end(); + } + + T &front() { + MutableCPU(); + return cpu_.front(); + } + + T &back() { + MutableCPU(); + return cpu_.back(); + } + + const_iterator begin() const { + ImmutableCPU(); + return cpu_.begin(); + } + + const_iterator end() const { + ImmutableCPU(); + return cpu_.end(); + } + + const T &back() const { + ImmutableCPU(); + return cpu_.back(); + } + + T *data() { return &(*this)[0]; } + + const T *data() const { return &(*this)[0]; } + + const T &front() const { + ImmutableCPU(); + return cpu_.front(); + } + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + MutableCPU(); + cpu_.assign(begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { + MutableCPU(); + cpu_.push_back(elem); + } + + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + MutableCPU(); + auto out_it = std::back_inserter>(this->cpu_); + std::copy(begin, end, out_it); + } + + // resize the vector + void resize(size_t size) { + MutableCPU(); + cpu_.resize(size); + } + + // get cuda ptr. immutable + const T *CUDAData(platform::Place place) const { + PADDLE_ENFORCE(platform::is_gpu_place(place), + "CUDA Data must on CUDA place"); + ImmutableCUDA(place); + return reinterpret_cast(gpu_->ptr()); + } + + // get cuda ptr. mutable + T *CUDAMutableData(platform::Place place) { + const T *ptr = CUDAData(place); + flag_ = kDirty | kDataInCUDA; + return const_cast(ptr); + } + + // clear + void clear() { + cpu_.clear(); + flag_ = kDirty | kDataInCPU; + } + + size_t capacity() const { return cpu_.capacity(); } + + // reserve data + void reserve(size_t size) const { cpu_.reserve(size); } + + // implicit cast operator. Vector can be cast to std::vector implicitly. + operator std::vector() const { + ImmutableCPU(); + return cpu_; + } + + bool operator==(const VectorData &other) const { + ImmutableCPU(); + other.ImmutableCPU(); + return cpu_ == other.cpu_; + } + + std::mutex &Mutex() const { return mtx_; } + + boost::optional CUDAPlace() const { + return gpu_ == nullptr + ? boost::none + : boost::optional( + boost::get(gpu_->place())); + } + + private: + enum DataFlag { + kDataInCPU = 0x01, + kDataInCUDA = 0x02, + // kDirty means the data has been changed in one device. + kDirty = 0x10 + }; + + void CopyToCPU() const { + // COPY GPU Data To CPU + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(gpu_->place())); + auto stream = dev_ctx->stream(); + void *src = gpu_->ptr(); + void *dst = cpu_.data(); + paddle::memory::Copy(platform::CPUPlace(), dst, CUDAPlace().get(), src, + gpu_memory_size_, stream); + dev_ctx->Wait(); + } + + void MutableCPU() { + if (IsInCUDA() && IsDirty()) { + CopyToCPU(); + } + flag_ = kDirty | kDataInCPU; + } + + void ImmutableCUDA(platform::Place place) const { + if (IsDirty()) { + if (IsInCPU()) { + CopyCPUDataToCUDA(place); + UnsetFlag(kDirty); + SetFlag(kDataInCUDA); + } else if (IsInCUDA() && !(place == gpu_->place())) { + PADDLE_THROW("This situation should not happen"); + // Still dirty + } else { + // Dirty && DataInCUDA && Device is same + // Do nothing + } + } else { + if (!IsInCUDA()) { + // Even data is not dirty. However, data is not in CUDA. Copy data. + CopyCPUDataToCUDA(place); + SetFlag(kDataInCUDA); + } else if (!(place == gpu_->place())) { + PADDLE_THROW("This situation should not happen."); + } else { + // Not Dirty && DataInCUDA && Device is same + // Do nothing. + } + } + } + + void CopyCPUDataToCUDA(const platform::Place &place) const { + void *src = cpu_.data(); + gpu_memory_size_ = cpu_.size() * sizeof(T); + gpu_ = memory::Alloc(place, gpu_memory_size_); + void *dst = gpu_->ptr(); + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto stream = dev_ctx->stream(); + paddle::memory::Copy(CUDAPlace().get(), dst, platform::CPUPlace(), src, + gpu_memory_size_, stream); + } + + void ImmutableCPU() const { + if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or + // CPU has no data. + CopyToCPU(); + UnsetFlag(kDirty); + } + SetFlag(kDataInCPU); + } + + void UnsetFlag(int flag) const { flag_ &= ~flag; } + void SetFlag(int flag) const { flag_ |= flag; } + + bool IsDirty() const { return flag_ & kDirty; } + + bool IsInCUDA() const { return flag_ & kDataInCUDA; } + + bool IsInCPU() const { return flag_ & kDataInCPU; } + + mutable std::vector cpu_; + mutable paddle::memory::AllocationPtr gpu_; + mutable size_t gpu_memory_size_{0}; + mutable int flag_; + + mutable std::mutex mtx_; + }; + + public: + // Default ctor. Create empty Vector + Vector() : m_(new VectorData()) {} + + // Fill vector with value. The vector size is `count`. + explicit Vector(size_t count, const T &value = T()) + : m_(new VectorData(count, value)) {} + + // Ctor with init_list + Vector(std::initializer_list init) : m_(new VectorData(init)) {} + + // implicit cast from std::vector. + template + Vector(const std::vector &dat) : m_(new VectorData(dat)) { // NOLINT + } + + // Copy ctor + Vector(const Vector &other) { m_ = other.m_; } + + // Copy operator + Vector &operator=(const Vector &other) { + m_ = other.m_; + return *this; + } + + // Move ctor + Vector(Vector &&other) { m_ = std::move(other.m_); } + + // CPU data access method. Mutable. + T &operator[](size_t i) { return (*m_.MutableData())[i]; } + + // CPU data access method. Immutable. + const T &operator[](size_t i) const { return m_.Data()[i]; } + + // std::vector iterator methods. Based on CPU data access method + size_t size() const { return m_.Data().size(); } + + iterator begin() { return m_.MutableData()->begin(); } + + iterator end() { return m_.MutableData()->end(); } + + T &front() { return m_.MutableData()->front(); } + + T &back() { return m_.MutableData()->back(); } + + const_iterator begin() const { return m_.Data().begin(); } + + const_iterator end() const { return m_.Data().end(); } + + const_iterator cbegin() const { return begin(); } + + const_iterator cend() const { return end(); } + + const T &back() const { return m_.Data().back(); } + + T *data() { return m_.MutableData()->data(); } + + const T *data() const { return m_.Data().data(); } + + const T &front() const { return m_.Data().front(); } + // end of std::vector iterator methods + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + m_.MutableData()->assign(begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { m_.MutableData()->push_back(elem); } + + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + m_.MutableData()->Extend(begin, end); + } + + // resize the vector + void resize(size_t size) { + if (m_.Data().size() != size) { + m_.MutableData()->resize(size); + } + } + + // get cuda ptr. immutable + const T *CUDAData(platform::Place place) const { + { + auto &mtx = m_.Data().Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_.Data().CUDAPlace(); + if (cuda_place == boost::none || + cuda_place == boost::get(place)) { + return m_.Data().CUDAData(place); + } + } + // If m_ contains CUDAData in a different place. Detach manually. + m_.Detach(); + return CUDAData(place); + } + + // get cuda ptr. mutable + T *CUDAMutableData(platform::Place place) { + { + auto &mtx = m_.Data().Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_.Data().CUDAPlace(); + if (cuda_place == boost::none || + cuda_place == boost::get(place)) { + return m_.MutableData()->CUDAMutableData(place); + } + } + // If m_ contains CUDAData in a different place. Detach manually. + m_.Detach(); + return CUDAMutableData(place); + } + + // clear + void clear() { m_.MutableData()->clear(); } + + size_t capacity() const { return m_.Data().capacity(); } + + // reserve data + void reserve(size_t size) { m_.Data().reserve(size); } + + // the unify method to access CPU or CUDA data. immutable. + const T *Data(platform::Place place) const { + if (platform::is_gpu_place(place)) { + return CUDAData(place); + } else { + return data(); + } + } + + // the unify method to access CPU or CUDA data. mutable. + T *MutableData(platform::Place place) { + if (platform::is_gpu_place(place)) { + return CUDAMutableData(place); + } else { + return data(); + } + } + + // implicit cast operator. Vector can be cast to std::vector implicitly. + operator std::vector() const { return m_.Data(); } + + bool operator==(const Vector &other) const { + if (size() != other.size()) return false; + auto it1 = cbegin(); + auto it2 = other.cbegin(); + for (; it1 < cend(); ++it1, ++it2) { + if (*it1 != *it2) { + return false; + } + } + return true; + } + + const void *Handle() const { return &m_.Data(); } + + private: + // Vector is an COW object. + mutable details::COWPtr m_; +}; + +#else // PADDLE_WITH_CUDA + +template +class CPUVector : public std::vector> { + public: + CPUVector() : std::vector() {} + CPUVector(size_t count, const T &value = T()) // NOLINT + : std::vector(count, value) {} + CPUVector(std::initializer_list init) : std::vector(init) {} + CPUVector(const std::vector &other) : std::vector(other) {} // NOLINT + CPUVector(const CPUVector &other) : std::vector(other) {} + CPUVector(CPUVector &&other) : std::vector(std::move(other)) {} + CPUVector(std::vector &&other) // NOLINT + : std::vector(std::move(other)) {} + CPUVector &operator=(const CPUVector &other) { + this->assign(other.begin(), other.end()); + return *this; + } + CPUVector &operator=(const std::vector &other) { + this->assign(other.begin(), other.end()); + return *this; + } + + friend std::ostream &operator<<(std::ostream &os, const CPUVector &other) { + std::stringstream ss; + for (auto v : other) { + os << v << " "; + } + return os; + } + + T &operator[](size_t id) { return this->at(id); } + + const T &operator[](size_t id) const { return this->at(id); } + + template + void Extend(const D &begin, const D &end) { + this->reserve(this->size() + size_t(end - begin)); + this->insert(this->end(), begin, end); + } + + const T *CUDAData(platform::Place place) const { + PADDLE_THROW( + "Vector::CUDAData() method is not supported in CPU-only version"); + } + + T *CUDAMutableData(platform::Place place) { + PADDLE_THROW( + "Vector::CUDAMutableData() method is not supported in CPU-only " + "version"); + } + + const T *Data(platform::Place place) const { + PADDLE_ENFORCE( + platform::is_cpu_place(place), + "Vector::Data() method is not supported when not in CPUPlace"); + return this->data(); + } + + T *MutableData(platform::Place place) { + PADDLE_ENFORCE( + platform::is_cpu_place(place), + "Vector::MutableData() method is not supported when not in CPUPlace"); + return this->data(); + } + + const void *Handle() const { return static_cast(this); } +}; + +template +using Vector = CPUVector; + +#endif // PADDLE_WITH_CUDA + +}; // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/mixed_vector_test.cc b/paddle/fluid/framework/mixed_vector_test.cc new file mode 100644 index 00000000..0599c8d3 --- /dev/null +++ b/paddle/fluid/framework/mixed_vector_test.cc @@ -0,0 +1,72 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/mixed_vector.h" + +template +using vec = paddle::framework::Vector; + +TEST(mixed_vector, CPU_VECTOR) { + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10UL); + vec tmp2; + tmp2 = tmp; + ASSERT_EQ(tmp2.size(), 10UL); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp2[i], i); + ASSERT_EQ(tmp2[i], tmp[i]); + } + int cnt = 0; + for (auto& t : tmp2) { + ASSERT_EQ(t, cnt); + ++cnt; + } +} + +TEST(mixed_vector, InitWithCount) { + paddle::framework::Vector vec(10, 10); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(vec[i], 10); + } +} + +TEST(mixed_vector, ForEach) { + vec tmp; + for (auto& v : tmp) { + VLOG(3) << v; + } +} + +TEST(mixed_vector, Reserve) { + paddle::framework::Vector vec; + vec.reserve(1); + vec.push_back(0); + vec.push_back(0); + vec.push_back(0); +} + +TEST(mixed_vector, Resize) { + paddle::framework::Vector vec; + vec.resize(1); + vec.push_back(0); + vec.push_back(0); + vec.push_back(0); +} diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu new file mode 100644 index 00000000..4b0caa8d --- /dev/null +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include + +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/platform/gpu_info.h" + +template +using vec = paddle::framework::Vector; + +static __global__ void multiply_10(int* ptr) { + for (int i = 0; i < 10; ++i) { + ptr[i] *= 10; + } +} + +cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) { + return reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(place)) + ->stream(); +} + +TEST(mixed_vector, GPU_VECTOR) { + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10UL); + paddle::platform::CUDAPlace gpu(0); + + multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu)); + + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp[i], i * 10); + } +} + +TEST(mixed_vector, MultiGPU) { + if (paddle::platform::GetCUDADeviceCount() < 2) { + LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple " + "GPUs in your machine."; + return; + } + + vec tmp; + for (int i = 0; i < 10; ++i) { + tmp.push_back(i); + } + ASSERT_EQ(tmp.size(), 10UL); + paddle::platform::CUDAPlace gpu0(0); + paddle::platform::SetDeviceId(0); + multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0)); + paddle::platform::CUDAPlace gpu1(1); + auto* gpu1_ptr = tmp.MutableData(gpu1); + paddle::platform::SetDeviceId(1); + multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr); + for (int i = 0; i < 10; ++i) { + ASSERT_EQ(tmp[i], i * 100); + } +} diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc new file mode 100644 index 00000000..8cbf2efa --- /dev/null +++ b/paddle/fluid/framework/multi_trainer.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { + +void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, + Dataset* dataset) { + thread_num_ = trainer_desc.thread_num(); + SetDataset(dataset); + // get filelist from trainer_desc here + const std::vector readers = + dataset->GetReaders(); + VLOG(3) << "readers num: " << readers.size(); + // change thread num to readers num + thread_num_ = readers.size(); + VLOG(3) << "worker thread num: " << thread_num_; + workers_.resize(thread_num_); + for (int i = 0; i < thread_num_; ++i) { + workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( + trainer_desc.device_worker_name()); + workers_[i]->Initialize(trainer_desc); + workers_[i]->SetDeviceIndex(i); + workers_[i]->SetDataFeed(readers[i]); + } + + // set debug here + SetDebug(trainer_desc.debug()); +} + +// call only after all resources are set in current trainer +void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, + const platform::Place& place) { + for (int i = 0; i < thread_num_; ++i) { + workers_[i]->SetPlace(place); + workers_[i]->SetRootScope(root_scope_); + workers_[i]->CreateDeviceResource(main_program); // Program + workers_[i]->BindingDataFeedMemory(); + } +} + +void MultiTrainer::Run() { + VLOG(3) << "Going to run"; + for (int thidx = 0; thidx < thread_num_; ++thidx) { + if (!debug_) { + threads_.push_back( + std::thread(&DeviceWorker::TrainFiles, workers_[thidx].get())); + } else { + threads_.push_back(std::thread(&DeviceWorker::TrainFilesWithProfiler, + workers_[thidx].get())); + } + } +} + +void MultiTrainer::Finalize() { + for (auto& th : threads_) { + th.join(); + } + root_scope_->DropKids(); +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc new file mode 100644 index 00000000..a37bb6f4 --- /dev/null +++ b/paddle/fluid/framework/naive_executor.cc @@ -0,0 +1,131 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace framework { +void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, + int block_id, bool with_feed_fetch_ops) { + if (!scope) { + scope_ = new framework::Scope; + } else { + scope_ = scope; + } + + VLOG(3) << "NaiveExecutor init with scope " << scope; + CreateOps(program_desc, block_id, with_feed_fetch_ops); +} + +void NaiveExecutor::Run() { +#ifndef PADDLE_ON_INFERENCE + LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the " + "cmake flag ON_INFER is not set."; + LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and " + "variables will be reused to save the allocation " + "overhead."; + LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by " + "setting the cmake flag ON_INFER=ON if you are " + "running Paddle Inference"; +#endif // PADDLE_ON_INFERENCE + for (auto &op : ops_) { + VLOG(4) << std::this_thread::get_id() << " run " + << op->DebugStringEx(scope_) << " on scope " << scope_; + op->SetIsCalledByExecutor(false); + op->Run(*scope_, place_); + } +} + +void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id, + bool persistable, Scope *scope) { + PADDLE_ENFORCE_NOT_NULL(scope); + + auto &global_block = desc.Block(block_id); + + const auto *anc = scope; + PADDLE_ENFORCE(anc->parent() != anc); + while (anc->parent()) { + anc = anc->parent(); + } + + int num_vars = 0; + for (auto &var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + num_vars++; + + if (persistable == var->Persistable()) { + if (persistable) { + if (!anc->FindVar(var->Name())) { + auto *ptr = const_cast(anc)->Var(var->Name()); + VLOG(3) << scope << " Create persistable variable " << var->Name() + << ", which pointer is " << ptr; + InitializeVariable(ptr, var->GetType()); + } + } else { + auto *ptr = const_cast(scope)->Var(var->Name()); + VLOG(3) << scope << " Create variable " << var->Name() + << ", which pointer is " << ptr; + InitializeVariable(ptr, var->GetType()); + } + } + } + VLOG(4) << "naive executor create " << num_vars << " vars"; +} + +void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id, + bool with_feed_fetch_ops) { + for (const auto &op_desc : desc.Block(block_id).AllOps()) { + if (!with_feed_fetch_ops && + (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) { + string::PrettyLogEndl(string::Style::detail(), "--- skip [%s], %s -> %s", + op_desc->Input("X")[0], op_desc->Type(), + op_desc->Output("Out")[0]); + continue; + } + ops_.emplace_back(OpRegistry::CreateOp(*op_desc)); + } +} + +LoDTensor *NaiveExecutor::FindTensor(const std::string &name) { + PADDLE_ENFORCE(scope_, "Need to init scope first"); + auto *var = scope_->FindVar(name); + PADDLE_ENFORCE(var, "No variable [%s] in the scope"); + auto *tensor = const_cast(&var->Get()); + return tensor; +} + +void NaiveExecutor::CleanFeedFetchOps() { + std::vector> ops; + for (auto &op : ops_) { + if (op->Type() != "feed" && op->Type() != "fetch") { + ops.emplace_back(std::move(op)); + } + } + ops_.swap(ops); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h new file mode 100644 index 00000000..5e673f68 --- /dev/null +++ b/paddle/fluid/framework/naive_executor.h @@ -0,0 +1,69 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +/* + * Simple, intuitive and effective. Only single thread is supported, and + * currently designed for inference. + */ +class NaiveExecutor { + public: + explicit NaiveExecutor(const platform::Place& place) : place_(place) {} + + // Create child scope. + // Create variables. + // @with_feed_fetch_ops: whether to work with the feed and fetch operators. + void Prepare(Scope* scope, const ProgramDesc& program_desc, int block_id, + bool with_feed_fetch_ops); + + // Create variables before head. + // Create parameters if persistable is ture, or create the temporary variables + // instead. + void CreateVariables(const ProgramDesc& desc, int block_id, bool persistable, + Scope* scope); + + // Run all the operators. + void Run(); + + // Get an tensor to operating directly, without the need for feed_ops. + LoDTensor* FindTensor(const std::string& name); + + Scope* scope() { return scope_; } + + void CleanFeedFetchOps(); + + protected: + void CreateOps(const ProgramDesc& desc, int block_id, + bool with_feed_fetch_ops); + + private: + const platform::Place place_; + // Catch the required resource to avoid recreate. + std::vector> ops_; + Scope* scope_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/naive_executor_test.cc b/paddle/fluid/framework/naive_executor_test.cc new file mode 100644 index 00000000..c9176306 --- /dev/null +++ b/paddle/fluid/framework/naive_executor_test.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/naive_executor.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +TEST(NaiveExecutor, Basic) { + ProgramDesc program; + auto* main_block = program.MutableBlock(0); + auto* a = main_block->Var("a"); // input + auto* b = main_block->Var("b"); // input + auto* c = main_block->Var("c"); // input + a->SetType(proto::VarType::LOD_TENSOR); + b->SetType(proto::VarType::LOD_TENSOR); + c->SetType(proto::VarType::LOD_TENSOR); + + auto* add = main_block->AppendOp(); + add->SetType("elementwise_add"); + add->SetInput("X", {"a"}); + add->SetInput("Y", {"b"}); + add->SetOutput("Out", {"c"}); + + auto place = platform::CPUPlace(); + NaiveExecutor exe(place); + exe.Prepare(nullptr, program, 0, false); + auto* a_tensor = exe.FindTensor("a"); + auto* b_tensor = exe.FindTensor("b"); + auto* c_tensor = exe.FindTensor("c"); + + a_tensor->Resize({1, 4}); + b_tensor->Resize({1, 4}); + c_tensor->Resize({1, 4}); + b_tensor->mutable_data(place); + a_tensor->mutable_data(place); + + float a_arr[] = {0, 1, 2, 3}; + float b_arr[] = {0.0, .1, .2, .3}; + + std::copy_n(a_arr, 4, a_tensor->mutable_data(place)); + std::copy_n(b_arr, 4, b_tensor->mutable_data(place)); + + exe.Run(); + + auto* c_data = c_tensor->mutable_data(place); + for (int i = 0; i < 4; i++) { + EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3); + } +} + +} // namespace framework +} // namespace paddle + +USE_OP(elementwise_add); diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.h b/paddle/fluid/framework/no_need_buffer_vars_inference.h new file mode 100644 index 00000000..2c933659 --- /dev/null +++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h @@ -0,0 +1,60 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/op_desc.h" + +namespace paddle { +namespace framework { + +class NoNeedBufferVarsInference { + public: + NoNeedBufferVarsInference(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs) + : inputs_(inputs), outputs_(outputs), attrs_(attrs) {} + + virtual ~NoNeedBufferVarsInference() = default; + + const VariableNameMap &Inputs() const { return inputs_; } + + const VariableNameMap &Outputs() const { return outputs_; } + + const AttributeMap &Attrs() const { return attrs_; } + + virtual std::unordered_set operator()() const = 0; + + private: + const VariableNameMap &inputs_; + const VariableNameMap &outputs_; + const AttributeMap &attrs_; +}; + +#define DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(class_type, ...) \ + class class_type : public ::paddle::framework::NoNeedBufferVarsInference { \ + public: \ + using ::paddle::framework::NoNeedBufferVarsInference:: \ + NoNeedBufferVarsInference; \ + \ + std::unordered_set operator()() const override { \ + return {__VA_ARGS__}; \ + } \ + } + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc new file mode 100644 index 00000000..1ea93b76 --- /dev/null +++ b/paddle/fluid/framework/op_desc.cc @@ -0,0 +1,822 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_desc.h" +#include +#include +#include // NOLINT +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace framework { + +class OpDesc; +class BlockDesc; +class CompileTimeInferShapeContext : public InferShapeContext { + public: + CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block); + + bool HasInput(const std::string &name) const override; + + bool HasOutput(const std::string &name) const override; + + bool HasInputs(const std::string &name) const override; + + bool HasOutputs(const std::string &name) const override; + + AttrReader Attrs() const override; + + const std::vector &Inputs( + const std::string &name) const override; + + const std::vector &Outputs( + const std::string &name) const override; + + void ShareDim(const std::string &in, const std::string &out, size_t i = 0, + size_t j = 0) override { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + const std::string &input_n = Inputs(in)[i]; + const std::string &output_n = Outputs(out)[j]; + + PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@", + in, i); + PADDLE_ENFORCE(output_n != framework::kEmptyVarName, + "The %s[%d] is @EMPTY@", out, j); + + auto *in_var = block_.FindVarRecursive(input_n); + auto *out_var = block_.FindVarRecursive(output_n); + + PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(), + "The type of %s and %s is not the same.", input_n, output_n); + + SetDim(output_n, GetDim(input_n)); + } + + void ShareLoD(const std::string &in, const std::string &out, size_t i = 0, + size_t j = 0) const override { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName, + "The %s[%d] is @EMPTY@", in, i); + PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName, + "The %s[%d] is @EMPTY@", out, j); + auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); + auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); + if (in_var->GetType() != proto::VarType::LOD_TENSOR && + in_var->GetType() != proto::VarType::LOD_TENSOR_ARRAY) { + VLOG(3) << "input " << in << " is not LodTensor or LodTensorArray."; + return; + } + out_var->SetLoDLevel(in_var->GetLoDLevel()); + } + + void DecreaseLoDLevel(const std::string &in, const std::string &out, + size_t i = 0, size_t j = 0) const override { + PADDLE_ENFORCE_LT(i, Inputs(in).size()); + PADDLE_ENFORCE_LT(j, Outputs(out).size()); + PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName, + "The %s[%d] is @EMPTY@", in, i); + PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName, + "The %s[%d] is @EMPTY@", out, j); + auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); + auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); + PADDLE_ENFORCE(out_var->GetType() == proto::VarType::LOD_TENSOR_ARRAY || + out_var->GetType() == proto::VarType::LOD_TENSOR, + "The input %s should be LodTensorArray or LodTensor.", + out_var->Name()); + PADDLE_ENFORCE(in_var->GetType() == proto::VarType::LOD_TENSOR, + "The input %s should be LodTensor.", in_var->Name()); + if (in_var->GetLoDLevel() > 0) { + out_var->SetLoDLevel(in_var->GetLoDLevel() - 1); + } + } + + std::vector GetInputVarPtrs( + const std::string &name) override { + const std::vector arg_names = Inputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { + return block_.FindVarRecursive(name); + }); + return res; + } + + std::vector GetOutputVarPtrs( + const std::string &name) override { + const std::vector arg_names = Outputs(name); + std::vector res; + res.reserve(arg_names.size()); + std::transform(arg_names.begin(), arg_names.end(), std::back_inserter(res), + [this](const std::string &name) { + return block_.FindVarRecursive(name); + }); + return res; + } + + DDim GetInputDim(const std::string &name) const override { + const std::vector &arg_names = Inputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Input(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + return this->GetDim(arg_names[0]); + } + + std::vector GetInputsDim(const std::string &name) const override { + const std::vector &arg_names = Inputs(name); + return GetDims(arg_names); + } + + bool IsRuntime() const override; + + std::vector GetInputsVarType( + const std::string &name) const override { + return GetVarTypes(Inputs(name)); + } + + std::vector GetOutputsVarType( + const std::string &name) const override { + return GetVarTypes(Outputs(name)); + } + + void SetOutputDim(const std::string &name, const DDim &dim) override { + auto &arg_names = Outputs(name); + PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, + "Output(%s) should hold one element, but now it holds %d", + name, arg_names.size()); + SetDim(arg_names[0], dim); + } + + void SetOutputsDim(const std::string &name, + const std::vector &dims) override { + auto &names = Outputs(name); + SetDims(names, dims); + } + + protected: + std::vector GetVarTypes( + const std::vector &names) const { + std::vector retv; + retv.resize(names.size()); + std::transform( + names.begin(), names.end(), retv.begin(), + std::bind(std::mem_fn(&CompileTimeInferShapeContext::GetVarType), this, + std::placeholders::_1)); + return retv; + } + + proto::VarType::Type GetVarType(const std::string &name) const; + + DDim GetDim(const std::string &name) const { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + DDim res; + try { + auto shape = var->GetShape(); + res = shape.empty() ? make_ddim({0UL}) : make_ddim(shape); + } catch (...) { + VLOG(5) << "GetDim of variable " << name << " error"; + std::rethrow_exception(std::current_exception()); + } + return res; + } + + std::vector GetDims(const std::vector &names) const { + std::vector ret; + ret.reserve(names.size()); + std::transform( + names.begin(), names.end(), std::back_inserter(ret), + [this](const std::string &name) { return this->GetDim(name); }); + return ret; + } + + void SetDim(const std::string &name, const DDim &dim); + + void SetDims(const std::vector &names, + const std::vector &dims) { + size_t length = names.size(); + PADDLE_ENFORCE_EQ(length, dims.size()); + for (size_t i = 0; i < length; ++i) { + if (names[i] == framework::kEmptyVarName) { + continue; + } + SetDim(names[i], dims[i]); + } + } + + std::vector GetRepeatedDims(const std::string &name) const override; + + void SetRepeatedDims(const std::string &name, + const std::vector &dims) override; + + const OpDesc &op_; + const BlockDesc &block_; +}; + +OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs) { + desc_.set_type(type); + inputs_ = inputs; + outputs_ = outputs; + attrs_ = attrs; + need_update_ = true; + block_ = nullptr; +} + +OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) { + CopyFrom(other); + block_ = block; + need_update_ = true; +} + +void OpDesc::CopyFrom(const OpDesc &op_desc) { + desc_.set_type(op_desc.Type()); + inputs_ = op_desc.inputs_; + outputs_ = op_desc.outputs_; + attrs_ = op_desc.attrs_; + need_update_ = true; +} + +OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block) + : desc_(desc), need_update_(false) { + // restore inputs_ + int input_size = desc_.inputs_size(); + for (int i = 0; i < input_size; ++i) { + const proto::OpDesc::Var &var = desc_.inputs(i); + std::vector &args = inputs_[var.parameter()]; + int argu_size = var.arguments_size(); + args.reserve(argu_size); + for (int j = 0; j < argu_size; ++j) { + args.push_back(var.arguments(j)); + } + } + // restore outputs_ + int output_size = desc_.outputs_size(); + for (int i = 0; i < output_size; ++i) { + const proto::OpDesc::Var &var = desc_.outputs(i); + std::vector &args = outputs_[var.parameter()]; + int argu_size = var.arguments_size(); + args.reserve(argu_size); + for (int j = 0; j < argu_size; ++j) { + args.push_back(var.arguments(j)); + } + } + // restore attrs_ + for (const proto::OpDesc::Attr &attr : desc_.attrs()) { + std::string attr_name = attr.name(); + // The sub_block referred to by the BLOCK attr hasn't been added + // to ProgramDesc class yet, we skip setting BLOCK/BLOCKS attr here. + if (attr.type() != proto::AttrType::BLOCK && + attr.type() != proto::AttrType::BLOCKS) { + attrs_[attr_name] = GetAttrValue(attr); + } + } + this->block_ = block; +} + +proto::OpDesc *OpDesc::Proto() { + Flush(); + return &desc_; +} + +const std::vector &OpDesc::Input(const std::string &name) const { + auto it = inputs_.find(name); + PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name, + Type()); + return it->second; +} + +std::vector OpDesc::InputArgumentNames() const { + std::vector retv; + for (auto &ipt : this->inputs_) { + retv.insert(retv.end(), ipt.second.begin(), ipt.second.end()); + } + return retv; +} + +void OpDesc::SetInput(const std::string ¶m_name, + const std::vector &args) { + need_update_ = true; + inputs_[param_name] = args; +} + +const std::vector &OpDesc::Output(const std::string &name) const { + auto it = outputs_.find(name); + PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s", + name, Type()); + return it->second; +} + +std::vector OpDesc::OutputArgumentNames() const { + std::vector retv; + for (auto &ipt : this->outputs_) { + retv.insert(retv.end(), ipt.second.begin(), ipt.second.end()); + } + return retv; +} + +void OpDesc::SetOutput(const std::string ¶m_name, + const std::vector &args) { + need_update_ = true; + this->outputs_[param_name] = args; +} + +bool OpDesc::HasProtoAttr(const std::string &name) const { + auto &op_info = OpInfoMap::Instance(); + if (op_info.Has(desc_.type())) { + auto op_info_ptr = op_info.Get(desc_.type()); + if (op_info_ptr.HasOpProtoAndChecker()) { + const proto::OpProto &proto = op_info_ptr.Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return true; + } + } + } + } + return false; +} + +proto::AttrType OpDesc::GetAttrType(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + return static_cast(it->second.which() - 1); +} + +std::vector OpDesc::AttrNames() const { + std::vector retv; + retv.reserve(attrs_.size()); + for (auto &attr : attrs_) { + retv.push_back(attr.first); + } + return retv; +} + +void OpDesc::RemoveAttr(const std::string &name) { + attrs_.erase(name); + need_update_ = true; +} + +void OpDesc::SetAttr(const std::string &name, const Attribute &v) { + // NOTICE(minqiyang): pybind11 will take the empty list in python as + // the std::vector type in C++; so we have to change the attr's type + // here if we meet this issue + proto::AttrType attr_type = static_cast(v.which() - 1); + if (attr_type == proto::AttrType::INTS && + boost::get>(v).size() == 0u) { + // Find current attr via attr name and set the correct attribute value + const proto::OpProto::Attr &attr = GetProtoAttr(name); + switch (attr.type()) { + case proto::AttrType::BOOLEANS: { + VLOG(11) << "SetAttr: " << Type() << ", " << name + << " from INTS to BOOLEANS"; + this->attrs_[name] = std::vector(); + break; + } + case proto::AttrType::INTS: { + VLOG(11) << "SetAttr: " << Type() << ", " << name + << " from INTS to INTS"; + this->attrs_[name] = std::vector(); + break; + } + case proto::AttrType::LONGS: { + VLOG(11) << "SetAttr: " << Type() << ", " << name + << " from LONGS to LONGS"; + this->attrs_[name] = std::vector(); + break; + } + case proto::AttrType::FLOATS: { + VLOG(11) << "SetAttr: " << Type() << ", " << name + << " from INTS to FLOATS"; + this->attrs_[name] = std::vector(); + break; + } + case proto::AttrType::STRINGS: { + VLOG(11) << "SetAttr: " << Type() << ", " << name + << " from INTS to STRINGS"; + this->attrs_[name] = std::vector(); + break; + } + case proto::AttrType::BLOCKS: { + VLOG(11) << "SetAttr: " << Type() << ", " << name + << " from INTS to BLOCKS"; + this->SetBlocksAttr(name, std::vector()); + return; + } + default: + PADDLE_THROW("Wrong attr type %d", attr.type()); + } + need_update_ = true; + return; + } + + this->attrs_[name] = v; + need_update_ = true; +} + +void OpDesc::SetBlockAttr(const std::string &name, BlockDesc *block) { + this->attrs_[name] = block; + need_update_ = true; +} + +void OpDesc::SetBlocksAttr(const std::string &name, + std::vector blocks) { + this->attrs_[name] = blocks; + need_update_ = true; +} + +void OpDesc::SetAttrMap( + const std::unordered_map &attr_map) { + attrs_ = attr_map; + need_update_ = true; +} + +Attribute OpDesc::GetAttr(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + return it->second; +} + +const proto::OpProto::Attr &OpDesc::GetProtoAttr( + const std::string &name) const { + const proto::OpProto &proto = OpInfoMap::Instance().Get(Type()).Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return attr; + } + } + + PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type()); +} + +Attribute OpDesc::GetNullableAttr(const std::string &name) const { + auto it = attrs_.find(name); + if (it != attrs_.end()) { + return it->second; + } else { + return Attribute(); + } +} + +std::vector OpDesc::GetBlocksAttrIds(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + auto blocks = boost::get>(it->second); + + std::vector ids; + for (auto n : blocks) { + ids.push_back(n->ID()); + } + + return ids; +} + +int OpDesc::GetBlockAttrId(const std::string &name) const { + auto it = attrs_.find(name); + PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); + return boost::get(it->second)->ID(); +} + +const std::unordered_map &OpDesc::GetAttrMap() const { + return attrs_; +} + +void OpDesc::Rename(const std::string &old_name, const std::string &new_name) { + RenameInput(old_name, new_name); + RenameOutput(old_name, new_name); + need_update_ = true; +} + +void OpDesc::RenameOutput(const std::string &old_name, + const std::string &new_name) { + for (auto &output : outputs_) { + std::replace(output.second.begin(), output.second.end(), old_name, + new_name); + } + + auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName()); + if (it != attrs_.end()) { + auto &op_vars = boost::get>(it->second); + std::replace(op_vars.begin(), op_vars.end(), old_name, new_name); + } + + need_update_ = true; +} + +void OpDesc::RenameInput(const std::string &old_name, + const std::string &new_name) { + for (auto &input : inputs_) { + std::replace(input.second.begin(), input.second.end(), old_name, new_name); + } + + auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName()); + if (it != attrs_.end()) { + auto &op_vars = boost::get>(it->second); + std::replace(op_vars.begin(), op_vars.end(), old_name, new_name); + } + + need_update_ = true; +} + +struct SetAttrDescVisitor : public boost::static_visitor { + explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {} + mutable proto::OpDesc::Attr *attr_; + void operator()(int v) const { attr_->set_i(v); } + void operator()(float v) const { attr_->set_f(v); } + void operator()(const std::string &v) const { attr_->set_s(v); } + + // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162 + template ::value>::type> + void operator()(T b) const { + attr_->set_b(b); + } + + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_ints()); + } + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_floats()); + } + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_strings()); + } + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_bools()); + } + void operator()(const std::vector &v) const { + std::vector blocks_idx; + for (auto blk : v) { + blocks_idx.push_back(blk->ID()); + } + VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx()); + } + + void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } + + void operator()(int64_t v) const { attr_->set_l(v); } + + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_longs()); + } + + void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } +}; + +void OpDesc::Flush() { + if (need_update_) { + this->desc_.mutable_inputs()->Clear(); + for (auto &ipt : inputs_) { + auto *input = desc_.add_inputs(); + input->set_parameter(ipt.first); + VectorToRepeated(ipt.second, input->mutable_arguments()); + } + + this->desc_.mutable_outputs()->Clear(); + for (auto &opt : outputs_) { + auto *output = desc_.add_outputs(); + output->set_parameter(opt.first); + VectorToRepeated(opt.second, output->mutable_arguments()); + } + + this->desc_.mutable_attrs()->Clear(); + for (auto &attr : attrs_) { + auto *attr_desc = desc_.add_attrs(); + attr_desc->set_name(attr.first); + attr_desc->set_type( + static_cast(attr.second.which() - 1)); + SetAttrDescVisitor visitor(attr_desc); + boost::apply_visitor(visitor, attr.second); + } + + need_update_ = false; + } +} + +static std::once_flag init_infer_shape_funcs; + +/** + * NOTE(paddle-dev): Very tricky code here. Maybe we should find a + * better way to register compile-time infershape method gentlely. + * + * Normally, we can register a class derived from InferShapeBase, so that + * we can set the field of `infer_shape_` inside OpInfo when registering op. + * + * However, there is another way we can set the field of `infer_shape_` inside + * OpInfo. Usually, we overload InferShape method of OperatorWithKernel. After + * running the following method InitInferShapeFuncs, `infer_shape_` would be set + * to be the InferShape method of OperatorWithKernel. That is to say, we borrow + * the run-time InferShape method of OperatorWithKernel to be the compile-time + * InferShape method. + * + * However, during compiling time, we may not know inputs, outputs and attrs of + * run-time OperatorWithKernel. So the following code creates a fake + * OperatorWithKernel object. That is why the field info_ of OperatorBase + * would be null. + */ +static void InitInferShapeFuncs() { + std::call_once(init_infer_shape_funcs, [] { + auto &map = OpInfoMap::Instance(); + auto &info_map = *map.mutable_map(); + + for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) { + auto op_type = kern_pair.first; + auto it = info_map.find(op_type); + PADDLE_ENFORCE(it != info_map.end(), "%s has not been registered", + op_type); + auto &op_info = it->second; + if (op_info.infer_shape_) { // infer_shape has been registered. + continue; + } + + auto op = dynamic_cast(op_info.Creator()( + "", VariableNameMap{}, VariableNameMap{}, AttributeMap{})); + + PADDLE_ENFORCE_NOT_NULL( + op, "InferShapeBase is not registered to Operator %s", op_type); + + op_info.infer_shape_ = [op](InferShapeContext *ctx) { + op->InferShape(ctx); + }; + } + }); +} + +void OpDesc::CheckAttrs() { + PADDLE_ENFORCE(!Type().empty(), + "CheckAttr() can not be called before type is setted."); + auto *checker = OpInfoMap::Instance().Get(Type()).Checker(); + if (checker == nullptr) { + // checker is not configured. That operator could be generated by Paddle, + // not by users. + return; + } + VLOG(10) << "begin to check attribute of " << Type(); + checker->Check(&attrs_); +} + +void OpDesc::InferShape(const BlockDesc &block) const { + VLOG(3) << "CompileTime infer shape on " << Type(); + InitInferShapeFuncs(); + auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_; + PADDLE_ENFORCE(static_cast(infer_shape), + "%s's infer_shape has not been registered", this->Type()); + CompileTimeInferShapeContext ctx(*this, block); + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + auto inames = this->InputArgumentNames(); + sout << " From ["; + std::copy(inames.begin(), inames.end(), + std::ostream_iterator(sout, ", ")); + sout << "] to ["; + auto onames = this->OutputArgumentNames(); + std::copy(onames.begin(), onames.end(), + std::ostream_iterator(sout, ", ")); + sout << "]"; + VLOG(10) << sout.str(); + } + infer_shape(&ctx); +} + +void OpDesc::InferVarType(BlockDesc *block) const { + // There are a few places that var type can be set. + // When VarDesc is created, default set to LOD_TENSOR. + // When output variable is created, default is defaut set to LOD_TENSOR. + // We limit here to be the only place that operator defines its customized + // var type inference. Hence, we don't do any "default" setting here. + auto &info = OpInfoMap::Instance().Get(this->Type()); + if (info.infer_var_type_) { + InferVarTypeContext context(this, block); + info.infer_var_type_(&context); + } +} + +CompileTimeInferShapeContext::CompileTimeInferShapeContext( + const OpDesc &op, const BlockDesc &block) + : op_(op), block_(block) {} + +bool CompileTimeInferShapeContext::HasInput(const std::string &name) const { + const std::vector &input_names = op_.Input(name); + auto length = input_names.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Input(%s) should have only one value, " + "but it have %d now", + name, length); + return block_.HasVarRecursive(input_names[0]); +} + +bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const { + const std::vector &output_names = op_.Output(name); + auto length = output_names.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Output(%s) should have only one value, " + "but it have %d now", + name, length); + return block_.HasVarRecursive(output_names[0]); +} + +bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const { + const std::vector &input_names = op_.Input(name); + if (input_names.empty()) { + return false; + } + for (auto &input : input_names) { + if (!block_.HasVarRecursive(input)) return false; + } + return true; +} + +bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const { + const std::vector &output_names = op_.Output(name); + if (output_names.empty()) { + return false; + } + for (auto &output : output_names) { + if (!block_.HasVarRecursive(output)) return false; + } + return true; +} + +AttrReader CompileTimeInferShapeContext::Attrs() const { + return AttrReader(op_.GetAttrMap()); +} + +const std::vector &CompileTimeInferShapeContext::Inputs( + const std::string &name) const { + return op_.Input(name); +} + +const std::vector &CompileTimeInferShapeContext::Outputs( + const std::string &name) const { + return op_.Output(name); +} + +std::vector CompileTimeInferShapeContext::GetRepeatedDims( + const std::string &name) const { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + std::vector res; + try { + auto shapes = var->GetShapes(); + for (const auto &s : shapes) { + res.push_back(s.empty() ? make_ddim({0UL}) : make_ddim(s)); + } + } catch (...) { + VLOG(5) << "GetRepeatedDim of variable " << name << " error."; + std::rethrow_exception(std::current_exception()); + } + return res; +} + +void CompileTimeInferShapeContext::SetDim(const std::string &name, + const DDim &dim) { + block_.FindVarRecursive(name)->SetShape(vectorize(dim)); +} + +void CompileTimeInferShapeContext::SetRepeatedDims( + const std::string &name, const std::vector &dims) { + auto var = block_.FindVarRecursive(name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); + std::vector> dim_vec(dims.size()); + std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize); + var->SetShapes(dim_vec); +} + +bool CompileTimeInferShapeContext::IsRuntime() const { return false; } + +proto::VarType::Type CompileTimeInferShapeContext::GetVarType( + const std::string &name) const { + return block_.FindVarRecursive(name)->GetType(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h new file mode 100644 index 00000000..dedaf243 --- /dev/null +++ b/paddle/fluid/framework/op_desc.h @@ -0,0 +1,153 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/framework/var_desc.h" + +namespace paddle { +namespace framework { + +class BlockDesc; +class ProgramDesc; +class OpDesc { + public: + OpDesc() {} + + OpDesc(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs); + + OpDesc(const proto::OpDesc &desc, BlockDesc *block); + + explicit OpDesc(BlockDesc *block) : block_(block) {} + + OpDesc(const OpDesc &other, BlockDesc *block); + + void CopyFrom(const OpDesc &op_desc); + + proto::OpDesc *Proto(); + + std::string Type() const { return desc_.type(); } + + void SetType(const std::string &type) { desc_.set_type(type); } + + const std::vector &Input(const std::string &name) const; + + std::vector InputArgumentNames() const; + + void SetInput(const std::string ¶m_name, + const std::vector &args); + + const std::vector &Output(const std::string &name) const; + + std::vector OutputArgumentNames() const; + + void SetOutput(const std::string ¶m_name, + const std::vector &args); + + bool HasAttr(const std::string &name) const { + return attrs_.find(name) != attrs_.end(); + } + + bool HasProtoAttr(const std::string &name) const; + + proto::AttrType GetAttrType(const std::string &name) const; + + std::vector AttrNames() const; + + void SetAttr(const std::string &name, const Attribute &v); + void RemoveAttr(const std::string &name); + + void SetBlockAttr(const std::string &name, BlockDesc *block); + + void SetBlocksAttr(const std::string &name, std::vector blocks); + + Attribute GetAttr(const std::string &name) const; + + const proto::OpProto::Attr &GetProtoAttr(const std::string &name) const; + + Attribute GetNullableAttr(const std::string &name) const; + + int GetBlockAttrId(const std::string &name) const; + + std::vector GetBlocksAttrIds(const std::string &name) const; + + void Rename(const std::string &old_name, const std::string &new_name); + + void RenameOutput(const std::string &old_name, const std::string &new_name); + + void RenameInput(const std::string &old_name, const std::string &new_name); + + // Only be used in C++ + const AttributeMap &GetAttrMap() const; + + // Only be used in C++ + void SetAttrMap(const AttributeMap &attr_map); + + std::vector InputNames() const { return MapKeys(inputs_); } + std::vector OutputNames() const { return MapKeys(outputs_); } + + const VariableNameMap &Inputs() const { return inputs_; } + + const VariableNameMap &Outputs() const { return outputs_; } + + AttributeMap *MutableAttrMap() { + this->need_update_ = true; + return &this->attrs_; + } + + void CheckAttrs(); + + void InferShape(const BlockDesc &block) const; + + void InferVarType(BlockDesc *block) const; + + void SetIsTarget(bool is_target) { desc_.set_is_target(is_target); } + + void Flush(); + + BlockDesc *Block() { return this->block_; } + + const BlockDesc *Block() const { return this->block_; } + + private: + template + static std::vector MapKeys(const MapType &map) { + std::vector ret_val; + ret_val.reserve(map.size()); + std::transform( + map.begin(), map.end(), std::back_inserter(ret_val), + [](const typename MapType::value_type &pair) { return pair.first; }); + return ret_val; + } + + proto::OpDesc desc_; + BlockDesc *block_; // not_own + // input arg name => input variable names + VariableNameMap inputs_; + // output arg name => output variable names + VariableNameMap outputs_; + AttributeMap attrs_; + + // need_update_ indicate there some local changes not be synchronized. If + // local changes should be synchronized, need_update_ should be set to true. + bool need_update_{false}; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc new file mode 100644 index 00000000..c815e194 --- /dev/null +++ b/paddle/fluid/framework/op_info.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_info.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +// C++11 removes the need for manual locking. Concurrent execution shall wait if +// a static local variable is already being initialized. +// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex +OpInfoMap& OpInfoMap::Instance() { + static OpInfoMap g_op_info_map; + return g_op_info_map; +} + +std::vector OpInfoMap::GetUseDefaultGradOpDescMakerOps() const { + // Use set to sort op names + std::set result_ops; + for (auto& pair : map_) { + if (pair.second.use_default_grad_op_desc_maker_) { + result_ops.insert(pair.first); + } + } + return std::vector(result_ops.begin(), result_ops.end()); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h new file mode 100644 index 00000000..daa72769 --- /dev/null +++ b/paddle/fluid/framework/op_info.h @@ -0,0 +1,123 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class InferShapeBase { + public: + virtual ~InferShapeBase() = default; + virtual void operator()(InferShapeContext*) const = 0; +}; + +struct OpInfo { + OpCreator creator_; + GradOpMakerFN grad_op_maker_; + proto::OpProto* proto_{nullptr}; + OpAttrChecker* checker_{nullptr}; + InferVarTypeFN infer_var_type_; + InferShapeFN infer_shape_; + InferInplaceOpFN infer_inplace_; + InferNoNeedBufferVarsFN infer_no_need_buffer_vars_; + + // NOTE(zjl): this flag is added to check whether + // the grad maker is the default one. + bool use_default_grad_op_desc_maker_{false}; + + bool HasOpProtoAndChecker() const { + return proto_ != nullptr && checker_ != nullptr; + } + + const proto::OpProto& Proto() const { + PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered"); + PADDLE_ENFORCE(proto_->IsInitialized(), + "Operator Proto must be initialized in op info"); + return *proto_; + } + + const OpCreator& Creator() const { + PADDLE_ENFORCE_NOT_NULL(creator_, + "Operator Creator has not been registered"); + return creator_; + } + + const GradOpMakerFN& GradOpMaker() const { + PADDLE_ENFORCE_NOT_NULL(grad_op_maker_, + "Operator GradOpMaker has not been registered."); + return grad_op_maker_; + } + + const OpAttrChecker* Checker() const { return checker_; } + + const InferNoNeedBufferVarsFN& NoNeedBufferVarsInferer() const { + return infer_no_need_buffer_vars_; + } +}; + +class OpInfoMap { + public: + static OpInfoMap& Instance(); + + bool Has(const std::string& op_type) const { + return map_.find(op_type) != map_.end(); + } + + void Insert(const std::string& type, const OpInfo& info) { + PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type); + map_.insert({type, info}); + } + + const OpInfo& Get(const std::string& type) const { + auto op_info_ptr = GetNullable(type); + PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered", + type); + return *op_info_ptr; + } + + const OpInfo* GetNullable(const std::string& type) const { + auto it = map_.find(type); + if (it == map_.end()) { + return nullptr; + } else { + return &it->second; + } + } + + const std::unordered_map& map() const { return map_; } + + std::unordered_map* mutable_map() { return &map_; } + + std::vector GetUseDefaultGradOpDescMakerOps() const; + + private: + OpInfoMap() = default; + std::unordered_map map_; + + DISABLE_COPY_AND_ASSIGN(OpInfoMap); +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc new file mode 100644 index 00000000..6d4801e4 --- /dev/null +++ b/paddle/fluid/framework/op_kernel_type.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_kernel_type.h" + +namespace paddle { +namespace framework { + +size_t OpKernelType::Hash::operator()(const OpKernelType& key) const { + int cur_loc = 0; + + int place = key.place_.which(); + cur_loc += OpKernelType::kPlaceBits; + + int data_type = static_cast(key.data_type_) << cur_loc; + cur_loc += OpKernelType::kPrimaryDTypeBits; + + int data_layout = static_cast(key.data_layout_) << cur_loc; + cur_loc += OpKernelType::kLayoutBits; + + int library_type = static_cast(key.library_type_) << cur_loc; + cur_loc += OpKernelType::kLibBits; + + int customized_value = key.customized_type_value_; + PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits)); + customized_value = customized_value << cur_loc; + cur_loc += OpKernelType::kCustomizeBits; + PADDLE_ENFORCE(cur_loc < 64); + + std::hash hasher; + return hasher(place + data_type + data_layout + library_type + + customized_value); +} + +bool OpKernelType::operator==(const OpKernelType& o) const { + return platform::places_are_same_class(place_, o.place_) && + data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && + library_type_ == o.library_type_ && + customized_type_value_ == o.customized_type_value_; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h new file mode 100644 index 00000000..9edc1a3e --- /dev/null +++ b/paddle/fluid/framework/op_kernel_type.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/library_type.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +class OpKernelType { + public: + constexpr static int kDefaultCustomizedTypeValue = 0; + + // In total should be smaller than 64. + constexpr static int kPlaceBits = 4; + constexpr static int kPrimaryDTypeBits = 8; + constexpr static int kLayoutBits = 4; + constexpr static int kLibBits = 4; + constexpr static int kCustomizeBits = 4; + + OpKernelType(proto::VarType::Type data_type, platform::Place place, + DataLayout data_layout = DataLayout::kAnyLayout, + LibraryType library_type = LibraryType::kPlain, + int customized_type_value = kDefaultCustomizedTypeValue) + : data_type_(data_type), + data_layout_(data_layout), + place_(place), + library_type_(library_type), + customized_type_value_(customized_type_value) {} + + OpKernelType(proto::VarType::Type data_type, + const platform::DeviceContext& dev_ctx, + DataLayout data_layout = DataLayout::kAnyLayout, + LibraryType library_type = LibraryType::kPlain, + int customized_type_value = kDefaultCustomizedTypeValue) + : data_type_(data_type), + data_layout_(data_layout), + place_(dev_ctx.GetPlace()), + library_type_(library_type), + customized_type_value_(customized_type_value) {} + + virtual ~OpKernelType() {} + + struct Hash { + size_t operator()(const OpKernelType& key) const; + }; + + size_t hash_key() const { return Hash()(*this); } + + bool operator==(const OpKernelType& o) const; + + bool operator!=(const OpKernelType& o) const { return !(*this == o); } + + proto::VarType::Type data_type_; + DataLayout data_layout_; + platform::Place place_; + LibraryType library_type_; + int customized_type_value_; +}; + +inline std::ostream& operator<<(std::ostream& os, + const OpKernelType& kernel_key) { + os << "data_type[" << kernel_key.data_type_ << "]:data_layout[" + << kernel_key.data_layout_ << "]:place[" << kernel_key.place_ + << "]:library_type[" << kernel_key.library_type_ << "]"; + return os; +} + +inline std::string KernelTypeToString(const OpKernelType& kernel_key) { + std::ostringstream stream; + stream << kernel_key; + return stream.str(); +} + +inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) { + bool ret = + (l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r); +#ifdef PADDLE_WITH_MKLDNN + // Layout transform needed for either non-MKLDNN to MKLDNN or vice versa + ret |= (l != DataLayout::kMKLDNN && r == DataLayout::kMKLDNN); + ret |= (l == DataLayout::kMKLDNN && r != DataLayout::kMKLDNN); +#endif + return ret; +} + +inline bool NeedTransform(const OpKernelType& l, const OpKernelType& r) { + return (!platform::places_are_same_class(l.place_, r.place_)) || + (l.data_type_ != r.data_type_) || + NeedTransformLayout(l.data_layout_, r.data_layout_); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc new file mode 100644 index 00000000..40db8540 --- /dev/null +++ b/paddle/fluid/framework/op_kernel_type_test.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_kernel_type.h" +#include +#include + +TEST(OpKernelType, ToString) { + using OpKernelType = paddle::framework::OpKernelType; + using DataType = paddle::framework::proto::VarType; + using CPUPlace = paddle::platform::CPUPlace; + using DataLayout = paddle::framework::DataLayout; + using LibraryType = paddle::framework::LibraryType; + + OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, + LibraryType::kCUDNN); + + ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type), + "data_type[float]:data_layout[NCHW]:place[CPUPlace]:library_type[" + "CUDNN]"); + + using CUDAPlace = paddle::platform::CUDAPlace; + OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW, + LibraryType::kCUDNN); + ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2), + "data_type[::paddle::platform::float16]:data_layout[NCHW]:place[" + "CUDAPlace(0)]:library_" + "type[CUDNN]"); +} + +TEST(OpKernelType, Hash) { + using OpKernelType = paddle::framework::OpKernelType; + using DataType = paddle::framework::proto::VarType; + using CPUPlace = paddle::platform::CPUPlace; + using CUDAPlace = paddle::platform::CUDAPlace; + using DataLayout = paddle::framework::DataLayout; + using LibraryType = paddle::framework::LibraryType; + + OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW, + LibraryType::kCUDNN); + OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW, + LibraryType::kCUDNN); + + OpKernelType::Hash hasher; + ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2)); +} diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc new file mode 100644 index 00000000..b502ef7a --- /dev/null +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_proto_maker.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +void OpProtoAndCheckerMaker::Validate() { + validated_ = true; + CheckNoDuplicatedInOutAttrs(); +} + +OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput( + const std::string& name, const std::string& comment) { + auto* input = proto_->add_inputs(); + input->set_name(name); + input->set_comment(comment); + return OpProtoAndCheckerMaker::VariableBuilder{input}; +} + +OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput( + const std::string& name, const std::string& comment) { + auto* output = proto_->add_outputs(); + output->set_name(name); + output->set_comment(comment); + return OpProtoAndCheckerMaker::VariableBuilder{output}; +} + +void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() { + std::unordered_set names; + auto checker = [&](const std::string& name) { + PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name); + names.insert(name); + }; + for (auto& attr : proto_->attrs()) { + checker(attr.name()); + } + for (auto& input : proto_->inputs()) { + checker(input.name()); + } + for (auto& output : proto_->outputs()) { + checker(output.name()); + } +} + +void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, + OpAttrChecker* attr_checker) { + proto_ = proto; + op_checker_ = attr_checker; + Make(); + + AddAttr(OpRoleAttrName(), "The role of this operator") + .InEnum( + {static_cast(OpRole::kForward), + static_cast(OpRole::kBackward), + static_cast(OpRole::kOptimize), static_cast(OpRole::kRPC), + static_cast(OpRole::kDist), static_cast(OpRole::kLRSched), + static_cast(OpRole::kLoss) | static_cast(OpRole::kForward), + static_cast(OpRole::kLoss) | + static_cast(OpRole::kBackward), + static_cast(OpRole::kOptimize) | + static_cast(OpRole::kLRSched), + static_cast(OpRole::kNotSpecified)}) + .SetDefault(static_cast(OpRole::kNotSpecified)); + AddAttr>(OpRoleVarAttrName(), + "Optimized for variable") + .SetDefault({}); + + AddAttr(OpNamescopeAttrName(), "Operator name with namesope.") + .SetDefault(""); + + AddAttr>(OpCreationCallstackAttrName(), + "Callstack for Op Creatation.") + .SetDefault({}); + + Validate(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h new file mode 100644 index 00000000..5f3ce60e --- /dev/null +++ b/paddle/fluid/framework/op_proto_maker.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/framework.pb.h" +namespace paddle { +namespace framework { + +////////////////////////// +// Don't add more roles to make this too complicated! +////////////////////////// +enum class OpRole { + kForward = 0x0000, + kBackward = 0x0001, + kOptimize = 0x0002, + // RPC role is for send/recv related op + kRPC = 0x0004, + // Dist role is for split_byref/split_selected_rows/concat + // used for distributed training. + kDist = 0x0008, + // Tag all learning rate scheduler operators. + kLRSched = 0x0010, + + kLoss = 0x0100, + // The default value of op's role. This should be only used for unittests and + // CreateOp inside a operator. + kNotSpecified = 0x1000, +}; + +// this class not only make proto but also init attribute checkers. +class OpProtoAndCheckerMaker { + public: + static const char *OpRoleAttrName() { return "op_role"; } + static const char *OpRoleVarAttrName() { return "op_role_var"; } + static const char *OpNamescopeAttrName() { return "op_namescope"; } + static const char *OpCreationCallstackAttrName() { return "op_callstack"; } + + void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); + + virtual void Make() = 0; + + virtual ~OpProtoAndCheckerMaker() { + CHECK(validated_) << "should call Validate after build"; + } + + protected: + struct VariableBuilder { + proto::OpProto::Var *var_; + + VariableBuilder &AsDuplicable() { + var_->set_duplicable(true); + return *this; + } + + VariableBuilder &AsIntermediate() { + var_->set_intermediate(true); + return *this; + } + + VariableBuilder &AsDispensable() { + var_->set_dispensable(true); + return *this; + } + }; + + VariableBuilder AddInput(const std::string &name, const std::string &comment); + + VariableBuilder AddOutput(const std::string &name, + const std::string &comment); + + template + TypedAttrChecker &AddAttr(const std::string &name, + const std::string &comment, + bool generated = false) { + auto *attr = proto_->add_attrs(); + attr->set_name(name); + attr->set_comment(comment); + attr->set_generated(generated); + attr->set_type(AttrTypeID()); + return op_checker_->AddAttrChecker(name); + } + + void AddComment(const std::string &comment) { proto_->set_comment(comment); } + + private: + void CheckNoDuplicatedInOutAttrs(); + void Validate(); + + proto::OpProto *proto_; + OpAttrChecker *op_checker_; + bool validated_{false}; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc new file mode 100644 index 00000000..a8030d37 --- /dev/null +++ b/paddle/fluid/framework/op_proto_maker_test.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_proto_maker.h" + +#include "gtest/gtest.h" + +class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddAttr("scale", "scale of test op"); + AddAttr("scale", "scale of test op"); + } +}; + +TEST(ProtoMaker, DuplicatedAttr) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + TestAttrProtoMaker proto_maker; + ASSERT_THROW(proto_maker(&op_proto, &op_checker), + paddle::platform::EnforceNotMet); +} + +class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("input", "input of test op"); + AddInput("input", "input of test op"); + } +}; + +TEST(ProtoMaker, DuplicatedInOut) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + TestAttrProtoMaker proto_maker; + ASSERT_THROW(proto_maker(&op_proto, &op_checker), + paddle::platform::EnforceNotMet); +} diff --git a/paddle/fluid/framework/op_registry.cc b/paddle/fluid/framework/op_registry.cc new file mode 100644 index 00000000..346d14d4 --- /dev/null +++ b/paddle/fluid/framework/op_registry.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +#include + +namespace paddle { +namespace framework { + +std::unique_ptr OpRegistry::CreateOp( + const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, AttributeMap attrs) { + auto& info = OpInfoMap::Instance().Get(type); + if (info.Checker() != nullptr) { + info.Checker()->Check(&attrs); + } + auto op = info.Creator()(type, inputs, outputs, attrs); + return std::unique_ptr(op); +} + +static VariableNameMap ConvertOpDescVarsToVarNameMap( + const google::protobuf::RepeatedPtrField& + op_desc_vars) { + VariableNameMap ret_val; + for (auto& var : op_desc_vars) { + auto& var_names = ret_val[var.parameter()]; + auto& var_names_in_proto = var.arguments(); + var_names.reserve(static_cast(var_names_in_proto.size())); + std::copy(var_names_in_proto.begin(), var_names_in_proto.end(), + std::back_inserter(var_names)); + } + return ret_val; +} + +std::unique_ptr OpRegistry::CreateOp( + const proto::OpDesc& op_desc) { + VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be" + "used in unit tests. Use CreateOp(const OpDesc& op_desc) " + "instead."; + VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs()); + VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs()); + AttributeMap attrs; + for (auto& attr : op_desc.attrs()) { + attrs[attr.name()] = GetAttrValue(attr); + } + + return CreateOp(op_desc.type(), inputs, outputs, attrs); +} + +std::unique_ptr OpRegistry::CreateOp(const OpDesc& op_desc) { + return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(), + op_desc.GetAttrMap()); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h new file mode 100644 index 00000000..a53a81c2 --- /dev/null +++ b/paddle/fluid/framework/op_registry.h @@ -0,0 +1,326 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include "glog/logging.h" // For VLOG() +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/details/op_registry.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/shape_inference.h" + +namespace paddle { +namespace framework { + +class Registrar { + public: + // In our design, various kinds of classes, e.g., operators and kernels, + // have their corresponding registry and registrar. The action of + // registration is in the constructor of a global registrar variable, which + // are not used in the code that calls package framework, and would + // be removed from the generated binary file by the linker. To avoid such + // removal, we add Touch to all registrar classes and make USE_OP macros to + // call this method. So, as long as the callee code calls USE_OP, the global + // registrar variable won't be removed by the linker. + void Touch() {} +}; + +template +struct OperatorRegistrar : public Registrar { + explicit OperatorRegistrar(const char* op_type) { + PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type), + "'%s' is registered more than once.", op_type); + static_assert(sizeof...(ARGS) != 0, + "OperatorRegistrar should be invoked at least by OpClass"); + OpInfo info; + details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info); + OpInfoMap::Instance().Insert(op_type, info); + } +}; + +class OpRegistry { + public: + static std::unique_ptr CreateOp(const std::string& type, + const VariableNameMap& inputs, + const VariableNameMap& outputs, + AttributeMap attrs); + + static std::unique_ptr CreateOp(const proto::OpDesc& op_desc); + + static std::unique_ptr CreateOp(const OpDesc& op_desc); +}; + +template +struct OpKernelRegistrarFunctor; + +template +inline void RegisterKernelClass(const char* op_type, const char* library_type, + int customized_type_value, Func func) { + std::string library(library_type); + std::string data_layout = "ANYLAYOUT"; + if (library == "MKLDNN") { + data_layout = "MKLDNNLAYOUT"; + } + OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), + StringToDataLayout(data_layout), + StringToLibraryType(library_type), customized_type_value); + OperatorWithKernel::AllOpKernels()[op_type][key] = func; +} + +template +struct OpKernelRegistrarFunctor { + using KERNEL_TYPE = + typename std::tuple_element>::type; + + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const { + using T = typename KERNEL_TYPE::ELEMENT_TYPE; + RegisterKernelClass( + op_type, library_type, customized_type_value, + + [](const framework::ExecutionContext& ctx) { + KERNEL_TYPE().Compute(ctx); + }); + constexpr auto size = std::tuple_size>::value; + OpKernelRegistrarFunctor + func; + func(op_type, library_type, customized_type_value); + } +}; + +template +struct OpKernelRegistrarFunctor { + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const {} +}; + +// User can register many kernel in one place. The data type could be +// different. +template +class OpKernelRegistrar : public Registrar { + public: + explicit OpKernelRegistrar(const char* op_type, const char* library_type, + int customized_type_value) { + OpKernelRegistrarFunctor func; + func(op_type, library_type, customized_type_value); + } +}; + +template +struct OpKernelRegistrarFunctorEx; + +template +class OpKernelRegistrarEx : public Registrar { + public: + explicit OpKernelRegistrarEx(const char* op_type, const char* library_type, + int customized_type_value) { + OpKernelRegistrarFunctorEx + func; + func(op_type, library_type, customized_type_value); + } +}; + +template +struct OpKernelRegistrarFunctorEx { + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const {} +}; + +template +struct OpKernelRegistrarFunctorEx { + using Functor = + typename std::tuple_element>::type; + using T = + typename std::tuple_element>::type; + + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const { + RegisterKernelClass(op_type, library_type, + customized_type_value, Functor()); + + constexpr auto size = + std::tuple_size>::value; + OpKernelRegistrarFunctorEx= size, I + 2, + DataTypeAndKernelType...> + func; + func(op_type, library_type, customized_type_value); + } +}; + +// clang-format off +/** + * check if MACRO is used in GLOBAL NAMESPACE. + */ +#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +/* + The variadic arguments should be class types derived from one of the + following classes: + OpProtoAndCheckerMaker + GradOpDescMakerBase + VarTypeInference + InferShapeBase +*/ +#define REGISTER_OPERATOR(op_type, op_class, ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op__##op_type, \ + "REGISTER_OPERATOR must be called in global namespace"); \ + static ::paddle::framework::OperatorRegistrar \ + __op_registrar_##op_type##__(#op_type); \ + int TouchOpRegistrar_##op_type() { \ + __op_registrar_##op_type##__.Touch(); \ + return 0; \ + } + +#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \ + REGISTER_OPERATOR(op_type, op_class, op_maker_class) + +/** + * Macro to register OperatorKernel. + */ +#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type, \ + place_class, customized_name, \ + customized_type_value, ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \ + "REGISTER_OP_KERNEL must be called in " \ + "global namespace"); \ + static ::paddle::framework::OpKernelRegistrar \ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\ + #op_type, #library_type, customized_type_value); \ + int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \ + .Touch(); \ + return 0; \ + } + +#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \ + REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( \ + op_type, library_type, place_class, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) + +#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ + REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__) + +#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ + REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) + +#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \ + customized_name, \ + customized_type_value, \ + ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \ + "REGISTER_OP_KERNEL_EX must be called in " \ + "global namespace"); \ + static ::paddle::framework::OpKernelRegistrarEx \ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\ + #op_type, #library_type, customized_type_value); \ + int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \ + .Touch(); \ + return 0; \ + } + +#define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...) \ + REGISTER_OP_KERNEL_EX( \ + op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) + +#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \ + REGISTER_OP_KERNEL_EX( \ + op_type, CPU, ::paddle::platform::CPUPlace, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) + +/** + * Macro to mark what Operator and Kernel + * we will use and tell the compiler to + * link them into target. + */ +#define USE_OP_ITSELF(op_type) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __use_op_itself_##op_type, \ + "USE_OP_ITSELF must be called in global namespace"); \ + extern int TouchOpRegistrar_##op_type(); \ + UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type() + +#define USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, \ + LIBRARY_TYPE, \ + customized_name) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##__, \ + "USE_OP_DEVICE_KERNEL must be in global namespace"); \ + extern int \ + TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name(); \ + UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_##customized_name##_ = /* NOLINT */ \ + TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE##_##customized_name() + +#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE) \ + USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(op_type, LIBRARY_TYPE, DEFAULT_TYPE) + +// TODO(fengjiayi): The following macros +// seems ugly, do we have better method? + +#ifndef PADDLE_WITH_CUDA +#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU) +#else +#define USE_OP_KERNEL(op_type) \ + USE_OP_DEVICE_KERNEL(op_type, CPU); \ + USE_OP_DEVICE_KERNEL(op_type, CUDA) +#endif + +#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type); + +#define USE_CPU_ONLY_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_DEVICE_KERNEL(op_type, CPU); + +#define USE_CUDA_ONLY_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_DEVICE_KERNEL(op_type, CUDA) + +#define USE_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_KERNEL(op_type) +// clang-format on + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_registry_test.cc b/paddle/fluid/framework/op_registry_test.cc new file mode 100644 index 00000000..04996d7b --- /dev/null +++ b/paddle/fluid/framework/op_registry_test.cc @@ -0,0 +1,370 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/op_registry.h" + +namespace pd = paddle::framework; + +namespace paddle { +namespace framework { + +class CosineOp : public OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("input", "input of cosine op"); + AddOutput("output", "output of cosine op"); + AddAttr("scale", "scale of cosine op") + .SetDefault(1.0) + .GreaterThan(0.0); + AddComment("This is cos op"); + } +}; + +class MyTestOp : public OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("input", "input of cosine op").AsDuplicable(); + AddOutput("output", "output of cosine op").AsIntermediate(); + auto my_checker = [](int i) { + PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!"); + }; + AddAttr("test_attr", "a simple test attribute") + .AddCustomChecker(my_checker); + AddComment("This is my_test op"); + } +}; +} // namespace framework +} // namespace paddle + +static void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + var->add_arguments(arg_name); + } +} +REGISTER_OP_WITHOUT_GRADIENT(cos_sim, paddle::framework::CosineOp, + paddle::framework::CosineOpProtoAndCheckerMaker); +REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp, + paddle::framework::MyTestOpProtoAndCheckerMaker); + +TEST(OpRegistry, CreateOp) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("cos_sim"); + BuildVar("input", {"aa"}, op_desc.add_inputs()); + BuildVar("output", {"bb"}, op_desc.add_outputs()); + + float scale = 3.3; + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(scale); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + paddle::framework::Scope scope; + paddle::platform::CPUPlace cpu_place; + op->Run(scope, cpu_place); + float scale_get = op->Attr("scale"); + ASSERT_EQ(scale_get, scale); +} + +TEST(OpRegistry, IllegalAttr) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("cos_sim"); + BuildVar("input", {"aa"}, op_desc.add_inputs()); + BuildVar("output", {"bb"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(-2.0); + + bool caught = false; + try { + paddle::framework::OpRegistry::CreateOp(op_desc); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = "larger_than check fail"; + const char* err_msg = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(err_msg[i], msg[i]); + } + } + ASSERT_TRUE(caught); +} + +TEST(OpRegistry, DefaultValue) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("cos_sim"); + BuildVar("input", {"aa"}, op_desc.add_inputs()); + BuildVar("output", {"bb"}, op_desc.add_outputs()); + + ASSERT_TRUE(op_desc.IsInitialized()); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + paddle::framework::Scope scope; + paddle::platform::CPUPlace cpu_place; + op->Run(scope, cpu_place); + ASSERT_EQ(op->Attr("scale"), 1.0); +} + +TEST(OpRegistry, CustomChecker) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("my_test_op"); + BuildVar("input", {"ii"}, op_desc.add_inputs()); + BuildVar("output", {"oo"}, op_desc.add_outputs()); + + // attr 'test_attr' is not set + bool caught = false; + try { + paddle::framework::OpRegistry::CreateOp(op_desc); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = "Attribute 'test_attr' is required!"; + const char* err_msg = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(err_msg[i], msg[i]); + } + } + ASSERT_TRUE(caught); + + // set 'test_attr' set to an illegal value + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("test_attr"); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(3); + caught = false; + try { + paddle::framework::OpRegistry::CreateOp(op_desc); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = "'test_attr' must be even!"; + const char* err_msg = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(err_msg[i], msg[i]); + } + } + ASSERT_TRUE(caught); + + // set 'test_attr' set to a legal value + op_desc.mutable_attrs()->Clear(); + attr = op_desc.mutable_attrs()->Add(); + attr->set_name("test_attr"); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(4); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + op->Run(scope, cpu_place); + int test_attr = op->Attr("test_attr"); + ASSERT_EQ(test_attr, 4); +} + +TEST(OperatorRegistrar, Test) { + paddle::framework::OperatorRegistrar< + paddle::framework::CosineOp, + paddle::framework::CosineOpProtoAndCheckerMaker> + reg("cos"); +} + +namespace paddle { +namespace framework { + +class OpKernelTestMaker : public OpProtoAndCheckerMaker { + public: + void Make() { AddComment("NoGradOp, same input output. no Grad"); } +}; + +class OpWithKernelTest : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(InferShapeContext* ctx) const override {} + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(proto::VarType::FP32, ctx.device_context()); + } +}; + +template +class OpKernelTest : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const {} +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel, + paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestMaker); +REGISTER_OP_CPU_KERNEL( + op_with_kernel, + paddle::framework::OpKernelTest); + +REGISTER_OP_CUDA_KERNEL(op_with_kernel, + paddle::framework::OpKernelTest< + paddle::platform::CUDADeviceContext, float>); + +TEST(OperatorRegistrar, CPU) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + op_desc.set_type("op_with_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + op->Run(scope, cpu_place); +} + +TEST(OperatorRegistrar, CUDA) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CUDAPlace cuda_place(0); + paddle::framework::Scope scope; + + op_desc.set_type("op_with_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + op->Run(scope, cuda_place); +} + +static int op_test_value = 0; + +using paddle::platform::CPUDeviceContext; +using paddle::platform::CUDADeviceContext; +using paddle::platform::DeviceContext; + +namespace paddle { +namespace framework { + +class OpWithMultiKernelTest : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(InferShapeContext* ctx) const override {} + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(proto::VarType::FP32, platform::CUDAPlace(0), + DataLayout::kAnyLayout, + framework::LibraryType::kCUDNN); + } +}; + +template +class OpMultiKernelTest : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const; +}; + +template +class OpMultiKernelTest + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + ++op_test_value; + } +}; + +template +class OpMultiKernelTest + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + --op_test_value; + } +}; + +template +class OpMultiKernelTest2 : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const; +}; + +template +class OpMultiKernelTest2 + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + op_test_value += 10; + } +}; + +template +class OpMultiKernelTest2 + : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const { + op_test_value -= 10; + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel, + paddle::framework::OpWithMultiKernelTest, + paddle::framework::OpKernelTestMaker); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CPU, paddle::platform::CPUPlace, + paddle::framework::OpMultiKernelTest); +REGISTER_OP_KERNEL( + op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace, + paddle::framework::OpMultiKernelTest2); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace, + paddle::framework::OpMultiKernelTest); +REGISTER_OP_KERNEL( + op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace, + paddle::framework::OpMultiKernelTest2); + +TEST(OperatorRegistrar, OpWithMultiKernel) { + paddle::framework::proto::OpDesc op_desc; + paddle::platform::CUDAPlace cuda_place(0); + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + op_desc.set_type("op_with_multi_kernel"); + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + + // TODO(qiao) add priority back + // use all available kernels + op->Run(scope, cuda_place); + EXPECT_EQ(op_test_value, -10); +} diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc new file mode 100644 index 00000000..2f7476aa --- /dev/null +++ b/paddle/fluid/framework/operator.cc @@ -0,0 +1,1214 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_transform.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/shape_inference.h" +#include "paddle/fluid/framework/transfer_scope_cache.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/profiler.h" + +DECLARE_bool(benchmark); +DEFINE_bool(check_nan_inf, false, + "Checking whether operator produce NAN/INF or not. It will be " + "extremely slow so please use this flag wisely."); +DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op"); +DEFINE_bool(fast_check_nan_inf, false, + "Fast checking NAN/INF after each operation. It will be a little" + "bit slow, much faster than check_nan_inf"); + +namespace paddle { +namespace framework { + +std::vector> kKernelPriority = { + std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN), + std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain), + std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN), + std::make_tuple(platform::CPUPlace(), LibraryType::kPlain), +}; + +proto::VarType::Type GetDataTypeOfVar(const Variable* var) { + if (var->IsType()) { + return var->Get().type(); + } else if (var->IsType()) { + return var->Get().value().type(); + } else { + PADDLE_THROW("Var should be LoDTensor or SelectedRows"); + } +} + +static DDim GetDimsDebug(const Scope& scope, const std::string& name, + bool get_actual_dim = false) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + return DDim({-1}); + } + + if (var->IsType()) { + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return DDim({-1}); + } + return tensor.dims(); + } else if (var->IsType()) { + if (get_actual_dim) { + return var->Get().value().dims(); + } else { + return var->Get().GetCompleteDims(); + } + } else { + return DDim({-1}); + } +} + +static bool VarInited(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) return false; + return var->IsInitialized(); +} + +static std::string GetDtype(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + return ""; + } + + if (var->IsType()) { + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return ""; + } + return DataTypeToString(tensor.type()); + } else if (var->IsType()) { + auto tensor = var->Get().value(); + if (UNLIKELY(!tensor.IsInitialized())) { + return "uninited"; + } else { + return DataTypeToString(tensor.type()); + } + } else { + return ""; + } +} + +static int GetRowSize(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + return -1; + } + + if (var->IsType()) { + return var->Get().rows().size(); + } + + return -1; +} + +static LoD GetLoDDebug(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + auto default_lod = LoD({{}}); + + if (var == nullptr) { + return default_lod; + } + + if (var->IsType()) { + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return default_lod; + } + return tensor.lod(); + } else { + return default_lod; + } +} + +RuntimeContext::RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, + const Scope& scope) { + for (auto& var_name_item : innames) { + std::vector& input_vars = inputs[var_name_item.first]; + input_vars.reserve(var_name_item.second.size()); + for (auto& var_name : var_name_item.second) { + input_vars.push_back(scope.FindVar(var_name)); + } + } + for (auto& var_name_item : outnames) { + std::vector& output_vars = outputs[var_name_item.first]; + output_vars.reserve(var_name_item.second.size()); + for (auto& var_name : var_name_item.second) { + output_vars.push_back(scope.FindVar(var_name)); + } + } +} + +void OperatorBase::Run(const Scope& scope, const platform::Place& place) { + try { + VLOG(4) << place << " " << DebugStringEx(&scope); + if (platform::is_gpu_place(place)) { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("Cannot run operator on place %s", place); +#else + auto dev_id = boost::get(place).device; + platform::SetDeviceId(dev_id); +#endif + } + + // The profile has a process-wide mutex, results in serious performance + // issue + // in concurrency scenerio. Here use an `if` to fix this issue. + // Please not remove the `if`, ask @Superjomn if there are any concern. + if (platform::IsProfileEnabled()) { + platform::RecordEvent record_event(Type()); + RunImpl(scope, place); + } else { + RunImpl(scope, place); + } + + VLOG(3) << place << " " << DebugStringEx(&scope); + } catch (platform::EnforceNotMet exception) { + if (Attrs().count("sub_block") != 0) { + throw std::move(exception); + } + + auto& callstack = Attr>( + OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); + + if (callstack.empty()) { + throw std::move(exception); + } + std::ostringstream sout; + sout << "Invoke operator " << Type() << " error.\n"; + sout << "Python Callstacks: \n"; + for (auto& line : callstack) { + sout << line; + } + sout << "C++ Callstacks: \n"; + sout << exception.err_str_; + exception.err_str_ = sout.str(); + throw std::move(exception); + } catch (...) { + std::rethrow_exception(std::current_exception()); + } +} + +bool OperatorBase::HasInputs(const std::string& name) const { + return inputs_.find(name) != inputs_.end(); +} + +std::string OperatorBase::Input(const std::string& name) const { + auto& ins = Inputs(name); + PADDLE_ENFORCE_LE(ins.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + type_, name); + return ins.empty() ? kEmptyVarName : ins[0]; +} + +const std::vector& OperatorBase::Inputs( + const std::string& name) const { + auto it = inputs_.find(name); + PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.", + type_, name); + return it->second; +} + +bool OperatorBase::HasOutputs(const std::string& name) const { + if (outputs_.find(name) != outputs_.end()) { + return true; + } else { + return false; + } +} + +std::string OperatorBase::Output(const std::string& name) const { + auto& outs = Outputs(name); + PADDLE_ENFORCE_LE(outs.size(), 1UL, + "Operator %s's output %s should contain only one variable.", + type_, name); + return outs.empty() ? kEmptyVarName : outs[0]; +} + +const std::vector& OperatorBase::Outputs( + const std::string& name) const { + auto it = outputs_.find(name); + PADDLE_ENFORCE(it != outputs_.end(), + "Operator %s does not have an output called %s.", type_, name); + return it->second; +} + +std::string OperatorBase::DebugStringEx(const Scope* scope) const { + std::stringstream ss; + ss << "Op(" << type_ << "), inputs:{"; + for (auto it = inputs_.begin(); it != inputs_.end();) { + auto& input = *it; + ss << input.first << "["; + for (size_t i = 0; i < input.second.size(); ++i) { + auto var_name = input.second[i]; + ss << var_name; + if (scope) { + if (!VarInited(*scope, var_name)) { + ss << "[uninited]"; + } else { + int row_size = GetRowSize(*scope, var_name); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } + std::string dtype = GetDtype(*scope, var_name); + ss << ":" << dtype; + ss << "[" << GetDimsDebug(*scope, var_name, true) << "]"; + ss << "(" << GetLoDDebug(*scope, var_name) << ")"; + } + } + if (i != input.second.size() - 1) { + ss << ", "; + } + } + ss << "]"; + ++it; + if (it != inputs_.end()) { + ss << ", "; + } + } + ss << "}, outputs:{"; + for (auto it = outputs_.begin(); it != outputs_.end();) { + auto& output = *it; + ss << output.first << "["; + for (size_t i = 0; i < output.second.size(); ++i) { + auto var_name = output.second[i]; + ss << var_name; + if (scope) { + if (!VarInited(*scope, var_name)) { + ss << "[uninited]"; + } else { + int row_size = GetRowSize(*scope, output.second[i]); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } + std::string dtype = GetDtype(*scope, output.second[i]); + ss << ":" << dtype; + ss << "[" << GetDimsDebug(*scope, var_name, true) << "]"; + ss << "(" << GetLoDDebug(*scope, var_name) << ")"; + } + } + if (i != output.second.size() - 1) { + ss << ", "; + } + } + ss << "]"; + ++it; + if (it != outputs_.end()) { + ss << ", "; + } + } + ss << "}."; + return ss.str(); +} + +OperatorBase::OperatorBase(const std::string& type, + const VariableNameMap& inputs, + const VariableNameMap& outputs, + const AttributeMap& attrs) + : type_(type), + inputs_(inputs), + outputs_(outputs), + attrs_(attrs), + // NOTE(zjl): why op_info may be nullptr? + info_(OpInfoMap::Instance().GetNullable(type)) { + GenerateTemporaryNames(); + CheckAllInputOutputSet(); +} + +std::vector OperatorBase::InputVars() const { + std::vector ret_val; + for (auto& o : inputs_) { + ret_val.reserve(ret_val.size() + o.second.size()); + ret_val.insert(ret_val.end(), o.second.begin(), o.second.end()); + } + return ret_val; +} + +std::vector OperatorBase::OutputVars(bool has_intermediate) const { + std::vector ret_val; + if (has_intermediate) { + // push all outputs into ret_val + for (auto& o : outputs_) { + ret_val.reserve(ret_val.size() + o.second.size()); + ret_val.insert(ret_val.end(), o.second.begin(), o.second.end()); + } + return ret_val; + } + auto& info = Info(); + + // get all OpProto::Var for outputs + for (auto& o : info.Proto().outputs()) { + // ignore all intermediate output + if (o.intermediate()) continue; + auto out = outputs_.find(o.name()); + if (out != outputs_.end()) { + ret_val.reserve(ret_val.size() + out->second.size()); + ret_val.insert(ret_val.end(), out->second.begin(), out->second.end()); + } + } + return ret_val; +} + +void OperatorBase::CheckAllInputOutputSet() const { + if (info_ == nullptr || info_->proto_ == nullptr) return; + + for (auto& in : info_->Proto().inputs()) { + if (!in.dispensable()) { + PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(), + "Operator %s's input, %s, is not set", Type(), in.name()); + } + } + + for (auto& out : info_->Proto().outputs()) { + if (!out.dispensable()) { + PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(), + "Operator %s's output, %s, is not set", Type(), + out.name()); + } + } +} + +void OperatorBase::GenerateTemporaryNames() { + static std::atomic gUniqId(0UL); + for (auto& output : outputs_) { + for (auto& output_name : output.second) { + if (output_name == kTempVarName) { + output_name += type_; + output_name += "@"; + output_name += std::to_string(gUniqId.fetch_add(1)); + } + } + } +} + +static bool VarIsTensor(const Variable& var) { + return var.IsType() || var.IsType(); +} + +const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var) { + if (var.IsType()) { + return static_cast(&(var.Get())); + } else if (var.IsType()) { + return &(var.Get().value()); + } else { + PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", + ToTypeName(var.Type())); + } +} + +Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var) { + if (var->IsType()) { + return var->GetMutable(); + } else if (var->IsType()) { + return var->GetMutable()->mutable_value(); + } else { + PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", + ToTypeName(var->Type())); + } +} + +bool ExecutionContext::HasInput(const std::string& name) const { + if (!op_.HasInputs(name)) { + return false; + } + auto& ins = Inputs(name); + size_t length = ins.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Input %s should not have more than one inputs", name); + auto arg = ins[0]; + auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg); + return var != nullptr; +} + +bool ExecutionContext::HasOutput(const std::string& name) const { + if (!op_.HasOutputs(name)) { + return false; + } + auto& outs = Outputs(name); + size_t length = outs.size(); + if (length == 0) { + return false; + } + PADDLE_ENFORCE_EQ(length, 1UL, + "Output %s should not have more than one inputs", name); + auto arg = outs[0]; + auto* var = arg == kEmptyVarName ? nullptr : scope_.FindVar(arg); + return var != nullptr; +} + +const Variable* ExecutionContext::InputVar(const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +Variable* ExecutionContext::OutputVar(const std::string& name) const { + auto it = ctx_.outputs.find(name); + if (it == ctx_.outputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's output %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +template <> +const Tensor* ExecutionContext::Input(const std::string& name) const { + return Input(name); +} + +template <> +const std::vector ExecutionContext::MultiInput( + const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) { + return {}; + } + const std::vector& vars = it->second; + std::vector res; + res.reserve(vars.size()); + std::transform(vars.begin(), vars.end(), std::back_inserter(res), + [&](Variable* var) -> const Tensor* { + if (var == nullptr) return nullptr; + PADDLE_ENFORCE( + var->IsType(), + "should be LoDTensor, but the received type is %s", + ToTypeName(var->Type())); + return &(var->Get()); + }); + return res; +} + +template <> +Tensor* ExecutionContext::Output(const std::string& name) const { + return Output(name); +} + +template <> +std::vector ExecutionContext::MultiOutput( + const std::string& name) const { + auto it = ctx_.outputs.find(name); + if (it == ctx_.outputs.end()) { + return {}; + } + const std::vector& vars = it->second; + std::vector res; + res.reserve(vars.size()); + std::transform(vars.begin(), vars.end(), std::back_inserter(res), + [&](Variable* var) -> Tensor* { + return var == nullptr ? nullptr + : var->GetMutable(); + }); + return res; +} + +bool OpSupportGPU(const std::string& op_type) { + auto& all_kernels = OperatorWithKernel::AllOpKernels(); + auto it = all_kernels.find(op_type); + if (it == all_kernels.end()) { + // All control operator must support GPU + return true; + } + for (auto& kern_pair : it->second) { + if (platform::is_gpu_place(kern_pair.first.place_)) { + return true; + } + } + return false; +} + +class RuntimeInferShapeContext : public InferShapeContext { + public: + RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope, + const RuntimeContext& ctx) + : op_(op), ctx_(ctx) {} + + bool HasInput(const std::string& name) const override { + // has only one input + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end()) { + return false; + } + const auto& in = it->second; + if (in.size() == 0) return false; + PADDLE_ENFORCE_EQ(in.size(), 1UL, + "Input %s should not have more than one inputs", name); + return in[0] != nullptr; + } + + bool HasOutput(const std::string& name) const override { + // has only one output + const auto& outs = ctx_.outputs; + auto it = outs.find(name); + if (it == outs.end()) { + return false; + } + const auto& out = it->second; + if (out.size() == 0) { + return false; + } + PADDLE_ENFORCE_EQ(out.size(), 1UL, + "Output %s should not have more than one outputs", name); + return out[0] != nullptr; + } + + bool HasInputs(const std::string& name) const override { + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end() || it->second.empty()) { + return false; + } + for (auto& input : it->second) { + if (input == nullptr) { + return false; + } + } + return true; + } + + bool HasOutputs(const std::string& name) const override { + const auto& outs = ctx_.outputs; + auto it = outs.find(name); + if (it == outs.end() || it->second.empty()) { + return false; + } + for (auto& output : it->second) { + if (output == nullptr) { + return false; + } + } + return true; + } + + AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } + + const std::vector& Inputs( + const std::string& name) const override { + return op_.Inputs(name); + } + + const std::vector& Outputs( + const std::string& name) const override { + return op_.Outputs(name); + } + + void ShareDim(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) override { + auto in_it = ctx_.inputs.find(in); + auto out_it = ctx_.outputs.find(out); + PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i, + "Inputs %s should have %llu argument", in, i); + PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j, + "Outputs %s should have %llu argument", out, j); + + Variable* in_var = in_it->second[i]; + Variable* out_var = out_it->second[j]; + + PADDLE_ENFORCE(in_var->Type() == out_var->Type(), + "The type of %s and %s is not the same.", in, out); + + if (in_var->IsType()) { + auto& in_sele_rows = in_var->Get(); + auto out_sele_rows = out_var->GetMutable(); + out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims()); + out_sele_rows->set_rows(in_sele_rows.rows()); + out_sele_rows->set_height(in_sele_rows.height()); + } else if (in_var->IsType()) { + auto& in_lod_tensor = in_var->Get(); + auto* out_lod_tensor = out_var->GetMutable(); + out_lod_tensor->Resize(in_lod_tensor.dims()); + } else { + PADDLE_THROW( + "Currently, the input type of ShareDim only can be LoDTensor " + "or SelectedRows."); + } + } + + void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const override { + auto in_it = ctx_.inputs.find(in); + auto out_it = ctx_.outputs.find(out); + PADDLE_ENFORCE(in_it != ctx_.inputs.end() && in_it->second.size() > i, + "Inputs %s should have %llu argument", in, i); + PADDLE_ENFORCE(out_it != ctx_.outputs.end() && out_it->second.size() > j, + "Outputs %s should have %llu argument", out, j); + + Variable* in_var = in_it->second.at(i); + if (!in_var->IsType()) return; + Variable* out_var = out_it->second.at(j); + PADDLE_ENFORCE(out_var->IsType(), + "The %d-th output of Output(%s) must be LoDTensor.", j, out); + auto in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_lod(in_tensor.lod()); + +// TODO(dzhwinter) : reuse ShareLoD in most operators. +// Need to call ShareLayout explicitly in sequence related ops. +// Shall we have a better method to shared info between in/out Tensor? +#ifdef PADDLE_WITH_MKLDNN + // Fix me: ugly workaround below + // Correct solution: + // set_layout() should NOT be called here (i.e. ShareLoD). Instead, + // layout of output tensor should be set "manually" in Compute() + // of each OPKernel. The reason layout should NOT be shared between + // input and output "automatically" (now by InferShape()->ShareLoD()) + // is that layout transform may occur after InferShape(). + // Workaround: + // Skip set_layout() when input layout is kMKLDNN + // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN + // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called + // in Compute() + if (in_tensor.layout() != DataLayout::kMKLDNN) +#endif + out_tensor->set_layout(in_tensor.layout()); + } + + void DecreaseLoDLevel(const std::string& in, const std::string& out, + size_t i = 0, size_t j = 0) const override { + PADDLE_THROW("DecreaseLoDLevel is only used in compile time."); + } + + bool IsRuntime() const override { return true; } + + // TODO(paddle-dev): Can this be template? + std::vector GetInputVarPtrs( + const std::string& name) override { + const std::vector& vars = InputVars(name); + std::vector res; + res.reserve(vars.size()); + res.insert(res.begin(), vars.begin(), vars.end()); + return res; + } + + std::vector GetOutputVarPtrs( + const std::string& name) override { + const std::vector& vars = OutputVars(name); + std::vector res; + res.reserve(vars.size()); + res.insert(res.begin(), vars.begin(), vars.end()); + return res; + } + + DDim GetInputDim(const std::string& name) const override { + const std::vector& vars = InputVars(name); + PADDLE_ENFORCE_EQ(vars.size(), 1UL, + "Input(%s) should hold one element, but now it holds %d", + name, vars.size()); + return this->GetDim(vars[0]); + } + + std::vector GetInputsDim(const std::string& name) const override { + const std::vector& vars = InputVars(name); + return GetDims(vars); + } + + std::vector GetInputsVarType( + const std::string& name) const override { + return GetVarTypes(InputVars(name)); + } + + std::vector GetOutputsVarType( + const std::string& name) const override { + return GetVarTypes(OutputVars(name)); + } + + void SetOutputDim(const std::string& name, const DDim& dim) override { + auto& vars = OutputVars(name); + PADDLE_ENFORCE_EQ(vars.size(), 1UL, + "Output(%s) should hold one element, but now it holds %d", + name, vars.size()); + SetDim(vars[0], dim); + } + + void SetOutputsDim(const std::string& name, + const std::vector& dims) override { + auto& vars = OutputVars(name); + SetDims(vars, dims); + } + + protected: + DDim GetDim(Variable* var) const { + PADDLE_ENFORCE_NOT_NULL(var); + if (var->IsType()) { + return var->Get().dims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); + } else { + PADDLE_THROW( + "Only LoDTensor/SelectedRows support 'GetDim', but Variables " + "type_id is %s.", + ToTypeName(var->Type())); + } + } + + std::vector GetDims(const std::vector& vars) const { + std::vector ret; + ret.reserve(vars.size()); + std::transform(vars.begin(), vars.end(), std::back_inserter(ret), + [this](Variable* var) { return this->GetDim(var); }); + return ret; + } + + std::vector GetRepeatedDims(const std::string& name) const override { + PADDLE_THROW("Only compile time support this method"); + } + + void SetDim(Variable* var, const DDim& dim) { + if (var->IsType()) { + var->GetMutable()->Resize(dim); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); + } else { + PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", + ToTypeName(var->Type())); + } + } + + void SetDims(const std::vector& vars, + const std::vector& dims) { + size_t length = vars.size(); + PADDLE_ENFORCE_EQ(length, dims.size()); + for (size_t i = 0; i < length; ++i) { + if (vars[i] == nullptr) { + continue; + } + SetDim(vars[i], dims[i]); + } + } + + void SetRepeatedDims(const std::string& name, + const std::vector& dims) override { + PADDLE_THROW("Only compile time support this method"); + } + + std::vector GetVarTypes( + const std::vector& vars) const { + std::vector retv; + retv.resize(vars.size()); + std::transform(vars.begin(), vars.end(), retv.begin(), + std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType), + this, std::placeholders::_1)); + return retv; + } + + proto::VarType::Type GetVarType(Variable* var) const { + return ToVarType(var->Type()); + } + + private: + const std::vector& InputVars(const std::string& name) const { + auto it = ctx_.inputs.find(name); + PADDLE_ENFORCE(it != ctx_.inputs.end(), + "Operator %s does not have the input %s.", op_.Type(), name); + return it->second; + } + + const std::vector& OutputVars(const std::string& name) const { + auto it = ctx_.outputs.find(name); + PADDLE_ENFORCE(it != ctx_.outputs.end(), + "Operator %s does not have the outputs %s.", op_.Type(), + name); + return it->second; + } + + const OperatorBase& op_; + const RuntimeContext& ctx_; +}; + +static void CheckTensorNANOrInf(const std::string& op_type, + const std::string& name, + const framework::Tensor& tensor) { + if (tensor.memory_size() == 0) { + return; + } + if (tensor.type() != proto::VarType::FP32 && + tensor.type() != proto::VarType::FP64) { + return; + } + PADDLE_ENFORCE(!framework::TensorContainsInf(tensor), + "Operator %s output Tensor %s contains Inf", op_type, name); + PADDLE_ENFORCE(!framework::TensorContainsNAN(tensor), + "Operator %s output Tensor %s contains NAN", op_type, name); +} + +void OperatorWithKernel::RuntimeInferShape(const Scope& scope, + const platform::Place& place, + const RuntimeContext& ctx) const { + RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx); + this->InferShape(&infer_shape_ctx); +} + +std::vector* OperatorWithKernel::GetKernelConfig( + const OpKernelType& key) const { + auto config_iter = kernel_configs_map_.find(key); + std::vector* kernel_configs = nullptr; + if (config_iter != kernel_configs_map_.end()) { + kernel_configs = &(config_iter->second); + } + return kernel_configs; +} + +void OperatorWithKernel::RunImpl(const Scope& scope, + const platform::Place& place) const { + // To reduce the elapsed time of HasAttr, we use bool variable to record the + // result of HasAttr. + if (!enable_cache_runtime_context_ && HasAttr(kEnableCacheRuntimeContext)) + enable_cache_runtime_context_ = true; + if (!all_kernels_must_compute_runtime_shape_ && + HasAttr(kAllKernelsMustComputeRuntimeShape)) + all_kernels_must_compute_runtime_shape_ = true; + if (!enable_cache_runtime_context_) { + RuntimeContext ctx(Inputs(), Outputs(), scope); + RunImpl(scope, place, &ctx); + } else { + const Scope* cur_scope = &scope; + if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { + std::lock_guard lock(cache_update_mutex_); + if (runtime_ctx_.get() == nullptr || pre_scope_ != cur_scope) { + runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope)); + pre_scope_ = cur_scope; + } + } + RunImpl(scope, place, runtime_ctx_.get()); + } +} + +void OperatorWithKernel::RunImpl(const Scope& scope, + const platform::Place& place, + RuntimeContext* runtime_ctx) const { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + + if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) { + ChooseKernel(*runtime_ctx, scope, place); + } + + std::vector* kernel_configs = GetKernelConfig(*kernel_type_); + + // do data transformScope &transfer_scope; + std::vector transfered_inplace_vars; + auto* transfer_scope = + PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx); + + // exec scope is the scope that kernel actually executed on. + const Scope& exec_scope = + (transfer_scope == nullptr ? scope : *transfer_scope); + + if (!(kernel_type_->place_ == dev_ctx->GetPlace())) { + dev_ctx = pool.Get(kernel_type_->place_); + } + + if (!all_kernels_must_compute_runtime_shape_) { + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx); + this->InferShape(&infer_shape_ctx); + } + // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext + // not Scope. Imperative mode only pass inputs and get outputs. + (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx, + kernel_configs)); + + if (!transfered_inplace_vars.empty()) { + // there is inplace variable has been transfered. + TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); + } + + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + dev_ctx->Wait(); + } + + if (FLAGS_fast_check_nan_inf) { + for (auto& vname : OutputVars(true)) { + // only check inserted vars, + // please see executor.py for details of fast_check_nan_inf + if (vname.rfind("debug_var") == 0) { + VLOG(3) << "debugging nan/inf in var " << vname; + + auto* var = exec_scope.FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(type_, vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(type_, vname, + var->Get().value()); + } + } + } + } + + if (FLAGS_check_nan_inf) { + for (auto& vname : OutputVars(true)) { + auto* var = exec_scope.FindVar(vname); + if (var == nullptr) continue; + if (var->IsType()) { + CheckTensorNANOrInf(type_, vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(type_, vname, + var->Get().value()); + } + } + } + + // To solve issue #15032, have a discussion with @Luotao for cpu inference, + // do not cache transfer scope, hence in this case delete transfer scope + // after run to avoid memory leak + if (transfer_scope && !run_by_executor_ && !enable_cache_transfer_scope_) { + scope.DeleteScope(transfer_scope); + } +} + +void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx, + const Scope& scope, + const platform::Place& place) const { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + + // check if op[type] has kernel registered. + auto& all_op_kernels = AllOpKernels(); + auto kernels_iter = all_op_kernels.find(type_); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", type_); + } + + OpKernelMap& kernels = kernels_iter->second; + + auto expected_kernel_key = this->GetExpectedKernelType( + ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); +#ifdef PADDLE_WITH_MKLDNN + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = LibraryType::kPlain; + expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } +#endif + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", type_, + KernelTypeToString(expected_kernel_key)); + } + + std::lock_guard lock(cache_update_mutex_); + if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) { + kernel_type_.reset(new OpKernelType(expected_kernel_key)); + kernel_func_.reset(new OpKernelFunc(kernel_iter->second)); + } +} + +void OperatorWithKernel::TransferInplaceVarsBack( + const Scope& scope, const std::vector& inplace_vars, + const Scope& transfer_scope) const { + for (auto& var_name : inplace_vars) { + VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; + auto* origin_var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(origin_var, "The var[%s] should not be nullptr.", + var_name); + auto* original_tensor = + GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var); + auto* var = transfer_scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var, "The var[%s] should not be nullptr.", + var_name); + auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var); + original_tensor->ShareDataWith(*transformed_tensor); + } +} + +Scope* OperatorWithKernel::PrepareData( + const Scope& scope, const OpKernelType& expected_kernel_key, + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const { + Scope* new_scope = nullptr; + + std::unordered_set no_buffer_ins; + if (info_) { + auto& no_buffer_inferer = info_->NoNeedBufferVarsInferer(); + // Some op may not register NoNeedBufferVarsInferer + if (no_buffer_inferer) { + no_buffer_ins = no_buffer_inferer(Inputs(), Outputs(), Attrs()); + } + } + + for (auto& var_name_item : Inputs()) { + // NOTE(zjl): STL does not guarantee fast std::unordered_set::count when set + // is empty. At least STL implemented on my mac does calculate hash code + // of search key even though the set is empty. + if (!no_buffer_ins.empty() && + no_buffer_ins.count(var_name_item.first) > 0) { + VLOG(7) << "Skip scanning input " << var_name_item.first + << " in Operator " << type_; + continue; + } + + std::vector& input_vars = ctx->inputs[var_name_item.first]; + + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto& var_name = var_name_item.second[i]; + auto* var = input_vars[i]; + + // Only tensor can be tranfer to another device. + if (var == nullptr || !VarIsTensor(*var)) { + continue; + } + + auto* tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var); + if (!tensor_in->IsInitialized()) { + continue; + } + + auto kernel_type_for_var = GetKernelTypeForVar( + var_name_item.first, *tensor_in, expected_kernel_key); + + if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) { + continue; + } + + auto out_var_names = OutputVars(true); + if (std::find(out_var_names.begin(), out_var_names.end(), var_name) != + out_var_names.end()) { + transfered_inplace_vars->emplace_back(var_name); + } + + VLOG(3) << "Transform Variable " << var_name << " from " + << kernel_type_for_var << " to " << expected_kernel_key; + + // In the inference scenerio, the scopes will be reused across the + // batches, so the `new_scope` here will result in GPU memroy explosion + // over the running of operators. + // We use a thread_local cache to fix that issue, the key in the cache is + // the combination of the `scope` argument, from_kernel_type, + // target_kernel_type. + // Have a discussion with @Superjomn or the inference developers if some + // changes on this logic for this macro might not tested on the other + // scenerios. + // If this op is not called by an Executor or ParallelExecutor, it should + // called by a NaiveExecutor, the NaiveExecutor will cache the scopes and + // variables, that behavior a lot different. + // + // To solve issue #15032, have a discussion with @Luotao for cpu + // inference, for all cpu kernels cases without GPU participation, here + // not do transfer scope caching, and cpu inference performance is not + // impacted by test. + enable_cache_transfer_scope_ = false; + if (!run_by_executor_ && + (platform::is_gpu_place(kernel_type_for_var.place_) || + platform::is_gpu_place(expected_kernel_key.place_))) { + new_scope = TryCreateTransferScope(kernel_type_for_var, + expected_kernel_key, &scope); + enable_cache_transfer_scope_ = true; + } + if (!new_scope) { + new_scope = &scope.NewScope(); + } + // For inference, if a gpu model has an op which could only run on CPU, + // each result of different input will be the same with the first one. + // The reason is that if a gpu tensor is the input of a cpu kernel, + // we will create a new cpu tensor in new scope. + // However, if enable_cache_runtime_context_, we get the cpu tensor each + // time, not the gpu tensor. + // Thus, we set pre_scope_ = nullptr to trigger `new RuntimeContext()` in + // RunImpl(). + if (enable_cache_runtime_context_) { + pre_scope_ = nullptr; + } + + auto* trans_var = new_scope->Var(var_name); + input_vars[i] = trans_var; + + Tensor out; + TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); + SetTensorToVariable(*var, out, trans_var); + } + } + + return new_scope; +} + +proto::VarType::Type OperatorWithKernel::IndicateDataType( + const ExecutionContext& ctx) const { + proto::VarType::Type dafault_data_type = + static_cast(-1); + proto::VarType::Type data_type = dafault_data_type; + for (auto& input : this->inputs_) { + const std::vector vars = ctx.MultiInputVar(input.first); + for (size_t i = 0; i < vars.size(); ++i) { + const Variable* var = vars[i]; + if (var != nullptr) { + const Tensor* t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &(var->Get().value()); + } + if (t != nullptr) { + PADDLE_ENFORCE(t->IsInitialized(), "Input %s(%lu) is not initialized", + input.first, i); + proto::VarType::Type tmp = t->type(); + PADDLE_ENFORCE( + tmp == data_type || data_type == dafault_data_type, + "DataType of Paddle Op %s %s must be the same. Get (%s) != (%s)", + Type(), input.first, DataTypeToString(data_type), + DataTypeToString(tmp)); + data_type = tmp; + } + } + } + } + PADDLE_ENFORCE(data_type != dafault_data_type, + "DataType should be indicated by input"); + return data_type; +} + +OpKernelType OperatorWithKernel::GetExpectedKernelType( + const ExecutionContext& ctx) const { + return OpKernelType(IndicateDataType(ctx), ctx.GetPlace()); +} + +OpKernelType OperatorWithKernel::GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const OpKernelType& expected_kernel_type) const { + return OpKernelType(expected_kernel_type.data_type_, tensor.place(), + tensor.layout()); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h new file mode 100644 index 00000000..07e7abd5 --- /dev/null +++ b/paddle/fluid/framework/operator.h @@ -0,0 +1,511 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include + +#include "glog/logging.h" // For VLOG +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/operator_kernel_configs.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/variant.h" + +DECLARE_int32(inner_op_parallelism); + +namespace paddle { +namespace framework { + +/// If a variable is a empty variable, that name will be used. +constexpr char kEmptyVarName[] = "@EMPTY@"; + +/// If a variable is a temporary variable, that name will be set in Python, +/// but it will be convert to a unique name in scope after OpCreator. +constexpr char kTempVarName[] = "@TEMP@"; + +/// If a variable's name has a certain suffix, it means that the +/// variable is the gradient of another varibale. +/// e.g. Variable "x@GRAD" is the gradient of varibale "x". +constexpr char kGradVarSuffix[] = "@GRAD"; + +constexpr size_t kGradVarSuffixSize = 5U; + +/// Variables with this suffix are supposed to be filled up with zeros. +constexpr char kZeroVarSuffix[] = "@ZERO"; + +/// Variables with this suffix are the new Gradient. +constexpr char kNewGradSuffix[] = "@NEWGRAD@"; + +/// RuntimeContext is used to relate input/output names of Operator with +/// the corresponding variables in name scope. +/// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same +/// name scope, since the input/output names of this Op do not change in the +/// execution, RuntimeContext could be created only at the first iteration of +/// this Op's execution to save the elapsed time. +constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@"; + +/// If an Op has this attribute, all its kernels should calculate output +/// variable's shape in the corresponding Compute() function. And +/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape() +/// function in its runtime for speedup. +/// TODO(luotao): Note that this temporal attribute would be deleted after all +/// ops contain it. +constexpr char kAllKernelsMustComputeRuntimeShape[] = + "@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@"; + +// define some kernel priority +/* Define multiple kernel type fallback order*/ +extern std::vector> kKernelPriority; + +inline std::string GradVarName(const std::string& var_name) { + std::string result; + result.reserve(var_name.size() + kGradVarSuffixSize); + result += var_name; + result += kGradVarSuffix; + return result; +} + +inline std::string GradOriginalVarName(const std::string& grad_var_name) { + std::size_t pos = grad_var_name.rfind(kGradVarSuffix); + if (pos == std::string::npos) { + return grad_var_name; + } else { + return grad_var_name.substr(0, pos); + } +} + +proto::VarType::Type GetDataTypeOfVar(const Variable* var); +const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var); +Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var); + +class OperatorBase; +class ExecutionContext; + +class RuntimeContext { + public: + RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, const Scope& scope); + + RuntimeContext(const VariableValueMap& invars, + const VariableValueMap& outvars) + : inputs(invars), outputs(outvars) {} + + VariableValueMap inputs; + VariableValueMap outputs; +}; + +/** + * OperatorBase has the basic elements that Net will call to do computation. + * Only CreateOperator from OpRegistry will new Operator directly. User + * should always construct a proto message OpDesc and call + * OpRegistry::CreateOp(op_desc) to get an Operator instance. + */ +class OperatorBase { + public: + OperatorBase(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs); + + virtual ~OperatorBase() {} + + /// Executor will call this interface function to Run an op. + // The implementation should be written at RunImpl + void Run(const Scope& scope, const platform::Place& place); + + // FIXME(typhoonzero): this is only used for recv_op to stop event_loop. + virtual void Stop() {} + + /// if scope is not null, also show dimensions of arguments + virtual std::string DebugStringEx(const Scope* scope) const; + std::string DebugString() const { return DebugStringEx(nullptr); } + + virtual bool SupportGPU() const { return false; } + + const std::string& Type() const { return type_; } + + bool HasAttr(const std::string& name) const { return attrs_.count(name); } + template + inline const T& Attr(const std::string& name) const { + PADDLE_ENFORCE(attrs_.find(name) != attrs_.end(), + "%s should be in AttributeMap", name); + return boost::get(attrs_.at(name)); + } + const AttributeMap& Attrs() const { return attrs_; } + + const VariableNameMap& Inputs() const { return inputs_; } + const VariableNameMap& Outputs() const { return outputs_; } + + const OpInfo& Info() const { + PADDLE_ENFORCE_NOT_NULL(info_, "OpInfo of %s is not found", type_); + return *info_; + } + + bool HasInputs(const std::string& name) const; + //! Get a input with argument's name described in `op_proto` + std::string Input(const std::string& name) const; + //! Get a input which has multiple variables. + const std::vector& Inputs(const std::string& name) const; + //! Get all inputs variable names + std::vector InputVars() const; + + bool HasOutputs(const std::string& name) const; + //! Get a output with argument's name described in `op_proto` + std::string Output(const std::string& name) const; + //! Get an output which has multiple variables. + //! TODO add a vector_view to prevent memory copy. + const std::vector& Outputs(const std::string& name) const; + //! Get all outputs variable names + virtual std::vector OutputVars(bool has_intermediate) const; + + void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } + virtual void RuntimeInferShape(const Scope& scope, + const platform::Place& place, + const RuntimeContext& ctx) const {} + + protected: + std::string type_; + // NOTE: in case of OpGrad, inputs_ contains: + // I (Inputs) + // O (Outputs) + // OG (Output Gradients) + VariableNameMap inputs_; + + // NOTE: in case of OpGrad, outputs_ contains + // IG (Inputs Gradients) + VariableNameMap outputs_; + AttributeMap attrs_; + + // OpInfo + const OpInfo* info_; + + // Whether this operator executes in an Executor. + bool run_by_executor_{true}; + + private: + void GenerateTemporaryNames(); + void CheckAllInputOutputSet() const; + virtual void RunImpl(const Scope& scope, + const platform::Place& place) const = 0; +}; + +#ifdef PADDLE_WITH_CUDA +using KernelConfig = boost::variant< + std::shared_ptr>, + std::shared_ptr>, + std::shared_ptr>>; +#else +using KernelConfig = boost::variant; +#endif + +using OpKernelConfigsMap = + std::unordered_map, + OpKernelType::Hash>; + +class ExecutionContext { + public: + ExecutionContext(const OperatorBase& op, const Scope& scope, + const platform::DeviceContext& device_context, + const RuntimeContext& ctx, + std::vector* configs) + : op_(op), + scope_(scope), + device_context_(device_context), + ctx_(ctx), + kernel_configs_(configs) {} + + const OperatorBase& op() const { return op_; } + + const Scope& scope() const { return scope_; } + + template + inline const T& Attr(const std::string& name) const { + return op_.Attr(name); + } + + bool HasInput(const std::string& name) const; + + bool HasOutput(const std::string& name) const; + + size_t InputSize(const std::string& name) const { + return op_.Inputs(name).size(); + } + + size_t OutputSize(const std::string& name) const { + return op_.Outputs(name).size(); + } + + const Variable* InputVar(const std::string& name) const; + + Variable* OutputVar(const std::string& name) const; + + const std::vector MultiInputVar( + const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) { + return {}; + } + return {it->second.begin(), it->second.end()}; + } + + std::vector MultiOutputVar(const std::string& name) const { + auto names = op_.Outputs(name); + auto it = ctx_.outputs.find(name); + if (it == ctx_.outputs.end()) { + return {}; + } + return it->second; + } + + template + const T* Input(const std::string& name) const { + auto* var = InputVar(name); + return var == nullptr ? nullptr : &var->Get(); + } + + template + T* Output(const std::string& name) const { + auto var = OutputVar(name); + return var == nullptr ? nullptr : var->GetMutable(); + } + + template + const std::vector MultiInput(const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) { + return {}; + } + const std::vector& vars = it->second; + std::vector res; + res.reserve(vars.size()); + std::transform(vars.begin(), vars.end(), std::back_inserter(res), + [&](Variable* var) -> const T* { + return var == nullptr ? nullptr : &var->Get(); + }); + return res; + } + + template + std::vector MultiOutput(const std::string& name) const { + auto it = ctx_.outputs.find(name); + if (it == ctx_.outputs.end()) { + return {}; + } + const std::vector& vars = it->second; + std::vector res; + res.reserve(vars.size()); + std::transform(vars.begin(), vars.end(), std::back_inserter(res), + [&](Variable* var) -> T* { + return var == nullptr ? nullptr : var->GetMutable(); + }); + return res; + } + + platform::Place GetPlace() const { return device_context_.GetPlace(); } + + template + const DeviceContextType& device_context() const { + return *reinterpret_cast(&device_context_); + } + + const platform::DeviceContext& device_context() const { + return device_context_; + } + +#ifdef PADDLE_WITH_CUDA + const inline platform::CUDADeviceContext& cuda_device_context() const { + PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); + return *reinterpret_cast( + &device_context_); + } +#endif + + //! Get actual name vector for this input. + const std::vector& Inputs(const std::string& name) const { + return op_.Inputs(name); + } + + //! Get actual name vector for this output. + const std::vector& Outputs(const std::string& name) const { + return op_.Outputs(name); + } + + template + Tensor AllocateTmpTensor(const framework::DDim& dim, + const DevContext& dev_ctx) const { + auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance() + .Get(dev_ctx) + .Allocate(product(dim) * sizeof(T)); + auto& deleter = tmp_allocation_ptr.get_deleter(); + auto* allocation_ptr = tmp_allocation_ptr.release(); + auto shared_allocation = std::shared_ptr( + allocation_ptr, deleter); + + PADDLE_ENFORCE_GE(allocation_ptr->size(), + framework::product(dim) * sizeof(T)); + + paddle::framework::Tensor temp_tensor( + framework::ToDataType(std::type_index(typeid(T)))); + temp_tensor.Resize(dim); + temp_tensor.ResetHolder(std::move(shared_allocation)); + return temp_tensor; + } + + template + T& GetKernelConfig(size_t idx) const { + PADDLE_ENFORCE( + kernel_configs_ && kernel_configs_->size() > static_cast(idx), + "%s selected kernel doesn't have kernel config %lu <= %lu", + op_.Type().c_str(), kernel_configs_->size(), idx); + return *boost::get>((*kernel_configs_)[idx]); + } + + private: + const OperatorBase& op_; + const Scope& scope_; + const platform::DeviceContext& device_context_; + const RuntimeContext& ctx_; + mutable std::vector* kernel_configs_; +}; + +template <> +const Tensor* ExecutionContext::Input(const std::string& name) const; + +template <> +const std::vector ExecutionContext::MultiInput( + const std::string& name) const; + +template <> +Tensor* ExecutionContext::Output(const std::string& name) const; + +template <> +std::vector ExecutionContext::MultiOutput( + const std::string& name) const; + +class OpKernelBase { + public: + /** + * ExecutionContext is the only parameter of Kernel Run function. + * Run will get input/output variables, state such as momentum and + * device resource such as CUDA stream, cublas handle, etc. from + * ExecutionContext. User should construct it before run the Operator. + */ + + virtual void Compute(const ExecutionContext& context) const = 0; + + virtual ~OpKernelBase() = default; +}; + +template +class OpKernel : public OpKernelBase { + public: + using ELEMENT_TYPE = T; +}; + +class OperatorWithKernel : public OperatorBase { + public: + using OpKernelFunc = std::function; + using OpKernelMap = + std::unordered_map; + + OperatorWithKernel(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + static std::unordered_map& + AllOpKernels() { + static std::unordered_map g_all_op_kernels; + return g_all_op_kernels; + } + + bool SupportGPU() const override { + auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_); + return std::any_of(op_kernels.begin(), op_kernels.end(), + [](OpKernelMap::const_reference kern_pair) { + return platform::is_gpu_place(kern_pair.first.place_); + }); + } + + virtual void InferShape(InferShapeContext* ctx) const { + Info().infer_shape_(ctx); + } + + void RuntimeInferShape(const Scope& scope, const platform::Place& place, + const RuntimeContext& ctx) const override; + + virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; + + std::vector* GetKernelConfig(const OpKernelType& key) const; + + protected: + virtual OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const OpKernelType& expected_kernel_type) const; + + private: + // indicate kernel DataType by input data. By default all input data must be + // same. + proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const; + void RunImpl(const Scope& scope, const platform::Place& place) const final; + void RunImpl(const Scope& scope, const platform::Place& place, + RuntimeContext* runtime_ctx) const; + + /** + * Transfer data from scope to a transfered scope. If there is no data need to + * be tranfered, it returns nullptr. + * + * * transfered_inplace_vars is a output vector. + */ + Scope* PrepareData(const Scope& scope, + const OpKernelType& expected_kernel_key, + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const; + + void TransferInplaceVarsBack(const Scope& scope, + const std::vector& inplace_vars, + const Scope& exec_scope) const; + + void ChooseKernel(const RuntimeContext& ctx, const Scope& scope, + const platform::Place& place) const; + + protected: + mutable OpKernelConfigsMap kernel_configs_map_; + mutable std::unique_ptr kernel_type_; + mutable std::unique_ptr kernel_func_; + mutable std::unique_ptr runtime_ctx_; + mutable const Scope* pre_scope_ = nullptr; + mutable bool enable_cache_runtime_context_ = false; + mutable bool all_kernels_must_compute_runtime_shape_ = false; + mutable std::mutex cache_update_mutex_; + mutable bool enable_cache_transfer_scope_ = false; +}; + +extern bool OpSupportGPU(const std::string& op_type); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h new file mode 100644 index 00000000..a350b895 --- /dev/null +++ b/paddle/fluid/framework/operator_kernel_configs.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace framework { + +// Not thread-safe. Should be owned per-kernel. +template +class AlgorithmsCache { + public: + AlgorithmsCache() : search_times_(0) { hash_.clear(); } + // Caches the best algorithm for a given + // combination of tensor dimensions & compute data type. + TAlgorithm GetAlgorithm( + const std::vector& dims1, const std::vector& dims2, + const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, + int algorithmFlags, // can set for different data type + std::function gen_func); + + TAlgorithm GetAlgorithm(int64_t area, int search_times, int algorithmFlags, + std::function gen_func); + + private: + std::unordered_map hash_; + int search_times_; +}; + +template +TAlgorithm framework::AlgorithmsCache::GetAlgorithm( + const std::vector& dims1, const std::vector& dims2, + const std::vector& strides, const std::vector& paddings, + const std::vector& dilations, int algorithmFlags, + std::function gen_func) { + int64_t seed = 0; + // Hash all of the inputs, use to try and look up a previously + // discovered algorithm, or fall back to generating a new one. + std::hash hashFn; + // do hash like boost + // https://stackoverflow.com/questions/2590677/how-do-i-combine-hash-values-in-c0x + for (const auto num : dims1) { + seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2); + } + + for (const auto num : dims2) { + seed ^= hashFn(num) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 1; + } + + for (const auto num : strides) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 2; + } + + for (const auto num : paddings) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 3; + } + + for (const auto num : dilations) { + seed ^= hashFn(static_cast(num)) + 0x9e3779b9 + (seed << 6) + + (seed >> 2) + 4; + } + + seed ^= hashFn(static_cast(algorithmFlags)) + 0x9e3779b9 + + (seed << 6) + (seed >> 2) + 5; + + if (seed == 0) return gen_func(); + + if (hash_.find(seed) == hash_.end()) { + TAlgorithm value = gen_func(); + hash_[seed] = value; + } + return hash_[seed]; +} + +template +TAlgorithm AlgorithmsCache::GetAlgorithm( + int64_t area, int search_times, int algorithmFlags, + std::function gen_func) { + if (hash_.find(area) != hash_.end()) { + return hash_[area]; + } + if (search_times_ < search_times) { + auto algo = gen_func(); + hash_[area] = algo; + ++search_times_; + return algo; + } + TAlgorithm algo{}; + int64_t min = static_cast(INT_MAX); + for (const auto& m : hash_) { + if (m.first < min) { + min = m.first; + algo = m.second; + } + } + return algo; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc new file mode 100644 index 00000000..fe4804ac --- /dev/null +++ b/paddle/fluid/framework/operator_test.cc @@ -0,0 +1,317 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/init.h" + +namespace paddle { +namespace framework { + +static int op_run_num = 0; + +class OpWithoutKernelTest : public OperatorBase { + public: + OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs), x(1) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override { + ++op_run_num; + ASSERT_EQ(static_cast(inputs_.size()), 1); + ASSERT_EQ(static_cast(outputs_.size()), 1); + ASSERT_EQ(scope.FindVar(inputs_.at("input")[0]), nullptr); + ASSERT_EQ(x, 1); + ASSERT_NE(scope.FindVar(outputs_.at("output")[0]), nullptr); + } + + public: + int x{0}; +}; + +class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("input", "input of test op"); + AddOutput("output", "output of test op"); + AddAttr("scale", "scale of cosine op"); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); + AddComment("This is test op"); + } +}; + +} // namespace framework +} // namespace paddle + +static void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + *var->mutable_arguments()->Add() = arg_name; + } +} + +REGISTER_OP_WITHOUT_GRADIENT(test_operator, + paddle::framework::OpWithoutKernelTest, + paddle::framework::OpWithoutKernelCheckerMaker); + +TEST(OperatorBase, all) { + paddle::framework::InitDevices(true); + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("test_operator"); + BuildVar("input", {"IN1"}, op_desc.add_inputs()); + BuildVar("output", {"OUT1"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(3.14); + + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + scope.Var("OUT1"); + ASSERT_EQ(paddle::framework::op_run_num, 0); + op->Run(scope, cpu_place); + ASSERT_EQ(paddle::framework::op_run_num, 1); +} + +namespace paddle { +namespace framework { + +static int special_type_value = 1; + +class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("x", "input of test op"); + AddOutput("y", "output of test op"); + AddAttr("scale", "scale of cosine op") + .SetDefault(1.0) + .GreaterThan(0.0); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); + AddComment("This is test op"); + } +}; + +static int cpu_kernel_run_num = 0; +static int cpu_kernel2_run_num = 0; + +class OpWithKernelTest : public OperatorWithKernel { + public: + using OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override {} + OpKernelType GetExpectedKernelType( + const ExecutionContext& ctx) const override { + int sub_type = ctx.Attr("kernel_sub_type"); + return OpKernelType(proto::VarType::FP32, ctx.GetPlace(), + framework::DataLayout::kAnyLayout, + framework::LibraryType::kPlain, sub_type); + } +}; + +template +class CPUKernelTest : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + std::cout << ctx.op().DebugString() << std::endl; + cpu_kernel_run_num++; + ASSERT_EQ(ctx.op().Input("x"), "IN1"); + ASSERT_EQ(ctx.op().Output("y"), "OUT1"); + } +}; + +template +class CPUKernel2Test : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + std::cout << ctx.op().DebugString() << std::endl; + cpu_kernel2_run_num++; + ASSERT_EQ(ctx.op().Input("x"), "IN1"); + ASSERT_EQ(ctx.op().Output("y"), "OUT1"); + } +}; + +class OpKernelTestMultiInputsProtoAndCheckerMaker + : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("xs", "inputs of test op").AsDuplicable(); + AddInput("k", "input of test op"); + AddOutput("ys", "outputs of test op").AsDuplicable(); + AddAttr("scale", "scale of cosine op") + .SetDefault(1.0) + .GreaterThan(0.0); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); + AddComment("This is test op"); + } +}; + +class CPUKernalMultiInputsTest : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + auto xs = ctx.op().Inputs("xs"); + ASSERT_EQ(xs.size(), 3UL); + ASSERT_EQ(xs[0], "x0"); + ASSERT_EQ(xs[1], "x1"); + ASSERT_EQ(xs[2], "x2"); + + auto inVar0 = ctx.MultiInputVar("xs"); + ASSERT_EQ(inVar0.size(), 3U); + + auto intVar1 = ctx.InputVar("k"); + ASSERT_NE(intVar1, nullptr); + + auto outVar0 = ctx.MultiOutputVar("ys"); + ASSERT_EQ(outVar0.size(), 2U); + + auto inTensor0 = ctx.MultiInput("xs"); + ASSERT_EQ(inTensor0.size(), 3U); + + auto intTensor1 = ctx.Input("k"); + ASSERT_NE(intTensor1, nullptr); + + auto outTensor0 = ctx.MultiOutput("ys"); + ASSERT_EQ(outTensor0.size(), 2U); + + auto k = ctx.op().Input("k"); + ASSERT_EQ(k, "k0"); + + auto ys = ctx.op().Outputs("ys"); + ASSERT_EQ(ys.size(), 2UL); + ASSERT_EQ(ys[0], "y0"); + ASSERT_EQ(ys[1], "y1"); + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OP_WITHOUT_GRADIENT( + op_with_kernel, paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestProtoAndCheckerMaker); + +REGISTER_OP_CPU_KERNEL(op_with_kernel, + paddle::framework::CPUKernelTest); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( + op_with_kernel, CPU, paddle::platform::CPUPlace, MY_SPECIAL_NAME, + paddle::framework::special_type_value, + paddle::framework::CPUKernel2Test); + +// test with single input +TEST(OpKernel, all) { + paddle::framework::InitDevices(true); + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("op_with_kernel"); + BuildVar("x", {"IN1"}, op_desc.add_inputs()); + BuildVar("y", {"OUT1"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(3.14); + + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); + op->Run(scope, cpu_place); + // kerne_sub_type = 0, hence cpu_kernel is called, cpu_kernel2 is not called. + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); + ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 0); + + attr = op_desc.mutable_attrs()->Add(); + attr->set_name("kernel_sub_type"); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(1); + auto op2 = paddle::framework::OpRegistry::CreateOp(op_desc); + op2->Run(scope, cpu_place); + // kerne_sub_type = 1, hence cpu_kernel2 is called, cpu_kernel is not called. + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); + ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 1); +} + +REGISTER_OP_WITHOUT_GRADIENT( + op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest, + paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker); +REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel, + paddle::framework::CPUKernalMultiInputsTest); + +// test with multi inputs +TEST(OpKernel, multi_inputs) { + paddle::framework::InitDevices(true); + paddle::framework::proto::OpDesc op_desc; + + op_desc.set_type("op_multi_inputs_with_kernel"); + BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs()); + BuildVar("k", {"k0"}, op_desc.add_inputs()); + BuildVar("ys", {"y0", "y1"}, op_desc.add_outputs()); + + auto attr = op_desc.mutable_attrs()->Add(); + attr->set_name("scale"); + attr->set_type(paddle::framework::proto::AttrType::FLOAT); + attr->set_f(3.14); + + paddle::platform::CPUPlace cpu_place; + paddle::framework::Scope scope; + scope.Var("x0")->GetMutable(); + scope.Var("x1")->GetMutable(); + scope.Var("x2")->GetMutable(); + scope.Var("k0")->GetMutable(); + scope.Var("y0")->GetMutable(); + scope.Var("y1")->GetMutable(); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + op->Run(scope, cpu_place); +} + +TEST(VarNameTest, all) { + std::string var_name("X"); + std::string grad_var_name = paddle::framework::GradVarName(var_name); + ASSERT_EQ(grad_var_name, "X@GRAD"); + std::string original_var_name = + paddle::framework::GradOriginalVarName(grad_var_name); + ASSERT_EQ(original_var_name, "X"); + original_var_name = paddle::framework::GradOriginalVarName(original_var_name); + ASSERT_EQ(original_var_name, "X"); + + std::string var_name_2("XYZ"); + grad_var_name = paddle::framework::GradVarName(var_name_2); + ASSERT_EQ(grad_var_name, "XYZ@GRAD"); + original_var_name = paddle::framework::GradOriginalVarName(grad_var_name); + ASSERT_EQ(original_var_name, "XYZ"); + original_var_name = paddle::framework::GradOriginalVarName(original_var_name); + ASSERT_EQ(original_var_name, "XYZ"); + + std::string var_name_3(""); + grad_var_name = paddle::framework::GradVarName(var_name_3); + ASSERT_EQ(grad_var_name, "@GRAD"); + original_var_name = paddle::framework::GradOriginalVarName(grad_var_name); + ASSERT_EQ(original_var_name, ""); + original_var_name = paddle::framework::GradOriginalVarName(original_var_name); + ASSERT_EQ(original_var_name, ""); +} diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc new file mode 100644 index 00000000..e45b5925 --- /dev/null +++ b/paddle/fluid/framework/parallel_executor.cc @@ -0,0 +1,782 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/parallel_executor.h" +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" +#include "paddle/fluid/platform/profiler.h" + +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif +DEFINE_string(pe_profile_fname, "", + "Profiler filename for PE, which generated by gperftools." + "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); +DEFINE_bool(enable_parallel_graph, false, + "Force disable parallel graph execution mode if set false."); + +namespace paddle { +namespace framework { + +static std::once_flag gProfileOnce; +#ifdef WITH_GPERFTOOLS +static bool gProfileStarted = false; +#endif + +class ParallelExecutorPrivate { + public: + explicit ParallelExecutorPrivate(const std::vector &places) + : places_(places) { + if (!FLAGS_pe_profile_fname.empty()) { + std::call_once(gProfileOnce, [] { +#ifdef WITH_GPERFTOOLS + ProfilerStart(FLAGS_pe_profile_fname.c_str()); + gProfileStarted = true; +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_pe_profile_fname will be ignored"; +#endif + }); + } + } + + ~ParallelExecutorPrivate() { + if (own_local_scope_) { + for (size_t i = 1; i < local_scopes_.size(); ++i) { + // Skip the first scope, since it is the global scope. + Scope *local_scope = local_scopes_[i]; + if (global_scope_->HasKid(local_scope)) { + global_scope_->DeleteScope(local_scope); + } + } + } + } + + ir::Graph *ApplyMemoryOptimizePass(ir::Graph *graph); + + inline bool HasGarbageCollectors() const { return !gcs_.empty(); } + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) { + VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_ + << ", num_trainers:" << bst.num_trainers_ + << ", trainer_id:" << bst.trainer_id_; + + if (bst.use_hierarchical_allreduce_) { + VLOG(1) << ", use_hierarchical_allreduce:" + << bst.use_hierarchical_allreduce_ << ", inter_trainers_num:" + << bst.hierarchical_allreduce_inter_nranks_ + << ", exter_trainers_num:" + << bst.hierarchical_allreduce_exter_nranks_; + } + + std::vector flat_nccl_ids; + if (nranks_ == 1) { + // FIXME(gongwb): need not to create ncclid when nranks==1 + nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_, + bst.trainer_id_); + return; + } + + if (bst.enable_parallel_graph_) { + VLOG(1) << "use only one ncclid in pg model"; + + ncclUniqueId *nccl_id = nullptr; + + std::string var_name = platform::GetFlatNCCLVarName(0); + auto nccl_id_var = scope->FindVar(var_name); + if (nccl_id_var) { + nccl_id = nccl_id_var->GetMutable(); + VLOG(10) << "find nccl_id_var:" << var_name << ", nccl_id:" << nccl_id; + } else { + nccl_id = new ncclUniqueId(); + PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(nccl_id)); + VLOG(10) << "can't find nccl_id_var:" << var_name + << ", nccl_id:" << nccl_id; + } + + flat_nccl_ids.push_back(nccl_id); + + nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_, + bst.trainer_id_); + VLOG(1) << "init bst nccl context complete!"; + return; + } + + // num_trainers ==1 && places > 1 + if (bst.num_trainers_ == 1) { + nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_, + bst.trainer_id_); + return; + } + + for (int i = 0; i < static_cast(bst.nccl_comm_num_); i++) { + std::string var_name = platform::GetFlatNCCLVarName(i); + auto nccl_id_var = scope->FindVar(var_name); + PADDLE_ENFORCE(nccl_id_var, "can't find %s nccl_id_var", var_name); + auto nccl_id = nccl_id_var->GetMutable(); + flat_nccl_ids.push_back(nccl_id); + } + + nccl_ctxs_->InitFlatCtxs(places_, flat_nccl_ids, bst.num_trainers_, + bst.trainer_id_); + + if (bst.use_hierarchical_allreduce_) { + std::vector inter_nccl_ids; + for (int i = 0; i < static_cast(bst.nccl_comm_num_); i++) { + std::string var_name = platform::GetHierarchicalInterNCCLVarName(i); + auto nccl_id_var = scope->FindVar(var_name); + PADDLE_ENFORCE(nccl_id_var, "can't find %s nccl_id_var", var_name); + auto inter_nccl_id = nccl_id_var->GetMutable(); + inter_nccl_ids.push_back(inter_nccl_id); + } + + std::vector exter_nccl_ids; + for (int i = 0; i < static_cast(bst.nccl_comm_num_); i++) { + std::string var_name = platform::GetHierarchicalExterNCCLVarName(i); + auto nccl_id_var = scope->FindVar(var_name); + PADDLE_ENFORCE(nccl_id_var, "can't find %s nccl_id_var", var_name); + auto nccl_id = nccl_id_var->GetMutable(); + exter_nccl_ids.push_back(nccl_id); + } + + nccl_ctxs_->InitHierarchicalCtxs( + places_, inter_nccl_ids, exter_nccl_ids, bst.num_trainers_, + bst.trainer_id_, bst.hierarchical_allreduce_inter_nranks_, + bst.hierarchical_allreduce_exter_nranks_); + } + } + + void InitOrGetNCCLCommunicator(framework::Scope *scope, BuildStrategy *bst) { + const std::string var_name = "NCCLCommunicator"; + auto var = scope->FindVar(var_name); + if (var != nullptr) { + PADDLE_ENFORCE(var->IsInitialized(), + "if %s exists, it must be initialized", var_name); + VLOG(1) << "find " << var_name + << " in scope, so use it and does not recreate!"; + nccl_ctxs_ = var->GetMutable(); + return; + } + + if (bst->use_hierarchical_allreduce_) { + PADDLE_ENFORCE(bst->num_trainers_ > 1, "num_trainers:%llu < 1", + bst->num_trainers_); + PADDLE_ENFORCE(bst->hierarchical_allreduce_inter_nranks_ > 1, + "inter_nranks:%d < 1", + bst->hierarchical_allreduce_inter_nranks_); + PADDLE_ENFORCE( + (bst->num_trainers_ % bst->hierarchical_allreduce_inter_nranks_ == 0), + "num_trainers:%llu mod inter_nranks:%d != 0", bst->num_trainers_, + bst->hierarchical_allreduce_inter_nranks_); + + bst->hierarchical_allreduce_exter_nranks_ = + bst->num_trainers_ / bst->hierarchical_allreduce_inter_nranks_; + } + + VLOG(1) << "not find " << var_name << " in scope, so recreate it!"; + nccl_ctxs_ = scope->Var(var_name)->GetMutable(); + InitNCCLCtxs(scope, *bst); + } +#endif + + inline bool IsPersistable(const std::string &name) const { + auto iter = is_persistable_.find(name); + return iter != is_persistable_.end() && iter->second; + } + + BuildStrategy build_strategy_; + std::vector places_; + std::vector local_scopes_; + std::vector local_exec_scopes_; + Scope *global_scope_; // not owned + std::unique_ptr executor_; + + std::unordered_map is_persistable_; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + platform::NCCLCommunicator *nccl_ctxs_{nullptr}; +#endif + bool own_local_scope_; + bool use_cuda_; + bool use_all_reduce_; + size_t nranks_; + + ir::MemOptVarInfoMapList mem_opt_var_infos_; + ir::GarbageCollectorMap gcs_; +}; + +ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { + std::vector last_live_ops_of_vars; + + auto ref_cnt_pass = ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_); + ref_cnt_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars); + graph = ref_cnt_pass->Apply(graph); + VLOG(10) << "ReferenceCountPass Applied"; + + if (build_strategy_.enable_inplace_) { + auto inplace_pass = + ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass"); + inplace_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_); + inplace_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars); + inplace_pass->SetNotOwned(ir::kUseCuda, &use_cuda_); + VLOG(10) << "Start to apply buffer_shared_inplace_pass"; + graph = inplace_pass->Apply(graph); + VLOG(10) << "buffer_shared_inplace_pass Applied"; + } + + // TODO(zjl): refactor MemoryOptimizePass as well!!! + + if (GetEagerDeletionThreshold() < 0) { + return graph; + } + size_t max_memory_size = static_cast(GetEagerDeletionThreshold()); + + for (size_t i = 0; i < places_.size(); ++i) { + auto &place = places_[i]; + if (gcs_.count(place) > 0) { + continue; + } + std::unique_ptr gc; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place)) { + if (IsFastEagerDeletionModeEnabled()) { + gc.reset(new UnsafeFastGPUGarbageCollector( + boost::get(place), max_memory_size)); + } else { + gc.reset(new StreamGarbageCollector( + boost::get(place), max_memory_size)); + } + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; + } else { +#endif + if (platform::is_cpu_place(place)) { + gc.reset(new CPUGarbageCollector(boost::get(place), + max_memory_size)); + VLOG(10) << "Created GarbageCollector at " << place; + } else { + PADDLE_THROW("Unsupported place for garbage collection"); + } +#ifdef PADDLE_WITH_CUDA + } +#endif + + gcs_.emplace(place, std::move(gc)); + } + + if (!gcs_.empty()) { + auto eager_deletion_pass = + ir::PassRegistry::Instance().Get("eager_deletion_pass"); + eager_deletion_pass->SetNotOwned(ir::kMemOptVarInfoMapList, + &mem_opt_var_infos_); + eager_deletion_pass->SetNotOwned(ir::kGarbageCollector, &gcs_); + eager_deletion_pass->SetNotOwned(ir::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + eager_deletion_pass->SetNotOwned(ir::kAllPlaces, &places_); + graph = eager_deletion_pass->Apply(graph); + VLOG(10) << "EagerDeletionPass Applied"; + } + return graph; +} + +std::vector &ParallelExecutor::GetLocalScopes() { + return member_->local_scopes_; +} + +void ParallelExecutor::DropLocalExeScopes() { + auto executor = dynamic_cast( + member_->executor_.get()); + if (executor) { + executor->DropLocalExeScopes(); + } +} + +bool ParallelExecutor::NeedCreateLocalExeScope() { + auto executor = dynamic_cast( + member_->executor_.get()); + return executor && executor->NeedCreateLocalExeScope(); +} + +ParallelExecutor::ParallelExecutor(const std::vector &places, + const std::vector &bcast_vars, + const std::string &loss_var_name, + Scope *scope, + const std::vector &local_scopes, + const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy, + ir::Graph *graph) + : member_(new ParallelExecutorPrivate(places)) { + member_->global_scope_ = scope; + member_->use_cuda_ = exec_strategy.use_cuda_; + member_->build_strategy_ = build_strategy; + member_->use_all_reduce_ = member_->build_strategy_.reduce_ == + BuildStrategy::ReduceStrategy::kAllReduce; + member_->nranks_ = build_strategy.num_trainers_ * places.size(); + if (!member_->use_all_reduce_ && member_->nranks_ == 1) { + LOG(INFO) << "If you set build_strategy.reduce with 'Reduce'," + "the number of places should be greater than 1."; + member_->build_strategy_.reduce_ = + BuildStrategy::ReduceStrategy::kAllReduce; + member_->use_all_reduce_ = true; + } +#if defined(PADDLE_WITH_CUDA) && defined(_WIN32) + if (member_->use_cuda_) { + PADDLE_ENFORCE(places.size() == 1, "Windows can support Single GPU only."); + } +#endif + + LOG(INFO) << string::Sprintf( + "The number of %s, which is used in ParallelExecutor, is %lu. And " + "the Program will be copied %lu copies", + (member_->use_cuda_ ? "CUDAPlace" : "CPUPlace"), places.size(), + places.size()); + + // Step 1. Bcast the bcast_vars to devs. + // Create local scopes + if (local_scopes.empty()) { + member_->own_local_scope_ = true; + member_->local_scopes_.emplace_back(member_->global_scope_); + for (size_t i = 1; i < member_->places_.size(); ++i) { + member_->local_scopes_.emplace_back(&scope->NewScope()); + } + } else { + member_->own_local_scope_ = false; + PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size()); + for (size_t i = 0; i < member_->places_.size(); ++i) { + member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); + } + } + + std::vector graphs; + if (member_->build_strategy_.async_mode_) { + PADDLE_ENFORCE(!member_->use_cuda_, + "gpu mode does not support async_mode_ now!"); + graphs.push_back(graph); + for (size_t i = 1; i < places.size(); ++i) { + auto *tmp_graph = new ir::Graph(graph->OriginProgram()); + async_graphs_.emplace_back(tmp_graph); + graphs.push_back(tmp_graph); + } + } + + // FIXME(Yancey1989): parallel graph mode get better performance + // in GPU allreduce distributed training. Need an elegant way to + // choice the execution strategy. + member_->build_strategy_.enable_parallel_graph_ = + EnableParallelGraphExecution(*graph, exec_strategy, + member_->build_strategy_); + if (member_->build_strategy_.enable_parallel_graph_) { + LOG(INFO) << "The Executor would execute the graph by ParallelGraph " + "Execution which can get better performance," + << "you can force it off by env FLAGS_enable_parallel_graph=0"; + } + + if (member_->use_cuda_ && member_->nranks_ > 1) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_); + + // Initialize device context's nccl comm, will be used by normal + // Operators like sync_batch_norm, and collective ops. + // NOTE: more than one ParallelExecutor with same place, the nccl comm will + // be rewrite and there will be some problem. + // NOTE: NCCL group-calls and non-group-calls can not use the same + // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use + // same communicators. + auto *nccl_ctxs = + member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_); + auto &pool = platform::DeviceContextPool::Instance(); + for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { + auto *dev_ctx = static_cast( + pool.Get(member_->places_[dev_id])); + auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]); + dev_ctx->set_nccl_comm(nccl_ctx.comm()); + } +#endif + } + // broadcast parameters from the 0th device to others: + auto need_broadcast = [&]() -> bool { + if (member_->build_strategy_.num_trainers_ > 1) { + // 1. num_tariners would be grater than 1 for nccl distributed training. + return true; + } else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { + // 2. Only one trainer process, but ParallelExecutor hold multiple + // devices. + return true; + } + return false; + }; + // Bcast Parameters to all GPUs + if (need_broadcast()) { + BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_); + } + + // Startup Program has been run. All local scopes has correct parameters. + + // Step 2. Convert main_program to SSA form and dependency graph. Also, insert + // ncclOp + std::vector async_graphs(places.size()); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + if (member_->build_strategy_.async_mode_) { + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_cuda_, + member_->nccl_ctxs_); + for (size_t i = 1; i < member_->places_.size(); ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_cuda_, + member_->nccl_ctxs_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_); + } +#else + if (member_->build_strategy_.async_mode_) { + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_cuda_); + for (size_t i = 1; i < member_->places_.size(); ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_cuda_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_cuda_); + } +#endif + + graph = member_->ApplyMemoryOptimizePass(graph); + + async_graphs[0] = graph; + + // Step 3. Create vars in each scope. Passes may also create new vars. + // skip control vars and empty vars + std::vector var_infos; + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + + member_->is_persistable_.emplace(node->Var()->Name(), + node->Var()->Persistable()); + } + } + + // If the loss_var_name is given, the number of graph should be only one. + if (loss_var_name.size()) { + size_t graph_num = ir::GraphNum(*graph); + if (graph_num > 1) { + LOG(WARNING) + << "The number of graph should be only one, " + "but the current graph has " + << ir::GraphNum(*graph) + << " sub_graphs. If you want to see the nodes of the " + "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " + "to specify the output dir. NOTES: if you not do training, " + "please don't pass loss_var_name."; + } + } + + std::unordered_map scope_map; + for (auto *scope : member_->local_scopes_) { + auto &local_exec_scope = scope->NewScope(); + member_->local_exec_scopes_.emplace_back(&local_exec_scope); + scope_map.emplace(scope, &local_exec_scope); + } + + PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), + member_->local_exec_scopes_.size()); + + std::vector final_graphs; + + if (member_->build_strategy_.async_mode_) { + VLOG(3) << "use AsyncSSAGraphExecutor"; + member_->executor_.reset(new details::AsyncSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, async_graphs)); + final_graphs = async_graphs; + } else if (member_->build_strategy_.enable_parallel_graph_) { + VLOG(3) << "use ParallelSSAGraphExecutor"; +#ifdef PADDLE_WITH_CUDA + // TODO(Yancey1989): Remove passing in the main_program when + // allreduce_seq_pass doesn't need it as the attr. + auto *pg_exe = new details::ParallelSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, graph); + final_graphs = pg_exe->Graphs(); + member_->executor_.reset(pg_exe); +#else + PADDLE_THROW( + "Paddle should be compiled with CUDA for ParallelGraph Execution."); +#endif + } else { + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + VLOG(3) << "use ThreadedSSAGraphExecutor"; + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, graph)); + } else { + VLOG(3) << "use FastThreadedSSAGraphExecutor"; + member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, graph)); + } + final_graphs.emplace_back(graph); + } + + VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; + if (!member_->build_strategy_.async_mode_) { + member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + std::move(var_infos), member_->places_, std::move(member_->executor_))); + } + + for (auto *g : final_graphs) { + auto ops = ir::FilterByNodeWrapper(*g); + for (auto *op : ops) { + op->SetLocalExecScopes(scope_map); + } + } +} + +void ParallelExecutor::BCastParamsToDevices( + const std::vector &vars, int trainer_id) const { + VLOG(3) << "BCastParamsToDevices"; + // the initializing bcast, all vars would be bcast from device(0). + for (auto &var : vars) { + framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); + if (main_var == nullptr || !main_var->IsType()) { + continue; + } + + auto &main_tensor = main_var->Get(); + if (!main_tensor.IsInitialized()) { + VLOG(3) << "one in var not inited, return!"; + continue; + } + auto &dims = main_tensor.dims(); + if (paddle::platform::is_gpu_place(main_tensor.place())) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + std::vector buffers; + buffers.reserve(member_->places_.size()); + size_t numel = main_tensor.numel(); + ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto place = member_->places_[i]; + void *buffer; + + if (i == 0 && trainer_id == 0) { + buffer = const_cast(main_tensor.data()); + } else { + auto local_scope = member_->local_scopes_[i]; + auto *t = local_scope->Var(var)->GetMutable(); + t->Resize(dims); + buffer = t->mutable_data(place, main_tensor.type()); + } + buffers.push_back(buffer); + } + + PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), + "variables' buffer size to bcast NOT equal to places"); + { + auto *nccl_ctxs = member_->nccl_ctxs_->DefaultFlatCtx(); + platform::NCCLGroupGuard guard; + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto &nccl_ctx = nccl_ctxs->at(member_->places_[i]); + platform::dynload::ncclBcast(buffers[i], numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); + } + nccl_ctxs->WaitAll(); + } +#endif + } else { + platform::CPUPlace cpu; + for (size_t i = 1; i < member_->places_.size(); ++i) { + auto local_scope = member_->local_scopes_[i]; + auto *t = local_scope->Var(var)->GetMutable(); + + auto copy_memory = [&] { + t->Resize(dims); + t->mutable_data(cpu, main_tensor.type()); + paddle::framework::TensorCopy(main_tensor, cpu, t); + }; + + auto share_memory = [&] { t->ShareDataWith(main_tensor); }; + + // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix. + if (member_->build_strategy_.async_mode_) { + share_memory(); + } else if (member_->use_all_reduce_ || member_->use_cuda_ || + var == "@LR_DECAY_COUNTER@") { + copy_memory(); + } else { + share_memory(); + } + } + } + } +} + +void ParallelExecutor::Run(const std::vector &fetch_tensors, + const std::string &fetched_var_name) { + VLOG(3) << "enter ParallelExecutor Run"; +#ifdef WITH_GPERFTOOLS + if (gProfileStarted) { + ProfilerFlush(); + } +#endif + + platform::RecordBlock b(0); + + ir::SkipMemOptVarsGuard guard(&(member_->mem_opt_var_infos_), fetch_tensors, + member_->HasGarbageCollectors()); + + VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run"; + auto fetch_data = member_->executor_->Run(fetch_tensors); + *member_->global_scope_->Var(fetched_var_name)->GetMutable() = + fetch_data; +} + +void ParallelExecutor::FeedTensorsIntoLocalScopes( + const std::vector> &tensors) { + PADDLE_ENFORCE_EQ(member_->local_scopes_.size(), tensors.size()); + + for (size_t i = 0; i < tensors.size(); ++i) { + auto &map = tensors[i]; + for (auto &pair : map) { + bool is_persistable = member_->IsPersistable(pair.first); + auto *feed_scope = is_persistable ? member_->local_scopes_[i] + : member_->local_exec_scopes_[i]; + auto *feed_var = feed_scope->Var(pair.first); + + auto *trg = feed_var->GetMutable(); + trg->ShareDataWith(pair.second); + trg->set_lod(pair.second.lod()); + } + } +} + +void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( + const std::unordered_map &tensors) { + for (auto &pair : tensors) { + auto lod_tensors = pair.second.SplitLoDTensor(member_->places_); + if (member_->places_.size() != lod_tensors.size()) { + bool is_cpu_place = platform::is_cpu_place(member_->places_.front()); + auto error_info = string::Sprintf( + "The number(%d) of samples of " + "current batch is less than the count(%d) of " + "devices(%s), currently, it is not allowed. ", + lod_tensors.size(), member_->places_.size(), + (is_cpu_place ? "CPU" : "GPU")); + if (is_cpu_place) { + error_info += + "You should set the environment variable CPU_NUM in the system " + "to determine the number of devices you need."; + } + PADDLE_THROW(error_info); + } + + bool is_persistable = member_->IsPersistable(pair.first); + for (size_t j = 0; j < member_->places_.size(); ++j) { + auto *feed_scope = is_persistable ? member_->local_scopes_[j] + : member_->local_exec_scopes_[j]; + auto *feed_var = feed_scope->Var(pair.first); + + auto t = feed_var->GetMutable(); + t->ShareDataWith(lod_tensors[j]); + t->set_lod(lod_tensors[j].lod()); + } + } +} + +ParallelExecutor::~ParallelExecutor() { + for (auto &p : member_->places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + delete member_; +} + +bool ParallelExecutor::EnableParallelGraphExecution( + const ir::Graph &graph, const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy) const { + if (!FLAGS_enable_parallel_graph) { + return false; + } + + bool enable_parallel_graph = true; + + for (ir::Node *node : graph.Nodes()) { + if (node->IsVar() && node->Var()) { + // TODO(Yancey1989): support sparse update in ParallelGraph mode. + if (node->Var()->GetType() == proto::VarType::SELECTED_ROWS) { + enable_parallel_graph = false; + break; + } + } else if (node->IsOp() && node->Op()) { + // TODO(Yancey1989): support pserver mode + if (node->Op()->Type() == "send" || node->Op()->Type() == "recv") { + enable_parallel_graph = false; + break; + } + } + } + + if (!member_->use_all_reduce_ || !member_->use_cuda_) { + if (build_strategy.enable_sequential_execution_ || + exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) { + enable_parallel_graph = false; + } + } + +#ifdef WIN32 + VLOG(1) << "Windows has no support to parallel graph, enable_parallel_graph " + "would be forced to false."; + enable_parallel_graph = false; +#endif + + return enable_parallel_graph; +} + +} // namespace framework +} // namespace paddle + +USE_PASS(reference_count_pass); +USE_PASS(eager_deletion_pass); +USE_PASS(buffer_shared_inplace_pass); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h new file mode 100644 index 00000000..1ac800c9 --- /dev/null +++ b/paddle/fluid/framework/parallel_executor.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { + +class ParallelExecutorPrivate; + +using details::BuildStrategy; +using details::ExecutionStrategy; + +class ParallelExecutor { + DISABLE_COPY_AND_ASSIGN(ParallelExecutor); + + public: + explicit ParallelExecutor(const std::vector &places, + const std::vector &bcast_vars, + const std::string &loss_var_name, Scope *scope, + const std::vector &local_scopes, + const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy, + ir::Graph *graph); + + ~ParallelExecutor(); + + std::vector &GetLocalScopes(); + + void DropLocalExeScopes(); + + // This API is used to check whether DropLocalExeScopes work. + bool NeedCreateLocalExeScope(); + + /** + * Feed tensors to local scopes. The size of tensors should be equal to the + * size of local scopes. + */ + void FeedTensorsIntoLocalScopes( + const std::vector> &tensors); + + void FeedAndSplitTensorIntoLocalScopes( + const std::unordered_map &tensors); + + void Run(const std::vector &fetch_tensors, + const std::string &fetched_var_name); + + private: + // broadcast the parameters from the 0th device. + // trainer_id the trainer index in nccl distributed training. + void BCastParamsToDevices(const std::vector &vars, + int trainer_id = 0) const; + bool EnableParallelGraphExecution(const ir::Graph &graph, + const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy) const; + + ParallelExecutorPrivate *member_; + std::vector> async_graphs_; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc new file mode 100644 index 00000000..916359ab --- /dev/null +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -0,0 +1,265 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/framework/trainer.h" +#include "paddle/fluid/framework/trainer_desc.pb.h" + +namespace paddle { +namespace framework { + +void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, + Dataset* dataset) { + pipeline_num_ = trainer_desc.thread_num(); + VLOG(3) << "pipeline num: " << pipeline_num_; + + SetDataset(dataset); + // get filelist from trainer_desc here + const std::vector readers = + dataset->GetReaders(); + VLOG(3) << "readers num: " << readers.size(); + + pipeline_config_ = trainer_desc.section_param(); + scope_queue_size_ = pipeline_config_.queue_size(); + sync_steps_ = pipeline_config_.sync_steps(); + section_num_ = pipeline_config_.section_config_size(); + + VLOG(3) << "scope_queue_size: " << scope_queue_size_; + VLOG(3) << "section num: " << section_num_; + VLOG(3) << "sync_steps: " << sync_steps_; + + workers_.resize(section_num_); + in_var_names_.resize(section_num_); + out_var_names_.resize(section_num_); + worker_count_.resize(section_num_); + worker_count_mutex_.resize(section_num_); + param_need_sync_.reset(new std::vector); + + int reader_index = 0; + for (int i = 0; i < section_num_; ++i) { + const auto& section_config = pipeline_config_.section_config(i); + int concurrency = section_config.concurrency(); + VLOG(3) << "the thread num of each pipeline in section " << i + << " is: " << concurrency; + in_var_names_[i].reset(new std::vector( + section_config.section_in_var_names().begin(), + section_config.section_in_var_names().end())); + out_var_names_[i].reset(new std::vector( + section_config.section_out_var_names().begin(), + section_config.section_out_var_names().end())); + worker_count_[i].resize(pipeline_num_); + worker_count_mutex_[i].resize(pipeline_num_); + for (int j = 0; j < pipeline_num_; ++j) { + worker_count_[i][j] = new int(concurrency); + worker_count_mutex_[i][j].reset(new std::mutex); + } + + platform::Place place; + workers_[i].resize(pipeline_num_); + for (int j = 0; j < pipeline_num_; ++j) { + workers_[i][j].resize(concurrency); + + switch (section_config.place()) { + case SectionConfig::CPUPlace: + place = platform::CPUPlace(); + break; + case SectionConfig::CUDAPlace: + // Note that one section has at most one GPU place in one pipeline + place = platform::CUDAPlace(j); + break; + case SectionConfig::CUDAPinnedPlace: + place = platform::CUDAPinnedPlace(); + break; + default: + PADDLE_ENFORCE(false, "Unkown place type in SectionConfig: %d", + section_config.place()); + } + + for (int k = 0; k < concurrency; ++k) { + workers_[i][j][k] = DeviceWorkerFactory::CreateDeviceWorker( + trainer_desc.device_worker_name()); + auto this_worker = + std::dynamic_pointer_cast( + workers_[i][j][k]); + this_worker->SetSectionIndex(i); + this_worker->SetDeviceIndex(j); + this_worker->SetThreadIndex(k); + this_worker->SetSectionNum(section_num_); + this_worker->SetPipelineNum(pipeline_num_); + if (i == 0) { + this_worker->SetDataFeed(readers[reader_index++]); + } + this_worker->SetPlace(place); + this_worker->Initialize(trainer_desc); + } + } + } + param_need_sync_.reset( + new std::vector(pipeline_config_.param_need_sync().begin(), + pipeline_config_.param_need_sync().end())); + VLOG(3) << "param_need_sync_ have: "; + for (const std::string& name : *param_need_sync_) { + VLOG(3) << name; + } + // set debug here + SetDebug(trainer_desc.debug()); +} + +void PipelineTrainer::InitFirstScopeQueue(ScopeQueue* scope_queue, + int pipeline_id, + const ProgramDesc& main_program) { + for (int i = 0; i < scope_queue_size_; ++i) { + Scope* scope = &pipeline_scopes_[pipeline_id]->NewScope(); + for (auto& var : main_program.Block(0).AllVars()) { + if (!var->Persistable()) { + auto* ptr = scope->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } + } + scope_queue->Send(scope); + } +} + +void PipelineTrainer::CopyParameters(const Scope& root_scope, int pipeline_id) { + for (const std::string& name : *param_need_sync_) { + const LoDTensor& root_tensor = root_scope.FindVar(name)->Get(); + + // TODO(hutxian): check a new var of the same name is created in + // pipeline_scope + LoDTensor* gpu_tensor = + pipeline_scopes_[pipeline_id]->Var(name)->GetMutable(); + platform::Place place = platform::CUDAPlace(pipeline_id); + TensorCopy(*static_cast(&root_tensor), place, + static_cast(gpu_tensor)); + } +} + +void PipelineTrainer::InitTrainerEnv(const ProgramDesc& main_program, + const platform::Place& place) { + PADDLE_ENFORCE(root_scope_, "Null root_scope pointer"); + SectionWorker::cpu_id_.store(pipeline_config_.start_cpu_core_id()); + scope_queues_.resize(section_num_); + pipeline_scopes_.resize(pipeline_num_); + + VLOG(3) << "Init ScopeQueues and create all scopes"; + for (int i = 0; i < section_num_; ++i) { + for (int j = 0; j < pipeline_num_; ++j) { + scope_queues_[i].emplace_back(new ScopeQueue(scope_queue_size_)); + if (i == 0) { + pipeline_scopes_[j] = &root_scope_->NewScope(); + CopyParameters(*root_scope_, j); + InitFirstScopeQueue(scope_queues_[0].back().get(), j, main_program); + } + } + } + + for (int i = 0; i < section_num_; ++i) { + for (int j = 0; j < pipeline_num_; ++j) { + for (size_t k = 0; k < workers_[i][j].size(); ++k) { + auto this_worker = + std::dynamic_pointer_cast( + workers_[i][j][k]); + this_worker->SetRootScope(root_scope_); + this_worker->SetCountMutex(worker_count_mutex_[i][j].get()); + this_worker->SetWorkerCount(worker_count_[i][j]); + this_worker->SetScopeQueue(scope_queues_[i][j].get(), + (i == section_num_ - 1) + ? scope_queues_[0][j].get() + : scope_queues_[i + 1][j].get()); + this_worker->SetVarNames(*in_var_names_[i], *out_var_names_[i]); + if (i != section_num_ - 1) { + // For data copy in adjacent different place + this_worker->SetNextSectionPlace( + std::dynamic_pointer_cast( + workers_[i + 1][j][0]) + ->place()); + } + } + } + } + + if (pipeline_num_ > 1) { + construct_sync_functor(); + } +} + +void PipelineTrainer::construct_sync_functor() { + std::vector cuda_places; + for (int i = 0; i < pipeline_num_; ++i) { + cuda_places.emplace_back(platform::CUDAPlace(i)); + } + nccl_ctx_map_.reset(new platform::NCCLContextMap(cuda_places)); + sync_functors_.resize(pipeline_num_); + SyncFunctor::sync_flag_ = 0; + SyncFunctor::pipeline_scopes_.resize(0); + + for (int j = 0; j < pipeline_num_; ++j) { + SyncFunctor* sync_function = new SyncFunctor(j, pipeline_num_, sync_steps_); + sync_function->SetSyncParam(*param_need_sync_); + sync_function->SetNcclCtxMap(nccl_ctx_map_.get()); + SyncFunctor::pipeline_scopes_.push_back(this->pipeline_scopes_[j]); + sync_functors_[j].reset(sync_function); + } + for (int i = section_num_ - 1; i >= 0; --i) { + if (SectionConfig::CUDAPlace == + pipeline_config_.section_config(i).place()) { + for (int j = 0; j < pipeline_num_; ++j) { + for (size_t k = 0; k < workers_[i][j].size(); ++k) { + auto this_worker = + std::dynamic_pointer_cast( + workers_[i][j][k]); + this_worker->SetSyncFunctor(sync_functors_[j].get()); + } + } + break; + } + } +} + +void PipelineTrainer::Run() { + VLOG(3) << "Going to run"; + for (int i = 0; i < section_num_; ++i) { + for (int j = 0; j < pipeline_num_; ++j) { + for (size_t k = 0; k < workers_[i][j].size(); ++k) { + if (!debug_) { + section_threads_.push_back( + std::thread(&DeviceWorker::TrainFiles, workers_[i][j][k].get())); + } else { + section_threads_.push_back(std::thread( + &DeviceWorker::TrainFilesWithProfiler, workers_[i][j][k].get())); + } + } + } + } +} + +void PipelineTrainer::Finalize() { + for (auto& th : section_threads_) { + th.join(); + } + for (const auto& var : *param_need_sync_) { + auto* root_tensor = root_scope_->Var(var)->GetMutable(); + // TODO(hutuxian): Add a final all-reduce? + const auto& thread_tensor = + pipeline_scopes_[0]->FindVar(var)->Get(); + TensorCopySync(thread_tensor, platform::CPUPlace(), root_tensor); + } + root_scope_->DropKids(); +} + +} // end namespace framework +} // end namespace paddle +#endif diff --git a/paddle/fluid/framework/program_desc.cc b/paddle/fluid/framework/program_desc.cc new file mode 100644 index 00000000..4b966711 --- /dev/null +++ b/paddle/fluid/framework/program_desc.cc @@ -0,0 +1,195 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/version.h" + +namespace paddle { +namespace framework { + +BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) { + auto *b = desc_.add_blocks(); + b->set_parent_idx(parent.ID()); + b->set_idx(desc_.blocks_size() - 1); + blocks_.emplace_back(new BlockDesc(this, b)); + return blocks_.back().get(); +} + +void ProgramDesc::Flush() { + for (auto &block : blocks_) { + block->Flush(); + } +} + +proto::ProgramDesc *ProgramDesc::Proto() { + Flush(); + return &desc_; +} + +int64_t ProgramDesc::Version() const { return desc_.version().version(); } + +ProgramDesc::ProgramDesc() { + desc_.mutable_version()->set_version(kCurProgramVersion); + auto *block = desc_.mutable_blocks()->Add(); + block->set_idx(kRootBlockIndex); + block->set_parent_idx(kNoneBlockIndex); + blocks_.emplace_back(new BlockDesc(this, block)); +} + +ProgramDesc::ProgramDesc(const ProgramDesc &o) { + desc_ = o.desc_; + for (int i = 0; i < desc_.blocks_size(); ++i) { + auto *block = desc_.mutable_blocks(i); + blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this)); + } + for (size_t block_id = 0; block_id < blocks_.size(); ++block_id) { + auto all_ops = blocks_[block_id]->AllOps(); + for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) { + auto &op = all_ops[op_id]; + + for (const std::string &attr_name : op->AttrNames()) { + if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) { + int sub_block_id = + o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name); + op->SetBlockAttr(attr_name, MutableBlock(sub_block_id)); + } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) { + std::vector sub_block_ids = + o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name); + std::vector block_descs; + for (int block_id : sub_block_ids) { + block_descs.push_back(MutableBlock(block_id)); + } + op->SetBlocksAttr(attr_name, block_descs); + } + } + } + } +} + +ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) { + desc_ = desc; + InitFromProto(); +} + +void ProgramDesc::CopyFrom(const proto::ProgramDesc &desc) { + blocks_.clear(); + desc_ = desc; + InitFromProto(); +} + +ProgramDesc::ProgramDesc(const std::string &binary_str) { + PADDLE_ENFORCE(desc_.ParseFromString(binary_str), + "Fail to parse program_desc from binary string."); + InitFromProto(); +} + +void ProgramDesc::InitFromProto() { + for (auto &block_desc : *desc_.mutable_blocks()) { + blocks_.emplace_back(new BlockDesc(this, &block_desc)); + } + for (auto &block : blocks_) { + for (auto *op : block->AllOps()) { + for (const auto &attr : op->Proto()->attrs()) { + if (attr.type() == proto::AttrType::BLOCK) { + size_t blk_idx = attr.block_idx(); + op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx)); + } else if (attr.type() == proto::AttrType::BLOCKS) { + auto blks_idx = attr.blocks_idx(); + std::vector block_descs; + for (int blk_idx : blks_idx) { + block_descs.push_back(this->MutableBlock(blk_idx)); + } + op->SetBlocksAttr(attr.name(), block_descs); + } + } + } + } +} + +const std::vector ProgramDesc::GetFeedTargetNames() { + auto &global_block = Block(0); + // The order of feed_target_names must follow the index specified in `col`. + // since feed operator's order doesn't necessary follow 'col'. + std::vector feed_target_names; + for (auto *op : global_block.AllOps()) { + if (op->Type() == kFeedOpType) { + size_t col = boost::get(op->GetAttr("col")); + if (col >= feed_target_names.size()) { + feed_target_names.resize(col + 1); + } + feed_target_names[col] = op->Output("Out")[0]; + } + } + return feed_target_names; +} + +const std::vector ProgramDesc::GetFetchTargetNames() { + auto &global_block = Block(0); + // The order of fetch_target_names must follow the index specified in `col`. + // since fetch operator's order doesn't necessary follow 'col'. + std::vector fetch_target_names; + for (auto *op : global_block.AllOps()) { + if (op->Type() == kFetchOpType) { + size_t col = boost::get(op->GetAttr("col")); + if (col >= fetch_target_names.size()) { + fetch_target_names.resize(col + 1); + } + fetch_target_names[col] = op->Input("X")[0]; + } + } + return fetch_target_names; +} + +void ProgramDesc::SetFeedHolderName(const std::string &feed_holder_name) { + auto *global_block = MutableBlock(0); + int index = 0; + for (auto *op : global_block->AllOps()) { + if (op->Type() == kFeedOpType) { + // Unify the input's name of all feed_ops to feed_holder_name + global_block->RemoveVar(op->Input("X")[0]); + op->SetInput("X", {feed_holder_name}); + op->SetAttr("col", {index}); + op->CheckAttrs(); + index++; + } + } + + auto *feed_holder = global_block->Var(feed_holder_name); + feed_holder->SetType(proto::VarType::FEED_MINIBATCH); + feed_holder->SetPersistable(true); +} + +void ProgramDesc::SetFetchHolderName(const std::string &fetch_holder_name) { + auto *global_block = MutableBlock(0); + int index = 0; + for (auto *op : global_block->AllOps()) { + if (op->Type() == kFetchOpType) { + // Unify the output's name of all fetch_ops to fetch_holder_name + global_block->RemoveVar(op->Output("Out")[0]); + op->SetOutput("Out", {fetch_holder_name}); + op->SetAttr("col", {index}); + op->CheckAttrs(); + index++; + } + } + + auto *fetch_holder = global_block->Var(fetch_holder_name); + fetch_holder->SetType(proto::VarType::FETCH_LIST); + fetch_holder->SetPersistable(true); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h new file mode 100644 index 00000000..2ec0e9d7 --- /dev/null +++ b/paddle/fluid/framework/program_desc.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/proto_desc.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class BlockDesc; + +class ProgramDesc { + public: + ProgramDesc(); + + explicit ProgramDesc(const proto::ProgramDesc &desc); + + ProgramDesc(const ProgramDesc &o); + + explicit ProgramDesc(const std::string &binary_str); + + BlockDesc *AppendBlock(const BlockDesc &parent); + + BlockDesc *MutableBlock(size_t idx) { + if (idx == static_cast(kNoneBlockIndex)) { + return nullptr; + } else { + return blocks_[idx].get(); + } + } + + const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; } + + size_t Size() const { return blocks_.size(); } + + void Flush(); + + void CopyFrom(const proto::ProgramDesc &desc); + + proto::ProgramDesc *Proto(); + + int64_t Version() const; + + // The output variable of feed_op is referenced as feed_target. + // This function is used to collect the output variable's name of all + // feed_ops. + const std::vector GetFeedTargetNames(); + + // The input variable of fetch_op is referenced as fetch_target. + // This function is used to collect the input variable's name of all + // fetch_ops. + const std::vector GetFetchTargetNames(); + + // The input variable of feed_op that holds input Tensor provided by users is + // referenced as feed_holder. + // This function is used to change or unify the feed_holder variables' name. + void SetFeedHolderName(const std::string &feed_holder_name); + + // The output variable of fetch_op that holds output Tensor needed by users is + // referenced as fetch_holder. + // This function is used to change or unify the fetch_holder variables' name. + void SetFetchHolderName(const std::string &fetch_holder_name); + + private: + void InitFromProto(); + + proto::ProgramDesc desc_; + + std::vector> blocks_; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/program_desc_test.cc b/paddle/fluid/framework/program_desc_test.cc new file mode 100644 index 00000000..48bde278 --- /dev/null +++ b/paddle/fluid/framework/program_desc_test.cc @@ -0,0 +1,178 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/program_desc.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/block_desc.h" + +namespace paddle { +namespace framework { +TEST(ProgramDesc, copy_ctor) { + ProgramDesc program; + auto* global_block = program.MutableBlock(0); + auto* x = global_block->Var("X"); + x->SetType(proto::VarType::LOD_TENSOR); + x->SetLoDLevel(0); + x->SetDataType(proto::VarType::FP32); + x->SetShape({1000, 784}); + + auto* y = global_block->Var("Y"); + y->SetType(proto::VarType::LOD_TENSOR); + y->SetLoDLevel(0); + y->SetDataType(proto::VarType::FP32); + y->SetShape({784, 100}); + + auto* op = global_block->AppendOp(); + op->SetType("mul"); + op->SetInput("X", {x->Name()}); + op->SetInput("Y", {y->Name()}); + + auto* out = global_block->Var("Out"); + out->SetType(proto::VarType::LOD_TENSOR); + op->SetOutput("Y", {out->Name()}); + + BlockDesc* new_block = program.AppendBlock(*global_block); + op = new_block->AppendOp(); + op->SetType("mul"); + + op = global_block->AppendOp(); + op->SetType("op_with_subblock"); + op->SetAttr("sub_block", new_block); + + std::vector sub_blocks; + sub_blocks.push_back(program.AppendBlock(*global_block)); + sub_blocks.push_back(program.AppendBlock(*global_block)); + op->SetAttr("sub_blocks", sub_blocks); + + ProgramDesc program_copy(program); + + auto* global_block_copy = program_copy.MutableBlock(0); + ASSERT_NE(global_block, global_block_copy); + + auto assert_same_var = [&](const std::string& name, VarDesc* var_before) { + ASSERT_TRUE(global_block_copy->HasVar(name)); + auto* copy = global_block_copy->Var(name); + ASSERT_NE(copy, var_before); + ASSERT_EQ(copy->Name(), var_before->Name()); + ASSERT_EQ(copy->GetType(), var_before->GetType()); + ASSERT_EQ(copy->GetShape(), var_before->GetShape()); + ASSERT_EQ(copy->Proto()->SerializeAsString(), + var_before->Proto()->SerializeAsString()); + }; + + ASSERT_EQ(global_block->LocalVarNames(), global_block_copy->LocalVarNames()); + ASSERT_EQ(3UL, global_block_copy->LocalVarNames().size()); + assert_same_var("X", x); + assert_same_var("Y", y); + assert_same_var("Out", out); + + bool found_sub_block = false; + bool found_sub_blocks = false; + for (size_t i = 0; i < global_block->OpSize(); ++i) { + auto op_origin = global_block->Op(i); + auto op_copy = global_block_copy->Op(i); + + ASSERT_EQ(op_origin->Type(), op_copy->Type()); + ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs()); + ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs()); + + ASSERT_EQ(op_origin->Proto()->attrs().size(), + op_copy->Proto()->attrs().size()); + for (auto it = op_origin->Proto()->attrs().begin(); + it != op_origin->Proto()->attrs().end(); ++it) { + for (auto it_2 = op_copy->Proto()->attrs().begin(); + it_2 != op_copy->Proto()->attrs().end(); ++it_2) { + if (it->name() == it_2->name()) { + ASSERT_TRUE(it_2->SerializeAsString() == it->SerializeAsString()); + } + } + } + + if (op->Type() == "op_with_subblock") { + ASSERT_EQ(1, op->GetBlockAttrId("sub_block")); + found_sub_block = true; + + ASSERT_EQ(2UL, op->GetBlocksAttrIds("sub_blocks").size()); + found_sub_blocks = true; + } + } + ASSERT_TRUE(found_sub_block); + ASSERT_TRUE(found_sub_blocks); + // Not check block's protostr are same it because the order of vars could be + // different and it is correct. +} + +TEST(ProgramDescBind, serialize_and_deserialize) { + ProgramDesc program_origin; + auto* global_block = program_origin.MutableBlock(0); + auto* x = global_block->Var("X"); + x->SetType(proto::VarType::LOD_TENSOR); + x->SetLoDLevel(0); + x->SetDataType(proto::VarType::FP32); + x->SetShape({1000, 784}); + + auto* y = global_block->Var("Y"); + y->SetType(proto::VarType::LOD_TENSOR); + y->SetLoDLevel(0); + y->SetDataType(proto::VarType::FP32); + y->SetShape({784, 100}); + + auto* op = global_block->AppendOp(); + op->SetType("mul"); + op->SetInput("X", {x->Name()}); + op->SetInput("Y", {y->Name()}); + + auto* out = global_block->Var("Out"); + out->SetType(proto::VarType::LOD_TENSOR); + op->SetOutput("Y", {out->Name()}); + + std::string binary_str; + program_origin.Proto()->SerializeToString(&binary_str); + + ProgramDesc program_restored(binary_str); + auto* global_block_restored = program_restored.MutableBlock(0); + ASSERT_NE(global_block, global_block_restored); + + auto assert_same_var = [&](const std::string& name, VarDesc* var_before) { + ASSERT_TRUE(global_block_restored->HasVar(name)); + auto* restored = global_block_restored->Var(name); + ASSERT_NE(restored, var_before); + ASSERT_EQ(restored->Name(), var_before->Name()); + ASSERT_EQ(restored->GetType(), var_before->GetType()); + ASSERT_EQ(restored->GetShape(), var_before->GetShape()); + ASSERT_EQ(restored->Proto()->SerializeAsString(), + var_before->Proto()->SerializeAsString()); + }; + + ASSERT_EQ(global_block->LocalVarNames(), + global_block_restored->LocalVarNames()); + ASSERT_EQ(3UL, global_block_restored->LocalVarNames().size()); + assert_same_var("X", x); + assert_same_var("Y", y); + assert_same_var("Out", out); + + for (size_t i = 0; i < global_block->OpSize(); ++i) { + auto op_origin = global_block->Op(i); + auto op_restored = global_block_restored->Op(i); + + ASSERT_EQ(op_origin->Type(), op_restored->Type()); + ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs()); + ASSERT_EQ(op_origin->Outputs(), op_restored->Outputs()); + + ASSERT_EQ(op_restored->Proto()->SerializeAsString(), + op_origin->Proto()->SerializeAsString()); + } +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/proto_desc.h b/paddle/fluid/framework/proto_desc.h new file mode 100644 index 00000000..40521c07 --- /dev/null +++ b/paddle/fluid/framework/proto_desc.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +namespace paddle { +namespace framework { + +// The Index of first Block in Program. also called root block. +constexpr int kRootBlockIndex = 0; +// The Parent Index of root Block, this block does not exist. +constexpr int kNoneBlockIndex = -1; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc new file mode 100644 index 00000000..0afcd85f --- /dev/null +++ b/paddle/fluid/framework/prune.cc @@ -0,0 +1,187 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/prune.h" + +#include + +#include +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +const char kFeedOpType[] = "feed"; +const char kFetchOpType[] = "fetch"; + +bool HasDependentVar(const proto::OpDesc& op_desc, + const std::set& dependent_vars) { + for (auto& var : op_desc.outputs()) { + for (auto& argu : var.arguments()) { + if (dependent_vars.count(argu) != 0) { + return true; + } + } + } + return false; +} + +bool IsTarget(const proto::OpDesc& op_desc) { + if (op_desc.has_is_target()) { + return op_desc.is_target(); + } + return false; +} + +int GetSubBlockIndex(const proto::OpDesc& op_desc) { + for (auto& attr : op_desc.attrs()) { + if (attr.type() == proto::AttrType::BLOCK) { + PADDLE_ENFORCE(attr.has_block_idx()); + return attr.block_idx(); + } + } + return -1; +} + +bool HasSubBlock(const proto::OpDesc& op_desc) { + return GetSubBlockIndex(op_desc) > 0; +} + +// block_id is the idx of the current block in the input desc +// parent_block_id is the idx of the parent of the current block +// in the output desc, -1 means the current block is global block +// dependent_vars is passed recursively from the parent block to +// the child block to help pruning +void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, + int block_id, int parent_block_id, + std::set* dependent_vars) { + auto& block = input.blocks(block_id); + auto& ops = block.ops(); + + bool expect_feed = true; + for (auto& op_desc : ops) { + PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed, + "All FeedOps are at the beginning of the ProgramDesc"); + expect_feed = (op_desc.type() == kFeedOpType); + } + + bool expect_fetch = true; + for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { + auto& op_desc = *op_iter; + PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch, + "All FetchOps must at the end of the ProgramDesc"); + expect_fetch = (op_desc.type() == kFetchOpType); + } + + std::vector should_run; + for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { + auto& op_desc = *op_iter; + if (IsTarget(op_desc) || HasDependentVar(op_desc, *dependent_vars)) { + // insert its input to the dependency graph + for (auto& var : op_desc.inputs()) { + for (auto& argu : var.arguments()) { + dependent_vars->insert(argu); + } + } + should_run.push_back(true); + } else { + should_run.push_back(false); + } + } + + // since we are traversing the ProgramDesc in reverse order + // we reverse the should_run vector + std::reverse(should_run.begin(), should_run.end()); + + // copy the current block from input to output + auto* block_field = output->mutable_blocks(); + *block_field->Add() = input.blocks(block_id); + + int output_block_id = output->blocks_size() - 1; + auto* output_block = output->mutable_blocks(output_block_id); + output_block->set_idx(output_block_id); + output_block->set_parent_idx(parent_block_id); + + auto* op_field = output_block->mutable_ops(); + op_field->Clear(); + for (size_t i = 0; i < should_run.size(); ++i) { + if (should_run[i]) { + auto* op = op_field->Add(); + *op = input.blocks(block_id).ops(i); + if (HasSubBlock(*op)) { + // create sub_block_dependent_vars here to help prune the sub block + std::set sub_block_dependent_vars; + for (auto& var : op->inputs()) { + for (auto& argu : var.arguments()) { + sub_block_dependent_vars.insert(argu); + } + } + for (auto& var : op->outputs()) { + for (auto& argu : var.arguments()) { + sub_block_dependent_vars.insert(argu); + } + } + // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc + // output_block_id is the idx of the current block in the output desc + prune_impl(input, output, GetSubBlockIndex(*op), output_block_id, + &sub_block_dependent_vars); + } + } + } + + // remove the VarDescs in BlockDesc that are not referenced in + // the pruned OpDescs + std::unordered_map var_map; + auto* var_field = output->mutable_blocks(output_block_id)->mutable_vars(); + for (const auto& var : *var_field) { + var_map[var.name()] = var; + } + + std::set var_names; + for (const auto& op : *op_field) { + auto& input_field = op.inputs(); + for (auto& input_var : input_field) { + for (auto& arg : input_var.arguments()) { + if (var_map.count(arg) != 0) { + var_names.insert(arg); + } + } + } + auto& output_field = op.outputs(); + for (auto& output_var : output_field) { + for (auto& arg : output_var.arguments()) { + if (var_map.count(arg) != 0) { + var_names.insert(arg); + } + } + } + } + + var_field->Clear(); + for (const auto& name : var_names) { + *var_field->Add() = var_map[name]; + } +} + +// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies +void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) { + std::set dependent_vars; + output->clear_blocks(); + prune_impl(input, output, 0, -1, &dependent_vars); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/prune.h b/paddle/fluid/framework/prune.h new file mode 100644 index 00000000..1be7cd25 --- /dev/null +++ b/paddle/fluid/framework/prune.h @@ -0,0 +1,26 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc new file mode 100644 index 00000000..8af7d2d5 --- /dev/null +++ b/paddle/fluid/framework/prune_test.cc @@ -0,0 +1,151 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/prune.h" + +#include +#include + +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/operator.h" + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace f = paddle::framework; + +void AddOp(const std::string &type, const f::VariableNameMap &inputs, + const f::VariableNameMap &outputs, f::AttributeMap attrs, + paddle::framework::BlockDesc *block) { + // insert output + for (auto kv : outputs) { + for (auto v : kv.second) { + auto var = block->Var(v); + var->SetDataType(paddle::framework::proto::VarType::FP32); + } + } + + // insert op + auto op = block->AppendOp(); + op->SetType(type); + for (auto &kv : inputs) { + op->SetInput(kv.first, kv.second); + } + for (auto &kv : outputs) { + op->SetOutput(kv.first, kv.second); + } + op->SetAttrMap(attrs); +} + +TEST(Prune, one_operator) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + f::proto::ProgramDesc pruned; + + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0); + + pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true); + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1); +} + +TEST(Prune, forward) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + + for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) { + f::proto::ProgramDesc pruned; + pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true); + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1); + } +} + +TEST(Prune, multi_input_op) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{}, + block); + AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, + f::AttributeMap{}, block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true); + + f::proto::ProgramDesc pruned; + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4); +} + +TEST(Prune, multi_output_op) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, + f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true); + + f::proto::ProgramDesc pruned; + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2); +} + +TEST(Prune, multi_target) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + + AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, + f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{}, + block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true); + pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true); + + f::proto::ProgramDesc pruned; + f::Prune(*pdesc, &pruned); + PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3); +} diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc new file mode 100644 index 00000000..20d7f98e --- /dev/null +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -0,0 +1,142 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include "paddle/fluid/framework/device_worker.h" + +namespace paddle { +namespace framework { + +std::shared_ptr PullDenseWorker::s_instance_ = NULL; +std::mutex PullDenseWorker::mutex_for_version_; +std::map PullDenseWorker::last_versions_; +std::map PullDenseWorker::current_version_; +std::map> PullDenseWorker::training_versions_; +std::map> + PullDenseWorker::dense_value_names_; + +void PullDenseWorker::Initialize(const TrainerDesc& param) { + running_ = false; + param_ = param.pull_dense_param(); + dwp_param_ = param.downpour_param(); + threshold_ = param_.threshold(); + thread_num_ = param_.device_num(); + sleep_time_ms_ = param_.sleep_time_ms(); + for (size_t i = 0; + i < dwp_param_.program_config(0).pull_dense_table_id_size(); ++i) { + uint64_t tid = static_cast( + dwp_param_.program_config(0).pull_dense_table_id(i)); + TableParameter table; + for (auto i : param_.dense_table()) { + if (i.table_id() == tid) { + table = i; + break; + } + } + // setup dense variables for each table + int var_num = table.dense_value_name_size(); + dense_value_names_[tid].resize(var_num); + for (int j = 0; j < var_num; ++j) { + dense_value_names_[tid][j] = table.dense_value_name(j); + } + // setup training version for each table + training_versions_[tid].resize(thread_num_, 0); + last_versions_[tid] = 0; + current_version_[tid] = 0; + } + fleet_ptr_ = FleetWrapper::GetInstance(); +} + +void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { + for (auto& t : *status_vec) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(WARNING) << "Current Pull Dense Thread Failed Times" + << ++pull_dense_fail_times_; + } + } + + int MAX_FAIL_NUM = 20; + if (pull_dense_fail_times_ > MAX_FAIL_NUM) { + LOG(FATAL) << "Pull Dense Failed Times More Than " << MAX_FAIL_NUM + << " Times"; + exit(-1); + } + status_vec->resize(0); +} + +void PullDenseWorker::Stop() { + if (running_) { + running_ = false; + t_.join(); + } +} + +void PullDenseWorker::PullDense(bool force_update) { + pull_dense_status_.resize(0); + for (size_t i = 0; + i < dwp_param_.program_config(0).pull_dense_table_id_size(); ++i) { + uint64_t tid = static_cast( + dwp_param_.program_config(0).pull_dense_table_id(i)); + if (force_update || CheckUpdateParam(tid)) { + fleet_ptr_->PullDenseVarsAsync(*root_scope_, tid, dense_value_names_[tid], + &pull_dense_status_); + ResetThreadVersion(tid); + } + } + if (pull_dense_status_.size() != 0) { + Wait(&pull_dense_status_); + } +} + +int PullDenseWorker::Start() { + running_ = true; + // before training, we can pull dense from pserver first. + PullDense(true); + t_ = std::thread(&PullDenseWorker::Run, this); + return 0; +} + +void PullDenseWorker::Run() { + while (running_) { + PullDense(false); +#ifndef _WIN32 + usleep(sleep_time_ms_ * 1000); +#endif + } +} + +void PullDenseWorker::IncreaseThreadVersion(int thread_id, uint64_t table_id) { + std::lock_guard lock(mutex_for_version_); + training_versions_[table_id][thread_id]++; +} + +bool PullDenseWorker::CheckUpdateParam(uint64_t table_id) { + std::lock_guard lock(mutex_for_version_); + auto& version = training_versions_[table_id]; + current_version_[table_id] = + *(std::min_element(version.begin(), version.end())); + if (current_version_[table_id] - last_versions_[table_id] < threshold_) { + return false; + } + return true; +} + +void PullDenseWorker::ResetThreadVersion(uint64_t table_id) { + std::lock_guard lock(mutex_for_version_); + last_versions_[table_id] = current_version_[table_id]; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/python_headers.h b/paddle/fluid/framework/python_headers.h new file mode 100644 index 00000000..8f9e3fad --- /dev/null +++ b/paddle/fluid/framework/python_headers.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +// workaround for Python 2 issue: https://bugs.python.org/issue17120 +#pragma push_macro("_XOPEN_SOURCE") +#pragma push_macro("_POSIX_C_SOURCE") +#undef _XOPEN_SOURCE +#undef _POSIX_C_SOURCE + +#include "pybind11/pybind11.h" + +#pragma pop_macro("_XOPEN_SOURCE") +#pragma pop_macro("_POSIX_C_SOURCE") + +#if !defined(PYBIND11_HIDDEN) +#ifdef _WIN32 +#define PYBIND11_HIDDEN __declspec(dllexport) +#else +#define PYBIND11_HIDDEN __attribute__((visibility("hidden"))) +#endif +#endif diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc new file mode 100644 index 00000000..d3513fb7 --- /dev/null +++ b/paddle/fluid/framework/reader.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/reader.h" +#include + +namespace paddle { +namespace framework { + +void ReaderBase::ReadNext(std::vector *out) { + std::lock_guard lock(mu_); + PADDLE_ENFORCE_EQ(status_, ReaderStatus::kRunning); + ReadNextImpl(out); +} + +void ReaderBase::InsertDecoratedReader( + const std::shared_ptr &decorated_reader) { + std::lock_guard guard(mu_); + decorated_readers_.emplace_back(decorated_reader); +} + +std::unordered_set ReaderBase::GetEndPoints() { + std::unordered_set result; + std::deque queue; + queue.emplace_back(this); + while (!queue.empty()) { // BFS search + auto *front = queue.front(); + queue.pop_front(); + if (front->decorated_readers_.empty()) { + result.emplace(front); + } else { + for (auto &reader : front->decorated_readers_) { + if (auto *reader_ptr = reader.lock().get()) { + queue.emplace_back(reader_ptr); + } + } + } + } + + return result; +} + +void ReaderBase::Shutdown() { + std::lock_guard lock(mu_); + if (status_ != ReaderStatus::kStopped) { + ShutdownImpl(); + status_ = ReaderStatus::kStopped; + } +} + +void ReaderBase::Start() { + std::lock_guard lock(mu_); + if (status_ != ReaderStatus::kRunning) { + StartImpl(); + status_ = ReaderStatus::kRunning; + } +} + +ReaderBase::~ReaderBase() {} + +DecoratedReader::~DecoratedReader() { + VLOG(1) << "~DecoratedReader"; + reader_->Shutdown(); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h new file mode 100644 index 00000000..4b400e72 --- /dev/null +++ b/paddle/fluid/framework/reader.h @@ -0,0 +1,151 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +class ReaderBase { + public: + virtual void ReadNext(std::vector* out); + + virtual void Shutdown(); + + virtual void Start(); + + // Return the readers which are the end of decorating chain. Basically + // they are readers just before read op. + std::unordered_set GetEndPoints(); + + virtual ~ReaderBase(); + + protected: + virtual void ReadNextImpl(std::vector* out) {} + + virtual void ShutdownImpl() {} + + virtual void StartImpl() {} + + enum ReaderStatus { kRunning, kStopped }; + + ReaderStatus status_{kRunning}; + + mutable std::mutex mu_; + + private: + friend class DecoratedReader; + // These methods can be only invoked inside DecoratedReader to record the + // decorating chain. + void InsertDecoratedReader( + const std::shared_ptr& decorated_reader); + // A set of which readers that decorated this reader. + std::vector> decorated_readers_; +}; + +class DecoratedReader : public ReaderBase, + public std::enable_shared_from_this { + public: + explicit DecoratedReader(const std::shared_ptr& reader) + : ReaderBase(), reader_(reader) { + PADDLE_ENFORCE_NOT_NULL(reader_); + } + + void RegisterDecorateChain() { + reader_->InsertDecoratedReader(shared_from_this()); + } + + ~DecoratedReader(); + + protected: + void ShutdownImpl() override { + VLOG(1) << "ShutdownImpl"; + reader_->Shutdown(); + } + + void StartImpl() override { reader_->Start(); } + + std::shared_ptr reader_; +}; + +// FileReader is just a conceptual class. +class FileReader : public ReaderBase {}; + +// The ReaderHolder is used as reader' unified wrapper, +// making it easier to access different type reader in Variables. +class ReaderHolder { + public: + template + void Reset(const std::shared_ptr& reader) { + auto reader_base = std::dynamic_pointer_cast(reader); + PADDLE_ENFORCE_NOT_NULL(reader_base); + reader_ = reader_base; + } + + ~ReaderHolder() { VLOG(1) << "~ReaderHolder"; } + + const std::shared_ptr& Get() const { return reader_; } + + void ReadNext(std::vector* out) { + PADDLE_ENFORCE_NOT_NULL(reader_); + reader_->ReadNext(out); + } + + void ResetAll() { + VLOG(1) << "ResetAll"; + auto end_readers = reader_->GetEndPoints(); + for (auto* reader : end_readers) { + reader->Shutdown(); + } + for (auto* reader : end_readers) { + reader->Start(); + } + } + + void Shutdown() { + VLOG(1) << "Shutdown"; + PADDLE_ENFORCE_NOT_NULL(reader_); + reader_->Shutdown(); + } + + void Start() { + VLOG(1) << "start"; + PADDLE_ENFORCE_NOT_NULL(reader_); + reader_->Start(); + } + + operator const std::shared_ptr&() const { return this->reader_; } + + private: + std::shared_ptr reader_; +}; + +template +inline std::shared_ptr MakeDecoratedReader(ARGS&&... args) { + std::shared_ptr reader(new T(std::forward(args)...)); + reader->RegisterDecorateChain(); + return reader; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/reader_test.cc b/paddle/fluid/framework/reader_test.cc new file mode 100644 index 00000000..d812417a --- /dev/null +++ b/paddle/fluid/framework/reader_test.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/reader.h" +#include +#include "gtest/gtest.h" + +class StubDecoratedReader : public paddle::framework::DecoratedReader { + public: + explicit StubDecoratedReader(const std::shared_ptr &reader) + : DecoratedReader(reader) {} + + void ReadNextImpl(std::vector *out) override {} +}; + +class StubRootReader : public paddle::framework::ReaderBase { + public: + void ReadNextImpl(std::vector *out) override {} +}; + +TEST(READER, decorate_chain) { + auto root = std::make_shared(); + auto end_point1 = + paddle::framework::MakeDecoratedReader(root); + auto end_point2 = + paddle::framework::MakeDecoratedReader(root); + + { + auto endpoints = root->GetEndPoints(); + ASSERT_EQ(endpoints.size(), 2U); + ASSERT_NE(endpoints.count(end_point1.get()), 0UL); + ASSERT_NE(endpoints.count(end_point2.get()), 0UL); + } + + { + auto end_point3 = + paddle::framework::MakeDecoratedReader(root); + ASSERT_EQ(root->GetEndPoints().size(), 3U); + } + { ASSERT_EQ(root->GetEndPoints().size(), 2U); } +} diff --git a/paddle/fluid/framework/revision.cc b/paddle/fluid/framework/revision.cc new file mode 100644 index 00000000..28693a5e --- /dev/null +++ b/paddle/fluid/framework/revision.cc @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/revision.h" + +namespace paddle { +namespace framework { +#ifdef PADDLE_REVISION +const std::string kPaddleRevision(PADDLE_REVISION); +#else +const std::string kPaddleRevision("baidu/paddlepaddle/paddle@null@null"); +#endif + +const std::string GetPaddleRevision() { + return kPaddleRevision; +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/revision.h b/paddle/fluid/framework/revision.h new file mode 100644 index 00000000..11588ea5 --- /dev/null +++ b/paddle/fluid/framework/revision.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#pragma once + +namespace paddle { +namespace framework { +const std::string GetPaddleRevision(); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/rw_lock.h b/paddle/fluid/framework/rw_lock.h new file mode 100644 index 00000000..f8aa8751 --- /dev/null +++ b/paddle/fluid/framework/rw_lock.h @@ -0,0 +1,99 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if !defined(_WIN32) +#include +#else +#include // NOLINT +#endif // !_WIN32 + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +#if !defined(_WIN32) +struct RWLock { + RWLock() { pthread_rwlock_init(&lock_, nullptr); } + + ~RWLock() { pthread_rwlock_destroy(&lock_); } + + inline void RDLock() { + PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0, + "acquire read lock failed"); + } + + inline void WRLock() { + PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0, + "acquire write lock failed"); + } + + inline void UNLock() { + PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed"); + } + + private: + pthread_rwlock_t lock_; +}; +// TODO(paddle-dev): Support RWLock for WIN32 for correctness. +#else +// https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive +// In windows, rw_lock seems like a hack. Use empty object and do nothing. +struct RWLock { + // FIXME(minqiyang): use mutex here to do fake lock + inline void RDLock() { mutex_.lock(); } + + inline void WRLock() { mutex_.lock(); } + + inline void UNLock() { mutex_.unlock(); } + + private: + std::mutex mutex_; +}; +#endif + +class AutoWRLock { + public: + explicit AutoWRLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } + + ~AutoWRLock() { UnLock(); } + + private: + inline void Lock() { lock_->WRLock(); } + + inline void UnLock() { lock_->UNLock(); } + + private: + RWLock* lock_; +}; + +class AutoRDLock { + public: + explicit AutoRDLock(RWLock* rw_lock) : lock_(rw_lock) { Lock(); } + + ~AutoRDLock() { UnLock(); } + + private: + inline void Lock() { lock_->RDLock(); } + + inline void UnLock() { lock_->UNLock(); } + + private: + RWLock* lock_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/rw_lock_test.cc b/paddle/fluid/framework/rw_lock_test.cc new file mode 100644 index 00000000..16f9cbb6 --- /dev/null +++ b/paddle/fluid/framework/rw_lock_test.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/rw_lock.h" +#include +#include // NOLINT +#include // NOLINT +#include + +namespace f = paddle::framework; + +void f1(f::RWLock *lock) { + lock->RDLock(); + lock->UNLock(); +} + +TEST(RWLOCK, read_read) { + f::RWLock lock; + lock.RDLock(); + std::thread t1(f1, &lock); + std::thread t2(f1, &lock); + t1.join(); + t2.join(); + lock.UNLock(); +} + +void f2(f::RWLock *lock, std::vector *result) { + lock->RDLock(); + ASSERT_EQ(result->size(), 0UL); + lock->UNLock(); +} + +void f3(f::RWLock *lock, std::vector *result) { + lock->WRLock(); + result->push_back(1); + lock->UNLock(); +} + +TEST(RWLOCK, read_write) { + f::RWLock lock; + std::vector result; + + lock.RDLock(); + std::thread t1(f2, &lock, &result); + t1.join(); + std::thread t2(f3, &lock, &result); + std::this_thread::sleep_for(std::chrono::seconds(1)); + ASSERT_EQ(result.size(), 0UL); + lock.UNLock(); + t2.join(); + ASSERT_EQ(result.size(), 1UL); +} + +void f4(f::RWLock *lock, std::vector *result) { + lock->RDLock(); + ASSERT_EQ(result->size(), 1UL); + lock->UNLock(); +} + +TEST(RWLOCK, write_read) { + f::RWLock lock; + std::vector result; + + lock.WRLock(); + std::thread t1(f4, &lock, &result); + std::this_thread::sleep_for(std::chrono::seconds(1)); + result.push_back(1); + lock.UNLock(); + t1.join(); +} diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc new file mode 100644 index 00000000..afafff52 --- /dev/null +++ b/paddle/fluid/framework/scope.cc @@ -0,0 +1,256 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/scope.h" + +#include // for unique_ptr +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/string/printf.h" + +DECLARE_bool(benchmark); + +DEFINE_bool( + eager_delete_scope, true, + "Delete local scope eagerly. It will reduce GPU memory usage but " + "slow down the destruction of variables.(around 1% performance harm)"); + +// When in inference scenario, the scopes will not be written by two threads in +// a mean time, but a scope may be read by multiple threads concurrently, and +// the mutex will cause serious performance issue. +// So the mutex is disabled when `ON_INFER`. +#ifdef PADDLE_ON_INFERENCE +#define SCOPE_KIDS_READER_LOCK +#define SCOPE_KIDS_WRITER_LOCK +#define SCOPE_VARS_READER_LOCK +#define SCOPE_VARS_WRITER_LOCK +#else +#define SCOPE_KIDS_READER_LOCK AutoRDLock auto_lock(&kids_lock_); +#define SCOPE_KIDS_WRITER_LOCK AutoWRLock auto_lock(&kids_lock_); +#define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_); +#define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_); +#endif + +namespace paddle { +namespace framework { + +Scope::~Scope() { DropKids(); } + +Scope& Scope::NewScope() const { + Scope* child = new Scope(this); + { + SCOPE_KIDS_WRITER_LOCK + kids_.push_back(child); + } + return *child; +} + +std::unique_ptr Scope::NewTmpScope() const { + return std::unique_ptr(new Scope(this)); +} + +Variable* Scope::Var(const std::string& name) { + SCOPE_VARS_WRITER_LOCK + return VarInternal(name); +} + +Variable* Scope::Var(std::string* name) { + SCOPE_VARS_WRITER_LOCK + auto new_name = std::to_string(reinterpret_cast(this)) + "." + + std::to_string(vars_.size()); + if (name != nullptr) { + *name = new_name; + } + return VarInternal(new_name); +} + +Variable* Scope::FindVar(const std::string& name) const { + SCOPE_VARS_READER_LOCK + return FindVarInternal(name); +} + +Variable* Scope::FindLocalVar(const std::string& name) const { + SCOPE_VARS_READER_LOCK + return FindVarLocally(name); +} + +const Scope* Scope::FindScope(const Variable* var) const { + SCOPE_VARS_READER_LOCK + return FindScopeInternal(var); +} + +void Scope::DropKids() { + SCOPE_KIDS_WRITER_LOCK + for (Scope* s : kids_) delete s; + kids_.clear(); +} + +bool Scope::HasKid(const Scope* scope) const { + SCOPE_KIDS_READER_LOCK + auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); + return it != this->kids_.end(); +} + +std::vector Scope::LocalVarNames() const { + std::vector known_vars; + { + SCOPE_VARS_READER_LOCK + known_vars.reserve(this->vars_.size()); + for (auto& p : vars_) { + known_vars.emplace_back(p.first); + } + } + return known_vars; +} + +void Scope::DeleteScope(Scope* scope) const { + SCOPE_KIDS_WRITER_LOCK + auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); + PADDLE_ENFORCE(it != this->kids_.end(), "%p Cannot find %p as kid scope", + this, scope); + this->kids_.erase(it); + // When making memory benchmark on Fluid, we have to delete scope sync. + if (FLAGS_benchmark || FLAGS_eager_delete_scope) { + delete scope; + } else { + Async([scope] { delete scope; }); + } +} + +void Scope::EraseVars(const std::vector& var_names) { + std::set var_set(var_names.begin(), var_names.end()); + SCOPE_VARS_WRITER_LOCK + for (auto it = vars_.begin(); it != vars_.end();) { + if (var_set.find(it->first) != var_set.end()) { + it = vars_.erase(it); + } else { + ++it; + } + } +} + +void Scope::Rename(const std::string& origin_name, + const std::string& new_name) const { + SCOPE_VARS_WRITER_LOCK + RenameInternal(origin_name, new_name); +} + +std::string Scope::Rename(const std::string& origin_name) const { + SCOPE_VARS_WRITER_LOCK + auto new_name = string::Sprintf("%p.%d", this, vars_.size()); + RenameInternal(origin_name, new_name); + return new_name; +} + +Variable* Scope::VarInternal(const std::string& name) { + auto* v = FindVarLocally(name); + if (v != nullptr) return v; + v = new Variable(); + vars_.emplace(name, std::unique_ptr(v)); + VLOG(3) << "Create variable " << name; + return v; +} + +const Scope* Scope::FindScopeInternal(const Variable* var) const { + for (auto& kv : vars_) { + if (kv.second.get() == var) { + return this; + } + } + return (parent_ == nullptr) ? nullptr : parent_->FindScope(var); +} + +void Scope::RenameInternal(const std::string& origin_name, + const std::string& new_name) const { + auto origin_it = vars_.find(origin_name); + PADDLE_ENFORCE(origin_it != vars_.end(), + "Cannot find original variable with name %s", origin_name); + auto new_it = vars_.find(new_name); + PADDLE_ENFORCE(new_it == vars_.end(), + "The variable with name %s is already in the scope", new_name); + vars_[new_name].reset(origin_it->second.release()); + vars_.erase(origin_it); +} + +Variable* Scope::FindVarInternal(const std::string& name) const { + auto var = FindVarLocally(name); + if (var != nullptr) { + return var; + } + return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); +} + +Variable* Scope::FindVarLocally(const std::string& name) const { + auto it = vars_.find(name); + if (it != vars_.end()) return it->second.get(); + return nullptr; +} + +void Scope::EraseVarsExcept(const std::unordered_set& vars) { + SCOPE_VARS_WRITER_LOCK + for (auto iter = vars_.begin(); iter != vars_.end();) { + if (vars.count(iter->second.get()) != 0) { + ++iter; + } else { + vars_.erase(iter++); + } + } +} + +std::string GenScopeTreeDebugInfo(Scope* root) { + std::stringstream os; + + if (!root) return ""; + + // level traversal + std::queue queue; + queue.push(root); + + std::vector scopes; + + while (!queue.empty()) { + auto* end = queue.back(); + Scope* q = nullptr; + while (q != end) { + q = queue.front(); + queue.pop(); + os << q << " "; + scopes.push_back(q); + + for (auto* c : q->kids()) { + queue.push(c); + } + } + // end of a level + os << "\n------------------------------------------\n"; + } + + os << "\nDetails:\n\n"; + + for (Scope* q : scopes) { + os << "====\n"; + os << q << ":\n"; + for (auto& var : q->LocalVarNames()) { + os << " - " << var << "\n"; + } + } + + return os.str(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h new file mode 100644 index 00000000..9de29632 --- /dev/null +++ b/paddle/fluid/framework/scope.h @@ -0,0 +1,154 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +extern "C" { +#include +} + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { + +class Scope; + +/** + * @brief Scope that manage all variables. + * + * Scope is an association of a name to Variable. All variables belong to + * Scope. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. + * One net can run in different scopes and update different variable in the + * scope. + */ +class Scope { + public: + Scope() {} + ~Scope(); + + /// Create a sub-scope. Returns a reference other than a pointer so + /// to prevent from manual deletion. + /// Mark it to const because that new kid scope cannot change parent scope. + Scope& NewScope() const; + + /// Create a sub-scope for current scope but do not record it in the kids to + /// avoid performance problems. + std::unique_ptr NewTmpScope() const; + + /// Create a variable with given name if it doesn't exist. + /// Caller doesn't own the returned Variable. + Variable* Var(const std::string& name); + + /// Create a variable with a scope-unique name. + /// Caller doesn't own the returned Variable. + Variable* Var(std::string* name = nullptr); + + void EraseVars(const std::vector& var_names); + + // Erase all variables except the given `vars` + void EraseVarsExcept(const std::unordered_set& vars); + + /// Find a variable in the scope or any of its ancestors. Returns + /// nullptr if cannot find. + /// Caller doesn't own the returned Variable. + Variable* FindVar(const std::string& name) const; + + /// Find a variable in the current scope. + /// Return nullptr if cannot find. + /// Caller doesn't own the returned Variable. + Variable* FindLocalVar(const std::string& name) const; + + const Scope* parent() const { return parent_; } + + /// Find the scope or an ancestor scope that contains the given variable. + const Scope* FindScope(const Variable* var) const; + + void DeleteScope(Scope* scope) const; + + /// Drop all kids scopes belonged to this scope. + void DropKids(); + + /// Find if a scope exists in the kid scopes + bool HasKid(const Scope* scope) const; + + const std::list& kids() const { return kids_; } + + // enumerate all the variables current contains. + std::vector LocalVarNames() const; + + // Rename variable to a new name + void Rename(const std::string& origin_name, + const std::string& new_name) const; + + // Rename variable to a new name and return the new name + std::string Rename(const std::string& origin_name) const; + + protected: + struct KeyHasher { + std::size_t operator()(const std::string& key) const { + return XXH32(key.c_str(), key.size(), 1); + } + }; + + mutable std::unordered_map, KeyHasher> + vars_; + + private: + // Call Scope::NewScope for a sub-scope. + explicit Scope(Scope const* parent) : parent_(parent) {} + + // Called by Var. + Variable* VarInternal(const std::string& name); + + // Called by FindScope. + const Scope* FindScopeInternal(const Variable* var) const; + + // Called by Rename. + void RenameInternal(const std::string& origin_name, + const std::string& new_name) const; + + // Called by FindVar recursively. + Variable* FindVarInternal(const std::string& name) const; + + // Called by FindVarInternal and Var. + Variable* FindVarLocally(const std::string& name) const; + + // Scope in `kids_` are owned by this class. + mutable std::list kids_; + const Scope* parent_{nullptr}; + + DISABLE_COPY_AND_ASSIGN(Scope); + + private: + mutable RWLock kids_lock_; + mutable RWLock vars_lock_; +}; + +// Generate some debug string about the inherience structure of scope, quite +// naive. +std::string GenScopeTreeDebugInfo(Scope*); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope_pool.cc b/paddle/fluid/framework/scope_pool.cc new file mode 100644 index 00000000..5cb241a7 --- /dev/null +++ b/paddle/fluid/framework/scope_pool.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/scope_pool.h" +#include "paddle/fluid/framework/threadpool.h" + +namespace paddle { +namespace framework { + +ScopePool &ScopePool::Instance() { // NOLINT + static ScopePool pool; + return pool; +} + +void ScopePool::DeleteScope(Scope *scope) { delete scope; } + +void ScopePool::Insert(std::unique_ptr &&s) { + std::lock_guard guard(mtx_); + scopes_.insert(s.release()); +} + +void ScopePool::Remove(Scope *s) { + size_t has_scope; + { + std::lock_guard guard(mtx_); + has_scope = scopes_.erase(s); + } + PADDLE_ENFORCE(has_scope > 0, "Delete non-existing global scope"); + DeleteScope(s); +} + +ScopePool::~ScopePool() { Clear(); } + +void ScopePool::Clear() { + std::lock_guard guard(mtx_); + for (auto *s : scopes_) { + DeleteScope(s); + } + scopes_.clear(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope_pool.h b/paddle/fluid/framework/scope_pool.h new file mode 100644 index 00000000..a8b46869 --- /dev/null +++ b/paddle/fluid/framework/scope_pool.h @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +class ScopePool { + public: + static ScopePool &Instance(); // NOLINT + + void Insert(std::unique_ptr &&s); + + void Remove(Scope *s); + + void Clear(); + + ~ScopePool(); + + private: + ScopePool() = default; + + static void DeleteScope(Scope *scope); + + std::unordered_set scopes_; + std::mutex mtx_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/scope_test.cc b/paddle/fluid/framework/scope_test.cc new file mode 100644 index 00000000..ebf8178a --- /dev/null +++ b/paddle/fluid/framework/scope_test.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/scope.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +using paddle::framework::Scope; +using paddle::framework::Variable; + +TEST(Scope, VarsShadowing) { + Scope s; + Scope& ss1 = s.NewScope(); + Scope& ss2 = s.NewScope(); + + Variable* v0 = s.Var("a"); + Variable* v1 = ss1.Var("a"); + + EXPECT_NE(v0, v1); + + EXPECT_EQ(v0, s.FindVar("a")); + EXPECT_EQ(v1, ss1.FindVar("a")); + EXPECT_EQ(v0, ss2.FindVar("a")); +} + +TEST(Scope, FindVar) { + Scope s; + Scope& ss = s.NewScope(); + + EXPECT_EQ(nullptr, s.FindVar("a")); + EXPECT_EQ(nullptr, ss.FindVar("a")); + + ss.Var("a"); + + EXPECT_EQ(nullptr, s.FindVar("a")); + EXPECT_NE(nullptr, ss.FindVar("a")); +} + +TEST(Scope, FindScope) { + Scope s; + Scope& ss = s.NewScope(); + Variable* v = s.Var("a"); + + EXPECT_EQ(&s, s.FindScope(v)); + EXPECT_EQ(&s, ss.FindScope(v)); +} + +TEST(Scope, GetAllNames) { + Scope s; + Variable* v = s.Var("a"); + EXPECT_EQ(&s, s.FindScope(v)); + + std::vector ans = s.LocalVarNames(); + std::string str; + for (auto& var : ans) { + str += var; + } + + EXPECT_STREQ("a", str.c_str()); +} diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc new file mode 100644 index 00000000..c1a404c1 --- /dev/null +++ b/paddle/fluid/framework/section_worker.cc @@ -0,0 +1,411 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" + +#include "paddle/fluid/framework/device_worker.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/trainer_desc.pb.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/lodtensor_printer.h" + +namespace paddle { +namespace framework { + +uint64_t SyncFunctor::sync_flag_ = 0; +std::vector SyncFunctor::pipeline_scopes_; + +SyncFunctor::SyncFunctor(int rank_id, int rank_num, int sync_steps) + : rank_id_(rank_id), rank_num_(rank_num), sync_steps_(sync_steps) { + PADDLE_ENFORCE(rank_num > 1, "rank_num should larger than 1"); + counter_ = 0; + sync_signal_ = 0; + uint8_t* ptr = reinterpret_cast(&sync_signal_); + for (int i = 0; i < rank_num_; ++i) { + ptr[i] = 0xFF; + } +} + +int SyncFunctor::operator()(Scope* scope) { + ++counter_; + if (counter_ < sync_steps_) { + return 0; + } + if (counter_ == sync_steps_) { + reinterpret_cast(&sync_flag_)[rank_id_] = 0xFF; + } + + if (sync_flag_ == sync_signal_) { + static std::mutex mutex; + if (mutex.try_lock()) { + if (sync_flag_ == sync_signal_) { + Synchronize(); + sync_flag_ = 0; + } + mutex.unlock(); + } + } + + if (sync_flag_ == 0) { + counter_ = 0; + } + return 0; +} + +void SyncFunctor::Synchronize() { + for (const std::string& name : *sync_param_) { + platform::NCCLGroupGuard guard; + for (int i = 0; i < rank_num_; ++i) { + const platform::NCCLContext& nccl_ctx = nccl_ctx_map_->at(i); + LoDTensor* tensor = + pipeline_scopes_[i]->Var(name)->GetMutable(); + // TODO(hutuxian): do not depend on data type explicitly + float* data = + tensor->mutable_data(nccl_ctx_map_->DevCtx(i)->GetPlace()); + const int numel = tensor->numel(); + + paddle::framework::AttributeMap attrs; + attrs.insert({"scale", static_cast(1. / rank_num_)}); + auto scale_op = framework::OpRegistry::CreateOp("scale", {{"X", {name}}}, + {{"Out", {name}}}, attrs); + scale_op->Run(*(pipeline_scopes_[i]), + nccl_ctx_map_->DevCtx(i)->GetPlace()); + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + data, data, numel, ncclFloat, ncclSum, nccl_ctx.comm(), + dynamic_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(i))) + ->stream())); + } + } + nccl_ctx_map_->WaitAll(); +} + +std::atomic SectionWorker::cpu_id_(0); +void SectionWorker::Initialize(const TrainerDesc& trainer_desc) { + dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_); + std::shared_ptr program; + program.reset(new ProgramDesc( + trainer_desc.section_param().section_config(section_id_).program_desc())); + for (auto& op_desc : program->Block(0).AllOps()) { + ops_.push_back(OpRegistry::CreateOp(*op_desc)); + } +} + +void SectionWorker::AutoSetCPUAffinity(bool reuse) { + int thread_cpu_id = cpu_id_.fetch_add(1); + + unsigned concurrency_cap = std::thread::hardware_concurrency(); + unsigned proc = thread_cpu_id; + + if (proc >= concurrency_cap) { + if (reuse) { + proc %= concurrency_cap; + } else { + LOG(INFO) << "All " << concurrency_cap + << " CPUs have been set affinities. Fail to set " + << thread_cpu_id << "th thread"; + return; + } + } + + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(proc, &mask); + + if (-1 == sched_setaffinity(0, sizeof(mask), &mask)) { + LOG(WARNING) << "Fail to set thread affinity to CPU " << proc; + return; + } + + CPU_ZERO(&mask); + if ((0 != sched_getaffinity(0, sizeof(mask), &mask)) || + (0 == CPU_ISSET(proc, &mask))) { + LOG(WARNING) << "Fail to set thread affinity to CPU " << proc; + } + SEC_LOG << "Set " << thread_cpu_id << "th thread affinity to CPU " << proc; +} + +void SectionWorker::TrainFiles() { + SEC_LOG << "begin section_worker TrainFiles"; + AutoSetCPUAffinity(true); + + int64_t step_cnt = 0; + int64_t accum_num = 0; + int batch_size = 0; + Scope* scope = nullptr; + while (in_scope_queue_->Receive(&scope)) { + if (device_reader_ != nullptr) { + device_reader_->AssignFeedVar(*scope); + batch_size = device_reader_->Next(); + if (batch_size <= 0) { + break; + } + SEC_LOG << "read batch size: " << batch_size; + } else { + // TODO(hutuxian): Keep batch_size in scope? Or is there a better way to + // fetch batch_size? Some variables may not have batch_size. + PADDLE_ENFORCE( + in_var_names_->size(), + "Section without a reader or in variable is not supported by now"); + const LoDTensor& tensor = + scope->FindVar(in_var_names_->at(0))->Get(); + batch_size = + tensor.lod().size() ? tensor.lod()[0].size() - 1 : tensor.dims()[0]; + SEC_LOG << "input batch size: " << batch_size; + } + + Scope* exe_scope = scope; + if (section_id_ > 0 && platform::is_gpu_place(place_)) { + SEC_LOG << "CPU2GPU memory copy"; + + if (scope->kids().empty()) { + exe_scope = &scope->NewScope(); + } else { + exe_scope = scope->kids().front(); + PADDLE_ENFORCE(scope->kids().size() == 1, "scope->kids().size(): %zu", + scope->kids().size()); + } + + for (const std::string& name : *in_var_names_) { + const LoDTensor& src_tensor = scope->FindVar(name)->Get(); + if (platform::is_gpu_place(src_tensor.place())) { + continue; + } + LoDTensor* gpu_tensor = exe_scope->Var(name)->GetMutable(); + gpu_tensor->set_lod(src_tensor.lod()); + TensorCopy(*static_cast(&src_tensor), place_, *dev_ctx_, + static_cast(gpu_tensor)); + } + } + + SEC_LOG << "begin running ops"; + + for (auto& op : ops_) { + op->Run(*exe_scope, place_); + } + exe_scope->DropKids(); + // Wait for GPU calc finising, as the cudaMemcpy and GPU calc may be in + // different streams + // No effect when it is a CPUDeviceContext + dev_ctx_->Wait(); + + if (section_id_ != section_num_ - 1 && platform::is_gpu_place(place_)) { + // FIXME: Temporarily we assume two adjacent sections are in different + // places, + // and we do data transformation only in sections in GPU place, so the + // data is + // transform from GPU to CPU + // A better way to handle such a data transformation is to record each + // place of + // joint-out variables, and do transform as required + + SEC_LOG << "GPU2CPU memory copy"; + + for (const std::string& name : *out_var_names_) { + const LoDTensor& src_tensor = + exe_scope->FindVar(name)->Get(); + LoDTensor* dst_tensor = scope->Var(name)->GetMutable(); + dst_tensor->set_lod(src_tensor.lod()); + TensorCopy(*static_cast(&src_tensor), + next_section_place_, *dev_ctx_, + static_cast(dst_tensor)); + } + } + + out_scope_queue_->Send(scope); + + if (sync_func_) { + (*sync_func_)(scope); + } + + ++step_cnt; + accum_num += batch_size; + } + + worker_count_mutex_->lock(); + --(*worker_count_); + worker_count_mutex_->unlock(); + + if (*worker_count_ <= 0) { + while (section_id_ < section_num_ - 1 && out_scope_queue_->Size()) { + sleep(1); + } + out_scope_queue_->Close(); + } +} + +void SectionWorker::TrainFilesWithProfiler() { + SEC_LOG << "begin section_worker TrainFiles with profiler"; + AutoSetCPUAffinity(true); + + int64_t step_cnt = 0; + int64_t accum_num = 0; + int batch_size = 0; + Scope* scope = nullptr; + + platform::Timer reader_timer; + platform::Timer cal_timer; + platform::Timer trans_timer; + platform::Timer sync_timer; + platform::Timer main_timer; + platform::Timer outer_timer; + + std::vector op_total_time; + std::vector op_name; + for (auto& op : ops_) { + op_name.push_back(op->Type()); + } + op_total_time.resize(ops_.size()); + for (size_t i = 0; i < op_total_time.size(); ++i) { + op_total_time[i] = 0.0; + } + platform::Timer timeline; + + bool started = false; + while (in_scope_queue_->Receive(&scope)) { + if (UNLIKELY(!started)) { + outer_timer.Start(); + started = true; + } + main_timer.Resume(); + + if (device_reader_ != nullptr) { + reader_timer.Resume(); + device_reader_->AssignFeedVar(*scope); + batch_size = device_reader_->Next(); + reader_timer.Pause(); + if (batch_size <= 0) { + break; + } + SEC_LOG << "read batch size: " << batch_size; + } else { + PADDLE_ENFORCE( + in_var_names_->size(), + "Section without a reader or in variable is not supported by now"); + const LoDTensor& tensor = + scope->FindVar(in_var_names_->at(0))->Get(); + batch_size = + tensor.lod().size() ? tensor.lod()[0].size() - 1 : tensor.dims()[0]; + SEC_LOG << "input batch size: " << batch_size; + } + + Scope* exe_scope = scope; + if (section_id_ > 0 && platform::is_gpu_place(place_)) { + SEC_LOG << "CPU2GPU memory copy"; + trans_timer.Resume(); + if (scope->kids().empty()) { + exe_scope = &scope->NewScope(); + } else { + exe_scope = scope->kids().front(); + PADDLE_ENFORCE(scope->kids().size() == 1, "scope->kids().size(): %zu", + scope->kids().size()); + } + + for (const std::string& name : *in_var_names_) { + const LoDTensor& src_tensor = scope->FindVar(name)->Get(); + if (platform::is_gpu_place(src_tensor.place())) { + continue; + } + LoDTensor* gpu_tensor = exe_scope->Var(name)->GetMutable(); + gpu_tensor->set_lod(src_tensor.lod()); + TensorCopy(*static_cast(&src_tensor), place_, *dev_ctx_, + static_cast(gpu_tensor)); + } + trans_timer.Pause(); + } + + SEC_LOG << "begin running ops"; + cal_timer.Resume(); + int op_id = 0; + for (auto& op : ops_) { + timeline.Start(); + op->Run(*exe_scope, place_); + timeline.Pause(); + op_total_time[op_id++] += timeline.ElapsedUS(); + } + exe_scope->DropKids(); + // Wait for GPU calc finising, as the cudaMemcpy and GPU calc may be in + // different streams + // No effect when it is a CPUDeviceContext + dev_ctx_->Wait(); + cal_timer.Pause(); + + if (section_id_ != section_num_ - 1 && platform::is_gpu_place(place_)) { + // FIXME: Temporarily we assume two adjacent sections are in different + // places, + // and we do data transformation only in sections in GPU place, so the + // data is + // transform from GPU to CPU + // A better way to handle such a data transformation is to record each + // place of + // joint-out variables, and do transform as required + + SEC_LOG << "GPU2CPU memory copy"; + trans_timer.Resume(); + for (const std::string& name : *out_var_names_) { + const LoDTensor& src_tensor = + exe_scope->FindVar(name)->Get(); + LoDTensor* dst_tensor = scope->Var(name)->GetMutable(); + dst_tensor->set_lod(src_tensor.lod()); + TensorCopy(*static_cast(&src_tensor), + next_section_place_, *dev_ctx_, + static_cast(dst_tensor)); + } + trans_timer.Pause(); + } + + out_scope_queue_->Send(scope); + + if (sync_func_) { + sync_timer.Resume(); + (*sync_func_)(scope); + sync_timer.Pause(); + } + + ++step_cnt; + accum_num += batch_size; + main_timer.Pause(); + } + outer_timer.Pause(); + + worker_count_mutex_->lock(); + --(*worker_count_); + worker_count_mutex_->unlock(); + + if (*worker_count_ <= 0) { + while (section_id_ < section_num_ - 1 && out_scope_queue_->Size()) { + sleep(1); + } + out_scope_queue_->Close(); + } + LOG(ERROR) << "log_for_profile" + << " card:" << pipeline_id_ << " thread:" << thread_id_ + << " section:" << section_id_ << " step_count:" << step_cnt + << " batch_count:" << accum_num + << " read_time:" << reader_timer.ElapsedUS() + << " trans_time:" << trans_timer.ElapsedUS() + << " cal_time:" << cal_timer.ElapsedUS() + << " sync_time:" << sync_timer.ElapsedUS() + << " main_time:" << main_timer.ElapsedUS() + << " outer_time:" << outer_timer.ElapsedUS(); + for (size_t i = 0; i < ops_.size(); ++i) { + LOG(ERROR) << "op: " << op_name[i] + << ", mean time: " << op_total_time[i] / accum_num; + } +} +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc new file mode 100644 index 00000000..54a81825 --- /dev/null +++ b/paddle/fluid/framework/selected_rows.cc @@ -0,0 +1,234 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/selected_rows.h" + +namespace paddle { +namespace framework { + +struct ReAllocateVisitor { + ReAllocateVisitor(const framework::DDim& dims, framework::Tensor* tensor) + : dims_(dims), tensor_(tensor) {} + + template + void operator()() const { + framework::Tensor cpu_tensor; + platform::CPUPlace cpu; + T* ptr = cpu_tensor.mutable_data(dims_, cpu); + const T* old_ptr = + tensor_->memory_size() == 0 ? nullptr : tensor_->data(); + if (old_ptr != nullptr) { + std::copy(old_ptr, old_ptr + tensor_->numel(), ptr); + } + tensor_->ShareDataWith(cpu_tensor); + } + + framework::DDim dims_; + framework::Tensor* tensor_; +}; + +struct TensorCopyVisitor { + TensorCopyVisitor(framework::Tensor* dst, int64_t dst_offset, + const framework::Tensor src, int64_t src_offset, + int64_t size) + : dst_(dst), + dst_offset_(dst_offset), + src_(src), + src_offset_(src_offset), + size_(size) {} + + template + void apply() const { + // TODO(Yancey1989): support other place + platform::CPUPlace cpu; + memory::Copy(cpu, dst_->mutable_data(cpu) + dst_offset_, cpu, + src_.data() + src_offset_, size_ * sizeof(T)); + } + + framework::Tensor* dst_; + int64_t dst_offset_; + framework::Tensor src_; + int64_t src_offset_; + int64_t size_; +}; + +struct TensorFillVisitor { + TensorFillVisitor(framework::Tensor* dst, int64_t dst_offset, int64_t size, + float value) + : dst_(dst), dst_offset_(dst_offset), size_(size) {} + + template + void apply() const { + // TODO(qiao): support other place + platform::CPUPlace cpu; + auto* tensor_data = dst_->mutable_data(cpu); + auto* start = tensor_data + dst_offset_; + auto* end = start + size_; + std::fill(start, end, static_cast(0.0)); + } + + framework::Tensor* dst_; + int64_t dst_offset_; + int64_t size_; +}; + +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, + const platform::DeviceContext& dev_ctx) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { + // the 2st field, rows information + auto& rows = selected_rows.rows(); + uint64_t size = rows.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + for (uint64_t i = 0; i < size; ++i) { + os.write(reinterpret_cast(&rows[i]), sizeof(rows[i])); + } + } + { + // the 3st field, the height of SelectedRows + int64_t height = selected_rows.height(); + os.write(reinterpret_cast(&height), sizeof(height)); + } + // the 4st field, Tensor data + TensorToStream(os, selected_rows.value(), dev_ctx); +} + +void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, + const platform::DeviceContext& dev_ctx) { + { + // the 1st field, unit32_t version for SelectedRows + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + } + { + // the 2st field, rows information + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + auto& rows = *selected_rows->mutable_rows(); + rows.resize(size); + for (uint64_t i = 0; i < size; ++i) { + is.read(reinterpret_cast(&rows[i]), sizeof(int64_t)); + } + } + { + // the 3st field, the height of the SelectedRows + int64_t height; + is.read(reinterpret_cast(&height), sizeof(int64_t)); + selected_rows->set_height(height); + } + // the 4st field, tensor which contains the data + TensorFromStream(is, selected_rows->mutable_value(), dev_ctx); +} + +bool SelectedRows::HasKey(int64_t key) const { + return std::find(rows_.begin(), rows_.end(), key) == rows_.end() ? false + : true; +} + +int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown, + bool is_test) { + if (is_test) { + auto iter = id_to_index_.find(key); + if (iter == id_to_index_.end()) { + return -1; + } else { + return iter->second; + } + } + + rwlock_->RDLock(); + auto iter = id_to_index_.find(key); + if (iter == id_to_index_.end()) { + rwlock_->UNLock(); + if (!auto_grown) { + PADDLE_THROW("key %d not found", key); + } + rwlock_->WRLock(); + auto map_size = id_to_index_.size(); + auto vector_size = rows_.size(); + if (map_size != vector_size) { + rwlock_->UNLock(); + PADDLE_THROW( + "id_to_index_ size %d should have the same size with rows_ %d", + map_size, vector_size); + } + auto write_iter = id_to_index_.find(key); + if (write_iter == id_to_index_.end()) { + int row_num = rows_.size(); + if (row_num == value_->dims()[0]) { + rwlock_->UNLock(); + PADDLE_THROW("selected rows is full, then length exceed %d", row_num); + } + // key logic to put a key into id_to_index_ + rows_.push_back(key); + auto index = static_cast(rows_.size() - 1); + id_to_index_[key] = index; + rwlock_->UNLock(); + return index; + } else { + auto index = write_iter->second; + rwlock_->UNLock(); + return index; + } + } else { + auto index = iter->second; + rwlock_->UNLock(); + return index; + } +} + +void SelectedRows::SyncIndex() { + rwlock_->WRLock(); + id_to_index_.clear(); + for (size_t i = 0; i < rows_.size(); ++i) { + id_to_index_[rows_[i]] = i; + } + rwlock_->UNLock(); +} + +void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, + bool auto_grown, bool is_test) { + PADDLE_ENFORCE(value->IsInitialized(), + "The value tensor should be initialized."); + if (ids.numel() == 0) { + VLOG(3) << "keys is empty, please check data!"; + } else { + int64_t value_width = value_->numel() / value_->dims()[0]; + PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0], + "output tensor should have the same shape with table " + "except the dims[0]."); + for (int i = 0; i < ids.numel(); ++i) { + auto id = ids.data()[i]; + int64_t index = AutoGrownIndex(id, auto_grown, is_test); + if (index < 0) { + VLOG(5) << "id " << id << " not in the table, return 0"; + framework::VisitDataType( + value_->type(), + TensorFillVisitor(value, i * value_width, value_width, 0.0)); + } else { + framework::VisitDataType( + value_->type(), + TensorCopyVisitor(value, i * value_width, *value_.get(), + index * value_width, value_width)); + } + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h new file mode 100644 index 00000000..e1bdba9b --- /dev/null +++ b/paddle/fluid/framework/selected_rows.h @@ -0,0 +1,167 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/memory/memcpy.h" + +namespace paddle { +namespace framework { + +class SelectedRows { + /* + * @brief We can use the SelectedRows structure to reproduce a sparse table. + * A sparse table is a key-value structure that the key is an `int64_t`, + * and the value is a Tensor which the first dimension is 0. + * You can use the following interface to operate the sparse table, and you + * can find + * some detail information from the comments of each interface: + * + * HasKey(key), whether the sparse table has the specified key. + * Set(key, value), set a key-value pair into the sparse table. + * Get(keys, value*), get value by given key list and apply it to the given + * value pointer + * with the specified offset. + * + */ + public: + SelectedRows(const std::vector& rows, const int64_t& height) + : rows_(rows), height_(height) { + value_.reset(new Tensor()); + rwlock_.reset(new RWLock); + } + + SelectedRows() { + height_ = 0; + value_.reset(new Tensor()); + rwlock_.reset(new RWLock); + } + + platform::Place place() const { return value_->place(); } + + const Tensor& value() const { return *value_; } + + Tensor* mutable_value() { return value_.get(); } + + int64_t height() const { return height_; } + + void set_height(int64_t height) { height_ = height; } + + const Vector& rows() const { return rows_; } + + Vector* mutable_rows() { return &rows_; } + + void set_rows(const Vector& rows) { rows_ = rows; } + + /* + * @brief Get the index of key in rows + * + * @return -1 if the key does not exists. + */ + int64_t Index(int64_t key) const { + auto it = std::find(rows_.begin(), rows_.end(), key); + if (it == rows_.end()) { + PADDLE_THROW("id %s not in table", key); + } + return static_cast(std::distance(rows_.begin(), it)); + } + + /* + * @brief whether has the specified key in the table. + * + * @return true if the key is exists. + */ + bool HasKey(int64_t key) const; + + /* + * @brief Get value by the key list. + * Note!!! this interface is only used when selected_rows is used as + * parameters + * for distribute lookup table. + * + * @return a list of pair which contains the non-exists key and the index in + * the value + */ + void Get(const framework::Tensor& ids, framework::Tensor* value, + bool auto_grown = false, bool is_test = false); + + /* + * @brief Get the index of the key from id_to_index_ map. If the key not + * exist, + * add the key into id_to_index_. + * + * Note!!! this interface is only used when selected_rows is used as + * parameters + * for distribute lookup table. + * + * @return index of the key. + */ + int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false); + + /* + * @brief Get the index of the key from id_to_index_ map. + */ + inline int64_t GetIndexFromId(int64_t key) { + auto iter = id_to_index_.find(key); + if (iter == id_to_index_.end()) { + return -1; + } else { + return iter->second; + } + } + + void SyncIndex(); + /* + * @brief Get complete Dims before + */ + DDim GetCompleteDims() const { + std::vector dims = vectorize(value_->dims()); + dims[0] = height_; + return make_ddim(dims); + } + + private: + // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here. + // SelectedRows are simply concated when adding together. Until a + // SelectedRows add a Tensor, will the duplicate rows be handled. + Vector rows_; + std::unordered_map + id_to_index_; // should not be used when rows_ has duplicate member + std::unique_ptr value_{nullptr}; + int64_t height_; // height indicates the underline tensor's height + std::unique_ptr rwlock_{nullptr}; +}; + +/* + * Serialize/Desiralize SelectedRows to std::ostream + * You can pass ofstream or ostringstream to serilize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, + const platform::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, + const platform::DeviceContext& dev_ctx); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/selected_rows_test.cc b/paddle/fluid/framework/selected_rows_test.cc new file mode 100644 index 00000000..3b0509e0 --- /dev/null +++ b/paddle/fluid/framework/selected_rows_test.cc @@ -0,0 +1,202 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include // NOLINT + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/selected_rows.h" + +namespace paddle { +namespace framework { + +class SelectedRowsTester : public ::testing::Test { + public: + void SetUp() override { + std::vector rows{0, 4, 7}; + int64_t height = 10; + int64_t row_numel = 100; + selected_rows_.reset(new SelectedRows(rows, height)); + + Tensor* value = selected_rows_->mutable_value(); + auto* data = value->mutable_data( + make_ddim({static_cast(rows.size()), row_numel}), place_); + for (int64_t i = 0; i < value->numel(); ++i) { + data[i] = static_cast(i); + } + } + + protected: + platform::CPUPlace place_; + std::unique_ptr selected_rows_{nullptr}; +}; + +TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); } + +TEST_F(SelectedRowsTester, dims) { + ASSERT_EQ(selected_rows_->value().dims(), make_ddim({3, 100})); +} + +TEST_F(SelectedRowsTester, complete_dims) { + ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100})); +} + +TEST_F(SelectedRowsTester, SerializeAndDeseralize) { + SelectedRows dst_tensor; + platform::CPUDeviceContext cpu_ctx(place_); + std::ostringstream oss; + + SerializeToStream(oss, *selected_rows_, cpu_ctx); + + std::istringstream iss(oss.str()); + DeserializeFromStream(iss, &dst_tensor, cpu_ctx); + + ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows()); + ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); + ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims()); + ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims()); + auto* dst_data = dst_tensor.value().data(); + for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) { + ASSERT_EQ(dst_data[i], static_cast(i)); + } +} + +TEST(SelectedRows, SparseTable) { + platform::CPUPlace cpu; + SelectedRows table; + + int64_t table_size = 100; + int64_t embedding_width = 8; + // initialize a sparse table + table.mutable_value()->Resize( + framework::make_ddim({table_size, embedding_width})); + auto* data = table.mutable_value()->mutable_data(cpu); + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + data[i * embedding_width + j] = static_cast(i); + } + } + ASSERT_EQ(table.AutoGrownIndex(10, true, false), 0); + ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1); + ASSERT_EQ(table.AutoGrownIndex(8, true, false), 1); + ASSERT_EQ(table.AutoGrownIndex(6, true, false), 2); + for (int64_t i = 11; i < 20; i++) { + ASSERT_EQ(table.AutoGrownIndex(i, true, true), -1); + ASSERT_TRUE(!table.HasKey(i)); + } + ASSERT_TRUE(table.HasKey(10)); + ASSERT_TRUE(table.HasKey(8)); + ASSERT_TRUE(table.HasKey(6)); + ASSERT_EQ(table.rows().size(), 3UL); + + framework::Tensor ids; + ids.Resize(framework::make_ddim({4})); + auto* ids_data = ids.mutable_data(cpu); + ids_data[0] = static_cast(6); + ids_data[1] = static_cast(6); + ids_data[2] = static_cast(8); + ids_data[3] = static_cast(10); + + framework::Tensor get_value; + auto* value_data = get_value.mutable_data( + framework::make_ddim({4, embedding_width}), cpu); + table.Get(ids, &get_value); + + for (int j = 0; j < embedding_width; ++j) { + ASSERT_EQ(value_data[0 * embedding_width + j], 2); + } + for (int j = 0; j < embedding_width; ++j) { + ASSERT_EQ(value_data[1 * embedding_width + j], 2); + } + for (int j = 0; j < embedding_width; ++j) { + ASSERT_EQ(value_data[2 * embedding_width + j], 1); + } + for (int j = 0; j < embedding_width; ++j) { + ASSERT_EQ(value_data[3 * embedding_width + j], 0); + } +} + +void f1(SelectedRows* table, int table_size) { + for (int i = 1000000; i > 0; --i) { + auto id = i % table_size; + int64_t index1 = table->AutoGrownIndex(id, true); + int64_t index2 = table->AutoGrownIndex(id, false); + int64_t index3 = table->AutoGrownIndex(id, true); + ASSERT_EQ(index1, index2); + ASSERT_EQ(index2, index3); + } +} + +void f2(SelectedRows* table, int table_size) { + for (int i = 0; i < 1000000; ++i) { + auto id = i % table_size; + int64_t index1 = table->AutoGrownIndex(id, true); + int64_t index2 = table->AutoGrownIndex(id, false); + int64_t index3 = table->AutoGrownIndex(id, true); + ASSERT_EQ(index1, index2); + ASSERT_EQ(index2, index3); + } +} + +void f3(SelectedRows* table, int table_size) { + clock_t t1 = clock(); + for (int i = 100000; i > 0; --i) { + auto id1 = table->AutoGrownIndex(i % table_size, true); + auto id2 = table->Index(i % table_size); + ASSERT_EQ(id1, id2); + } + clock_t t2 = clock(); + std::cout << "f3 run time:" << t2 - t1 << std::endl; +} + +void f4(SelectedRows* table, int table_size) { + clock_t t1 = clock(); + for (int i = 0; i < 100000; ++i) { + auto id1 = table->AutoGrownIndex(i % table_size, true); + auto id2 = table->Index(i % table_size); + ASSERT_EQ(id1, id2); + } + clock_t t2 = clock(); + std::cout << "f4 run time:" << t2 - t1 << std::endl; +} + +TEST(SelectedRows, MultiThreadAutoIndex) { + platform::CPUPlace cpu; + SelectedRows table; + + int64_t table_size = 100000; + int64_t embedding_width = 8; + // initialize a sparse table + table.mutable_value()->Resize( + framework::make_ddim({table_size, embedding_width})); + auto* data = table.mutable_value()->mutable_data(cpu); + for (int64_t i = 0; i < table_size; ++i) { + for (int64_t j = 0; j < embedding_width; ++j) { + data[i * embedding_width + j] = static_cast(i); + } + } + + std::thread t1(f1, &table, table_size); + std::thread t11(f1, &table, table_size); + std::thread t2(f2, &table, table_size); + std::thread t22(f2, &table, table_size); + t1.join(); + t11.join(); + t2.join(); + t22.join(); + std::thread t3(f3, &table, table_size); + std::thread t4(f4, &table, table_size); + t3.join(); + t4.join(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/shape_inference.cc b/paddle/fluid/framework/shape_inference.cc new file mode 100644 index 00000000..4ac872ac --- /dev/null +++ b/paddle/fluid/framework/shape_inference.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/shape_inference.h" +#include +#include +#include +#include "paddle/fluid/framework/grad_op_desc_maker.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { + +std::vector InferShapeContext::GetReaderDims( + const std::string &name) const { + const std::vector &arg_names = Inputs(name); + PADDLE_ENFORCE_EQ( + arg_names.size(), 1UL, + "Reader input '%s' should hold one element, but now it holds %d", name, + arg_names.size()); + return this->GetRepeatedDims(arg_names[0]); +} + +void InferShapeContext::SetReaderDims(const std::string &name, + const std::vector &dims) { + const std::vector &arg_names = Outputs(name); + PADDLE_ENFORCE_EQ( + arg_names.size(), 1UL, + "Reader output '%s' should hold one element, but now it holds %d", name, + arg_names.size()); + return this->SetRepeatedDims(arg_names[0], dims); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h new file mode 100644 index 00000000..e0a84827 --- /dev/null +++ b/paddle/fluid/framework/shape_inference.h @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { + +class OperatorBase; + +using InferShapeVarPtr = boost::variant; + +class InferShapeContext { + public: + virtual ~InferShapeContext() = default; + virtual bool HasInput(const std::string &name) const = 0; + virtual bool HasOutput(const std::string &name) const = 0; + + virtual std::vector GetInputsVarType( + const std::string &name) const = 0; + virtual std::vector GetOutputsVarType( + const std::string &name) const = 0; + + virtual bool HasInputs(const std::string &name) const = 0; + virtual bool HasOutputs(const std::string &name) const = 0; + + virtual DDim GetInputDim(const std::string &name) const = 0; + virtual std::vector GetInputsDim(const std::string &name) const = 0; + virtual std::vector GetReaderDims(const std::string &name) const; + + virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0; + virtual void SetOutputsDim(const std::string &name, + const std::vector &dims) = 0; + virtual void SetReaderDims(const std::string &name, + const std::vector &dims); + + virtual AttrReader Attrs() const = 0; + virtual const std::vector &Inputs( + const std::string &name) const = 0; + virtual const std::vector &Outputs( + const std::string &name) const = 0; + + virtual void ShareDim(const std::string &in, const std::string &out, + size_t i = 0, size_t j = 0) = 0; + + virtual void ShareLoD(const std::string &in, const std::string &out, + size_t i = 0, size_t j = 0) const = 0; + + virtual void DecreaseLoDLevel(const std::string &in, const std::string &out, + size_t i = 0, size_t j = 0) const = 0; + + virtual bool IsRuntime() const = 0; + + virtual std::vector GetInputVarPtrs( + const std::string &name) = 0; + virtual std::vector GetOutputVarPtrs( + const std::string &name) = 0; + + protected: + virtual std::vector GetRepeatedDims(const std::string &name) const = 0; + virtual void SetRepeatedDims(const std::string &name, + const std::vector &dims) = 0; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc new file mode 100644 index 00000000..565b7d9d --- /dev/null +++ b/paddle/fluid/framework/tensor.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/var_type.h" + +namespace paddle { +namespace framework { +extern size_t SizeOfType(proto::VarType::Type type); +void Tensor::check_memory_size() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); + PADDLE_ENFORCE_LE( + numel() * SizeOfType(type()), memory_size(), + "Tensor's dims_ is out of bound. Call Tensor::mutable_data " + "first to re-allocate memory.\n" + "or maybe the required data-type mismatches the data already stored."); +} + +Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {} + +size_t Tensor::memory_size() const { + return holder_ == nullptr ? 0UL : holder_->size() - offset_; +} + +void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type, + size_t requested_size) { + type_ = type; + PADDLE_ENFORCE_GE(numel(), 0, + "When calling this method, the Tensor's numel must be " + "equal or larger than zero. " + "Please check Tensor::Resize has been called first."); + size_t size = numel() * SizeOfType(type); + if (requested_size) { + PADDLE_ENFORCE_GE(requested_size, size); + size = requested_size; + } + /* some versions of boost::variant don't have operator!= */ + if (holder_ == nullptr || !(holder_->place() == place) || + holder_->size() < size + offset_) { + holder_ = memory::AllocShared(place, size); + offset_ = 0; + } + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +void* Tensor::mutable_data(platform::Place place, size_t requested_size) { + PADDLE_ENFORCE(this->holder_ != nullptr, + "Cannot invoke mutable data if current hold nothing."); + return mutable_data(place, type_, requested_size); +} + +Tensor& Tensor::ShareDataWith(const Tensor& src) { + src.check_memory_size(); + *this = src; + return *this; +} + +Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const { + check_memory_size(); + PADDLE_ENFORCE_GE(begin_idx, 0, + "The start row index must be greater than 0."); + PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound."); + PADDLE_ENFORCE_LT( + begin_idx, end_idx, + "The start row index must be lesser than the end row index."); + + if (dims_[0] == 1) { + return *this; + } else { + size_t base = numel() / dims_[0]; + Tensor dst; + dst.holder_ = holder_; + dst.set_layout(layout_); + dst.type_ = type_; + DDim dst_dims = dims_; + dst_dims[0] = end_idx - begin_idx; + dst.Resize(dst_dims); + dst.offset_ = offset_ + begin_idx * base * SizeOfType(type()); + return dst; + } +} + +Tensor& Tensor::Resize(const DDim& dims) { + dims_ = dims; + return *this; +} + +const DDim& Tensor::dims() const { return dims_; } + +int64_t Tensor::numel() const { return product(dims_); } + +void Tensor::ResetHolder(std::shared_ptr holder) { + if (holder_) { + PADDLE_ENFORCE_EQ(numel() * SizeOfType(type()), holder->size()); + } + holder_ = holder; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h new file mode 100644 index 00000000..8fffecfa --- /dev/null +++ b/paddle/fluid/framework/tensor.h @@ -0,0 +1,211 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { + +namespace framework { + +class LoDTensor; + +class Tensor { +#ifdef PADDLE_WITH_MKLDNN + + public: + inline mkldnn::memory::format format() const { return format_; } + + inline void set_format(const mkldnn::memory::format format) { + format_ = format; + } + + protected: + /** + * @brief the detail format of memory block which have layout as kMKLDNN + * + * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, + * nChw16c, etc. For a MKLDNN memory block, layout will be set as + * DataLayout::kMKLDNN meanwhile detail memory format will be kept in + * this field. + */ + + mkldnn::memory::format format_ = mkldnn::memory::format::format_undef; +#endif + + public: + template + friend struct EigenTensor; + + template + friend struct EigenMatrix; + + template + friend struct EigenVector; + + public: + Tensor() : type_(proto::VarType::FP32), offset_(0) {} + + explicit Tensor(const proto::VarType::Type&); + + /*! Return a pointer to mutable memory block. */ + template + T* data(); + + /*! Return a pointer to constant memory block. */ + template + const T* data() const; + + inline bool IsInitialized() const; + + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + T* mutable_data(platform::Place place, size_t requested_size = 0); + + void* mutable_data(platform::Place place, proto::VarType::Type type, + size_t requested_size = 0); + + void* mutable_data(platform::Place place, size_t requested_size = 0); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * @param[in] requested_size The size of the block in bytes. + * + * @note If not exist, then allocation. + */ + template + T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0); + + /*! Return the dimensions of the memory block. */ + const DDim& dims() const; + + /*! Return the numel of the memory block. */ + int64_t numel() const; + + /*! Resize the dimensions of the memory block. */ + Tensor& Resize(const DDim& dims); + + /*! The internal of two tensors share the same memory block. */ + Tensor& ShareDataWith(const Tensor& src); + + /** + * @brief Return a sub-tensor of the given tensor. + * + * @param[in] begin_idx The index of the start row(inclusive) to slice. + * The index number begins from 0. + * @param[in] end_idx The index of the end row(exclusive) to slice. + * The index number begins from 0. + */ + Tensor Slice(int64_t begin_idx, int64_t end_idx) const; + + platform::Place place() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tensor not initialized yet when Tensor::place() is called."); + return holder_->place(); + } + + proto::VarType::Type type() const { + PADDLE_ENFORCE_NOT_NULL( + holder_, "Tensor not initialized yet when Tensor::type() is called."); + return type_; + } + + // memory size returns the holding memory size in byte. + size_t memory_size() const; + + void check_memory_size() const; + + DataLayout layout() const { return layout_; } + + void set_layout(const DataLayout layout) { layout_ = layout; } + + void clear() { + holder_ = nullptr; + offset_ = 0; + } + + void ShareBufferWith(const Tensor& tensor) { + holder_ = tensor.holder_; + offset_ = tensor.offset_; + } + + const std::shared_ptr& Holder() const { return holder_; } + size_t offset() const { return offset_; } + + std::shared_ptr MoveMemoryHolder() { + return std::move(holder_); + } + + void ResetHolder(std::shared_ptr holder); + + private: + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + proto::VarType::Type type_; + /** + * @brief points to elements dimensions. + * + * @note dims_ do not indicate the memory block size. + */ + + DDim dims_; + + /** + * @brief the layout of memory block, default is NHWC. + * + * @note the memory allocation order, describe how weight/data is stored + * For example, in 4-D Tensor(rank=4), there are three commonly + * used layout. They are + * NCHW, NHWC, CHWN. + * N,C,H,W for respectively the batch size, the number of + * feature maps, the height. + */ + // Fix me: here just change the default layout to kNCHW + // it doesn't fix the real issue, i.e. feeder should set up tensor layout + // according to actual input data + DataLayout layout_ = DataLayout::kNCHW; + + /** + * @brief A PlaceHolder may be shared by more than one tensor. + * + * @note Some of them may be slices of the others. So the offset_ + * is introduced here to indicate the byte offset between + * PlaceHolder::ptr_ and where the tensor data really begins. + */ + size_t offset_; +}; + +} // namespace framework +} // namespace paddle + +#include "paddle/fluid/framework/tensor_impl.h" diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h new file mode 100644 index 00000000..a5c39b7e --- /dev/null +++ b/paddle/fluid/framework/tensor_impl.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace framework { +template +inline const T* Tensor::data() const { + check_memory_size(); + bool valid = + std::is_same::value || type_ == DataTypeTrait::DataType(); + PADDLE_ENFORCE( + valid, "Tensor holds the wrong type, it holds %s, but desires to be %s", + DataTypeToString(type_), DataTypeToString(DataTypeTrait::DataType())); + + return reinterpret_cast( + reinterpret_cast(holder_->ptr()) + offset_); +} + +inline bool Tensor::IsInitialized() const { return holder_ != nullptr; } + +template +inline T* Tensor::data() { + check_memory_size(); + bool valid = + std::is_same::value || type_ == DataTypeTrait::DataType(); + PADDLE_ENFORCE( + valid, "Tensor holds the wrong type, it holds %s, but desires to be %s", + DataTypeToString(type_), DataTypeToString(DataTypeTrait::DataType())); + return reinterpret_cast(reinterpret_cast(holder_->ptr()) + + offset_); +} + +template +inline T* Tensor::mutable_data(DDim dims, platform::Place place, + size_t requested_size) { + static_assert(std::is_pod::value, "T must be POD"); + Resize(dims); + return mutable_data(place, requested_size); +} + +template +inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) { + static_assert(std::is_pod::value, "T must be POD"); + return reinterpret_cast( + mutable_data(place, DataTypeTrait::DataType(), requested_size)); +} + +inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { + int rank = src.dims().size(); + PADDLE_ENFORCE_GE( + rank, 2, + "'ReshapeToMatrix()' is only used for flatten high rank " + "tensors to matrixs. Can not be used in reshaping vectors."); + if (rank == 2) { + return src; + } + Tensor res; + res.ShareDataWith(src); + res.Resize(flatten_to_2d(src.dims(), num_col_dims)); + return res; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc new file mode 100644 index 00000000..238af9ba --- /dev/null +++ b/paddle/fluid/framework/tensor_test.cc @@ -0,0 +1,259 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/tensor.h" +#include +#include +#include "paddle/fluid/platform/float16.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; + +TEST(Tensor, Dims) { + framework::Tensor tt; + tt.Resize({2, 3, 4}); + framework::DDim dims = tt.dims(); + ASSERT_EQ(arity(dims), 3); + for (int i = 0; i < 3; ++i) { + EXPECT_EQ(i + 2, dims[i]); + } +} + +TEST(Tensor, DataAssert) { + framework::Tensor src_tensor; + + bool caught = false; + try { + src_tensor.data(); + } catch (platform::EnforceNotMet err) { + caught = true; + std::string msg = + "holder_ should not be null\nTensor holds no memory. Call " + "Tensor::mutable_data first."; + const char* what = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + ASSERT_TRUE(caught); +} + +TEST(Tensor, MutableData) { + { + framework::Tensor src_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::CPUPlace()); + EXPECT_NE(p1, nullptr); + // set src_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = src_tensor.mutable_data(framework::make_ddim({3, 4}), + platform::CPUPlace()); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1, p2); + // set src_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::CPUPlace()); + EXPECT_EQ(p1, p2); + // set src_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CPUPlace()); + EXPECT_EQ(p1, p2); + + float* p3 = nullptr; + float* p4 = nullptr; + // set src_tensor a different type but smaller size. + // memory block is supposed to be unchanged. + auto* tmp = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CPUPlace()); + p3 = reinterpret_cast(tmp); + EXPECT_EQ(p1, p3); + + // set src_tensor a different type but bigger size. + // memory block is supposed to be changed. + auto* tmp2 = src_tensor.mutable_data( + framework::make_ddim({2, 2, 3}), platform::CPUPlace()); + p4 = reinterpret_cast(tmp2); + EXPECT_NE(p1, p4); + } + // Not sure if it's desired, but currently, Tensor type can be changed. + { + framework::Tensor src_tensor; + int8_t* p1 = src_tensor.mutable_data(framework::make_ddim({1}), + platform::CPUPlace()); + EXPECT_NE(p1, nullptr); + *p1 = 1; + + uint8_t* p2 = src_tensor.mutable_data(framework::make_ddim({1}), + platform::CPUPlace()); + EXPECT_NE(p2, nullptr); + EXPECT_EQ(static_cast(p2[0]), 1); + } + +#ifdef PADDLE_WITH_CUDA + { + framework::Tensor src_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::CUDAPlace()); + EXPECT_NE(p1, nullptr); + // set src_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = src_tensor.mutable_data(framework::make_ddim({3, 1024}), + platform::CUDAPlace()); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1, p2); + // set src_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::CUDAPlace()); + EXPECT_EQ(p1, p2); + // set src_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CUDAPlace()); + EXPECT_EQ(p1, p2); + } +#endif +} + +TEST(Tensor, ShareDataWith) { + { + framework::Tensor src_tensor; + framework::Tensor dst_tensor; + // Try to share data form uninitialized tensor + bool caught = false; + try { + dst_tensor.ShareDataWith(src_tensor); + } catch (paddle::platform::EnforceNotMet err) { + caught = true; + std::string msg = + "holder_ should not be null\nTensor holds no memory. Call " + "Tensor::mutable_data first."; + const char* what = err.what(); + for (size_t i = 0; i < msg.length(); ++i) { + ASSERT_EQ(what[i], msg[i]); + } + } + ASSERT_TRUE(caught); + + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::CPUPlace()); + dst_tensor.ShareDataWith(src_tensor); + ASSERT_EQ(src_tensor.data(), dst_tensor.data()); + } + +#ifdef PADDLE_WITH_CUDA + { + framework::Tensor src_tensor; + framework::Tensor dst_tensor; + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::CUDAPlace()); + dst_tensor.ShareDataWith(src_tensor); + ASSERT_EQ(src_tensor.data(), dst_tensor.data()); + } +#endif +} + +TEST(Tensor, Slice) { + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({5, 3, 4}), + platform::CPUPlace()); + framework::Tensor slice_tensor = src_tensor.Slice(1, 3); + framework::DDim slice_dims = slice_tensor.dims(); + ASSERT_EQ(arity(slice_dims), 3); + EXPECT_EQ(slice_dims[0], 2); + EXPECT_EQ(slice_dims[1], 3); + EXPECT_EQ(slice_dims[2], 4); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = reinterpret_cast( + src_tensor.mutable_data(src_tensor.dims(), platform::CPUPlace())); + uintptr_t slice_data_address = + reinterpret_cast(slice_tensor.data()); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CPUPlace())); + EXPECT_EQ(src_data_address, src_mutable_data_address); + EXPECT_EQ(slice_data_address, slice_mutable_data_address); + EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address); + } + +#ifdef PADDLE_WITH_CUDA + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 9}), + platform::CUDAPlace()); + framework::Tensor slice_tensor = src_tensor.Slice(2, 6); + framework::DDim slice_dims = slice_tensor.dims(); + ASSERT_EQ(arity(slice_dims), 2); + EXPECT_EQ(slice_dims[0], 4); + EXPECT_EQ(slice_dims[1], 9); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = + reinterpret_cast(src_tensor.mutable_data( + src_tensor.dims(), platform::CUDAPlace())); + uintptr_t slice_data_address = + reinterpret_cast(slice_tensor.data()); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CUDAPlace())); + EXPECT_EQ(src_data_address, src_mutable_data_address); + EXPECT_EQ(slice_data_address, slice_mutable_data_address); + EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); + } +#endif +} + +TEST(Tensor, ReshapeToMatrix) { + framework::Tensor src; + int* src_ptr = src.mutable_data({2, 3, 4, 9}, platform::CPUPlace()); + for (int i = 0; i < 2 * 3 * 4 * 9; ++i) { + src_ptr[i] = i; + } + framework::Tensor res = framework::ReshapeToMatrix(src, 2); + ASSERT_EQ(res.dims()[0], 2 * 3); + ASSERT_EQ(res.dims()[1], 4 * 9); +} + +TEST(Tensor, Layout) { + framework::Tensor src; + ASSERT_EQ(src.layout(), framework::DataLayout::kNCHW); + src.set_layout(framework::DataLayout::kAnyLayout); + ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout); +} + +TEST(Tensor, FP16) { + using platform::float16; + framework::Tensor src; + float16* src_ptr = src.mutable_data({2, 3}, platform::CPUPlace()); + for (int i = 0; i < 2 * 3; ++i) { + src_ptr[i] = static_cast(i); + } + EXPECT_EQ(src.memory_size(), 2 * 3 * sizeof(float16)); + // EXPECT a human readable error message + // src.data(); + // Tensor holds the wrong type, it holds N6paddle8platform7float16E at + // [/paddle/Paddle/paddle/fluid/framework/tensor_impl.h:43] +} diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc new file mode 100644 index 00000000..33ef3b91 --- /dev/null +++ b/paddle/fluid/framework/tensor_util.cc @@ -0,0 +1,541 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include "paddle/fluid/framework/tensor_util.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace framework { + +void TensorCopy(const Tensor& src, const platform::Place& dst_place, + const platform::DeviceContext& ctx, Tensor* dst) { + VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " + << dst_place; + src.check_memory_size(); + + dst->Resize(src.dims()); + dst->set_layout(src.layout()); + auto src_place = src.place(); + auto src_ptr = src.data(); + + auto dst_ptr = dst->mutable_data(dst_place, src.type()); + + auto size = src.numel() * SizeOfType(src.type()); + + if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_cpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_cpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto ctx_gpu_place = boost::get(ctx_place); + PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); + } else if (platform::is_gpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + auto ctx_place = ctx.GetPlace(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + auto stream = + reinterpret_cast(ctx).stream(); + if (platform::is_same_place(src_place, dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + } else { + if (platform::is_same_place(ctx_place, src_place)) { + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + platform::DeviceContextPool::Instance().Get(src.place())->Wait(); + } else if (platform::is_same_place(ctx_place, dst_place)) { + platform::DeviceContextPool::Instance().Get(src.place())->Wait(); + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, + stream); + } else { + PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place."); + } + } + } +#endif +} + +void TensorCopy(const Tensor& src, const platform::Place& dst_place, + Tensor* dst) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext* dev_ctx; + if (platform::is_gpu_place(dst_place)) { + dev_ctx = pool.Get(dst_place); + } else { + dev_ctx = pool.Get(src.place()); + } + TensorCopy(src, dst_place, *dev_ctx, dst); +} + +void TensorCopySync(const Tensor& src, const platform::Place& dst_place, + Tensor* dst) { + VLOG(3) << "TensorCopySync " << src.dims() << " from " << src.place() + << " to " << dst_place; + src.check_memory_size(); + dst->Resize(src.dims()); + dst->set_layout(src.layout()); + auto src_place = src.place(); + auto src_ptr = src.data(); + auto dst_ptr = dst->mutable_data(dst_place, src.type()); + auto size = src.numel() * SizeOfType(src.type()); + if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:GPU->CPU"); + auto src_gpu_place = boost::get(src_place); + auto dst_cpu_place = boost::get(dst_place); + memory::Copy(dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); + } else if (platform::is_cpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:CPU->GPU"); + auto src_cpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + memory::Copy(dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, nullptr); + } else if (platform::is_gpu_place(src_place) && + platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:GPU->GPU"); + if (src_ptr == dst_ptr && platform::is_same_place(src_place, dst_place)) { + VLOG(3) << "Skip copy the same data from " << src_place << " to " + << dst_place; + return; + } + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); + } else if (platform::is_cuda_pinned_place(src_place) && + platform::is_gpu_place(dst_place)) { + platform::RecordEvent record_event("TensorCopy:CUDAPinned->GPU"); + auto src_pinned_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, + nullptr); + } +#endif +} + +template +struct AnyDTypeVisitor { + Predicate predicate_; + const Tensor& tensor_; + const DevCtx& ctx_; + Tensor* out_; + + AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx, + Tensor* out) + : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {} + + template + void apply() const { + auto t = EigenVector::Flatten(tensor_); + auto o = EigenScalar::From(*out_); + // return any of predicate_(t) is true. + o.device(*ctx_.eigen_device()) = predicate_(t).any(); + } +}; + +template +inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, + const DevCtx& ctx, framework::Tensor* out) { + VisitDataType(tensor.type(), AnyDTypeVisitor( + predicate, tensor, ctx, out)); +} + +template +class AnyVisitor : public boost::static_visitor { + private: + const framework::Tensor& tensor_; + Predicate predicate_; + + public: + AnyVisitor(const framework::Tensor& tensor, Predicate predicate) + : tensor_(tensor), predicate_(std::move(predicate)) {} + + template + bool operator()(const Place& place) const { + framework::Tensor out; + out.Resize({1}); + out.mutable_data(place); + auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + AnyImpl(predicate_, tensor_, *ctx, &out); + return this->GetResult(out, place); + } + + bool GetResult(const framework::Tensor& out, + const platform::CUDAPlace& gpu) const { + platform::CPUPlace cpu; + framework::Tensor tmp; + tmp.Resize({1}); + tmp.mutable_data(cpu); + auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); + gpuctx->Wait(); + TensorCopy(out, cpu, *gpuctx, &tmp); + gpuctx->Wait(); + return GetResult(tmp, cpu); + } + + bool GetResult(const framework::Tensor& out, + const platform::CPUPlace& cpu) const { + return *out.data(); + } + + bool GetResult(const framework::Tensor& out, + const platform::CUDAPinnedPlace& cpu) const { + return *out.data(); + } +}; + +template +class AnyOutVisitor : public boost::static_visitor<> { + private: + const framework::Tensor& tensor_; + mutable framework::Tensor* out_; + Predicate predicate_; + + public: + AnyOutVisitor(const framework::Tensor& tensor, Predicate predicate, + framework::Tensor* out) + : tensor_(tensor), out_(out), predicate_(std::move(predicate)) {} + + template + void operator()(const Place& place) const { + auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + out_->Resize({1}); + out_->mutable_data(place); + AnyImpl(predicate_, tensor_, *ctx, out_); + } +}; + +template +inline bool Any(const framework::Tensor& tensor, Predicate predicate) { + AnyVisitor visitor(tensor, predicate); + auto place = tensor.place(); + return platform::VisitPlace(place, visitor); +} + +template +inline void Any(const framework::Tensor& tensor, Predicate predicate, + framework::Tensor* out) { + AnyOutVisitor visitor(tensor, predicate, out); + auto place = tensor.place(); + platform::VisitPlace(place, visitor); +} + +struct ContainsNANPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isnan()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isnan(); + } +}; + +bool TensorContainsNAN(const framework::Tensor& tensor) { + ContainsNANPredicate predicate; + return Any(tensor, predicate); +} + +void TensorContainsNAN(const framework::Tensor& tensor, + framework::Tensor* out) { + ContainsNANPredicate predicate; + Any(tensor, predicate, out); +} + +struct ContainsInfPredicate { + template + auto operator()(const T& eigen_vec) const + -> decltype(std::declval().isinf()) { + // Cast eigen_vector to vector of bool. true if is inf. + return eigen_vec.isinf(); + } +}; + +bool TensorContainsInf(const framework::Tensor& tensor) { + ContainsInfPredicate predicate; + return Any(tensor, predicate); +} + +void TensorContainsInf(const framework::Tensor& tensor, + framework::Tensor* out) { + ContainsInfPredicate predicate; + Any(tensor, predicate, out); +} + +// NOTE(dzhwinter): +// Isfinite need a AllVisitor to loop through all the elements. +// We choose two cuda call instead of one allvisitor. The AllVisitor +// should be implemented if the performance hurts. +bool TensorIsfinite(const framework::Tensor& tensor) { + ContainsInfPredicate pred_inf; + ContainsNANPredicate pred_nan; + return !Any(tensor, pred_inf) && !Any(tensor, pred_nan); +} + +#ifdef PADDLE_WITH_CUDA +template +static inline void __global__ BothFalse(const T* cmp, T* out) { + out[0] = (!cmp[0]) && (!out[0]); +} +#endif + +struct BothFalseVisitor : public boost::static_visitor<> { + const framework::Tensor& in_; + mutable framework::Tensor* out_; + BothFalseVisitor(const framework::Tensor& in, framework::Tensor* out) + : in_(in), out_(out) {} + + template + void operator()(const Place& place) const { + VisitorImpl(place); + } + + void VisitorImpl(const platform::CUDAPlace& gpu) const { +#ifdef PADDLE_WITH_CUDA + auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu); + BothFalse<<<1, 1, 0, ctx->stream()>>>(in_.data(), + out_->mutable_data(gpu)); +#endif + } + + void VisitorImpl(const platform::CPUPlace& cpu) const { + bool lhs = !in_.data()[0]; + bool rhs = !out_->mutable_data(cpu)[0]; + out_->mutable_data(cpu)[0] = lhs && rhs; + } + + void VisitorImpl( + const platform::CUDAPinnedPlace& cpu /* equals to cpu*/) const { + bool lhs = !in_.data()[0]; + bool rhs = !out_->mutable_data(cpu)[0]; + out_->mutable_data(cpu)[0] = lhs && rhs; + } +}; + +void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) { + framework::Tensor tmp; + TensorContainsInf(tensor, &tmp); + TensorContainsNAN(tensor, out); + BothFalseVisitor visitor(tmp, out); + auto place = tensor.place(); + platform::VisitPlace(place, visitor); +} + +void TensorToStream(std::ostream& os, const Tensor& tensor, + const platform::DeviceContext& dev_ctx) { + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { // the 2nd field, tensor description + // int32_t size + // void* protobuf message + proto::VarType::TensorDesc desc; + desc.set_data_type(tensor.type()); + auto dims = framework::vectorize(tensor.dims()); + auto* pb_dims = desc.mutable_dims(); + pb_dims->Resize(static_cast(dims.size()), 0); + std::copy(dims.begin(), dims.end(), pb_dims->begin()); + int32_t size = desc.ByteSize(); + os.write(reinterpret_cast(&size), sizeof(size)); + auto out = desc.SerializeAsString(); + os.write(out.data(), size); + } + { // the 3rd field, tensor data + uint64_t size = tensor.numel() * framework::SizeOfType(tensor.type()); + + auto* data_ptr = tensor.data(); + PADDLE_ENFORCE(size < std::numeric_limits::max(), + "Index overflow when writing tensor"); + if (platform::is_gpu_place(tensor.place())) { +#ifdef PADDLE_WITH_CUDA + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& gpu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + boost::get(tensor.place()), + reinterpret_cast(data), size_to_write, + gpu_dev_ctx.stream()); + gpu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW("Unexpected branch"); +#endif + } else { + os.write(static_cast(data_ptr), + static_cast(size)); + } + } +} + +struct DeserializedDataFunctor { + DeserializedDataFunctor(void** buf, Tensor* tensor, + const platform::Place& place) + : buf_(buf), tensor_(tensor), place_(place) {} + + template + void apply() { + *buf_ = tensor_->mutable_data(place_); + } + + void** buf_; + Tensor* tensor_; + platform::Place place_; +}; + +void TensorFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx) { + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + proto::VarType::TensorDesc desc; + { // int32_t size + // proto buffer + int32_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::unique_ptr buf(new char[size]); + is.read(reinterpret_cast(buf.get()), size); + PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), + "Cannot parse tensor desc"); + } + { // read tensor + std::vector dims; + dims.reserve(static_cast(desc.dims().size())); + std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); + tensor->Resize(framework::make_ddim(dims)); + void* buf; + auto ctx = platform::CPUDeviceContext(); + size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); + if (platform::is_gpu_place(dev_ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + Tensor cpu_tensor; + cpu_tensor.Resize(framework::make_ddim(dims)); + framework::VisitDataType( + desc.data_type(), + DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace())); + is.read(static_cast(buf), size); + auto dst_place = dev_ctx.GetPlace(); + framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); +#else + PADDLE_THROW("Unexpected branch"); +#endif + } else { + framework::VisitDataType( + desc.data_type(), + DeserializedDataFunctor(&buf, tensor, ctx.GetPlace())); + is.read(static_cast(buf), size); + } + } +} + +template +std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) { + auto inspect = tensor.data(); + auto element_num = tensor.numel(); + + os << "\tdata: ["; + if (element_num > 0) { + os << inspect[0]; + for (int j = 1; j < element_num; ++j) { + os << " " << inspect[j]; + } + } + os << "]"; + return os; +} + +std::ostream& operator<<(std::ostream& os, const Tensor& t) { + os << "\tdim: " << t.dims() << "\n"; + os << "\tlayout: " << DataLayoutToString(t.layout()) << "\n"; + + Tensor tensor; + tensor.Resize(t.dims()); + if (platform::is_cpu_place(t.place())) { + tensor.ShareDataWith(t); + } else { + platform::CPUPlace place; + framework::TensorCopy(t, place, &tensor); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(t.place()); + dev_ctx.Wait(); + } + +#define PrintTensorCallback(cpp_type, proto_type) \ + do { \ + if (tensor.type() == proto_type) { \ + os << "\tdtype: " << proto_type << "\n"; \ + print_tensor(os, tensor); \ + return os; \ + } \ + } while (0) + + _ForEachDataType_(PrintTensorCallback); + VLOG(1) << "PrintVar: unrecognized data type:" << t.type(); + return os; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.cu b/paddle/fluid/framework/tensor_util.cu new file mode 120000 index 00000000..edd88c4e --- /dev/null +++ b/paddle/fluid/framework/tensor_util.cu @@ -0,0 +1 @@ +tensor_util.cc \ No newline at end of file diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h new file mode 100644 index 00000000..e382f920 --- /dev/null +++ b/paddle/fluid/framework/tensor_util.h @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/temporary_allocator.h" + +namespace paddle { +namespace framework { + +// NOTE(zcd): Because TensorCopy is an async operation, when the src_place +// and dst_place are two different GPU, to ensure that the operation can +// be carried out correctly, there is a src_ctx wait operation in TensorCopy. +// If ctx_place and src_place are the same, src_ctx.Wait() is added +// after memory::Copy; if ctx_place and dst_place are the same, +// src_ctx.Wait() is added before memory::Copy. +void TensorCopy(const Tensor& src, const platform::Place& dst_place, + const platform::DeviceContext& ctx, Tensor* dst); + +// NOTE(zcd): If the src.place() and dst_place are two different GPU, +// the copy operation is carried out on the dst_place's stream. This is +// very important, because TensorCopy is an async operator, and in most +// case, once this copy operator returns, dst is to be used in dst_place's +// stream, if this copy operation is carried out on the src_place's stream, +// when dst is used in dst_place's stream the copy operation may be +// not completed. +void TensorCopy(const Tensor& src, const platform::Place& dst_place, + Tensor* dst); + +void TensorCopySync(const Tensor& src, const platform::Place& dst_place, + Tensor* dst); + +template +void TensorFromVector(const std::vector& src, + const platform::DeviceContext& ctx, Tensor* dst); +template +void TensorFromVector(const std::vector& src, Tensor* dst); + +template +void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, + std::vector* dst); +template +void TesnorToVector(const Tensor& src, std::vector* dst); + +// copy the result bool to cpu +bool TensorContainsNAN(const framework::Tensor& tensor); +bool TensorContainsInf(const framework::Tensor& tensor); +bool TensorIsfinite(const framework::Tensor& tensor); + +// store the result bool in gpu tensor, async operation. Faster than above ones. +void TensorContainsNAN(const framework::Tensor& tensor, framework::Tensor* out); +void TensorContainsInf(const framework::Tensor& tensor, framework::Tensor* out); +void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out); + +void TensorToStream(std::ostream& os, const Tensor& tensor, + const platform::DeviceContext& dev_ctx); +void TensorFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx); + +// +// The implementation of template functions. +// + +template +void TensorFromVector(const std::vector& src, + const platform::DeviceContext& ctx, Tensor* dst) { + auto dst_place = ctx.GetPlace(); + auto src_ptr = static_cast(src.data()); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(T); + + if (platform::is_cpu_place(dst_place)) { + memory::Copy(boost::get(dst_place), dst_ptr, src_place, + src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(dst_place)) { // NOLINT + memory::Copy( + boost::get(dst_place), dst_ptr, src_place, src_ptr, + size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +template +void TensorFromVector(const std::vector& src, Tensor* dst) { + platform::CPUPlace dst_place = platform::CPUPlace(); + auto src_ptr = static_cast(src.data()); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(T); + + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); +} + +template +void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, + std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(T); + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(dst->data()); + + if (platform::is_cpu_place(src.place())) { + memory::Copy(dst_place, dst_ptr, + boost::get(src.place()), src_ptr, size); + } +#ifdef PADDLE_WITH_CUDA + else if (platform::is_gpu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, boost::get(src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif +} + +template +void TensorToVector(const Tensor& src, std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(T); + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(dst->data()); + + PADDLE_ENFORCE(platform::is_cpu_place(src.place())); + + memory::Copy(dst_place, dst_ptr, boost::get(src.place()), + src_ptr, size); +} + +std::ostream& operator<<(std::ostream& os, const Tensor& t); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc new file mode 100644 index 00000000..17c55378 --- /dev/null +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -0,0 +1,396 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/tensor_util.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +TEST(TensorCopy, Tensor) { + Tensor src_tensor; + Tensor dst_tensor; + platform::CPUDeviceContext cpu_ctx((platform::CPUPlace())); + + int* src_ptr = + src_tensor.mutable_data(make_ddim({3, 3}), platform::CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + src_tensor.set_layout(DataLayout::kAnyLayout); + + auto cpu_place = new platform::CPUPlace(); + TensorCopy(src_tensor, *cpu_place, &dst_tensor); + + const int* dst_ptr = dst_tensor.data(); + EXPECT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + TensorCopy(dst_tensor, *cpu_place, &dst_tensor); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + + Tensor slice_tensor = src_tensor.Slice(1, 2); + TensorCopy(slice_tensor, *cpu_place, &dst_tensor); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + EXPECT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + +#ifdef PADDLE_WITH_CUDA + { + Tensor src_tensor; + Tensor gpu_tensor; + Tensor dst_tensor; + + int* src_ptr = + src_tensor.mutable_data(make_ddim({3, 3}), platform::CPUPlace()); + + int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + memcpy(src_ptr, arr, 9 * sizeof(int)); + + // CPU Tensor to GPU Tensor + auto gpu_place = new platform::CUDAPlace(0); + platform::CUDADeviceContext gpu_ctx(*gpu_place); + TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + // GPU Tensor to CPU Tensor + auto cpu_place = new platform::CPUPlace(); + TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + const int* dst_ptr = dst_tensor.data(); + EXPECT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + // Copy the same tensor + TensorCopy(gpu_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + gpu_ctx.Wait(); + const int* dst_ptr_tmp = dst_tensor.data(); + EXPECT_NE(src_ptr, dst_ptr_tmp); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], dst_ptr_tmp[i]); + } + + Tensor slice_tensor = src_tensor.Slice(1, 2); + + // CPU Slice Tensor to GPU Tensor + TensorCopy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + // GPU Tensor to CPU Tensor + TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Slice Tensors + gpu_ctx.Wait(); + const int* slice_ptr = slice_tensor.data(); + dst_ptr = dst_tensor.data(); + EXPECT_NE(dst_ptr, slice_ptr); + for (size_t i = 0; i < 3; ++i) { + EXPECT_EQ(dst_ptr[i], slice_ptr[i]); + } + + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + } +#endif +} + +TEST(TensorFromVector, Tensor) { + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + paddle::framework::Tensor cpu_tensor; + + // Copy to CPU Tensor + cpu_tensor.Resize(paddle::framework::make_ddim({3, 3})); + auto cpu_place = new paddle::platform::CPUPlace(); + paddle::framework::TensorFromVector(src_vec, &cpu_tensor); + + // Compare Tensors + const int* cpu_ptr = cpu_tensor.data(); + const int* src_ptr = src_vec.data(); + EXPECT_NE(src_ptr, cpu_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + } + + src_vec.erase(src_vec.begin(), src_vec.begin() + 5); + cpu_tensor.Resize(paddle::framework::make_ddim({2, 2})); + paddle::framework::TensorFromVector(src_vec, &cpu_tensor); + cpu_ptr = cpu_tensor.data(); + src_ptr = src_vec.data(); + EXPECT_NE(src_ptr, cpu_ptr); + for (size_t i = 0; i < 5; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + } + + delete cpu_place; + } + +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + paddle::framework::Tensor cpu_tensor; + paddle::framework::Tensor gpu_tensor; + paddle::framework::Tensor dst_tensor; + + // Copy to CPU Tensor + cpu_tensor.Resize(make_ddim({3, 3})); + auto cpu_place = new paddle::platform::CPUPlace(); + paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place); + paddle::framework::TensorFromVector(src_vec, cpu_ctx, &cpu_tensor); + + // Copy to GPUTensor + gpu_tensor.Resize(paddle::framework::make_ddim({3, 3})); + auto gpu_place = new paddle::platform::CUDAPlace(); + paddle::platform::CUDADeviceContext gpu_ctx(*gpu_place); + paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); + // Copy from GPU to CPU tensor for comparison + paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + const int* src_ptr = src_vec.data(); + const int* cpu_ptr = cpu_tensor.data(); + const int* dst_ptr = dst_tensor.data(); + EXPECT_NE(src_ptr, cpu_ptr); + EXPECT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 9; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + src_vec.erase(src_vec.begin(), src_vec.begin() + 5); + + cpu_tensor.Resize(paddle::framework::make_ddim({2, 2})); + paddle::framework::TensorFromVector(src_vec, cpu_ctx, &cpu_tensor); + gpu_tensor.Resize(paddle::framework::make_ddim({2, 2})); + paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); + paddle::framework::TensorCopy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); + + // Sync before Compare Tensors + gpu_ctx.Wait(); + src_ptr = src_vec.data(); + cpu_ptr = cpu_tensor.data(); + dst_ptr = dst_tensor.data(); + EXPECT_NE(src_ptr, cpu_ptr); + EXPECT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 5; ++i) { + EXPECT_EQ(src_ptr[i], cpu_ptr[i]); + EXPECT_EQ(src_ptr[i], dst_ptr[i]); + } + + delete cpu_place; + delete gpu_place; + } +#endif +} + +TEST(TensorToVector, Tensor) { + { + paddle::framework::Tensor src; + int* src_ptr = src.mutable_data({3, 3}, paddle::platform::CPUPlace()); + for (int i = 0; i < 3 * 3; ++i) { + src_ptr[i] = i; + } + + paddle::platform::CPUPlace place; + std::vector dst; + paddle::framework::TensorToVector(src, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_ptr[i], dst[i]); + } + } +#ifdef PADDLE_WITH_CUDA + { + std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + paddle::framework::Tensor gpu_tensor; + paddle::platform::CUDAPlace place; + paddle::platform::CUDADeviceContext gpu_ctx(place); + paddle::framework::TensorFromVector(src_vec, gpu_ctx, &gpu_tensor); + + std::vector dst; + paddle::framework::TensorToVector(gpu_tensor, gpu_ctx, &dst); + + for (int i = 0; i < 3 * 3; ++i) { + EXPECT_EQ(src_vec[i], dst[i]); + } + } +#endif +} + +TEST(TensorContainsNAN, CPU) { + { + paddle::framework::Tensor src; + float* buf = src.mutable_data({3}, paddle::platform::CPUPlace()); + buf[0] = 0.0; + buf[1] = NAN; + buf[2] = 0.0; + EXPECT_TRUE(paddle::framework::TensorContainsNAN(src)); + buf[1] = 0.0; + EXPECT_FALSE(paddle::framework::TensorContainsNAN(src)); + } + + { + paddle::framework::Tensor src; + paddle::platform::float16* buf = + src.mutable_data( + {3}, paddle::platform::CPUPlace()); + buf[0] = 0.0; + buf[1].x = 0x7fff; + buf[2] = 0.0; + EXPECT_TRUE(paddle::framework::TensorContainsNAN(src)); + buf[1] = 0.0; + EXPECT_FALSE(paddle::framework::TensorContainsNAN(src)); + } +} + +TEST(TensorContainsInf, CPU) { + { + paddle::framework::Tensor src; + double* buf = src.mutable_data({3}, paddle::platform::CPUPlace()); + buf[0] = 1.0; + buf[1] = INFINITY; + buf[2] = 0.0; + EXPECT_TRUE(paddle::framework::TensorContainsInf(src)); + buf[1] = 1.0; + EXPECT_FALSE(paddle::framework::TensorContainsInf(src)); + } + + { + paddle::framework::Tensor src; + paddle::platform::float16* buf = + src.mutable_data( + {3}, paddle::platform::CPUPlace()); + buf[0] = 1.0; + buf[1].x = 0x7c00; + buf[2] = 0.0; + EXPECT_TRUE(paddle::framework::TensorContainsInf(src)); + buf[1] = 1.0; + EXPECT_FALSE(paddle::framework::TensorContainsInf(src)); + } +} + +TEST(TensorIsfinite, CPU) { + { + paddle::framework::Tensor src, out; + double* buf = src.mutable_data({3}, paddle::platform::CPUPlace()); + buf[0] = 1.0; + buf[1] = INFINITY; + buf[2] = 0.0; + paddle::framework::TensorIsfinite(src, &out); + EXPECT_EQ(out.data()[0], false); + buf[1] = 1.0; + paddle::framework::TensorIsfinite(src, &out); + EXPECT_EQ(out.data()[0], true); + } + + { + paddle::framework::Tensor src, out; + double* buf = src.mutable_data({3}, paddle::platform::CPUPlace()); + buf[0] = 1.0; + buf[1] = NAN; + buf[2] = 0.0; + paddle::framework::TensorIsfinite(src, &out); + EXPECT_EQ(out.data()[0], false); + buf[1] = 1.0; + paddle::framework::TensorIsfinite(src, &out); + EXPECT_EQ(out.data()[0], true); + } + + { + paddle::framework::Tensor src, out; + paddle::platform::float16* buf = + src.mutable_data( + {3}, paddle::platform::CPUPlace()); + buf[0] = 1.0; + buf[1].x = 0x7c00; + buf[2] = 0.0; + paddle::framework::TensorIsfinite(src, &out); + EXPECT_EQ(out.data()[0], false); + buf[1] = 1.0; + paddle::framework::TensorIsfinite(src, &out); + EXPECT_EQ(out.data()[0], true); + buf[1].x = 0x7fff; + paddle::framework::TensorIsfinite(src, &out); + EXPECT_EQ(out.data()[0], false); + } +} + +TEST(Tensor, FromAndToStream) { + framework::Tensor src_tensor; + int array[6] = {1, 2, 3, 4, 5, 6}; + src_tensor.Resize({2, 3}); + int* src_ptr = src_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 6; ++i) { + src_ptr[i] = array[i]; + } + { + framework::Tensor dst_tensor; + auto place = new platform::CPUPlace(); + platform::CPUDeviceContext cpu_ctx(*place); + std::ostringstream oss; + TensorToStream(oss, src_tensor, cpu_ctx); + + std::istringstream iss(oss.str()); + TensorFromStream(iss, &dst_tensor, cpu_ctx); + int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 5; ++i) { + EXPECT_EQ(dst_ptr[i], array[i]); + } + EXPECT_EQ(dst_tensor.dims(), src_tensor.dims()); + delete place; + } +#ifdef PADDLE_WITH_CUDA + { + Tensor gpu_tensor; + gpu_tensor.Resize({2, 3}); + Tensor dst_tensor; + + auto gpu_place = new platform::CUDAPlace(); + platform::CUDADeviceContext gpu_ctx(*gpu_place); + + TensorCopy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); + + std::ostringstream oss; + TensorToStream(oss, gpu_tensor, gpu_ctx); + + std::istringstream iss(oss.str()); + TensorFromStream( + iss, &dst_tensor, + *platform::DeviceContextPool::Instance().Get(platform::CPUPlace())); + + int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace()); + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(dst_ptr[i], array[i]); + } + delete gpu_place; + } +#endif +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu new file mode 100644 index 00000000..a51f7419 --- /dev/null +++ b/paddle/fluid/framework/tensor_util_test.cu @@ -0,0 +1,260 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +static __global__ void FillNAN(float* buf) { + buf[0] = 0.0; + buf[1] = 0.1; + buf[2] = NAN; +} + +static __global__ void FillInf(float* buf) { + buf[0] = INFINITY; + buf[1] = 0.1; + buf[2] = 0.2; +} + +static __global__ void FillNAN(platform::float16* buf) { + buf[0] = 0.0; + buf[1] = 0.1; + buf[2].x = 0x7fff; +} + +static __global__ void FillInf(platform::float16* buf) { + buf[0] = 0.0; + buf[1].x = 0x7c00; + buf[2] = 0.5; +} + +static __global__ void FillFinite(float* buf) { + buf[0] = 0.0; + buf[1] = 0.1; + buf[2] = 0.2; +} + +static __global__ void FillFinite(platform::float16* buf) { + buf[0] = 0.0; + buf[1] = 0.1; + buf[2] = 0.2; +} + +TEST(TensorContainsNAN, GPU) { + paddle::platform::CUDAPlace gpu(0); + auto& pool = paddle::platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + { + Tensor tensor; + float* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsNAN(tensor)); + } + { + Tensor tensor; + paddle::platform::float16* buf = + tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsNAN(tensor)); + } +} + +TEST(TensorContainsInf, GPU) { + paddle::platform::CUDAPlace gpu(0); + auto& pool = paddle::platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + { + Tensor tensor; + float* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsInf(tensor)); + } + { + Tensor tensor; + paddle::platform::float16* buf = + tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsInf(tensor)); + } +} + +TEST(TensorIsfinite, GPU) { + paddle::platform::CUDAPlace gpu(0); + using paddle::platform::float16; + auto& pool = paddle::platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + // contains inf + { + Tensor tensor; + float* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + EXPECT_TRUE(!TensorIsfinite(tensor)); + } + { + Tensor tensor; + float16* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + EXPECT_TRUE(!TensorIsfinite(tensor)); + } + + // contains nan + { + Tensor tensor; + float* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + EXPECT_TRUE(!TensorIsfinite(tensor)); + } + { + Tensor tensor; + float16* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + EXPECT_TRUE(!TensorIsfinite(tensor)); + } + + // all element are finite + { + Tensor tensor; + float* buf = tensor.mutable_data({3}, gpu); + FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + EXPECT_TRUE(TensorIsfinite(tensor)); + } + { + Tensor tensor; + float16* buf = tensor.mutable_data({3}, gpu); + FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + EXPECT_TRUE(TensorIsfinite(tensor)); + } +} + +TEST(TensorContainsInf, GPUWithoutWait) { + paddle::platform::CUDAPlace gpu(0); + auto& pool = paddle::platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + { + Tensor tensor, out; + float* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + TensorContainsInf(tensor, &out); + platform::CPUPlace cpu; + Tensor tmp; + TensorCopy(out, cpu, *cuda_ctx, &tmp); + cuda_ctx->Wait(); + ASSERT_EQ(tmp.data()[0], true); + } + { + Tensor tensor, out; + paddle::platform::float16* buf = + tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + TensorContainsInf(tensor, &out); + platform::CPUPlace cpu; + Tensor tmp; + TensorCopy(out, cpu, *cuda_ctx, &tmp); + cuda_ctx->Wait(); + ASSERT_EQ(tmp.data()[0], true); + } +} + +TEST(TensorContainsNAN, GPUWithoutWait) { + paddle::platform::CUDAPlace gpu(0); + auto& pool = paddle::platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + { + Tensor tensor, out; + float* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + TensorContainsNAN(tensor, &out); + platform::CPUPlace cpu; + Tensor tmp; + TensorCopy(out, cpu, *cuda_ctx, &tmp); + cuda_ctx->Wait(); + ASSERT_EQ(tmp.data()[0], true); + } + { + Tensor tensor, out; + paddle::platform::float16* buf = + tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + TensorContainsNAN(tensor, &out); + platform::CPUPlace cpu; + Tensor tmp; + TensorCopy(out, cpu, *cuda_ctx, &tmp); + cuda_ctx->Wait(); + ASSERT_EQ(tmp.data()[0], true); + } +} + +TEST(TensorIsfinite, GPUWithoutWait) { + paddle::platform::CUDAPlace gpu(0); + auto& pool = paddle::platform::DeviceContextPool::Instance(); + auto* cuda_ctx = pool.GetByPlace(gpu); + { + Tensor tensor, out; + float* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + TensorIsfinite(tensor, &out); + platform::CPUPlace cpu; + Tensor tmp; + TensorCopy(out, cpu, *cuda_ctx, &tmp); + cuda_ctx->Wait(); + EXPECT_EQ(tmp.data()[0], false); + } + { + Tensor tensor, out; + float* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + TensorIsfinite(tensor, &out); + platform::CPUPlace cpu; + Tensor tmp; + TensorCopy(out, cpu, *cuda_ctx, &tmp); + cuda_ctx->Wait(); + EXPECT_EQ(tmp.data()[0], false); + } + { + Tensor tensor, out; + float* buf = tensor.mutable_data({3}, gpu); + FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + TensorIsfinite(tensor, &out); + platform::CPUPlace cpu; + Tensor tmp; + TensorCopy(out, cpu, *cuda_ctx, &tmp); + cuda_ctx->Wait(); + EXPECT_EQ(tmp.data()[0], true); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc new file mode 100644 index 00000000..d34f826c --- /dev/null +++ b/paddle/fluid/framework/threadpool.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/framework/threadpool.h" + +#include "gflags/gflags.h" +#include "paddle/fluid/platform/enforce.h" + +DEFINE_int32(io_threadpool_size, 100, + "number of threads used for doing IO, default 100"); + +DEFINE_int32(dist_threadpool_size, 0, + "number of threads used for distributed executed."); + +namespace paddle { +namespace framework { +std::unique_ptr ThreadPool::threadpool_(nullptr); +std::once_flag ThreadPool::init_flag_; + +ThreadPool* ThreadPool::GetInstance() { + std::call_once(init_flag_, &ThreadPool::Init); + return threadpool_.get(); +} + +void ThreadPool::Init() { + if (threadpool_.get() == nullptr) { + // TODO(Yancey1989): specify the max threads number + int num_threads = std::thread::hardware_concurrency(); + if (FLAGS_dist_threadpool_size > 0) { + num_threads = FLAGS_dist_threadpool_size; + VLOG(1) << "set dist_threadpool_size to " << num_threads; + } + PADDLE_ENFORCE_GT(num_threads, 0); + threadpool_.reset(new ThreadPool(num_threads)); + } +} + +ThreadPool::ThreadPool(int num_threads) : running_(true) { + threads_.resize(num_threads); + for (auto& thread : threads_) { + // TODO(Yancey1989): binding the thread on the specify CPU number + thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); + } +} + +ThreadPool::~ThreadPool() { + { + // notify all threads to stop running + std::unique_lock l(mutex_); + running_ = false; + } + scheduled_.notify_all(); + + for (auto& t : threads_) { + t->join(); + t.reset(nullptr); + } +} + +void ThreadPool::TaskLoop() { + while (true) { + Task task; + + { + std::unique_lock lock(mutex_); + scheduled_.wait( + lock, [this] { return !this->tasks_.empty() || !this->running_; }); + + if (!running_ && tasks_.empty()) { + return; + } + + if (tasks_.empty()) { + PADDLE_THROW("This thread has no task to Run"); + } + + // pop a task from the task queue + task = std::move(tasks_.front()); + tasks_.pop(); + } + // run the task + task(); + } +} + +std::unique_ptr ThreadPoolIO::io_threadpool_(nullptr); +std::once_flag ThreadPoolIO::io_init_flag_; + +ThreadPool* ThreadPoolIO::GetInstanceIO() { + std::call_once(io_init_flag_, &ThreadPoolIO::InitIO); + return io_threadpool_.get(); +} + +void ThreadPoolIO::InitIO() { + if (io_threadpool_.get() == nullptr) { + // TODO(typhoonzero1986): make this configurable + io_threadpool_.reset(new ThreadPool(FLAGS_io_threadpool_size)); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h new file mode 100644 index 00000000..7a51d18f --- /dev/null +++ b/paddle/fluid/framework/threadpool.h @@ -0,0 +1,144 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // NOLINT +#include +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include +#include "glog/logging.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace framework { + +struct ExceptionHandler { + mutable std::future> future_; + explicit ExceptionHandler( + std::future>&& f) + : future_(std::move(f)) {} + void operator()() const { + auto ex = this->future_.get(); + if (ex != nullptr) { + LOG(FATAL) << "The exception is thrown inside the thread pool. You " + "should use RunAndGetException to handle the exception.\n" + "The default exception handler is LOG(FATAL)." + << ex->what(); + } + } +}; + +// ThreadPool maintains a queue of tasks, and runs them using a fixed +// number of threads. +class ThreadPool { + public: + explicit ThreadPool(int num_threads); + + using Task = std::packaged_task()>; + + // Returns the singleton of ThreadPool. + static ThreadPool* GetInstance(); + + ~ThreadPool(); + + // Run pushes a function to the task queue and returns a std::future + // object. To wait for the completion of the task, call + // std::future::wait(). + template + std::future Run(Callback fn) { + auto f = this->RunAndGetException(fn); + return std::async(std::launch::deferred, ExceptionHandler(std::move(f))); + } + + template + std::future> RunAndGetException( + Callback fn) { + Task task([fn]() -> std::unique_ptr { + try { + fn(); + } catch (platform::EnforceNotMet ex) { + return std::unique_ptr( + new platform::EnforceNotMet(ex)); + } catch (const std::exception& e) { + LOG(FATAL) << "Unexpected exception is catched in thread pool. All " + "throwable exception in Fluid should be an EnforceNotMet." + << e.what(); + } + return nullptr; + }); + std::future> f = task.get_future(); + { + std::unique_lock lock(mutex_); + if (!running_) { + PADDLE_THROW("enqueue on stopped ThreadPool"); + } + tasks_.push(std::move(task)); + } + scheduled_.notify_one(); + return f; + } + + private: + DISABLE_COPY_AND_ASSIGN(ThreadPool); + + // The constructor starts threads to run TaskLoop, which retrieves + // and runs tasks from the queue. + void TaskLoop(); + + // Init is called by GetInstance. + static void Init(); + + private: + static std::unique_ptr threadpool_; + static std::once_flag init_flag_; + + std::vector> threads_; + + std::queue tasks_; + std::mutex mutex_; + bool running_; + std::condition_variable scheduled_; +}; + +class ThreadPoolIO : ThreadPool { + public: + static ThreadPool* GetInstanceIO(); + static void InitIO(); + + private: + // NOTE: threadpool in base will be inhereted here. + static std::unique_ptr io_threadpool_; + static std::once_flag io_init_flag_; +}; + +// Run a function asynchronously. +// NOTE: The function must return void. If the function need to return a value, +// you can use lambda to capture a value pointer. +template +std::future Async(Callback callback) { + return ThreadPool::GetInstance()->Run(callback); +} + +template +std::future AsyncIO(Callback callback) { + return ThreadPoolIO::GetInstanceIO()->Run(callback); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc new file mode 100644 index 00000000..884d61e2 --- /dev/null +++ b/paddle/fluid/framework/threadpool_test.cc @@ -0,0 +1,61 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/threadpool.h" + +namespace framework = paddle::framework; + +void do_sum(std::vector>* fs, std::mutex* mu, + std::atomic* sum, int cnt) { + for (int i = 0; i < cnt; ++i) { + std::lock_guard l(*mu); + fs->push_back(framework::Async([sum]() { sum->fetch_add(1); })); + } +} + +TEST(ThreadPool, ConcurrentInit) { + framework::ThreadPool* pool; + int n = 50; + std::vector threads; + for (int i = 0; i < n; ++i) { + std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); }); + threads.push_back(std::move(t)); + } + for (auto& t : threads) { + t.join(); + } +} + +TEST(ThreadPool, ConcurrentRun) { + std::atomic sum(0); + std::vector threads; + std::vector> fs; + std::mutex fs_mu; + int n = 50; + // sum = (n * (n + 1)) / 2 + for (int i = 1; i <= n; ++i) { + std::thread t(do_sum, &fs, &fs_mu, &sum, i); + threads.push_back(std::move(t)); + } + for (auto& t : threads) { + t.join(); + } + for (auto& t : fs) { + t.wait(); + } + EXPECT_EQ(sum, ((n + 1) * n) / 2); +} diff --git a/paddle/fluid/framework/trainer.cc b/paddle/fluid/framework/trainer.cc new file mode 100644 index 00000000..644bd33a --- /dev/null +++ b/paddle/fluid/framework/trainer.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { + +void TrainerBase::SetScope(Scope* root_scope) { root_scope_ = root_scope; } + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h new file mode 100644 index 00000000..5fe296ff --- /dev/null +++ b/paddle/fluid/framework/trainer.h @@ -0,0 +1,148 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include + +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/device_worker.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/trainer_desc.pb.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/reader/blocking_queue.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace framework { + +class TrainerBase { + public: + TrainerBase() {} + virtual ~TrainerBase() {} + // model memory are hosted in root_scope + void SetScope(Scope* root_scope); + void SetDebug(const bool debug) { debug_ = debug; } + void SetDataset(Dataset* dataset_ptr) { dataset_ptr_ = dataset_ptr; } + virtual void Initialize(const TrainerDesc& trainer_desc, + Dataset* data_set) = 0; + virtual void InitTrainerEnv(const ProgramDesc& main_program, + const platform::Place& place) = 0; + virtual void InitOtherEnv(const ProgramDesc& main_program) = 0; + virtual void Run() = 0; + virtual void Finalize() = 0; + + protected: + Scope* root_scope_; + bool debug_; + Dataset* dataset_ptr_; +}; + +// general trainer for async execution +// local trainer and distributed trainer are supported +// depends on the assigned device_worker +class MultiTrainer : public TrainerBase { + public: + MultiTrainer() {} + virtual ~MultiTrainer() {} + virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set); + virtual void InitTrainerEnv(const ProgramDesc& main_program, + const platform::Place& place); + virtual void InitOtherEnv(const ProgramDesc& main_program) {} + virtual void Run(); + virtual void Finalize(); + + protected: + int thread_num_; + std::vector threads_; + std::vector readers_; + std::vector> workers_; +}; + +class DistMultiTrainer : public MultiTrainer { + public: + DistMultiTrainer() {} + virtual ~DistMultiTrainer() {} + virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set); + virtual void InitOtherEnv(const ProgramDesc& main_program); + virtual void Run(); + virtual void Finalize(); + + protected: + std::shared_ptr pull_dense_worker_; +}; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +class PipelineTrainer : public TrainerBase { + public: + PipelineTrainer() {} + ~PipelineTrainer() override {} + void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set) override; + void InitTrainerEnv(const ProgramDesc& main_program, + const platform::Place& place) override; + void InitOtherEnv(const ProgramDesc& main_program) override {} + void Run() override; + void Finalize() override; + + protected: + int section_num_; + int pipeline_num_; + int scope_queue_size_; + int sync_steps_; + + SectionWorkerParameter pipeline_config_; + + // The in/output var names for each section + std::vector>> in_var_names_; + std::vector>> out_var_names_; + + // Counter for the running thread + std::vector> worker_count_; + std::vector>> worker_count_mutex_; + + // worker: [section_id][pipeline_id][thread_id] + std::vector>>> + workers_; + std::vector section_threads_; + + // We use scope to maintain context info, and scopes + // will be deliverd between different sections. + std::vector>> scope_queues_; + std::vector pipeline_scopes_; + + // The parameters that should be syncronized between different cards using + // nccl all-reduce + std::shared_ptr> param_need_sync_; + std::vector> sync_functors_; + std::shared_ptr nccl_ctx_map_; + + std::vector readers_; + + void InitFirstScopeQueue(ScopeQueue* scope_queue, int pipeline_id, + const ProgramDesc& main_program); + void CopyParameters(const Scope& root_scope, int pipeline_id); + void construct_sync_functor(); +}; +#endif +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto new file mode 100644 index 00000000..4910fb74 --- /dev/null +++ b/paddle/fluid/framework/trainer_desc.proto @@ -0,0 +1,120 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto2"; +option optimize_for = LITE_RUNTIME; +import "data_feed.proto"; +import "framework.proto"; +package paddle.framework; + +message TrainerDesc { + // class name for create trainer desc + // the matchness of trainer name and device worker name + // will be checked in python API + optional string class_name = 1; + // class name for creating device worker + optional string device_worker_name = 2; + // thread number + optional int32 thread_num = 3; + // if we need to binding cpu + optional bool binding_cpu = 4 [ default = false ]; + repeated string filelist = 5; + optional bool debug = 6 [ default = false ]; + optional FetchConfig fetch_config = 7; + optional bool use_cvm = 8 [ default = false ]; + + // device worker parameters + optional HogwildWorkerParameter hogwild_param = 101; + optional DownpourWorkerParameter downpour_param = 103; + optional PullDenseWorkerParameter pull_dense_param = 102; + optional SectionWorkerParameter section_param = 104; + // datafeed desc + optional DataFeedDesc data_desc = 201; +} + +message HogwildWorkerParameter { repeated string skip_ops = 1; } + +message DownpourWorkerParameter { + repeated TableParameter sparse_table = 1; + repeated TableParameter dense_table = 2; + repeated string skip_ops = 3; + repeated ProgramConfig program_config = 4; + optional bool push_sparse = 5 [ default = true ]; + optional bool push_dense = 6 [ default = true ]; +} + +message SectionWorkerParameter { + repeated SectionConfig section_config = 1; + optional int32 queue_size = 2 [ default = 1 ]; + optional int64 sync_steps = 3 [ default = 1 ]; + optional int32 start_cpu_core_id = 4 [ default = 1 ]; + repeated string param_need_sync = 5; +} + +message SectionConfig { + enum Place { + CPUPlace = 0; + CUDAPlace = 1; + CUDAPinnedPlace = 2; + } + + // FIXME: How to use proto::ProgramDesc + // required string program_desc_str = 1; + optional proto.ProgramDesc program_desc = 1; + optional Place place = 2; + optional int32 concurrency = 3 [ default = 1 ]; + repeated string section_in_var_names = 4; + repeated string section_out_var_names = 5; +} + +message FetchConfig { + enum Method { PRINT = 0; } + repeated string fetch_var_names = 1; + repeated string fetch_var_str_format = 2; + optional int32 print_period = 3 [ default = 100 ]; + optional Method method = 4 [ default = PRINT ]; +} + +message ProgramConfig { + required string program_id = 1; + repeated int32 push_sparse_table_id = 2; + repeated int32 push_dense_table_id = 3; + repeated int32 pull_sparse_table_id = 4; + repeated int32 pull_dense_table_id = 5; +} + +message PullDenseWorkerParameter { + // dense table only and specialized usage + optional int32 threshold = 1 [ default = 1 ]; + optional int32 device_num = 2; + optional int32 sleep_time_ms = 3 [ default = 2 ]; + repeated TableParameter dense_table = 4; +} + +message TableParameter { + // dense table only + optional uint64 table_id = 1; + repeated string dense_value_name = 2; + repeated string dense_grad_name = 3; + repeated int32 push_dense_wait_times = 5; + // sparse table only + repeated string sparse_key_name = 6; + repeated string sparse_value_name = 7; + repeated string sparse_grad_name = 8; + repeated int32 push_sparse_wait_times = 9; + // sparse table only and specialized usage + optional int32 emb_dim = 10; + optional int32 fea_dim = 11; + optional string label_var_name = 12; +} diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc new file mode 100644 index 00000000..ce0eb5ec --- /dev/null +++ b/paddle/fluid/framework/trainer_factory.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/trainer_factory.h" +#include +#include +#include + +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { + +typedef std::shared_ptr (*CreatetrainerFunction)(); +typedef std::unordered_map trainerMap; +trainerMap g_trainer_map; + +#define REGISTER_TRAINER_CLASS(trainer_class) \ + namespace { \ + std::shared_ptr Creator_##trainer_class() { \ + return std::shared_ptr(new trainer_class); \ + } \ + class __Registerer_##trainer_class { \ + public: \ + __Registerer_##trainer_class() { \ + g_trainer_map[#trainer_class] = &Creator_##trainer_class; \ + } \ + }; \ + __Registerer_##trainer_class g_registerer_##trainer_class; \ + } // namespace + +std::string TrainerFactory::TrainerTypeList() { + std::string trainer_types; + for (auto iter = g_trainer_map.begin(); iter != g_trainer_map.end(); ++iter) { + if (iter != g_trainer_map.begin()) { + trainer_types += ", "; + } + trainer_types += iter->first; + } + return trainer_types; +} + +std::shared_ptr TrainerFactory::CreateTrainer( + std::string trainer_class) { + if (g_trainer_map.count(trainer_class) < 1) { + LOG(WARNING) << "Trainer class: " << trainer_class << " not defined"; + LOG(WARNING) << TrainerTypeList(); + exit(-1); + } + return g_trainer_map[trainer_class](); +} + +REGISTER_TRAINER_CLASS(MultiTrainer); +REGISTER_TRAINER_CLASS(DistMultiTrainer); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +REGISTER_TRAINER_CLASS(PipelineTrainer); +#endif +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/trainer_factory.h b/paddle/fluid/framework/trainer_factory.h new file mode 100644 index 00000000..9c772a4f --- /dev/null +++ b/paddle/fluid/framework/trainer_factory.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { + +class TrainerFactory { + public: + static std::string TrainerTypeList(); + static std::shared_ptr CreateTrainer(std::string trainer_class); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/trainer_test.cc b/paddle/fluid/framework/trainer_test.cc new file mode 100644 index 00000000..f689679d --- /dev/null +++ b/paddle/fluid/framework/trainer_test.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/trainer.h" +#include + +namespace paddle { +namespace framework { +TEST() { + // create multi trainer + // create hogwild device worker + // create dataset + // train for a while +} +} +} diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc new file mode 100644 index 00000000..e1326f88 --- /dev/null +++ b/paddle/fluid/framework/transfer_scope_cache.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/transfer_scope_cache.h" + +namespace paddle { +namespace framework { + +#ifdef PADDLE_WITH_MKLDNN +using transfer_data_cache_map = std::unordered_map; +using transfer_scope_cache_map = std::unordered_set; +static std::unordered_map + static_transfer_data_caches; +static std::unordered_map + static_transfer_scope_caches; +#endif + +std::unordered_map& global_transfer_data_cache() { +#ifdef PADDLE_WITH_MKLDNN + size_t sid = platform::get_cur_mkldnn_session_id(); + + // if there is specific mkldnn tid setting from user. + if (sid != platform::kMKLDNNSessionID_Default) { + sid = std::hash()(std::this_thread::get_id()); + + static std::mutex acquire_barrier; + std::lock_guard block_until_finish_this_job(acquire_barrier); + + auto map_it = static_transfer_data_caches.find(sid); + if (map_it == static_transfer_data_caches.end()) { + auto* x = new transfer_data_cache_map; + static_transfer_data_caches[sid] = x; + return *x; + } else { + return *static_transfer_data_caches[sid]; + } + } +#endif + thread_local auto* x = new std::unordered_map; + return *x; +} + +std::unordered_set& global_transfer_scope_cache() { +#ifdef PADDLE_WITH_MKLDNN + size_t sid = platform::get_cur_mkldnn_session_id(); + + // if there is specific mkldnn session id setting from user. + if (sid != platform::kMKLDNNSessionID_Default) { + sid = std::hash()(std::this_thread::get_id()); + + static std::mutex acquire_barrier; + std::lock_guard block_until_finish_this_job(acquire_barrier); + + auto map_it = static_transfer_scope_caches.find(sid); + if (map_it == static_transfer_scope_caches.end()) { + auto* x = new transfer_scope_cache_map; + static_transfer_scope_caches[sid] = x; + return *x; + } else { + return *static_transfer_scope_caches[sid]; + } + } +#endif + thread_local auto* x = new std::unordered_set; + return *x; +} + +Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1, + const Scope* scope) { + Scope* new_scope{nullptr}; + size_t infer_cache_key = + CombineHash(OpKernelType::Hash()(type0), OpKernelType::Hash()(type1)); + infer_cache_key = + CombineHash(infer_cache_key, std::hash()(scope)); + + auto it = global_transfer_data_cache().find(infer_cache_key); + if (it != global_transfer_data_cache().end()) { + new_scope = global_transfer_data_cache()[infer_cache_key]; + } else { + new_scope = &scope->NewScope(); + global_transfer_data_cache()[infer_cache_key] = new_scope; + } + global_transfer_scope_cache().insert(new_scope); + return new_scope; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/transfer_scope_cache.h b/paddle/fluid/framework/transfer_scope_cache.h new file mode 100644 index 00000000..9a5d4526 --- /dev/null +++ b/paddle/fluid/framework/transfer_scope_cache.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +std::unordered_map& global_transfer_data_cache(); + +std::unordered_set& global_transfer_scope_cache(); + +// Combine two hash values to a single hash. +static size_t CombineHash(size_t seed, size_t a) { + return (seed ^ a) + 0x9e3779b9 + (seed << 6) + (seed >> 2); +} + +Scope* TryCreateTransferScope(OpKernelType type0, OpKernelType type1, + const Scope* scope); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h new file mode 100644 index 00000000..508ee931 --- /dev/null +++ b/paddle/fluid/framework/tuple.h @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace framework { + +typedef boost::variant + ElementVar; + +class Tuple { + public: + using ElementVars = std::vector; + + Tuple(const std::vector& var, + const std::vector& var_desc) + : var_(var), var_desc_(var_desc) {} + explicit Tuple(std::vector& var) : var_(var) {} + + ElementVar get(int idx) const { return var_[idx]; } + + ElementVar& get(int idx) { return var_[idx]; } + + bool isSameType(const Tuple& t) const; + + size_t getSize() const { return var_.size(); } + + private: + ElementVars var_; + std::vector var_desc_; +}; + +bool Tuple::isSameType(const Tuple& t) const { + size_t tuple_size = getSize(); + if (tuple_size != t.getSize()) { + return false; + } + for (size_t j = 0; j < tuple_size; ++j) { + auto type1 = get(j).which(); + auto type2 = t.get(j).which(); + if (type1 != type2) return false; + } + return true; +} + +Tuple* make_tuple(std::vector tuple) { return new Tuple(tuple); } + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tuple_test.cc b/paddle/fluid/framework/tuple_test.cc new file mode 100644 index 00000000..810900f1 --- /dev/null +++ b/paddle/fluid/framework/tuple_test.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/tuple.h" + +TEST(Tuple, Make) { + std::vector element_type; + element_type.push_back(12); + element_type.push_back(12.0f); + element_type.push_back("ElementVar"); + + paddle::framework::Tuple* tuple = paddle::framework::make_tuple(element_type); + + EXPECT_EQ(boost::get(tuple->get(0)), 12); + EXPECT_EQ(boost::get(tuple->get(1)), 12.0f); + EXPECT_EQ(boost::get(tuple->get(2)), "ElementVar"); + + delete tuple; +} + +TEST(Tuple, IsTheSameType) { + std::vector element_type1; + std::vector element_type2; + std::vector element_type3; + + element_type1.push_back(12); + element_type1.push_back(12.0f); + element_type1.push_back("Tuple1"); + + element_type2.push_back(13); + element_type2.push_back(13.0f); + element_type2.push_back("Tuple2"); + + element_type3.push_back(14.0f); + element_type3.push_back(14); + element_type3.push_back("Tuple3"); + + paddle::framework::Tuple* tuple1 = + paddle::framework::make_tuple(element_type1); + paddle::framework::Tuple* tuple2 = + paddle::framework::make_tuple(element_type2); + paddle::framework::Tuple* tuple3 = + paddle::framework::make_tuple(element_type3); + + EXPECT_TRUE(tuple1->isSameType(*tuple2)); + EXPECT_FALSE(tuple1->isSameType(*tuple3)); + + delete tuple1; + delete tuple2; + delete tuple3; +} diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h new file mode 100644 index 00000000..7f1bfb5d --- /dev/null +++ b/paddle/fluid/framework/type_defs.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace framework { +class OperatorBase; +class OpDesc; +class InferShapeContext; +class InferVarTypeContext; +class BlockDesc; +class Variable; +class NoNeedBufferVarsInference; + +using VariableNameMap = std::map>; +// TODO(panyx0718): Replace vector with something like gtl::Vector. +using VariableValueMap = std::map>; + +// The order should be as same as framework.proto +using Attribute = + boost::variant, + std::vector, std::vector, bool, + std::vector, BlockDesc*, int64_t, + std::vector, std::vector>; + +using AttributeMap = std::unordered_map; + +using OpCreator = std::function; + +using GradOpMakerFN = std::function>( + const OpDesc&, const std::unordered_set& /*no_grad_set*/, + std::unordered_map* /*grad_to_var*/, + const std::vector& grad_block)>; + +using InferVarTypeFN = + std::function; + +using InferShapeFN = std::function; + +using InplacePair = std::unordered_map; +using InferInplaceOpFN = std::function; + +using InferNoNeedBufferVarsFN = std::function( + const VariableNameMap& /*inputs*/, const VariableNameMap& /*outputs*/, + const AttributeMap& /*attrs*/)>; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/unroll_array_ops.h b/paddle/fluid/framework/unroll_array_ops.h new file mode 100644 index 00000000..ab176410 --- /dev/null +++ b/paddle/fluid/framework/unroll_array_ops.h @@ -0,0 +1,132 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace framework { + +namespace detail { + +template +struct UnrollFillConstant { + template + HOSTDEVICE inline static void Run(T *data, T val) { + data[kStart] = val; + UnrollFillConstant::Run(data, val); + } +}; + +template +struct UnrollFillConstant { + template + HOSTDEVICE inline static void Run(T *data, T val) {} +}; + +template +struct UnrollAssign { + template + HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) { + d2[kStart] = static_cast(d1[kStart]); + UnrollAssign::Run(d1, d2); + } +}; + +template +struct UnrollAssign { + template + HOSTDEVICE inline static void Run(const Tin *d1, Tout *d2) {} +}; + +template +struct UnrollVarArgsAssignImpl { + template + HOSTDEVICE inline static void Run(T *d, T val, Args... args) { + static_assert(sizeof...(args) + 1 == kEnd - kStart, "Wrong argument"); + d[kStart] = val; + UnrollVarArgsAssignImpl::Run( + d, args...); + } +}; + +template +struct UnrollVarArgsAssignImpl { + HOSTDEVICE inline static void Run(T *d) {} +}; + +template +struct UnrollVarArgsAssign { + template + HOSTDEVICE inline static void Run(T *d, Args... args) { + UnrollVarArgsAssignImpl::Run( + d, args...); + } +}; + +template +struct UnrollCompare { + template + HOSTDEVICE inline static bool Run(const T *d1, const T *d2) { + return d1[kStart] == d2[kStart] && + UnrollCompare::Run(d1, d2); + } +}; + +template +struct UnrollCompare { + template + HOSTDEVICE inline constexpr static bool Run(const T *d1, const T *d2) { + return true; + } +}; + +template +struct UnrollProduct { + template + HOSTDEVICE inline static T Run(const T *d) { + return d[kStart] * + UnrollProduct::Run(d); + } +}; + +template +struct UnrollProduct { + template + HOSTDEVICE inline constexpr static T Run(const T *d) { + return 1; + } +}; + +} // namespace detail + +template +using UnrollFillConstant = detail::UnrollFillConstant<0, N, N == 0>; + +template +using UnrollAssign = detail::UnrollAssign<0, N, N == 0>; + +template +using UnrollVarArgsAssign = detail::UnrollVarArgsAssign; + +template +using UnrollCompare = detail::UnrollCompare<0, N, N == 0>; + +template +using UnrollProduct = detail::UnrollProduct<0, N, N == 0>; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/unroll_array_ops_test.cc b/paddle/fluid/framework/unroll_array_ops_test.cc new file mode 100644 index 00000000..be811478 --- /dev/null +++ b/paddle/fluid/framework/unroll_array_ops_test.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/unroll_array_ops.h" +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +template +bool CheckEquality(const T* p, size_t n, T val) { + return std::all_of(p, p + n, [val](const T& v) { return v == val; }); +} + +template +bool FillConstantTestMain() { + static_assert(D1 >= D2, ""); + std::array arr; + arr.fill(0); + + UnrollFillConstant::Run(arr.data(), 1); + return CheckEquality(arr.data(), D2, 1) && + CheckEquality(arr.data() + D2, arr.size() - D2, 0); +} + +TEST(unroll_ops, fill_constant) { + EXPECT_TRUE((FillConstantTestMain<9, 0>())); + EXPECT_TRUE((FillConstantTestMain<9, 1>())); + EXPECT_TRUE((FillConstantTestMain<9, 4>())); + EXPECT_TRUE((FillConstantTestMain<9, 9>())); +} + +TEST(unroll_ops, assign) { + const int a[] = {1, 2, 3, 4, 5}; + int b[] = {0, 0, 0, 0, 0}; + UnrollAssign<3>::Run(a, b); + EXPECT_EQ(b[0], 1); + EXPECT_EQ(b[1], 2); + EXPECT_EQ(b[2], 3); + EXPECT_EQ(b[3], 0); + EXPECT_EQ(b[4], 0); +} + +TEST(unroll_ops, var_args_assign) { + int a[] = {0, 0, 0}; + UnrollVarArgsAssign::Run(a, 1, 2); + EXPECT_EQ(a[0], 1); + EXPECT_EQ(a[1], 2); + EXPECT_EQ(a[2], 0); +} + +TEST(unroll_ops, compare) { + int a[] = {1, 2, 3}; + int b[] = {1, 2, 4}; + EXPECT_TRUE(UnrollCompare<2>::Run(a, b)); + EXPECT_FALSE(UnrollCompare<3>::Run(a, b)); + + b[0] = -1; + EXPECT_TRUE(UnrollCompare<0>::Run(a, b)); + EXPECT_FALSE(UnrollCompare<1>::Run(a, b)); +} + +TEST(unroll_ops, product) { + int a[] = {2, 3, 4}; + EXPECT_EQ(UnrollProduct<3>::Run(a), a[0] * a[1] * a[2]); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc new file mode 100644 index 00000000..f3ea1f62 --- /dev/null +++ b/paddle/fluid/framework/var_desc.cc @@ -0,0 +1,275 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +proto::VarType::Type VarDesc::GetType() const { return desc_.type().type(); } + +void VarDesc::SetType(proto::VarType::Type type) { + desc_.mutable_type()->set_type(type); +} + +void VarDesc::SetShape(const std::vector &dims) { + VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims()); +} + +void VarDesc::SetTensorDescNum(size_t num) { + switch (desc_.type().type()) { + case proto::VarType::READER: { + auto *lod_tensors_ptr = + desc_.mutable_type()->mutable_reader()->mutable_lod_tensor(); + lod_tensors_ptr->Clear(); + for (size_t i = 0; i < num; ++i) { + lod_tensors_ptr->Add(); + } + return; + } break; + default: + PADDLE_THROW( + "Setting 'sub_tensor_number' is not supported by the type of var %s.", + this->Name()); + } +} + +size_t VarDesc::GetTensorDescNum() const { + switch (desc_.type().type()) { + case proto::VarType::READER: + return desc_.type().reader().lod_tensor_size(); + break; + default: + PADDLE_THROW( + "Getting 'sub_tensor_number' is not supported by the type of var %s.", + this->Name()); + } +} + +void VarDesc::SetShapes( + const std::vector> &multiple_dims) { + if (multiple_dims.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given shapes(" << multiple_dims.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_dims.size()); + } + std::vector tensors = mutable_tensor_descs(); + for (size_t i = 0; i < multiple_dims.size(); ++i) { + VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims()); + } +} + +std::vector VarDesc::GetShape() const { + return RepeatedToVector(tensor_desc().dims()); +} + +std::vector> VarDesc::GetShapes() const { + std::vector descs = tensor_descs(); + std::vector> res; + res.reserve(descs.size()); + for (const auto &tensor_desc : descs) { + res.push_back(RepeatedToVector(tensor_desc.dims())); + } + return res; +} + +void VarDesc::SetDataType(proto::VarType::Type data_type) { + mutable_tensor_desc()->set_data_type(data_type); +} + +void VarDesc::SetDataTypes( + const std::vector &multiple_data_type) { + if (multiple_data_type.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given data types(" + << multiple_data_type.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_data_type.size()); + } + std::vector tensor_descs = + mutable_tensor_descs(); + for (size_t i = 0; i < multiple_data_type.size(); ++i) { + tensor_descs[i]->set_data_type(multiple_data_type[i]); + } +} + +proto::VarType::Type VarDesc::GetDataType() const { + return tensor_desc().data_type(); +} + +std::vector VarDesc::GetDataTypes() const { + std::vector descs = tensor_descs(); + std::vector res; + res.reserve(descs.size()); + for (const auto &tensor_desc : descs) { + res.push_back(tensor_desc.data_type()); + } + return res; +} + +void VarDesc::SetLoDLevel(int32_t lod_level) { + switch (desc_.type().type()) { + case proto::VarType::LOD_TENSOR: + desc_.mutable_type()->mutable_lod_tensor()->set_lod_level(lod_level); + break; + case proto::VarType::LOD_TENSOR_ARRAY: + desc_.mutable_type()->mutable_tensor_array()->set_lod_level(lod_level); + break; + default: + PADDLE_THROW( + "Setting 'lod_level' is not supported by the type of var %s.", + this->Name()); + } +} + +void VarDesc::SetLoDLevels(const std::vector &multiple_lod_level) { + if (multiple_lod_level.size() != GetTensorDescNum()) { + VLOG(3) << "WARNING: The number of given lod_levels(" + << multiple_lod_level.size() + << ") doesn't match the existing tensor number(" + << GetTensorDescNum() + << "). The Reader is going to be reinitialized."; + SetTensorDescNum(multiple_lod_level.size()); + } + switch (desc_.type().type()) { + case proto::VarType::READER: { + size_t i = 0; + for (auto &lod_tensor : + *desc_.mutable_type()->mutable_reader()->mutable_lod_tensor()) { + lod_tensor.set_lod_level(multiple_lod_level[i++]); + } + } break; + default: + PADDLE_THROW( + "Setting 'lod_levels' is not supported by the type of var %s.", + this->Name()); + } +} + +int32_t VarDesc::GetLoDLevel() const { + switch (desc_.type().type()) { + case proto::VarType::LOD_TENSOR: + return desc_.type().lod_tensor().lod_level(); + case proto::VarType::LOD_TENSOR_ARRAY: + return desc_.type().tensor_array().lod_level(); + default: + PADDLE_THROW( + "Getting 'lod_level' is not supported by the type of var %s.", + this->Name()); + } +} + +std::vector VarDesc::GetLoDLevels() const { + std::vector res; + switch (desc_.type().type()) { + case proto::VarType::READER: + res.reserve(desc_.type().reader().lod_tensor_size()); + for (auto &lod_tensor : desc_.type().reader().lod_tensor()) { + res.push_back(lod_tensor.lod_level()); + } + return res; + break; + default: + PADDLE_THROW( + "Getting 'lod_levels' is not supported by the type of var %s.", + this->Name()); + } +} + +const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { + PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set."); + PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); + switch (desc_.type().type()) { + case proto::VarType::SELECTED_ROWS: + return desc_.type().selected_rows(); + case proto::VarType::LOD_TENSOR: + return desc_.type().lod_tensor().tensor(); + case proto::VarType::LOD_TENSOR_ARRAY: + return desc_.type().tensor_array().tensor(); + default: + PADDLE_THROW( + "Getting 'tensor_desc' is not supported by the type of var %s.", + this->Name()); + } +} + +std::vector VarDesc::tensor_descs() const { + PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); + std::vector res; + res.reserve(GetTensorDescNum()); + switch (desc_.type().type()) { + case proto::VarType::READER: + for (const auto &lod_tensor : desc_.type().reader().lod_tensor()) { + res.push_back(lod_tensor.tensor()); + } + return res; + default: + PADDLE_THROW( + "Getting 'tensor_descs' is not supported by the type of var " + "%s.", + this->Name()); + } +} + +proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { + PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); + PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); + switch (desc_.type().type()) { + case proto::VarType::SELECTED_ROWS: + return desc_.mutable_type()->mutable_selected_rows(); + case proto::VarType::LOD_TENSOR: + return desc_.mutable_type()->mutable_lod_tensor()->mutable_tensor(); + case proto::VarType::LOD_TENSOR_ARRAY: + return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor(); + default: + PADDLE_THROW( + "Getting 'mutable_tensor_desc' is not supported by the type of var " + "%s.", + this->Name()); + } +} + +std::vector VarDesc::mutable_tensor_descs() { + PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set."); + PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set."); + std::vector res; + res.reserve(GetTensorDescNum()); + switch (desc_.type().type()) { + case proto::VarType::READER: + for (auto &lod_tensor : + *desc_.mutable_type()->mutable_reader()->mutable_lod_tensor()) { + res.push_back(lod_tensor.mutable_tensor()); + } + return res; + default: + PADDLE_THROW( + "Getting 'tensor_descs' is not supported by the type of var " + "%s.", + this->Name()); + } +} + +bool operator==(const VarDesc &left, const VarDesc &right) { + return left.Proto()->SerializeAsString() == + right.Proto()->SerializeAsString(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h new file mode 100644 index 00000000..7c82e1d6 --- /dev/null +++ b/paddle/fluid/framework/var_desc.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace framework { + +// convert between std::vector and protobuf repeated. +template +inline std::vector RepeatedToVector( + const google::protobuf::RepeatedField &repeated_field) { + std::vector ret; + ret.reserve(repeated_field.size()); + std::copy(repeated_field.begin(), repeated_field.end(), + std::back_inserter(ret)); + return ret; +} + +template +inline void VectorToRepeated(const std::vector &vec, + RepeatedField *repeated_field) { + repeated_field->Clear(); + repeated_field->Reserve(vec.size()); + for (const auto &elem : vec) { + *repeated_field->Add() = elem; + } +} + +// Specialize vector. +template +inline void VectorToRepeated(const std::vector &vec, + RepeatedField *repeated_field) { + repeated_field->Clear(); + repeated_field->Reserve(vec.size()); + for (auto elem : vec) { + *repeated_field->Add() = elem; + } +} + +class VarDesc { + public: + explicit VarDesc(const std::string &name) { + desc_.set_name(name); + // TODO(paddle-dev): Why default to lodtensor. + desc_.mutable_type()->set_type(proto::VarType::LOD_TENSOR); + } + + explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {} + + proto::VarDesc *Proto() { return &desc_; } + + const proto::VarDesc *Proto() const { return &desc_; } + + std::string Name() const { return desc_.name(); } + + void SetName(std::string name) { desc_.set_name(name); } + + void SetTensorDescNum(size_t num); + + size_t GetTensorDescNum() const; + + void SetShape(const std::vector &dims); + + void SetShapes(const std::vector> &multiple_dims); + + std::vector GetShape() const; + + std::vector> GetShapes() const; + + void SetDataType(proto::VarType::Type data_type); + + void SetDataTypes( + const std::vector &multiple_data_type); + + proto::VarType::Type GetDataType() const; + + std::vector GetDataTypes() const; + + void SetLoDLevel(int32_t lod_level); + + void SetLoDLevels(const std::vector &multiple_lod_level); + + int32_t GetLoDLevel() const; + + std::vector GetLoDLevels() const; + + proto::VarType::Type GetType() const; + + void SetType(proto::VarType::Type type); + + bool Persistable() const { return desc_.persistable(); } + + void SetPersistable(bool persistable) { desc_.set_persistable(persistable); } + + private: + const proto::VarType::TensorDesc &tensor_desc() const; + std::vector tensor_descs() const; + proto::VarType::TensorDesc *mutable_tensor_desc(); + std::vector mutable_tensor_descs(); + + proto::VarDesc desc_; +}; + +bool operator==(const VarDesc &left, const VarDesc &right); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h new file mode 100644 index 00000000..73be446f --- /dev/null +++ b/paddle/fluid/framework/var_type.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { + +template +inline bool IsType(const std::type_index& type) { + return type == typeid(T); +} + +inline proto::VarType::Type ToVarType(int type) { + switch (type) { + case proto::VarType::LOD_TENSOR: + case proto::VarType::SELECTED_ROWS: + case proto::VarType::LOD_RANK_TABLE: + case proto::VarType::LOD_TENSOR_ARRAY: + case proto::VarType::READER: + return static_cast(type); + default: + PADDLE_THROW("ToVarType:Unsupported type %d", type); + } +} + +template +inline void VisitVarType(const framework::Variable& var, Visitor visitor) { + switch (var.Type()) { + case proto::VarType::LOD_TENSOR: + visitor(var.Get()); + return; + case proto::VarType::LOD_RANK_TABLE: + visitor(var.Get()); + return; + case proto::VarType::LOD_TENSOR_ARRAY: + visitor(var.Get()); + return; + case proto::VarType::SELECTED_ROWS: + visitor(var.Get()); + return; + case proto::VarType::READER: + visitor(var.Get()); + return; + default: + PADDLE_THROW("Not supported visit type, %s", ToTypeName(var.Type())); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h new file mode 100644 index 00000000..66e6ac81 --- /dev/null +++ b/paddle/fluid/framework/var_type_inference.h @@ -0,0 +1,156 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/type_defs.h" + +namespace paddle { +namespace framework { + +class OpDesc; +class BlockDesc; +// default infer var type context +class InferVarTypeContext { + public: + InferVarTypeContext(const OpDesc* op, BlockDesc* block) + : op_(op), block_(block) {} + + virtual ~InferVarTypeContext() {} + + virtual Attribute GetAttr(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(op_); + return op_->GetAttr(name); + } + + virtual bool HasVar(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindVarRecursive(name) != nullptr; + } + + virtual bool HasInput(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(op_); + auto& inputs = op_->Inputs(); + auto input = inputs.find(name); + return input != inputs.end() && !input->second.empty(); + } + + virtual bool HasOutput(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(op_); + auto& outputs = op_->Outputs(); + auto output = outputs.find(name); + return output != outputs.end() && !output->second.empty(); + } + + virtual const std::vector& Input(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(op_); + return op_->Input(name); + } + + virtual const std::vector& Output( + const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(op_); + return op_->Output(name); + } + + virtual proto::VarType::Type GetType(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindRecursiveOrCreateVar(name).GetType(); + } + + virtual void SetType(const std::string& name, proto::VarType::Type type) { + PADDLE_ENFORCE_NOT_NULL(block_); + block_->FindRecursiveOrCreateVar(name).SetType(type); + } + + virtual proto::VarType::Type GetDataType(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindRecursiveOrCreateVar(name).GetDataType(); + } + + virtual void SetDataType(const std::string& name, proto::VarType::Type type) { + PADDLE_ENFORCE_NOT_NULL(block_); + block_->FindRecursiveOrCreateVar(name).SetDataType(type); + } + + virtual std::vector GetDataTypes( + const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindRecursiveOrCreateVar(name).GetDataTypes(); + } + + virtual void SetDataTypes( + const std::string& name, + const std::vector& multiple_data_type) { + PADDLE_ENFORCE_NOT_NULL(block_); + block_->FindRecursiveOrCreateVar(name).SetDataTypes(multiple_data_type); + } + + virtual std::vector GetShape(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindRecursiveOrCreateVar(name).GetShape(); + } + + virtual void SetShape(const std::string& name, + const std::vector& dims) { + PADDLE_ENFORCE_NOT_NULL(block_); + block_->FindRecursiveOrCreateVar(name).SetShape(dims); + } + + virtual int32_t GetLoDLevel(const std::string& name) const { + PADDLE_ENFORCE_NOT_NULL(block_); + return block_->FindRecursiveOrCreateVar(name).GetLoDLevel(); + } + + virtual void SetLoDLevel(const std::string& name, int32_t lod_level) { + PADDLE_ENFORCE_NOT_NULL(block_); + block_->FindRecursiveOrCreateVar(name).SetLoDLevel(lod_level); + } + + protected: + const OpDesc* op_; + BlockDesc* block_; +}; + +class VarTypeInference { + public: + virtual ~VarTypeInference() {} + virtual void operator()(InferVarTypeContext* context) const = 0; // NOLINT +}; + +class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const final { // NOLINT + auto in_out_var_names = this->GetInputOutputWithSameType(); + + for (auto& i_o_n : in_out_var_names) { + auto& x_name = ctx->Input(i_o_n.first).at(0); + auto& out_name = ctx->Output(i_o_n.second).at(0); + + ctx->SetType(out_name, ctx->GetType(x_name)); + ctx->SetDataType(out_name, ctx->GetDataType(x_name)); + } + } + + protected: + virtual std::unordered_map + GetInputOutputWithSameType() const = 0; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc new file mode 100644 index 00000000..6bbb25a5 --- /dev/null +++ b/paddle/fluid/framework/var_type_inference_test.cc @@ -0,0 +1,116 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/var_type_inference.h" +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class NOP : public OperatorBase { + public: + NOP(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope &scope, + const platform::Place &place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SumOpVarTypeInference : public VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto &inputs = ctx->Input("X"); + auto default_var_type = proto::VarType::SELECTED_ROWS; + + bool any_input_is_lod_tensor = std::any_of( + inputs.begin(), inputs.end(), [&ctx](const std::string &name) { + return ctx->GetType(name) == proto::VarType::LOD_TENSOR; + }); + if (any_input_is_lod_tensor) { + default_var_type = proto::VarType::LOD_TENSOR; + } + + auto out_var_name = ctx->Output("Out").front(); + ctx->SetType(out_var_name, default_var_type); + } +}; +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker, + paddle::framework::SumOpVarTypeInference); +REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP, + paddle::framework::SumOpMaker); + +namespace paddle { +namespace framework { + +TEST(InferVarType, sum_op) { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"test_a", "test_b", "test_c"}); + op->SetOutput("Out", {"test_out"}); + + prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_out"); + + op->InferVarType(prog.MutableBlock(0)); + + ASSERT_EQ(proto::VarType::SELECTED_ROWS, + prog.MutableBlock(0)->Var("test_out")->GetType()); + + prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::LOD_TENSOR); + op->InferVarType(prog.MutableBlock(0)); + ASSERT_EQ(proto::VarType::LOD_TENSOR, + prog.MutableBlock(0)->Var("test_out")->GetType()); +} + +TEST(InferVarType, sum_op_without_infer_var_type) { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum_without_infer_var_type"); + op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); + op->SetOutput("Out", {"test2_out"}); + + prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test2_out"); + + op->InferVarType(prog.MutableBlock(0)); + + ASSERT_EQ(proto::VarType::LOD_TENSOR, + prog.MutableBlock(0)->Var("test2_out")->GetType()); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc new file mode 100644 index 00000000..7cc2b3b4 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/var_type_traits.h" +#include +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/platform/macros.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif +#include +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + +namespace paddle { +namespace framework { + +// Besides registering variable type id, it is helpful to register a +// var_id -> std::type_index map (for example, get type names according to id) +namespace detail { + +template +struct VarIdToTypeIndexMapInitializerImpl { + template + static void Init(MapType1 *id_to_type, MapType2 *type_to_id) { + using Type = + typename std::tuple_element::type; + static_assert(!std::is_same::value, "Type cannot be void"); + constexpr int kId = VarTypeTrait::kId; + auto type = std::type_index(typeid(Type)); + PADDLE_ENFORCE(id_to_type->count(kId) == 0, + "Registered duplicate type id %d for type %s", kId, + type.name()); + PADDLE_ENFORCE(type_to_id->count(type) == 0, + "Registered duplicate type_index %s for id %d", type.name(), + kId); + id_to_type->emplace(kId, type); + type_to_id->emplace(type, kId); + VarIdToTypeIndexMapInitializerImpl::Init(id_to_type, + type_to_id); + } +}; + +template +struct VarIdToTypeIndexMapInitializerImpl { + template + static void Init(MapType1 *, MapType2 *) {} +}; + +// VarIdToTypeIndexMapInitializer is designed to initialize var_id -> +// std::type_index map and std::type_index -> var_id map +using VarIdToTypeIndexMapInitializer = + VarIdToTypeIndexMapInitializerImpl<0, VarTypeRegistry::kRegisteredTypeNum, + VarTypeRegistry::kRegisteredTypeNum == + 0>; + +struct VarIdToTypeIndexMapHolder { + DISABLE_COPY_AND_ASSIGN(VarIdToTypeIndexMapHolder); + + public: + static const std::type_index &ToTypeIndex(int var_id) { + auto it = Instance().id_to_type_map_.find(var_id); + PADDLE_ENFORCE(it != Instance().id_to_type_map_.end(), + "VarId %d is not registered.", var_id); + return it->second; + } + + static int ToTypeId(const std::type_index &type) { + auto it = Instance().type_to_id_map_.find(type); + PADDLE_ENFORCE(it != Instance().type_to_id_map_.end(), + "VarType %s is not registered.", type.name()); + return it->second; + } + + private: + VarIdToTypeIndexMapHolder() { + VarIdToTypeIndexMapInitializer::Init(&id_to_type_map_, &type_to_id_map_); + } + + static const VarIdToTypeIndexMapHolder &Instance() { + static const VarIdToTypeIndexMapHolder instance; + return instance; + } + + std::unordered_map id_to_type_map_; + std::unordered_map type_to_id_map_; +}; + +} // namespace detail + +const std::type_index &VarTraitIdToTypeIndex(int var_id) { + return detail::VarIdToTypeIndexMapHolder::ToTypeIndex(var_id); +} + +const char *ToTypeName(int var_id) { + return VarTraitIdToTypeIndex(var_id).name(); +} + +int TypeIndexToVarTraitId(const std::type_index &type) { + return detail::VarIdToTypeIndexMapHolder::ToTypeId(type); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h new file mode 100644 index 00000000..7147f062 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits.h @@ -0,0 +1,191 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include +#ifndef _WIN32 +#include +#endif +#endif + +// Users should add forward declarations here +namespace paddle { + +namespace platform { +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +class Communicator; +class NCCLCommunicator; +#endif +#endif +} // namespace platform + +namespace framework { +class Tensor; +class LoDTensor; +class SelectedRows; +class LoDRankTable; +class ReaderHolder; +class Scope; +} // namespace framework + +namespace operators { + +class CudnnRNNCache; + +namespace reader { +class LoDTensorBlockingQueueHolder; +} // namespace reader +} // namespace operators + +} // namespace paddle + +namespace paddle { +namespace framework { + +const char *ToTypeName(int var_id); +const std::type_index &VarTraitIdToTypeIndex(int var_id); +int TypeIndexToVarTraitId(const std::type_index &type); + +namespace detail { + +template +struct TypePosFinderImpl { + static constexpr int kPos = + std::is_same::value + ? kStart + : TypePosFinderImpl::kPos; +}; + +template +struct TypePosFinderImpl { + static constexpr int kPos = std::is_same::value ? kStart : -1; +}; + +// TypePosFinder helps to find the position in which T is inside Args... +// If T is not inside Args..., kPos would be -1 +template +struct TypePosFinder { + static constexpr int kPos = + TypePosFinderImpl::kPos; +}; + +template +struct VarTypeRegistryImpl { + static constexpr size_t kRegisteredTypeNum = sizeof...(Args); + using ArgTuple = std::tuple; + + // TypePos() returns the position in which T is inside Args... + // If T is not inside Args..., return -1 + template + static constexpr int TypePos() { + return TypePosFinder::kPos; + } + + // IsRegistered() returns whether T is registered inside RegistryImpl + template + static constexpr bool IsRegistered() { + return TypePos() >= 0; + } +}; + +} // namespace detail + +#define REG_PROTO_VAR_TYPE_TRAIT(type, proto_id) \ + template <> \ + struct VarTypeTrait { \ + static_assert(VarTypeRegistry::IsRegistered(), \ + "Must be registered type"); \ + using Type = type; \ + static constexpr int kId = static_cast(proto_id); \ + } + +/** + * The following codes are designed to register variable types. + * Only registered types can be stored in Variable. + * This registry mechanism is designed to speed up Variable. + * + * Caution: If you want to add more var types, please consider carefully + * whether you really need to add it. + */ + +// Users should add other variable types below. +// Paddle would generate unique Ids for each registered variable types. +using VarTypeRegistry = detail::VarTypeRegistryImpl< + Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, + LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, + std::map, operators::reader::LoDTensorBlockingQueueHolder, +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 + ncclUniqueId, platform::Communicator, platform::NCCLCommunicator, +#endif + operators::CudnnRNNCache, +#endif + int, float>; + +template +struct VarTypeTrait { + static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); + using Type = T; + /** + * Unique VarType Id generation. + * + * The auto-generated id should not be the same as any protobuf id defined in + * framework.proto. Therefore, we generate id by adding the type pos and + * maximum protobuf id (i.e., proto::VarType::TUPLE). + * + * However, we may need more protobuf id in the future. + * To avoid changing this auto id generation algorithm frequently, we + * generate id by adding the type pos and twice of maximum protobuf id (i.e., + * proto::VarType::TUPLE). + */ + static constexpr int kId = VarTypeRegistry::TypePos() + + static_cast(proto::VarType::TUPLE) * 2; +}; + +// Users should set some of variable type ids to be what is defined in +// framework.proto below +REG_PROTO_VAR_TYPE_TRAIT(LoDTensor, proto::VarType::LOD_TENSOR); +REG_PROTO_VAR_TYPE_TRAIT(SelectedRows, proto::VarType::SELECTED_ROWS); +REG_PROTO_VAR_TYPE_TRAIT(std::vector, proto::VarType::STEP_SCOPES); +REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); +REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); +REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); +REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); +REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); + +/** End of variable type registration */ + +template +inline constexpr bool IsRegisteredVarType() { + return VarTypeRegistry::IsRegistered(); +} + +#undef REG_PROTO_VAR_TYPE_TRAIT +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc new file mode 100644 index 00000000..67dbfd74 --- /dev/null +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#ifdef PADDLE_WITH_CUDA +#ifndef _WIN32 +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#endif + +namespace paddle { +namespace framework { + +template +struct TypeIndexChecker { + template + static void Check(SetType1 *var_id_set, SetType2 *type_index_set) { + using Type = + typename std::tuple_element::type; + static_assert(std::is_same::Type, Type>::value, + "Type must be the same"); + constexpr auto kId = VarTypeTrait::kId; + std::type_index actual_type(typeid(Type)); + EXPECT_EQ(std::string(ToTypeName(kId)), std::string(actual_type.name())); + EXPECT_EQ(VarTraitIdToTypeIndex(kId), actual_type); + EXPECT_EQ(TypeIndexToVarTraitId(actual_type), kId); + EXPECT_EQ(VarTraitIdToTypeIndex(TypeIndexToVarTraitId(actual_type)), + actual_type); + EXPECT_EQ(TypeIndexToVarTraitId(VarTraitIdToTypeIndex(kId)), kId); + + EXPECT_TRUE(var_id_set->count(kId) == 0); // NOLINT + EXPECT_TRUE(type_index_set->count(actual_type) == 0); // NOLINT + var_id_set->insert(kId); + type_index_set->insert(std::type_index(typeid(Type))); + TypeIndexChecker::Check(var_id_set, + type_index_set); + } +}; + +template +struct TypeIndexChecker { + template + static void Check(SetType1 *, SetType2 *) {} +}; + +TEST(var_type_traits, check_no_duplicate_registry) { + constexpr size_t kRegisteredNum = VarTypeRegistry::kRegisteredTypeNum; + std::unordered_set var_id_set; + std::unordered_set type_index_set; + TypeIndexChecker<0, kRegisteredNum, kRegisteredNum == 0>::Check( + &var_id_set, &type_index_set); +} + +template +bool CheckVarId(int proto_id) { + static_assert(std::is_same::Type, T>::value, + "Type must be the same"); + return VarTypeTrait::kId == proto_id; +} + +TEST(var_type_traits, check_proto_type_id) { + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR)); + ASSERT_TRUE(CheckVarId(proto::VarType::SELECTED_ROWS)); + ASSERT_TRUE(CheckVarId>(proto::VarType::STEP_SCOPES)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_RANK_TABLE)); + ASSERT_TRUE(CheckVarId(proto::VarType::LOD_TENSOR_ARRAY)); + ASSERT_TRUE(CheckVarId(proto::VarType::PLACE_LIST)); + ASSERT_TRUE(CheckVarId(proto::VarType::READER)); + ASSERT_TRUE(CheckVarId(proto::VarType::INT32)); + ASSERT_TRUE(CheckVarId(proto::VarType::FP32)); + + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR, proto::VarType::LOD_TENSOR); + ASSERT_EQ(proto::VarType_Type_SELECTED_ROWS, proto::VarType::SELECTED_ROWS); + ASSERT_EQ(proto::VarType_Type_STEP_SCOPES, proto::VarType::STEP_SCOPES); + ASSERT_EQ(proto::VarType_Type_LOD_RANK_TABLE, proto::VarType::LOD_RANK_TABLE); + ASSERT_EQ(proto::VarType_Type_LOD_TENSOR_ARRAY, + proto::VarType::LOD_TENSOR_ARRAY); + ASSERT_EQ(proto::VarType_Type_PLACE_LIST, proto::VarType::PLACE_LIST); + ASSERT_EQ(proto::VarType_Type_READER, proto::VarType::READER); + ASSERT_EQ(proto::VarType_Type_FEED_MINIBATCH, proto::VarType::FEED_MINIBATCH); + ASSERT_EQ(proto::VarType_Type_FETCH_LIST, proto::VarType::FETCH_LIST); + ASSERT_EQ(proto::VarType_Type_RAW, proto::VarType::RAW); + ASSERT_EQ(proto::VarType_Type_TUPLE, proto::VarType::TUPLE); + ASSERT_EQ(proto::VarType_Type_INT32, proto::VarType::INT32); + ASSERT_EQ(proto::VarType_Type_FP32, proto::VarType::FP32); +} + +TEST(var_type_traits, test_registry) { + using Registry = detail::VarTypeRegistryImpl; + ASSERT_TRUE(Registry::TypePos() == 0); + ASSERT_TRUE(Registry::TypePos() == 1); + ASSERT_TRUE(Registry::TypePos() == 2); + ASSERT_TRUE(Registry::TypePos() == 3); + ASSERT_TRUE(Registry::TypePos() == -1); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h new file mode 100644 index 00000000..b9d07da8 --- /dev/null +++ b/paddle/fluid/framework/variable.h @@ -0,0 +1,104 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/var_type_traits.h" + +namespace paddle { +namespace framework { + +class Variable { + public: + template + const T& Get() const { + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); + PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing"); + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, + "Variable must be type %s, the holding type is %s", + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); + return *static_cast(holder_->Ptr()); + } + + bool IsInitialized() const { return holder_ != nullptr; } + + template + T* GetMutable() { + if (!holder_) { + holder_.reset(new PlaceholderImpl()); + } else { + PADDLE_ENFORCE(holder_->Type() == VarTypeTrait::kId, + "Variable must be type %s, the holding type is %s", + ToTypeName(VarTypeTrait::kId), + ToTypeName(holder_->Type())); + } + return static_cast(holder_->Ptr()); + } + + template + bool IsType() const { + return holder_ && holder_->Type() == VarTypeTrait::kId; + } + + void Clear() { holder_.reset(); } + + int Type() const { + PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory"); + return holder_->Type(); + } + + private: + struct Placeholder { + virtual ~Placeholder() = default; + + inline int Type() const { return type_; } + inline const void* Ptr() const { return ptr_; } + inline void* Ptr() { return ptr_; } + + protected: + inline void Init(void* p, int type) { + ptr_ = p; + type_ = type; + } + + void* ptr_; + int type_; + }; + + // Placeholder hides type T, so it doesn't appear as a template + // parameter of Variable. + template + struct PlaceholderImpl : public Placeholder { + static_assert( + IsRegisteredVarType(), + "Not registered type. Please register T inside var_type_traits.h"); + PlaceholderImpl() { this->Init(&obj_, VarTypeTrait::kId); } + + private: + T obj_; + }; + + // pointers to a PlaceholderImpl object indeed. + std::unique_ptr holder_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc new file mode 100644 index 00000000..65c939af --- /dev/null +++ b/paddle/fluid/framework/variable_helper.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/variable_helper.h" + +#include + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +void InitializeVariable(Variable *var, proto::VarType::Type var_type) { + if (var_type == proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else if (var_type == proto::VarType::SELECTED_ROWS) { + var->GetMutable(); + } else if (var_type == proto::VarType::FEED_MINIBATCH) { + var->GetMutable(); + } else if (var_type == proto::VarType::FETCH_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::STEP_SCOPES) { + var->GetMutable>(); + } else if (var_type == proto::VarType::LOD_RANK_TABLE) { + var->GetMutable(); + } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { + var->GetMutable(); + } else if (var_type == proto::VarType::PLACE_LIST) { + var->GetMutable(); + } else if (var_type == proto::VarType::READER) { + var->GetMutable(); + } else if (var_type == proto::VarType::RAW) { + // GetMutable will be called in operator + } else { + PADDLE_THROW( + "Variable type %d is not in " + "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, " + "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]", + var_type); + } +} + +void CopyVariable(const Variable &src_var, Variable *dst_var) { + // only support cpu now + auto cpu_place = platform::CPUPlace(); + + if (src_var.IsType()) { + auto *tmp_grad_tensor = dst_var->GetMutable(); + auto &src_tensor = src_var.Get(); + tmp_grad_tensor->set_lod(src_tensor.lod()); + framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor); + } else if (src_var.IsType()) { + auto &src_slr = src_var.Get(); + auto *tmp_grad_slr = dst_var->GetMutable(); + tmp_grad_slr->set_rows(src_slr.rows()); + tmp_grad_slr->set_height(src_slr.height()); + auto &src_t = src_slr.value(); + auto *dst_t = tmp_grad_slr->mutable_value(); + framework::TensorCopy(src_t, cpu_place, dst_t); + } else { + PADDLE_THROW("unknown var type to copy"); + } +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h new file mode 100644 index 00000000..5a2c267b --- /dev/null +++ b/paddle/fluid/framework/variable_helper.h @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/variable.h" +namespace paddle { +namespace framework { + +void InitializeVariable(Variable* var, proto::VarType::Type var_type); +void CopyVariable(const Variable& src_var, Variable* dst_var); + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc new file mode 100644 index 00000000..511c9c52 --- /dev/null +++ b/paddle/fluid/framework/variable_test.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { + +TEST(Variable, GetMutable) { + std::unique_ptr v(new Variable()); + + auto* t = v->GetMutable(); + *t = "1234"; + + const auto& tt = v->Get(); + EXPECT_EQ("1234", tt); + + try { + v->GetMutable(); + } catch (std::exception& e) { + return; + } + EXPECT_TRUE(false); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/version.cc b/paddle/fluid/framework/version.cc new file mode 100644 index 00000000..81c0392b --- /dev/null +++ b/paddle/fluid/framework/version.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/version.h" +#include + +namespace paddle { +namespace framework { +bool IsProgramVersionSupported(int64_t version) { + static int num_supported = + sizeof(kSupportedProgramVersion) / sizeof(kSupportedProgramVersion[0]); + return std::find(kSupportedProgramVersion, + kSupportedProgramVersion + num_supported, + version) != kSupportedProgramVersion + num_supported; +} + +bool IsTensorVersionSupported(uint32_t version) { + static int num_supported = + sizeof(kSupportedTensorVersion) / sizeof(kSupportedTensorVersion[0]); + return std::find(kSupportedTensorVersion, + kSupportedTensorVersion + num_supported, + version) != kSupportedTensorVersion + num_supported; +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/version.h b/paddle/fluid/framework/version.h new file mode 100644 index 00000000..9945bc58 --- /dev/null +++ b/paddle/fluid/framework/version.h @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#pragma once + +namespace paddle { +namespace framework { + +// Note: +// Program and Tensor that pass the IsXXXVersionSupported should +// be supported by the current codes. Otherwise, it's a compatibility +// bug. + +// The program version the current codes generate. +constexpr int64_t kCurProgramVersion = 0; + +// The program version that was generated by previous or current codes +// and supported by current codes. +constexpr int64_t kSupportedProgramVersion[] = {0}; + +// Due to historical reasons, tensor version use uint32_t. +// The tensor version the current codes generate. +constexpr uint32_t kCurTensorVersion = 0; + +// The tensor version that was generated by previous or current codes +// and supported by current codes. +constexpr uint32_t kSupportedTensorVersion[] = {0}; + +bool IsProgramVersionSupported(int64_t version); + +bool IsTensorVersionSupported(uint32_t version); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/version_test.cc b/paddle/fluid/framework/version_test.cc new file mode 100644 index 00000000..e8c5f256 --- /dev/null +++ b/paddle/fluid/framework/version_test.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/version.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +TEST(Version, Basic) { + EXPECT_TRUE(IsProgramVersionSupported(0)); + EXPECT_FALSE(IsProgramVersionSupported(1)); + EXPECT_FALSE(IsProgramVersionSupported(-1)); + + EXPECT_TRUE(IsTensorVersionSupported(0)); + EXPECT_FALSE(IsTensorVersionSupported(1)); + EXPECT_FALSE(IsTensorVersionSupported(-1)); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt new file mode 100644 index 00000000..73c629fd --- /dev/null +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -0,0 +1,10 @@ +cc_library(imperative_flag SRCS flags.cc DEPS gflags) + +if(WITH_PYTHON) +cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler imperative_flag) +cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind profiler) +cc_library(engine SRCS engine.cc) +cc_library(imperative_profiler SRCS profiler.cc) +cc_library(nccl_context SRCS nccl_context.cc DEPS device_context) +cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) +endif() diff --git a/paddle/fluid/imperative/README.md b/paddle/fluid/imperative/README.md new file mode 100644 index 00000000..4c4d619b --- /dev/null +++ b/paddle/fluid/imperative/README.md @@ -0,0 +1,212 @@ +# Overview + +Imperative Programming is easier to learn, debug and try new ideas. + +# Related Works + +## Pytorch +https://pytorch.org/ + +## TensorFlow Eager +https://www.tensorflow.org/guide/eager + +# Design + +## API +```python +class Layer(object): + + def __call__(inputs): + # build some parameter once. + # ... + return self.apply(inputs): + + def forward(inputs): + # forward logic with paddle operators. backward auto-generated. + + +class PyLayer(core.PyLayer): + + def __call__(cls, inputs): + # trace the logic. + + @staticmethod + def forward(inputs): + # any forward logic implemented with numpy io. + + @staticmethod + def backward(inputs): + # any backward logic implemented with numpy io. + +``` + + +## Tracer + +Current: Python Variable -> C++ VarBase -> C++ Variable -> C++ Tensor + +Longer term. +```python + +# Parent class. +class PyVarBase(object): + pass + +# Current python variable. +class Variable(PyVarBase): + pass + +class IVariable(PyVarBase): + def __init__(self): + self._ivar = core.VarBase() + + # Move var to a device. + def to(device): pass + # Get var value. + def value(): pass + # Trigger backward. + def backward(): pass + # Get var's gradient value. + def gradient_value(): pass + # operators to override. +``` + + + +```cpp +class Tracer { + public: + explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} + + virtual ~Tracer() {} + + void Trace(OpBase* op, + const std::map>& inputs, + const std::map>& outputs, + framework::BlockDesc* block, const bool stop_gradient = false); + + std::vector PyTrace(OpBase* op, const std::vector& inputs, + bool stop_gradient = false); +}; +``` + +* Trace forward operations +* Perform quick shape/type infer, push kernel execution engine and return to user. +* Perform autograd to generate gradients. +* Clear trace. +* Apply gradients with optimizers + +## Autodiff + +Lots of research already. +https://autodiff-workshop.github.io/ +https://en.wikipedia.org/wiki/Automatic_differentiation + +Basically, trace the forward execution, and perform autodiff +when needed. + +* Can be triggered by `backward()`. +* Can select a block of code to trace and autodiff. +* Use `require_grad` to drop some forward subgraph that doesn't need autodiff. + +## Execution Engine + +Lazy execution of pushed C++ operations. + +## Device Placement + +* Operator executes on the inputs' device. +* All inputs should live on the same device. +* use `Var.to()` to explicitly move var to a device. + +## Save/Load Models + +TODO + +## I/O + +TODO + +## Refactor + +* All function layers with parameters converted to class Layers. +* Existing models converted to imperative mode. +* All op tests run once in static graph, once in imperative mode. + +# Examples + +```python +class MyLayer(fluid.imperative.Layer): + def __init__(self): + super(MyLayer, self).__init__() + + def forward(self, inputs): + x = fluid.layers.relu(inputs) + x = fluid.layers.elementwise_mul(x, x) + x = fluid.layers.reduce_sum(x) + return [x] + + +class MyPyLayer(fluid.imperative.PyLayer): + def __init__(self): + super(MyPyLayer, self).__init__() + + @staticmethod + def forward(inputs): + return np.tanh(inputs[0]) + + @staticmethod + def backward(inputs): + return np.array(dout) * (1 - np.square(np.array(out))) + + +np_inp = np.ones([2, 2], np.float32) +with fluid.imperative.guard(): + my_py_layer = MyPyLayer() + outs = my_py_layer(np_inp) + dy_out = np.sum(outs[0]._numpy()) + outs[0]._backward() + dy_grad = var_inp._gradient() + + +class MLP(fluid.imperative.Layer): + def __init__(self): + super(MLP, self).__init__() + self._fc1 = FC(3, + fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + self._fc2 = FC(4, + fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.1))) + + def forward(self, inputs): + x = self._fc1(inputs) + x = self._fc2(x) + x = fluid.layers.reduce_sum(x) + return x + + + np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + with fluid.imperative.guard(): + var_inp = fluid.imperative.base.to_variable(np_inp) + mlp = MLP() + out = mlp(var_inp) + dy_out = out._numpy() + out._backward() +``` + +# Plan + +2.1,3 fulltime, Can run a few simple models. (Currently, 2 20% engs) + +4.1, 4 fulltime, Can run 6 models, Performance 70% Pytorch. Release alpha. + +6.1, 5 fulltime, Performance close to Pytorch, can run multi-devices. Release Beta. + +8.1, 5 fulltime, Works in general. Update existing models. Can compile to static graph, support more optimizations. + +12.1 Done. + +# Discussion + +TODO. diff --git a/paddle/fluid/imperative/backward_strategy.h b/paddle/fluid/imperative/backward_strategy.h new file mode 100644 index 00000000..9ff07d6d --- /dev/null +++ b/paddle/fluid/imperative/backward_strategy.h @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Created by Jiabin on 2019-04-25. +// +#pragma once +#ifndef PADDLE_BACKWARDSTRATEGY_H +#define PADDLE_BACKWARDSTRATEGY_H + +#endif // PADDLE_BACKWARDSTRATEGY_H + +namespace paddle { +namespace imperative { +namespace detail { + +class BackwardStrategy { + public: + /* DyGraph now support two kinds of backward strategy, one is sorted sum + * gradient, another is sum gradient once they are created */ + // TODO(jiabin): add more Strategy when we support + bool sorted_sum_gradient_{false}; +}; + +} // namespace detail +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc new file mode 100644 index 00000000..de7ab0e5 --- /dev/null +++ b/paddle/fluid/imperative/engine.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/engine.h" + +#include // NOLINT +#include + +#include "glog/logging.h" + +namespace paddle { +namespace imperative { + +static std::once_flag init_engine; +static Engine* engine; + +class DummyEngine : public Engine { + public: + void Enqueue(Runnable* runnable) override { + queued_runnables_.push_back(runnable); + } + + size_t Size() const override { return queued_runnables_.size(); } + + void Sync() override { + for (Runnable* l : queued_runnables_) { + LOG(INFO) << "running " << reinterpret_cast(l); + } + queued_runnables_.clear(); + } + + private: + std::vector queued_runnables_; +}; + +Engine* GetEngine() { + std::call_once(init_engine, []() { engine = new DummyEngine(); }); + return engine; +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h new file mode 100644 index 00000000..a1dfa5bd --- /dev/null +++ b/paddle/fluid/imperative/engine.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace paddle { +namespace imperative { + +struct Runnable {}; + +class Engine { + public: + virtual ~Engine() {} + + virtual void Enqueue(Runnable* runnable) = 0; + + virtual size_t Size() const = 0; + + virtual void Sync() = 0; +}; + +Engine* GetEngine(); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/flags.cc b/paddle/fluid/imperative/flags.cc new file mode 100644 index 00000000..57656d64 --- /dev/null +++ b/paddle/fluid/imperative/flags.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/flags.h" +#include "gflags/gflags.h" + +DEFINE_uint64(dygraph_debug, 0, + "Debug level of dygraph. This flag is not " + "open to users"); + +namespace paddle { +namespace imperative { + +bool IsDebugEnabled() { return FLAGS_dygraph_debug != 0; } + +uint64_t GetDebugLevel() { return FLAGS_dygraph_debug; } + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/flags.h b/paddle/fluid/imperative/flags.h new file mode 100644 index 00000000..094bce83 --- /dev/null +++ b/paddle/fluid/imperative/flags.h @@ -0,0 +1,26 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace paddle { +namespace imperative { + +extern bool IsDebugEnabled(); +extern uint64_t GetDebugLevel(); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc new file mode 100644 index 00000000..fb22d334 --- /dev/null +++ b/paddle/fluid/imperative/layer.cc @@ -0,0 +1,469 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/layer.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace imperative { + +void ThreadSafeNameSet::Insert(const std::string& name) { + std::lock_guard guard(mtx_); + set_.insert(name); +} + +void ThreadSafeNameSet::Remove(const std::string& name) { + std::lock_guard guard(mtx_); + auto iter = set_.find(name); + PADDLE_ENFORCE(iter != set_.end(), "%s does not exist", name); + set_.erase(iter); +} + +std::vector ThreadSafeNameSet::Names() const { + std::lock_guard guard(mtx_); + return std::vector(set_.begin(), set_.end()); +} + +ThreadSafeNameSet VarBase::name_set_; + +std::vector VarBase::AliveVarNames() { return name_set_.Names(); } + +using framework::Variable; + +namespace detail { + +template +class TensorAddToFunctor : public boost::static_visitor<> { + public: + TensorAddToFunctor(int64_t numel, const T* x, T* y) + : numel_(numel), x_(x), y_(y) {} + + void operator()(const platform::CPUPlace& place) { + platform::CPUDeviceContext* ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = operators::math::GetBlas(*ctx); + blas.AXPY(numel_, 1., x_, y_); + } + +#ifdef PADDLE_WITH_CUDA + void operator()(const platform::CUDAPlace& place) { + platform::CUDADeviceContext* ctx = + dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = operators::math::GetBlas(*ctx); + blas.AXPY(numel_, 1., x_, y_); + } +#else + void operator()(const platform::CUDAPlace& place) { + PADDLE_THROW("Do NOT support gradient merge in place %s", place); + } +#endif + + // there is NO blas in CUDAPinnedPlace + void operator()(const platform::CUDAPinnedPlace& place) { + PADDLE_THROW("Do NOT support gradient merge in place %s", place); + } + + private: + int64_t numel_; + const T* x_; + T* y_; +}; + +} // namespace detail + +void AddTo(std::shared_ptr src, std::shared_ptr dst, + platform::Place place, GradientRef* grad_ref) { + PADDLE_ENFORCE(grad_ref->find(dst.get()) != grad_ref->end(), + "gradient %s are not found in grad_ref", dst->Name()); + if ((*grad_ref)[dst.get()].second) { + PADDLE_ENFORCE(src->IsInitialize(), "Using uninitialized VarBase"); + dst->var_ = std::move(src->var_); + (*grad_ref)[dst.get()].second = false; + if (!dst->IsInitialize()) { + dst->SetInitialize(true); + } + return; + } else { + framework::Tensor* dst_tensor = + dst->var_->GetMutable(); + framework::Tensor* src_tensor = + src->var_->GetMutable(); + + // FIXME(minqiyang): loss_grad op will pass a zero grad of label + // ugly fix for it + if (src_tensor->numel() == 0) { + return; + } + + PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), + "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(), + src_tensor->numel()); + + detail::TensorAddToFunctor func( + src_tensor->numel(), src_tensor->data(), + dst_tensor->mutable_data(place)); + boost::apply_visitor(func, place); + } +} + +void ZeroGrads(const std::shared_ptr vb, + const platform::Place& place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + auto grad_t = vb->var_->GetMutable(); + operators::math::set_constant(*dev_ctx, grad_t, 0.0); +} + +void AddGradBySort(BackwardSumMap* bck_map, + std::shared_ptr target, + GradientRef* grad_ref) { + PADDLE_ENFORCE(bck_map->find(target.get()) != bck_map->end(), + "Can't find %s in backward grad map", target->Name()); + std::pair>>>& + current = bck_map->at(target.get()); + std::sort(current.second.begin(), current.second.end(), + [](const std::pair>& a, + const std::pair>& b) { + return a.first > b.first; + }); + for (auto& var_pair : current.second) { + VLOG(10) << "add origin_grad: " << target->Name(); + VLOG(10) << "added grad: " << var_pair.second->Name() + << " trace id is: " << var_pair.first; + AddTo(var_pair.second, target, current.first, grad_ref); + var_pair.second.reset(); + } +} + +class Autograd { + public: + Autograd() {} + + void RunBackward(VarBase* var, const detail::BackwardStrategy& bck_stratedy) { + if (var->IsStopGradient()) { + return; + } + VLOG(2) << "start autograd"; + BackwardSumMap bck_map; + std::deque ready; + ready.push_back(var->PreOp()); + + std::map dep_counts = + ComputeDepCounts(var->PreOp(), bck_stratedy, &grad_ref); + + while (!ready.empty()) { + OpBase* ready_op = ready.front(); + ready.pop_front(); + std::vector grads_outputs = + ready_op->ApplyGrad(&bck_map, &grad_ref, bck_stratedy); + + for (const auto& map : grads_outputs) { + for (auto it = map.rbegin(); it != map.rend(); ++it) { + const std::vector>& grad_outs = it->second; + for (size_t i = 0; i < grad_outs.size(); ++i) { + if (!grad_outs[i] || grad_outs[i]->IsStopGradient()) continue; + OpBase* pre_op = grad_outs[i]->PreOp(); + if (!pre_op) continue; + dep_counts[pre_op] -= 1; + PADDLE_ENFORCE(dep_counts[pre_op] >= 0); + bool pre_op_ready = dep_counts[pre_op] == 0; + if (pre_op_ready) { + ready.push_back(pre_op); + } + } + } + } + + ready_op->InvokeBackwardHooks(); + } + } + + private: + std::map ComputeDepCounts( + OpBase* op, const detail::BackwardStrategy& bck_stratedy, + GradientRef* grad_ref) { + if (bck_stratedy.sorted_sum_gradient_) { + PADDLE_ENFORCE_NOT_NULL(grad_ref, + "grad_ref should not be null when " + "using sorted grad backward strategy"); + } + std::map ret; + + std::deque queue; + queue.push_back(op); + std::unordered_set visited; + visited.insert(op); + while (!queue.empty()) { + OpBase* candidate = queue.front(); + queue.pop_front(); + for (const auto& map : candidate->grad_output_vars_) { + for (const auto& it : map) { + for (const auto& vb : it.second) { + if (bck_stratedy.sorted_sum_gradient_) { + ++(*grad_ref)[vb.get()].first; + } + // init the state of the grad_ + (*grad_ref)[vb.get()].second = true; + } + } + } + for (auto it : candidate->pre_ops_) { + for (OpBase* pre_op : it.second) { + if (!pre_op) continue; + VLOG(2) << "op dep " << candidate->Type() << " trace id " + << candidate->trace_id_ << " <---- " << it.first << " <---- " + << pre_op->Type() << " trace id " << pre_op->trace_id_; + if (visited.find(pre_op) == visited.end()) { + visited.insert(pre_op); + queue.push_back(pre_op); + } + ret[pre_op] += 1; + } + } + } + return ret; + } + + GradientRef grad_ref; +}; + +std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, + const bool blocking) const { + PADDLE_ENFORCE(var_->IsInitialized(), + "Variable must be initialized when getting numpy tensor"); + + // TODO(minqiyang): change this after move unique_name generator to CXX + const framework::LoDTensor& self_tensor = var_->Get(); + std::unique_ptr new_var(new VarBase( + "Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false)); + framework::LoDTensor* tensor = + new_var->var_->GetMutable(); + tensor->set_lod(var_->Get().lod()); + + const auto& src_tensor = var_->Get(); + framework::TensorCopy(src_tensor, dst_place, tensor); + if (blocking) { + platform::DeviceContextPool::Instance().Get(dst_place)->Wait(); + auto src_place = src_tensor.place(); + if (!(src_place == dst_place)) { + platform::DeviceContextPool::Instance().Get(src_place)->Wait(); + } + } + + if (platform::is_gpu_place(dst_place)) { + VLOG(3) << "copy tensor " << Name() << " from gpu"; + } + + return new_var; +} + +framework::LoDTensor& VarBase::GradValue() { + VLOG(3) << "get var grad " << Name(); + PADDLE_ENFORCE_NOT_NULL(grads_, + "Could not get grad value from no grad variable"); + return *(grads_->var_->GetMutable()); +} + +std::vector OpBase::ApplyGrad( + BackwardSumMap* bck_map, GradientRef* grad_ref, + const detail::BackwardStrategy& bck_stratedy) { + PADDLE_ENFORCE(!grad_op_descs_.empty(), "%s has no backward implementation", + Type()); + VLOG(3) << "apply op grad: " << Type(); + std::vector tmp_grad_outputs; + const size_t grad_op_count = grad_op_descs_.size(); + + tmp_grad_outputs.resize(grad_op_count); + for (size_t k = 0; k < grad_op_count; ++k) { + framework::OpDesc* grad_op_desc = grad_op_descs_[k]; + platform::RecordEvent record_event(grad_op_desc->Type()); + auto& grad_output_variable_map = grad_output_vars_[k]; + VLOG(3) << "apply grad op " << grad_op_desc->Type(); + + // Allocate tmp grad output variable + for (const auto& it : grad_output_variable_map) { + auto& outputs = tmp_grad_outputs[k][it.first]; + outputs.reserve(it.second.size()); + for (const std::shared_ptr& origin_grad_var_base : + it.second) { + // Allocate a new variable + std::shared_ptr tmp_grad_var_base(new VarBase( + string::Sprintf("%s@IGrad", origin_grad_var_base->Name()), + origin_grad_var_base->DataType(), origin_grad_var_base->Dims(), + place_, true, false)); + outputs.emplace_back(std::move(tmp_grad_var_base)); + } + } + + // No need to do compile time infer shape here. + // grad_op_desc_->InferShape(*block_); + // grad_op_desc->InferVarType(block_); + + std::unique_ptr opbase = + framework::OpRegistry::CreateOp(*grad_op_desc); + + auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type()); + if (info.infer_var_type_) { + RuntimeInferVarTypeContext infer_var_type_ctx( + &grad_input_vars_[k], &tmp_grad_outputs[k], &(opbase->Attrs())); + info.infer_var_type_(&infer_var_type_ctx); + } + + framework::OperatorWithKernel* op_kernel = + dynamic_cast(opbase.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + + // Run grad op + framework::VariableValueMap grad_invars_map; + framework::VariableValueMap grad_outvars_map; + + for (const auto& it : grad_input_vars_[k]) { + auto& grad_invars = grad_invars_map[it.first]; + grad_invars.reserve(it.second.size()); + for (const std::shared_ptr& grad_inp : it.second) { + PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr", + grad_op_desc->Type(), grad_inp->Name()); + if (!grad_inp->IsInitialize()) { + grad_inp->InitBuffer(); + ZeroGrads(grad_inp, place_); + } + const std::shared_ptr& const_grad_inp = grad_inp; + grad_invars.emplace_back(const_grad_inp->var_.get()); + } + } + + for (const auto& it : tmp_grad_outputs[k]) { + auto& grad_outvars = grad_outvars_map[it.first]; + grad_outvars.reserve(it.second.size()); + for (const std::shared_ptr& grad_out : it.second) { + PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr", + grad_op_desc->Type(), grad_out->Name()); + + grad_outvars.emplace_back(grad_out->var_.get()); + } + } + + framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map); + framework::Scope scope; + PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); + p.op.RuntimeInferShape(scope, place_, ctx); + p.func( + framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx, nullptr)); + } + + platform::RecordEvent record_event("merge_grads"); + // Add tmp grad outputs to original grad vars + for (size_t k = 0; k < grad_output_vars_.size(); ++k) { + for (const auto& it : grad_output_vars_[k]) { + auto& outputs = tmp_grad_outputs[k][it.first]; + const auto& origin_outputs = it.second; + PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); + + for (size_t i = 0; i < outputs.size(); ++i) { + // track outputs used by sum + if (bck_stratedy.sorted_sum_gradient_) { + if (bck_map->find(origin_outputs[i].get()) != bck_map->end()) { + VLOG(10) << "add sub grad to " << origin_outputs[i]->Name(); + bck_map->at(origin_outputs[i].get()) + .second.emplace_back( + std::pair>( + this->trace_id_, std::move(outputs[i]))); + } else { + VLOG(10) << "insert new map for " << origin_outputs[i]->Name(); + std::pair>>> + tmp(place_, + {std::make_pair(this->trace_id_, std::move(outputs[i]))}); + bck_map->insert(std::make_pair(origin_outputs[i].get(), tmp)); + } + + PADDLE_ENFORCE( + grad_ref->find(origin_outputs[i].get()) != grad_ref->end(), + "Can't find %s in grad_reference count map", + origin_outputs[i]->Name()); + PADDLE_ENFORCE(grad_ref->at(origin_outputs[i].get()).first >= 1, + "Backward error when calculate grad reference"); + if (grad_ref->at(origin_outputs[i].get()).first > 1) { + VLOG(10) << "remove ref for " << origin_outputs[i]->Name(); + grad_ref->at(origin_outputs[i].get()).first--; + } else { + VLOG(10) << "Add grad for: " << origin_outputs[i]->Name(); + AddGradBySort(bck_map, origin_outputs[i], grad_ref); + grad_ref->at(origin_outputs[i].get()).first--; + } + } else { + VLOG(10) << "AddTo Called with orig_grad is: " + << origin_outputs[i]->name_ << " Grad to be added is " + << outputs[i]->name_; + AddTo(outputs[i], origin_outputs[i], place_, grad_ref); + outputs[i].reset(); + } + } + } + } + + return grad_output_vars_; +} + +void OpBase::InvokeBackwardHooks() { + VLOG(3) << "call backward hooks, hooks num: " << backward_hooks_.size(); + + // call backward hooks + for (py::object& callable : backward_hooks_) { + callable(this); + } +} + +void OpBase::RegisterBackwardHooks(const py::object& callable) { + VLOG(3) << "Register backward hooks " << trace_id_; + + // TODO(minqiyang): check the callable format + backward_hooks_.push_back(callable); +} + +void VarBase::RunBackward(const detail::BackwardStrategy& bck_stratedy) { + if (!pre_op_) return; + platform::RecordEvent record_event("Imperative Backward"); + VLOG(3) << "start backward"; + grads_->InitBuffer(); + auto grads_t = grads_->var_->GetMutable(); + operators::math::set_constant( + *(platform::DeviceContextPool::Instance().Get( + var_->GetMutable()->place())), + grads_t, 1.0); + + Autograd().RunBackward(this, bck_stratedy); +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h new file mode 100644 index 00000000..2fbedd82 --- /dev/null +++ b/paddle/fluid/imperative/layer.h @@ -0,0 +1,550 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT + +// clang-format off +#include "paddle/fluid/framework/python_headers.h" +// clang-format on + +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/framework/var_type_inference.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/imperative/backward_strategy.h" +#include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/imperative/flags.h" + +namespace paddle { +namespace imperative { + +class VarBase; + +namespace py = ::pybind11; + +class PreparedOp { + public: + PreparedOp(const framework::OperatorBase& op, + const framework::RuntimeContext& ctx, + framework::OperatorWithKernel::OpKernelFunc func, + platform::DeviceContext* dev_ctx, + std::vector* kernel_configs) + : op(op), + ctx(ctx), + func(func), + dev_ctx(dev_ctx), + kernel_configs(kernel_configs) {} + + static PreparedOp Prepare(const framework::RuntimeContext& ctx, + const framework::OperatorWithKernel& op, + const platform::Place& place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + + // check if op[type] has kernel registered. + auto& all_op_kernels = op.AllOpKernels(); + auto kernels_iter = all_op_kernels.find(op.Type()); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", + op.Type()); + } + + framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second; + + auto expected_kernel_key = + op.GetExpectedKernelType(framework::ExecutionContext( + op, framework::Scope(), *dev_ctx, ctx, nullptr)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); +#ifdef PADDLE_WITH_MKLDNN + // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set + if (kernel_iter == kernels.end() && + expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) { + VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; + expected_kernel_key.library_type_ = framework::LibraryType::kPlain; + expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout; + kernel_iter = kernels.find(expected_kernel_key); + } +#endif + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", op.Type(), + KernelTypeToString(expected_kernel_key)); + } + std::vector* kernel_configs = + op.GetKernelConfig(expected_kernel_key); + return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs); + } + + inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; } + + const framework::OperatorBase& op; + const framework::RuntimeContext& ctx; + framework::OperatorWithKernel::OpKernelFunc func; + platform::DeviceContext* dev_ctx; + std::vector* kernel_configs; +}; + +class OpBase; + +class ThreadSafeNameSet { + public: + void Insert(const std::string& name); + + void Remove(const std::string& name); + + std::vector Names() const; + + private: + std::multiset set_; + mutable std::mutex mtx_; +}; + +/* The wrapper for Variable which holds a Variable and a VarBase of its + * gradient. This object should be managed totally by Python intepreter. + * + * Nearly all interface should be implemented in C++. + */ +class VarBase { + public: + static std::vector AliveVarNames(); + + // Internal interface, create VarBase from exist variable + VarBase(const std::string& name, std::unique_ptr var, + VarBase* grad, bool stop_gradient) + : VarBase(name, var->Get().type(), + var->Get().dims(), + var->Get().place(), nullptr, grad, + stop_gradient, false, true) { + var_ = std::move(var); + } + + // Python interface + VarBase(const std::string& name, const framework::proto::VarType::Type dtype, + const std::vector& shape, const platform::Place& place, + bool stop_gradient, bool persistable) + : VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient, + persistable) {} + + // Internal interface, create VarBase from with ddim + VarBase(const std::string& name, const framework::proto::VarType::Type dtype, + const framework::DDim& shape, const platform::Place& place, + bool stop_gradient, bool persistable) + : VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient, + persistable, true) {} + + // Grad used constructor + VarBase(const std::string& name, const framework::proto::VarType::Type dtype, + const std::vector& shape, const platform::Place& place, + bool stop_gradient, bool persistable, bool need_initialize) + : VarBase(name, dtype, framework::make_ddim(shape), place, nullptr, + nullptr, stop_gradient, persistable, need_initialize) {} + + private: + // TODO(minqiyang): need support SelectedRows + VarBase(const std::string& name, framework::proto::VarType::Type dtype, + const framework::DDim& shape, const platform::Place& place, + std::unique_ptr var, VarBase* grad, + bool stop_gradient, bool persistable, bool need_initialize) + : name_(name), + type_(framework::proto::VarType::LOD_TENSOR), + place_(place), + var_(std::move(var)), + grads_(grad), + dtype_(dtype), + stop_gradient_(stop_gradient), + persistable_(persistable), + pre_op_(nullptr), + pre_op_out_name_(), + pre_op_out_idx_(-1) { + if (!var_) { + var_.reset(new framework::Variable()); + } + + auto tensor = var_->GetMutable(); + tensor->Resize(shape); + if (need_initialize) { + tensor->mutable_data(place, dtype); + is_initialized_ = true; + VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype + << " place: " << place; + } else { + is_initialized_ = false; + VLOG(8) << "not initialized varbase: " << name_; + } + VLOG(8) << "create varbase: " << name_ << " type: " << dtype + << " place: " << place << "Stop gradient: " << stop_gradient_; + + if (IsDebugEnabled()) { + name_set_.Insert(name_); + } + } + + public: + virtual ~VarBase() { + pre_op_ = nullptr; + pre_op_out_idx_ = -1; + VLOG(8) << "destruct varbase: " << name_; + if (IsDebugEnabled()) { + name_set_.Remove(name_); + } + } + + inline void SetName(const std::string& name) { name_ = name; } + inline std::string Name() const { return name_; } + inline bool IsInitialize() const { return is_initialized_; } + inline void SetInitialize(bool inited) { is_initialized_ = inited; } + inline std::vector Shape() const { + if (var_->IsInitialized()) { + return framework::vectorize(var_->Get().dims()); + } else { + return {}; + } + } + + inline framework::DDim Dims() const { + return var_->Get().dims(); + } + + // data type. e.g.. FP32 + inline void SetDataType(framework::proto::VarType::Type type) { + auto tensor = var_->GetMutable(); + tensor->mutable_data(tensor->place(), type); + } + inline framework::proto::VarType::Type DataType() const { return dtype_; } + + // tensor type. e.g.. LoDTensor + inline void SetType(framework::proto::VarType::Type type) { type_ = type; } + inline framework::proto::VarType::Type Type() const { return type_; } + + inline void SetStopGradient(bool stop_gradient) { + stop_gradient_ = stop_gradient; + if (grads_) { + grads_->stop_gradient_ = stop_gradient; + } + } + inline bool IsStopGradient() const { return stop_gradient_; } + + inline void SetPersistable(bool persistable) { persistable_ = persistable; } + inline bool IsPersistable() const { return persistable_; } + inline void SetPreOp(OpBase* op) { pre_op_ = op; } + inline platform::Place GetPlace() { return place_; } + inline OpBase* PreOp() const { return pre_op_; } + inline int PreOpOutIdx() const { return pre_op_out_idx_; } + + void RunBackward(const detail::BackwardStrategy& bck_stratedy); + + inline void ResetPreOp(OpBase* op) { + if (op == pre_op_) { + // clear pre_op info when op equals to var's pre_op + pre_op_ = nullptr; + pre_op_out_idx_ = -1; + } + } + + void InitBuffer() { + if (!is_initialized_) { + var_->GetMutable()->mutable_data(place_, dtype_); + is_initialized_ = true; + VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype_ + << " place: " << place_; + } else { + VLOG(8) << "var: " << name_ << " has already been initialized "; + } + } + + void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name, + int pre_op_out_idx, bool pre_op_stop_gradient) { + pre_op_ = pre_op; + pre_op_out_name_ = pre_op_out_name; + pre_op_out_idx_ = pre_op_out_idx; + if (pre_op_stop_gradient) { + stop_gradient_ = pre_op_stop_gradient; + } + } + + void ClearGradient() { + VLOG(1) << "clear gradient of " << Name(); + if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) { + auto grads_t = grads_->var_->GetMutable(); + operators::math::set_constant( + *(platform::DeviceContextPool::Instance().Get( + grads_->var_->Get().place())), + grads_t, 0.0); + } + } + + framework::LoDTensor& GradValue(); + + std::unique_ptr NewVarBase(const platform::Place& dst_place, + const bool blocking) const; + + inline std::string GradName() const { + return string::Sprintf("%s@IGrad", Name()); + } + + std::string name_; + framework::proto::VarType::Type type_; + platform::Place place_; + + std::unique_ptr var_; + std::shared_ptr grads_; + + private: + framework::proto::VarType::Type dtype_; + bool stop_gradient_; + bool persistable_; + bool is_initialized_; + OpBase* pre_op_; + std::string pre_op_out_name_; + int pre_op_out_idx_; + + // A private flag to check memory leak + static ThreadSafeNameSet name_set_; +}; + +/* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its + * gradient. This object should be managed totally by Python intepreter. + */ +class PYBIND11_HIDDEN OpBase { + public: + OpBase(const std::string& type) + : type_(type), + trace_id_(-1), + place_(platform::CPUPlace()), + backward_hooks_() {} + + virtual ~OpBase() { + for (const auto& it : outputs_ref) { + auto vb = it.lock(); + if (vb) { + VLOG(3) << "Op reset by" << vb->name_; + vb->ResetPreOp(this); + } + } + // TODO(minqiyang): remove op_desc from block_desc in tracer + // release resource + for (framework::OpDesc* desc : grad_op_descs_) { + delete desc; + } + } + + std::vector ApplyGrad( + BackwardSumMap* bck_map, GradientRef* grad_ref, + const detail::BackwardStrategy& bck_stratedy); + + inline std::string Type() const { return type_; } + inline std::string GradOpType(size_t index) const { + PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]); + return grad_op_descs_[index]->Type(); + } + + void RegisterBackwardHooks(const py::object& callable); + + void InvokeBackwardHooks(); + + void TrackPreOp( + const std::string& inp_name, + const std::vector>& inputs) { + auto& pre_ops_list = pre_ops_[inp_name]; + pre_ops_list.reserve(inputs.size()); + auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name]; + for (std::shared_ptr inp_var : inputs) { + if (inp_var->PreOp() && !inp_var->IsStopGradient()) { + VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " + << inp_name; + pre_ops_list.emplace_back(inp_var->PreOp()); + pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx()); + } else { + VLOG(3) << "no pre op in slot " << inp_name + << " input var stop_gradient: " << inp_var->IsStopGradient(); + pre_ops_list.emplace_back(nullptr); + // pre_ops_out_idx_list.push_back(-1); + } + } + } + + std::string type_; + int trace_id_; + + // Note: each fwd op corresponds to a vector of bwd ops. + std::vector grad_op_descs_; + + platform::Place place_; + + OpBasePtrMap pre_ops_; + std::map> pre_ops_out_idx_; + + VarBaseWeakPtrList outputs_ref; + // Inputs to a vector of bwd ops. + std::vector grad_input_vars_; + // Outputs to a vector of bwd ops. + std::vector grad_output_vars_; + + std::vector backward_hooks_; + + framework::AttributeMap attrs_; +}; + +class Layer { + public: + virtual ~Layer() {} + + virtual std::vector> Forward( + const std::vector>& inputs) { + std::vector> vars; + return vars; + } +}; + +// infer var type context for imperative mode +class PYBIND11_HIDDEN RuntimeInferVarTypeContext + : public framework::InferVarTypeContext { + public: + RuntimeInferVarTypeContext(const imperative::VarBasePtrMap* inputs, + imperative::VarBasePtrMap* outputs, + const framework::AttributeMap* attrs_map) + : InferVarTypeContext(nullptr, nullptr), + inputs_(inputs), + outputs_(outputs), + attrs_(attrs_map), + input_names_(), + output_names_(), + var_set_() { + input_names_.reserve(inputs_->size()); + for (auto& it : *inputs_) { + for (std::shared_ptr var : it.second) { + input_names_[it.first].emplace_back(var->Name()); + var_set_[var->Name()] = var; + } + } + + output_names_.reserve(outputs_->size()); + for (auto& it : *outputs_) { + for (std::shared_ptr var : it.second) { + output_names_[it.first].emplace_back(var->Name()); + var_set_[var->Name()] = var; + } + } + } + + virtual ~RuntimeInferVarTypeContext() {} + + framework::Attribute GetAttr(const std::string& name) const override { + PADDLE_ENFORCE_NOT_NULL(attrs_); + return attrs_->at(name); + } + + bool HasVar(const std::string& name) const override { + return var_set_.count(name) > 0; + } + + bool HasInput(const std::string& name) const override { + PADDLE_ENFORCE_NOT_NULL(inputs_); + return inputs_->count(name) > 0; + } + + bool HasOutput(const std::string& name) const override { + PADDLE_ENFORCE_NOT_NULL(outputs_); + return outputs_->count(name) > 0; + } + + const std::vector& Input( + const std::string& name) const override { + return input_names_.at(name); + } + + const std::vector& Output( + const std::string& name) const override { + return output_names_.at(name); + } + + framework::proto::VarType::Type GetType( + const std::string& name) const override { + return var_set_.at(name)->Type(); + } + + void SetType(const std::string& name, + framework::proto::VarType::Type type) override { + if (name == "kLookupTablePath") { + VLOG(2) << "SUPER UGLY FIX, remove this when move imperative mode in C++"; + } else { + var_set_[name]->SetType(type); + } + } + + framework::proto::VarType::Type GetDataType( + const std::string& name) const override { + return var_set_.at(name)->DataType(); + } + + void SetDataType(const std::string& name, + framework::proto::VarType::Type type) override { + var_set_[name]->SetDataType(type); + } + + std::vector GetDataTypes( + const std::string& name) const override { + PADDLE_THROW("GetDataTypes is not supported in runtime InferVarType"); + } + + void SetDataTypes(const std::string& name, + const std::vector& + multiple_data_type) override { + PADDLE_THROW("SetDataTypes is not supported in runtime InferVarType"); + } + + std::vector GetShape(const std::string& name) const override { + PADDLE_THROW("Do not handle Shape in runtime InferVarType"); + } + + void SetShape(const std::string& name, + const std::vector& dims) override { + PADDLE_THROW("Do not handle Shape in runtime InferVarType"); + } + + int32_t GetLoDLevel(const std::string& name) const override { + PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType"); + } + + void SetLoDLevel(const std::string& name, int32_t lod_level) override { + PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType"); + } + + private: + const imperative::VarBasePtrMap* inputs_; + imperative::VarBasePtrMap* outputs_; + const framework::AttributeMap* attrs_; + std::unordered_map> input_names_; + std::unordered_map> output_names_; + std::unordered_map> + var_set_; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc new file mode 100644 index 00000000..d9630bd6 --- /dev/null +++ b/paddle/fluid/imperative/nccl_context.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/nccl_context.h" + +namespace paddle { +namespace imperative { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +void NCCLParallelContext::RecvNCCLID(const std::string &ep, + ncclUniqueId *nccl_id) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ(addr.size(), 2UL, + "The endpoint should contain host and port: %s", ep); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + + int server_fd, new_socket; + struct sockaddr_in address; + int addrlen = sizeof(address); + char buffer[1024] = {0}; + int opt = 0; + // creating socket fd + if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0) + PADDLE_THROW("create server fd failed"); + if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt))) + PADDLE_THROW("set socket opt failed"); + + address.sin_family = AF_INET; + address.sin_addr.s_addr = INADDR_ANY; + address.sin_port = htons(port); + + if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) + PADDLE_THROW("binding failed on ep: %s", ep); + VLOG(3) << "listening on: " << ep; + if (listen(server_fd, 3) < 0) PADDLE_THROW("listen on server fd failed"); + + if ((new_socket = + accept(server_fd, reinterpret_cast(&address), + reinterpret_cast(&addrlen))) < 0) + PADDLE_THROW("accept the new socket fd failed"); + + if (read(new_socket, buffer, 1024) < 0) + PADDLE_THROW("reading the ncclUniqueId from socket failed"); + VLOG(3) << "recevived the ncclUniqueId"; + memcpy(nccl_id, buffer, NCCL_UNIQUE_ID_BYTES); + + VLOG(3) << "closing the socket server: " << ep; + close(server_fd); +} + +void NCCLParallelContext::SendNCCLID(const std::string &ep, + ncclUniqueId *nccl_id) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ(addr.size(), 2UL, + "The endpoint should contain host and port: %s", ep); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + // struct sockaddr_in address; + int sock = 0; + struct sockaddr_in serv_addr; + char buffer[1024] = {0}; + + memcpy(buffer, nccl_id, NCCL_UNIQUE_ID_BYTES); + if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) + PADDLE_THROW("create socket failed"); + + memset(&serv_addr, '0', sizeof(serv_addr)); + serv_addr.sin_family = AF_INET; + serv_addr.sin_port = htons(port); + + if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) + PADDLE_THROW("invalied address: %s", ep); + + while (true) { + if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { + VLOG(0) << "worker: " << ep + << " is not ready, will retry after 3 seconds..."; + std::this_thread::sleep_for(std::chrono::seconds(3)); + continue; + } + VLOG(3) << "sending the ncclUniqueId to " << ep; + send(sock, buffer, NCCL_UNIQUE_ID_BYTES, 0); + break; + } + close(sock); +} + +void NCCLParallelContext::BcastNCCLId(ncclUniqueId *nccl_id, int root) { + if (strategy_.local_rank_ == root) { + for (auto ep : strategy_.trainer_endpoints_) { + if (ep != strategy_.current_endpoint_) SendNCCLID(ep, nccl_id); + } + } else { + RecvNCCLID(strategy_.current_endpoint_, nccl_id); + } +} + +void NCCLParallelContext::Init() { + ncclUniqueId nccl_id; + ncclComm_t comm; + if (strategy_.local_rank_ == 0) { + // generate the unique ncclid on the root worker + platform::dynload::ncclGetUniqueId(&nccl_id); + BcastNCCLId(&nccl_id, 0); + } else { + BcastNCCLId(&nccl_id, 0); + } + int gpu_id = boost::get(place_).device; + VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id; + + PADDLE_ENFORCE(cudaSetDevice(gpu_id)); + PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( + &comm, strategy_.nranks_, nccl_id, strategy_.local_rank_)); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = static_cast(pool.Get(place_)); + dev_ctx->set_nccl_comm(comm); +} +#endif + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h new file mode 100644 index 00000000..b4f44e56 --- /dev/null +++ b/paddle/fluid/imperative/nccl_context.h @@ -0,0 +1,81 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +// network header files +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include +#include +#include +#include +#endif + +#include +#include + +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/device_context.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/dynload/nccl.h" +#endif +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/split.h" + +namespace paddle { +namespace imperative { + +struct ParallelStrategy { + int nranks_{1}; + int local_rank_{0}; + std::vector trainer_endpoints_{}; + std::string current_endpoint_{""}; +}; + +class ParallelContext { + public: + explicit ParallelContext(const ParallelStrategy& strategy, + const platform::Place& place) + : strategy_(strategy), place_(place) {} + + virtual ~ParallelContext() {} + + virtual void Init() = 0; + + protected: + ParallelStrategy strategy_; + platform::Place place_; +}; + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +class NCCLParallelContext : ParallelContext { + public: + explicit NCCLParallelContext(const ParallelStrategy& strategy, + const platform::Place& place) + : ParallelContext(strategy, place) {} + + ~NCCLParallelContext() {} + + void BcastNCCLId(ncclUniqueId* nccl_id, int root); + + void Init() override; + + protected: + void RecvNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id); + + void SendNCCLID(const std::string& endpoint, ncclUniqueId* nccl_id); +}; +#endif + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/nccl_context_test.cc b/paddle/fluid/imperative/nccl_context_test.cc new file mode 100644 index 00000000..74a74ebe --- /dev/null +++ b/paddle/fluid/imperative/nccl_context_test.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/nccl_context.h" +#include "gtest/gtest.h" +#include "paddle/fluid/platform/device_context.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; + +imperative::ParallelStrategy GetStrategy(int local_rank) { + std::vector eps = {"127.0.0.1:9866", "127.0.0.1:9867"}; + imperative::ParallelStrategy strategy; + strategy.trainer_endpoints_ = eps; + strategy.current_endpoint_ = eps[local_rank]; + strategy.nranks_ = 2; + strategy.local_rank_ = local_rank; + return strategy; +} + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +void BcastNCCLId(int local_rank, ncclUniqueId *nccl_id) { + auto strategy = GetStrategy(local_rank); + platform::CUDAPlace gpu(local_rank); + imperative::NCCLParallelContext ctx(strategy, gpu); + ctx.BcastNCCLId(nccl_id, 0); +} + +TEST(BcastNCCLId, Run) { + ncclUniqueId nccl_id; + platform::dynload::ncclGetUniqueId(&nccl_id); + std::thread t(BcastNCCLId, 0, &nccl_id); + + ncclUniqueId recv_nccl_id; + BcastNCCLId(1, &recv_nccl_id); + + t.join(); + EXPECT_EQ(0, std::memcmp(nccl_id.internal, recv_nccl_id.internal, + NCCL_UNIQUE_ID_BYTES)); +} +#endif diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc new file mode 100644 index 00000000..34570b3a --- /dev/null +++ b/paddle/fluid/imperative/profiler.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/profiler.h" + +#ifdef WITH_GPERFTOOLS +#include "gperftools/profiler.h" +#endif +#include +#include +#include // NOLINT +#include // NOLINT + +DEFINE_string( + tracer_profile_fname, "xxgperf", + "Profiler filename for imperative tracer, which generated by gperftools." + "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable."); + +namespace paddle { +namespace imperative { + +static std::once_flag gTracerProfileOnce; +#ifdef WITH_GPERFTOOLS +static bool gTracerProfilerStarted = false; +#endif + +void StartProfile() { + if (!FLAGS_tracer_profile_fname.empty()) { + std::call_once(gTracerProfileOnce, [] { +#ifdef WITH_GPERFTOOLS + ProfilerStart(FLAGS_tracer_profile_fname.c_str()); + gTracerProfilerStarted = true; +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_tracer_profile_fname will be ignored"; +#endif + }); + } +} + +void StopProfile() { +#ifdef WITH_GPERFTOOLS + ProfilerFlush(); +#else + LOG(WARNING) << "Paddle is not compiled with gperftools. " + "FLAGS_tracer_profile_fname will be ignored"; +#endif +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/profiler.h b/paddle/fluid/imperative/profiler.h new file mode 100644 index 00000000..d52aeed4 --- /dev/null +++ b/paddle/fluid/imperative/profiler.h @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace imperative { + +extern void StartProfile(); + +extern void StopProfile(); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc new file mode 100644 index 00000000..682bea7d --- /dev/null +++ b/paddle/fluid/imperative/tracer.cc @@ -0,0 +1,292 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/tracer.h" + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/var_type_inference.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace imperative { + +void CreateGradOp(const framework::OpDesc& op_desc, + const std::unordered_set& no_grad_set, + const std::vector& grad_sub_block, + std::vector* grad_op_descs, + std::unordered_map* grad_to_var) { + PADDLE_ENFORCE(grad_op_descs->empty()); + const framework::OpInfo& op_info = + framework::OpInfoMap::Instance().Get(op_desc.Type()); + if (!op_info.grad_op_maker_) return; + + std::vector> descs = + op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); + for (auto& desc : descs) { + grad_op_descs->emplace_back(desc.release()); + } +} + +void CreateNoBuffuerGrad(std::shared_ptr var, + platform::DeviceContext* dev_ctx) { + PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base"); + PADDLE_ENFORCE_NOT_NULL(dev_ctx, + "Could not get valid device from forward op"); + + if (var->grads_ == nullptr) { + auto& var_t = var->var_->Get(); + var->grads_ = std::shared_ptr( + new VarBase(var->GradName(), framework::proto::VarType::FP32, + framework::vectorize(var_t.dims()), dev_ctx->GetPlace(), + var->IsStopGradient(), false, false)); + } +} + +platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { + platform::Place result = place; + for (const auto& it : inputs) { + for (const std::shared_ptr& var : it.second) { + platform::Place tmp_place = + var->var_->Get().place(); + if (!platform::is_same_place(tmp_place, result)) { + PADDLE_THROW( + "Input variable should keep in the same place: %s, but get place: " + "%s of input %s instead", + result, tmp_place, it.first); + } + } + } + + return result; +} + +framework::VariableNameMap CreateInputVarNameMap( + const OpBase* op, const VarBasePtrMap& varbase_map) { + framework::VariableNameMap result; + + auto& info_map = framework::OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(op->Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) { + return result; + } + + for (auto& in : op_info->Proto().inputs()) { + auto it = varbase_map.find(in.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE(in.dispensable()); + result[in.name()] = {}; + } else { + auto var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (std::shared_ptr var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[in.name()] = args; + } + } + return result; +} + +framework::VariableNameMap CreateOutputVarNameMap( + const OpBase* op, const VarBasePtrMap& varbase_map) { + framework::VariableNameMap result; + + auto& info_map = framework::OpInfoMap::Instance(); + auto* op_info = info_map.GetNullable(op->Type()); + if (op_info == nullptr || op_info->proto_ == nullptr) { + return result; + } + + for (auto& out : op_info->Proto().outputs()) { + auto it = varbase_map.find(out.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE(out.dispensable()); + result[out.name()] = {}; + } else { + auto var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (const std::shared_ptr& var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[out.name()] = args; + } + } + return result; +} + +Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} + +void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, + VarBasePtrMap* outputs, framework::AttributeMap attrs_map, + const platform::Place expected_place, + const bool stop_gradient) { + platform::RecordEvent record_event(op->type_); + framework::VariableValueMap invars_map; + framework::VariableValueMap outvars_map; + + // Construct input_vars_map and output_vars_map + std::map> current_vars_map; + for (auto it : inputs) { + auto& invars = invars_map[it.first]; + invars.reserve(it.second.size()); + for (std::shared_ptr inp : it.second) { + PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(), + inp->Name()); + + invars.emplace_back(inp->var_.get()); + if (!stop_gradient) { + current_vars_map[inp->Name()] = inp; + } + VLOG(3) << "input var name: " << inp->Name() + << " inited: " << inp->var_->IsInitialized() + << " stop_grad: " << inp->IsStopGradient(); + } + op->TrackPreOp(it.first, it.second); + } + + for (const auto& it : *outputs) { + auto& outvars = outvars_map[it.first]; + const std::vector>& outputs_tmp = + it.second; + outvars.reserve(outputs_tmp.size()); + for (size_t i = 0U; i < outputs_tmp.size(); ++i) { + // Add weak_ptr to track outputs + op->outputs_ref.emplace_back(outputs_tmp[i]); + std::shared_ptr out = outputs_tmp[i]; + outvars.emplace_back(out->var_.get()); + out->TrackPreOp(op, it.first, i, stop_gradient); + if (!stop_gradient) { + current_vars_map[out->Name()] = out; + } + + VLOG(3) << "output var name: " << out->Name() + << " inited: " << out->var_->IsInitialized() + << " stop_grad: " << out->IsStopGradient(); + } + } + + // Check attrs and create op + framework::VariableNameMap invars_name_map = + CreateInputVarNameMap(op, inputs); + framework::VariableNameMap outvars_name_map = + CreateOutputVarNameMap(op, *outputs); + + auto& info = framework::OpInfoMap::Instance().Get(op->Type()); + if (info.Checker() != nullptr) { + info.Checker()->Check(&attrs_map); + } + + std::unique_ptr op_base = + framework::OpRegistry::CreateOp(op->Type(), invars_name_map, + outvars_name_map, attrs_map); + + if (info.infer_var_type_) { + RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, outputs, &attrs_map); + info.infer_var_type_(&infer_var_type_ctx); + } + + // TODO(minqiyang): Support infer var type in imperative mode + // Run forward op + VLOG(3) << "tracer running " << op->Type(); + framework::RuntimeContext ctx(invars_map, outvars_map); + + // TODO(panyx0718): Cache p. + framework::OperatorWithKernel* op_kernel = + dynamic_cast(op_base.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + + framework::Scope scope; + op->place_ = GetExpectedPlace(expected_place, inputs); + + PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_); + prepared_op.op.RuntimeInferShape(scope, op->place_, ctx); + prepared_op.func( + framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx, + prepared_op.ctx, prepared_op.kernel_configs)); + + if (!stop_gradient) { + VLOG(5) << "start construct backward op"; + + // construct grad op descs + op->attrs_ = attrs_map; + std::unique_ptr fwd_op_desc(new framework::OpDesc( + op->Type(), invars_name_map, outvars_name_map, attrs_map)); + std::unique_ptr> grad_to_var( + new std::unordered_map()); + // NOTE(minqiyang): We don't support control flow op in imperative now + // Add grad_block_ when we want to support it + CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get()); + + VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type(); + + const size_t grad_op_count = op->grad_op_descs_.size(); + + op->grad_input_vars_.resize(grad_op_count); + op->grad_output_vars_.resize(grad_op_count); + + for (size_t i = 0; i < grad_op_count; ++i) { + framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; + for (auto it : grad_op_desc->Inputs()) { + auto& grad_in_vars = op->grad_input_vars_[i][it.first]; + grad_in_vars.reserve(it.second.size()); + for (const std::string& grad_invar : it.second) { + auto var_it = grad_to_var->find(grad_invar); + if (var_it == grad_to_var->end()) { + auto fwd_var_it = current_vars_map.find(grad_invar); + PADDLE_ENFORCE(fwd_var_it != current_vars_map.end()); + // Forward inputs or outputs. + grad_in_vars.emplace_back(fwd_var_it->second); + } else { + std::shared_ptr var = + current_vars_map[var_it->second]; + CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext()); + // Douts. + var->grads_->SetPreOp(var->PreOp()); + grad_in_vars.emplace_back(var->grads_); + } + } + } + + for (auto it : grad_op_desc->Outputs()) { + auto& grad_out_vars = op->grad_output_vars_[i][it.first]; + for (const std::string& grad_outvar : it.second) { + auto var_it = grad_to_var->find(grad_outvar); + PADDLE_ENFORCE(var_it != grad_to_var->end(), + "Could not found the grad op output var, should this " + "operator %s's stop gradient be True", + op->Type()); + + std::shared_ptr var = + current_vars_map[var_it->second]; + CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext()); + var->grads_->SetPreOp(var->PreOp()); + grad_out_vars.push_back(var->grads_); + VLOG(3) << "grads output var name: " << var->name_; + } + } + } + } +} +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h new file mode 100644 index 00000000..02d90227 --- /dev/null +++ b/paddle/fluid/imperative/tracer.h @@ -0,0 +1,60 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/imperative/engine.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace imperative { + +void CreateGradOp(const framework::OpDesc& op_desc, + const std::unordered_set& no_grad_set, + const std::vector& grad_sub_block, + framework::OpDesc** grad_op_desc, + std::unordered_map* grad_to_var); + +platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs); + +class Tracer { + public: + explicit Tracer(framework::BlockDesc* root_block); + + virtual ~Tracer() {} + + void Trace(OpBase* op, const VarBasePtrMap& inputs, + VarBasePtrMap* outputs, // NOLINT + framework::AttributeMap attrs_map, + const platform::Place expected_place, + const bool stop_gradient = false); + + private: + platform::Place GetPlace(const VarBasePtrMap& inputs); + + framework::BlockDesc* root_block_; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h new file mode 100644 index 00000000..fab8c2e6 --- /dev/null +++ b/paddle/fluid/imperative/type_defs.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace paddle { +namespace imperative { + +class VarBase; +class OpBase; + +typedef std::map>> + VarBasePtrMap; +typedef std::vector> VarBaseWeakPtrList; +typedef std::map> OpBasePtrMap; +typedef std::unordered_map< + const VarBase*, + std::pair>>>> + BackwardSumMap; // var_grad -> {place, {id -> var_grad@rename}} +typedef std::unordered_map> GradientRef; +// var_grad -> {ref_times, is_first_to_be_accumulate} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt new file mode 100644 index 00000000..83d91afa --- /dev/null +++ b/paddle/fluid/inference/CMakeLists.txt @@ -0,0 +1,108 @@ +if(WITH_TESTING) + include(tests/test.cmake) # some generic cmake funtion for inference +endif() + +set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor data_feed_proto) + +# TODO(panyx0718): Should this be called paddle_fluid_inference_api_internal? +cc_library(paddle_fluid_api + SRCS io.cc + DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + +# analysis and tensorrt must be added before creating static library, +# otherwise, there would be undefined reference to them in static library. +add_subdirectory(analysis) +add_subdirectory(utils) +if (TENSORRT_FOUND) + add_subdirectory(tensorrt) +endif() + +if (ANAKIN_SUBGRAPH) + add_subdirectory(anakin) +endif() + +get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) +get_property(cuda_modules GLOBAL PROPERTY CUDA_MODULES) +get_property(fluid_third_partys GLOBAL PROPERTY FLUID_THRID_PARTYS) +if (WIN32) +list(APPEND fluid_third_partys gflags glog protobuf cblas) +endif(WIN32) + +# paddle_fluid_origin exclude inference api interface +if(WIN32) + sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) +else(WIN32) + cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) +endif(WIN32) + +add_subdirectory(api) + +if(WITH_MKLDNN) + set(mkldnn_quantizer_src ${CMAKE_CURRENT_SOURCE_DIR}/api/mkldnn_quantizer.cc) + set(mkldnn_quantizer_cfg mkldnn_quantizer_config) +endif() + +set(STATIC_INFERENCE_APIS paddle_fluid_api paddle_inference_api analysis_predictor) +if (ANAKIN_FOUND) + set(ANAKIN_SHARED_INFERENCE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/api/api_anakin_engine.cc) +endif() +set(SHARED_INFERENCE_SRCS + io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc + ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc + ${mkldnn_quantizer_src} + ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc + ${ANAKIN_SHARED_INFERENCE_SRCS}) + +if(WIN32) + sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) +else(WIN32) + cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} + zero_copy_tensor reset_tensor_array analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) +endif(WIN32) + +if(NOT APPLE) + # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. + set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") + set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}") +endif() + +# Create shared library +if(WIN32) + sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} + DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) +else(WIN32) + cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} + DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array + analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder) +endif() +get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) +target_link_libraries(paddle_fluid_shared ${os_dependency_modules}) + +set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) +if(NOT APPLE AND NOT WIN32) + # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac. + set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map") + set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + # check symbol hidden + FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake + "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh" + " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n" + "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n" + " message(FATAL_ERROR \"Check symbol failed.\")\n" + "endif()\n") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol" + COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake" + DEPENDS paddle_fluid_shared) + add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol") +endif() + +if(WITH_TESTING) + # tests/book depends the models that generated by python/paddle/fluid/tests/book + add_subdirectory(tests/book) + if(WITH_INFERENCE_API_TEST) + add_subdirectory(tests/api) + endif() +endif() diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt new file mode 100644 index 00000000..9ffe7047 --- /dev/null +++ b/paddle/fluid/inference/anakin/CMakeLists.txt @@ -0,0 +1,5 @@ +cc_library(anakin_engine SRCS engine.cc DEPS framework_proto boost) +cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto boost) +target_link_libraries(anakin_engine anakin anakin_saber_common) +cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine) +add_subdirectory(convert) diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt new file mode 100644 index 00000000..67194c9f --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt @@ -0,0 +1,23 @@ +cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc +elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc +batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc +detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc affine_channel.cc +roi_align.cc shuffle_channel.cc helper.cc DEPS anakin_engine framework_proto +scope op_registry gtest) + +cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op) +cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv) +cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter) +cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling) +cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split) +cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split) +cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op elementwise_mul_op) +cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter) +cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax) +cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op) +cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op) +cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op) +cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op) +cc_test(test_anakin_dropout SRCS test_dropout_op.cc DEPS anakin_op_converter dropout_op) +cc_test(test_anakin_sum SRCS test_sum_op.cc DEPS anakin_op_converter sum_op selected_rows_functor) +cc_test(test_anakin_affine_channel SRCS test_affine_channel_op.cc DEPS anakin_op_converter affine_channel_op) diff --git a/paddle/fluid/inference/anakin/convert/activation.cc b/paddle/fluid/inference/anakin/convert/activation.cc new file mode 100644 index 00000000..523571f1 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/activation.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/activation.h" +#include +#include + +namespace paddle { +namespace inference { +namespace anakin { + +template +ActivationOpConverter::ActivationOpConverter( + const std::string &op_type) + : op_type_(op_type) { + auto it = anakin_op_types_.find(op_type_); + PADDLE_ENFORCE(it != anakin_op_types_.end(), + "activation op type is not support"); + anakin_op_type_ = it->second; +} + +template +void ActivationOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto input_name = op_desc.Input("X").front(); + auto output_name = op_desc.Output("Out").front(); + this->engine_->AddOp(op_name, "Activation", {input_name}, {output_name}); + this->engine_->AddOpAttr(op_name, "type", anakin_op_type_); + + if (op_type_ == "swish") { + float beta = boost::get(op_desc.GetAttr("beta")); + this->engine_->AddOpAttr(op_name, "clip_relu_num", beta); + } + if (op_type_ == "relu6") { + float threshold = boost::get(op_desc.GetAttr("threshold")); + this->engine_->AddOpAttr(op_name, "clip_relu_num", threshold); + } +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(sigmoid, SigmoidOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(tanh, TanhOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(swish, SwishOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(relu6, Relu6OpConverter); diff --git a/paddle/fluid/inference/anakin/convert/activation.h b/paddle/fluid/inference/anakin/convert/activation.h new file mode 100644 index 00000000..a2475e49 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/activation.h @@ -0,0 +1,72 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class ActivationOpConverter : public AnakinOpConverter { + public: + explicit ActivationOpConverter(const std::string &op_type); + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ActivationOpConverter() {} + + private: + std::string op_type_; + std::string anakin_op_type_; + std::map anakin_op_types_{{"tanh", "TanH"}, + {"sigmoid", "Sigmoid"}, + {"relu6", "ClippedRelu"}, + {"swish", "Swish"}}; +}; + +template +class TanhOpConverter : public ActivationOpConverter { + public: + TanhOpConverter() : ActivationOpConverter("tanh") {} +}; + +template +class SigmoidOpConverter : public ActivationOpConverter { + public: + SigmoidOpConverter() + : ActivationOpConverter("sigmoid") {} +}; + +template +class Relu6OpConverter : public ActivationOpConverter { + public: + Relu6OpConverter() : ActivationOpConverter("relu6") {} +}; + +template +class SwishOpConverter : public ActivationOpConverter { + public: + SwishOpConverter() : ActivationOpConverter("swish") {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/affine_channel.cc b/paddle/fluid/inference/anakin/convert/affine_channel.cc new file mode 100644 index 00000000..534e7dca --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/affine_channel.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/affine_channel.h" +#include +#include +#include +#include "paddle/fluid/inference/anakin/convert/helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void AffineChannelOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto input_name = op_desc.Input("X").front(); + auto output_name = op_desc.Output("Out").front(); + this->engine_->AddOp(op_name, "AffineChannel", {input_name}, {output_name}); + + // Copy the Scale to CPUPlace and get the pointer. + auto *scale_v = scope.FindVar(op_desc.Input("Scale").front()); + PADDLE_ENFORCE_NOT_NULL(scale_v); + auto weight1 = pblock_from_var(*scale_v, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + + // Copy the Bias to CPUPlace and get the pointer. + auto *bias_v = scope.FindVar(op_desc.Input("Bias").front()); + PADDLE_ENFORCE_NOT_NULL(bias_v); + auto weight2 = pblock_from_var(*bias_v, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_2", *weight2); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(affine_channel, AffineChannelOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/affine_channel.h b/paddle/fluid/inference/anakin/convert/affine_channel.h new file mode 100644 index 00000000..443f6101 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/affine_channel.h @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class AffineChannelOpConverter : public AnakinOpConverter { + public: + AffineChannelOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~AffineChannelOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc new file mode 100644 index 00000000..b41f5dc9 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/batch_norm.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/anakin/convert/helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void BatchNormOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1); + std::map inputs; + for (auto k : {"X", "Scale", "Bias", "Mean", "Variance"}) { + PADDLE_ENFORCE_EQ(op_desc.Input(k).size(), 1UL); + } + + auto input = op_desc.Input("X").front(); + auto output = op_desc.Output("Y").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front(); + auto epsilon = boost::get(op_desc.GetAttr("epsilon")); + + auto bn_op_name = op_name + ":bn"; + auto bn_output = bn_op_name + "_output"; + this->engine_->AddOp(bn_op_name, "BatchNorm", {input}, {bn_output}); + this->engine_->AddOpAttr(bn_op_name, "epsilon", epsilon); + this->engine_->AddOpAttr(bn_op_name, "momentum", static_cast(1.0)); + + auto scale_op_name = op_name + ":scale"; + this->engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output}); + this->engine_->AddOpAttr(scale_op_name, "axis", 1); + this->engine_->AddOpAttr(scale_op_name, "num_axes", 1); + this->engine_->AddOpAttr(scale_op_name, "bias_term", true); + + auto *mean_v = scope.FindVar(op_desc.Input("Mean").front()); + PADDLE_ENFORCE_NOT_NULL(mean_v); + auto weight1 = pblock_from_var(*mean_v, this->engine_); + this->engine_->AddOpAttr(bn_op_name, "weight_1", *weight1); + + auto *variance_v = scope.FindVar(op_desc.Input("Variance").front()); + PADDLE_ENFORCE_NOT_NULL(variance_v); + auto weight2 = + pblock_from_var(*variance_v, this->engine_); + this->engine_->AddOpAttr(bn_op_name, "weight_2", *weight2); + + auto *weight3 = pblock_from_vector( + std::vector({1}), this->engine_); + this->engine_->AddOpAttr(bn_op_name, "weight_3", *weight3); + + auto *scale_v = scope.FindVar(op_desc.Input("Scale").front()); + PADDLE_ENFORCE_NOT_NULL(scale_v); + auto scale = pblock_from_var(*scale_v, this->engine_); + this->engine_->AddOpAttr(scale_op_name, "weight_1", *scale); + + auto *bias_v = scope.FindVar(op_desc.Input("Bias").front()); + PADDLE_ENFORCE_NOT_NULL(bias_v); + auto bias = pblock_from_var(*bias_v, this->engine_); + this->engine_->AddOpAttr(scale_op_name, "weight_2", *bias); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(batch_norm, BatchNormOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.h b/paddle/fluid/inference/anakin/convert/batch_norm.h new file mode 100644 index 00000000..52156aeb --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/batch_norm.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class BatchNormOpConverter : public AnakinOpConverter { + public: + BatchNormOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~BatchNormOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/concat.cc b/paddle/fluid/inference/anakin/convert/concat.cc new file mode 100644 index 00000000..584a82ea --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/concat.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/concat.h" +#include + +namespace paddle { +namespace inference { +namespace anakin { + +template +void ConcatOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + int axis = boost::get(op_desc.GetAttr("axis")); + auto input_names = op_desc.Input("X"); + + auto y_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + this->engine_->AddOp(op_name, "Concat", input_names, {y_name}); + this->engine_->AddOpAttr(op_name, "axis", axis); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(concat, ConcatOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/concat.h b/paddle/fluid/inference/anakin/convert/concat.h new file mode 100644 index 00000000..fb5514af --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/concat.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class ConcatOpConverter : public AnakinOpConverter { + public: + ConcatOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ConcatOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc new file mode 100644 index 00000000..26f78efa --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/conv2d.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/conv2d.h" +#include +#include +#include +#include "paddle/fluid/inference/anakin/convert/helper.h" + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void Conv2dOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL); + + auto input_name = op_desc.Input("Input").front(); + auto output_name = op_desc.Output("Output").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front(); + this->engine_->AddOp(op_name, "Convolution", {input_name}, {output_name}); + + auto *filter_v = scope.FindVar(op_desc.Input("Filter").front()); + PADDLE_ENFORCE_NOT_NULL(filter_v); + auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); + + PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); + + const int filter_h = weight_tensor->dims()[2]; + const int filter_w = weight_tensor->dims()[3]; + + auto filter_num = weight_tensor->dims()[0]; + this->engine_->template AddOpAttr(op_name, "filter_num", filter_num); + this->engine_->template AddOpAttr>(op_name, "kernel_size", + {filter_h, filter_w}); + auto strides = boost::get>(op_desc.GetAttr("strides")); + this->engine_->template AddOpAttr>(op_name, "strides", strides); + auto paddings = boost::get>(op_desc.GetAttr("paddings")); + this->engine_->template AddOpAttr>(op_name, "padding", paddings); + auto dilations = boost::get>(op_desc.GetAttr("dilations")); + this->engine_->template AddOpAttr>(op_name, "dilation_rate", + dilations); + const int groups = boost::get(op_desc.GetAttr("groups")); + this->engine_->AddOpAttr(op_name, "group", groups); + this->engine_->AddOpAttr(op_name, "axis", 1); + this->engine_->AddOpAttr(op_name, "bias_term", false); + + ::anakin::saber::Shape anakin_shape(weight_shape); + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + + if (enable_int8) { + const float int8_range = 127.; + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + auto weight_scale = + boost::get>(op_desc.GetAttr("weight_scale")); + PBlock *weight1 = + new PBlock(anakin_shape, ::anakin::AK_INT8); + this->engine_->RegistBlock(weight1); + float *weight_data = weight_tensor->data(); + std::vector weight_int8; + int weight_num = weight_tensor->numel(); + for (int i = 0; i < weight_tensor->numel(); i++) { + bool is_valid_int8 = + ((weight_data[i] >= -128) && (weight_data[i] <= 127)); + PADDLE_ENFORCE(is_valid_int8, + "We are in anakin subgraph int8 mode, the weight of conv " + "should be in range [-128, 127]"); + weight_int8.push_back(static_cast(weight_data[i])); + } + memcpy(static_cast(weight1->h_tensor().mutable_data()), + static_cast(weight_int8.data()), sizeof(char) * weight_num); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); + this->engine_->Graph()->SetWeightsScale( + op_name, {weight_scale[0] / int8_range}, false); + this->engine_->AddTensorScale(input_name, in_scale / int8_range); + } else { + auto *weight1 = pblock_from_tensor( + *weight_tensor, weight_shape, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + } +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(conv2d, Conv2dOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/conv2d.h b/paddle/fluid/inference/anakin/convert/conv2d.h new file mode 100644 index 00000000..b22cb8ea --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/conv2d.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class Conv2dOpConverter : public AnakinOpConverter { + public: + Conv2dOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~Conv2dOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc new file mode 100644 index 00000000..f2e6003a --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc @@ -0,0 +1,115 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/conv2d_fusion.h" +#include +#include +#include +#include "paddle/fluid/inference/anakin/convert/helper.h" + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void Conv2dFusionOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL); + + auto input_name = op_desc.Input("Input").front(); + auto output_name = op_desc.Output("Output").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Output").front(); + this->engine_->AddOp(op_name, "Convolution", {input_name}, {output_name}); + + auto *filter_v = scope.FindVar(op_desc.Input("Filter").front()); + PADDLE_ENFORCE_NOT_NULL(filter_v); + + auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); + + auto *b_v = scope.FindVar(op_desc.Input("Bias").front()); + PADDLE_ENFORCE_NOT_NULL(b_v); + + PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); + const int filter_h = weight_tensor->dims()[2]; + const int filter_w = weight_tensor->dims()[3]; + auto filter_num = weight_tensor->dims()[0]; + this->engine_->template AddOpAttr(op_name, "filter_num", filter_num); + this->engine_->template AddOpAttr>(op_name, "kernel_size", + {filter_h, filter_w}); + auto strides = boost::get>(op_desc.GetAttr("strides")); + this->engine_->template AddOpAttr>(op_name, "strides", strides); + auto paddings = boost::get>(op_desc.GetAttr("paddings")); + this->engine_->template AddOpAttr>(op_name, "padding", paddings); + auto dilations = boost::get>(op_desc.GetAttr("dilations")); + this->engine_->template AddOpAttr>(op_name, "dilation_rate", + dilations); + const int groups = boost::get(op_desc.GetAttr("groups")); + this->engine_->AddOpAttr(op_name, "group", groups); + this->engine_->AddOpAttr(op_name, "axis", 1); + this->engine_->AddOpAttr(op_name, "bias_term", true); + + ::anakin::saber::Shape anakin_shape(weight_shape); + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + if (enable_int8) { + const float int8_range = 127.; + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + auto weight_scale = + boost::get>(op_desc.GetAttr("weight_scale")); + PBlock *weight1 = + new PBlock(anakin_shape, ::anakin::AK_INT8); + this->engine_->RegistBlock(weight1); + float *weight_data = weight_tensor->data(); + std::vector weight_int8; + int weight_num = weight_tensor->numel(); + for (int i = 0; i < weight_tensor->numel(); i++) { + bool is_valid_int8 = + ((weight_data[i] >= -128) && (weight_data[i] <= 127)); + PADDLE_ENFORCE(is_valid_int8, + "We are in anakin subgraph int8 mode, the weight of conv " + "should be in range [-128, 127]"); + weight_int8.push_back(static_cast(weight_data[i])); + } + memcpy(static_cast(weight1->h_tensor().mutable_data()), + static_cast(weight_int8.data()), sizeof(char) * weight_num); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); + this->engine_->Graph()->SetWeightsScale( + op_name, {weight_scale[0] / int8_range}, false); + this->engine_->AddTensorScale(input_name, in_scale / int8_range); + } else { + auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); + auto *weight1 = pblock_from_tensor( + *weight_tensor, weight_shape, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + auto weight2 = pblock_from_var(*b_v, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_2", *weight2); + } +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(conv2d_fusion, Conv2dFusionOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h new file mode 100644 index 00000000..768814d3 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class Conv2dFusionOpConverter : public AnakinOpConverter { + public: + Conv2dFusionOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~Conv2dFusionOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc new file mode 100644 index 00000000..5bbaeb57 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/density_prior_box.h" +#include +#include +#include + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void DensityPriorBoxOpConverter::operator()( + const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc, + const framework::Scope& scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + auto input_name = op_desc.Input("Input").front(); + auto image_name = op_desc.Input("Image").front(); + auto output_name = op_desc.Output("Boxes").front(); + auto op_type = op_desc.Type(); + auto op_name = op_type + ":" + op_desc.Output("Boxes").front(); + + // only for density_prior_box + std::vector fixed_sizes = {}; + std::vector fixed_ratios = {}; + std::vector densities = {}; + + std::vector min_sizes = {}; + std::vector max_sizes = {}; + std::vector aspect_ratios = {}; + bool is_clip = false; + bool is_flip = false; + + if (op_type == "density_prior_box") { + fixed_sizes = + boost::get>(op_desc.GetAttr("fixed_sizes")); + fixed_ratios = + boost::get>(op_desc.GetAttr("fixed_ratios")); + densities = boost::get>(op_desc.GetAttr("densities")); + is_clip = boost::get(op_desc.GetAttr("clip")); + } else if (op_type == "prior_box") { + min_sizes = boost::get>(op_desc.GetAttr("min_sizes")); + max_sizes = boost::get>(op_desc.GetAttr("max_sizes")); + aspect_ratios = + boost::get>(op_desc.GetAttr("aspect_ratios")); + is_clip = boost::get(op_desc.GetAttr("clip")); + is_flip = boost::get(op_desc.GetAttr("flip")); + } + std::vector dens; + for (auto& ele : densities) { + dens.push_back(static_cast(ele)); + } + + auto variances = boost::get>(op_desc.GetAttr("variances")); + + // lack img_h, img_w + auto step_h = boost::get(op_desc.GetAttr("step_h")); + auto step_w = boost::get(op_desc.GetAttr("step_w")); + auto offset = boost::get(op_desc.GetAttr("offset")); + PTuple t_order; + t_order.push_back("MIN"); + t_order.push_back("COM"); + t_order.push_back("MAX"); + + std::vector temp_v = {}; + + this->engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, + {output_name}); + this->engine_->template AddOpAttr>(op_name, "min_size", + min_sizes); + this->engine_->template AddOpAttr>(op_name, "max_size", + max_sizes); + this->engine_->template AddOpAttr>(op_name, "aspect_ratio", + aspect_ratios); + this->engine_->template AddOpAttr>(op_name, "fixed_size", + fixed_sizes); + this->engine_->template AddOpAttr>(op_name, "fixed_ratio", + fixed_ratios); + this->engine_->template AddOpAttr>(op_name, "density", dens); + this->engine_->AddOpAttr(op_name, "is_flip", is_flip); + this->engine_->AddOpAttr(op_name, "is_clip", is_clip); + this->engine_->template AddOpAttr>(op_name, "variance", + variances); + this->engine_->AddOpAttr(op_name, "img_h", static_cast(0)); + this->engine_->AddOpAttr(op_name, "img_w", static_cast(0)); + this->engine_->AddOpAttr(op_name, "step_h", step_h); + this->engine_->AddOpAttr(op_name, "step_w", step_w); + this->engine_->AddOpAttr(op_name, "offset", offset); + this->engine_->template AddOpAttr>(op_name, "order", + t_order); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(prior_box, DensityPriorBoxOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.h b/paddle/fluid/inference/anakin/convert/density_prior_box.h new file mode 100644 index 00000000..5714f57a --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class DensityPriorBoxOpConverter + : public AnakinOpConverter { + public: + DensityPriorBoxOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~DensityPriorBoxOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/detection_out.cc b/paddle/fluid/inference/anakin/convert/detection_out.cc new file mode 100644 index 00000000..73dd6f28 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/detection_out.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/detection_out.h" +#include +#include + +namespace paddle { +namespace inference { +namespace anakin { + +template +void DetectionOutOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + auto target_name = op_desc.Input("TargetBox").front(); + auto prior_box_name = op_desc.Input("PriorBox").front(); + auto scores_name = op_desc.Input("Scores").front(); + auto output_name = op_desc.Output("Out").front(); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + auto code_type = boost::get(op_desc.GetAttr("code_type")); + auto background_label = boost::get(op_desc.GetAttr("background_label")); + auto score_threshold = boost::get(op_desc.GetAttr("score_threshold")); + auto nms_top_k = boost::get(op_desc.GetAttr("nms_top_k")); + auto nms_threshold = boost::get(op_desc.GetAttr("nms_threshold")); + auto nms_eta = boost::get(op_desc.GetAttr("nms_eta")); + auto keep_top_k = boost::get(op_desc.GetAttr("keep_top_k")); + std::string anakin_code_type; + if (code_type == "decode_center_size") { + anakin_code_type = "CENTER_SIZE"; + } else if (code_type == "encode_center_size") { + PADDLE_THROW( + "Not support encode_center_size code_type in DetectionOut of anakin"); + } + + this->engine_->AddOp(op_name, "DetectionOutput", + {target_name, scores_name, prior_box_name}, + {output_name}); + this->engine_->AddOpAttr(op_name, "share_location", true); + this->engine_->AddOpAttr(op_name, "variance_encode_in_target", false); + this->engine_->AddOpAttr(op_name, "class_num", static_cast(0)); + this->engine_->AddOpAttr(op_name, "background_id", background_label); + this->engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k); + this->engine_->AddOpAttr(op_name, "code_type", anakin_code_type); + this->engine_->AddOpAttr(op_name, "conf_thresh", score_threshold); + this->engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k); + this->engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold); + this->engine_->AddOpAttr(op_name, "nms_eta", nms_eta); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/detection_out.h b/paddle/fluid/inference/anakin/convert/detection_out.h new file mode 100644 index 00000000..c34342a6 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/detection_out.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class DetectionOutOpConverter : public AnakinOpConverter { + public: + DetectionOutOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~DetectionOutOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/dropout.cc b/paddle/fluid/inference/anakin/convert/dropout.cc new file mode 100644 index 00000000..6c5f80b5 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/dropout.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/dropout.h" +#include +#include +#include +#include "paddle/fluid/inference/anakin/convert/helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void DropoutOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Mask").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto x_name = op_desc.Input("X").front(); + auto out_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + this->engine_->AddOp(op_name, "Scale", {x_name}, {out_name}); + + auto dropout_prob = boost::get(op_desc.GetAttr("dropout_prob")); + auto factor = 1 - dropout_prob; + auto *weight1 = pblock_from_vector( + std::vector({factor}), this->engine_); + + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + this->engine_->AddOpAttr(op_name, "axis", 0); + this->engine_->AddOpAttr(op_name, "num_axes", 0); + this->engine_->AddOpAttr(op_name, "bias_term", false); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(dropout, DropoutOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/dropout.h b/paddle/fluid/inference/anakin/convert/dropout.h new file mode 100644 index 00000000..801aa3dd --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/dropout.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class DropoutOpConverter : public AnakinOpConverter { + public: + DropoutOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~DropoutOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/elementwise.cc b/paddle/fluid/inference/anakin/convert/elementwise.cc new file mode 100644 index 00000000..d221f26e --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/elementwise.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/elementwise.h" +#include +#include +#include + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void ElementwiseAddOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto x_name = op_desc.Input("X").front(); + auto y_name = op_desc.Input("Y").front(); + auto out_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name}); + std::string elementwise_type = "Add"; + this->engine_->template AddOpAttr(op_name, "type", + elementwise_type); + std::vector coeff = {1.0, 1.0}; + this->engine_->template AddOpAttr>(op_name, "coeff", coeff); +} + +template +void ElementwiseMulOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto x_name = op_desc.Input("X").front(); + auto y_name = op_desc.Input("Y").front(); + auto out_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + this->engine_->AddOp(op_name, "Eltwise", {x_name, y_name}, {out_name}); + std::string elementwise_type = "Mul"; + this->engine_->template AddOpAttr(op_name, "type", + elementwise_type); + std::vector coeff = {1.0, 1.0}; + this->engine_->template AddOpAttr>(op_name, "coeff", coeff); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(elementwise_add, ElementwiseAddOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(elementwise_mul, ElementwiseMulOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/elementwise.h b/paddle/fluid/inference/anakin/convert/elementwise.h new file mode 100644 index 00000000..190a8b55 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/elementwise.h @@ -0,0 +1,55 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class ElementwiseAddOpConverter + : public AnakinOpConverter { + public: + ElementwiseAddOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ElementwiseAddOpConverter() {} + + private: +}; + +template +class ElementwiseMulOpConverter + : public AnakinOpConverter { + public: + ElementwiseMulOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ElementwiseMulOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc new file mode 100644 index 00000000..b64d0b84 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/fc.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/fc.h" +#include +#include +#include +#include "paddle/fluid/inference/anakin/convert/helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void FcBaseOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + auto input_names = op_desc.InputNames(); + bool with_bias = input_names.size() >= 3; + + std::string w_name = "Y"; + std::string i_name = "X"; + if (with_bias) { + w_name = "W"; + i_name = "Input"; + } + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + // get weights + auto *y_v = scope.FindVar(op_desc.Input(w_name).front()); + PADDLE_ENFORCE_NOT_NULL(y_v); + auto weight_tensor = tensor_from_var(*y_v, platform::CPUPlace()); + auto weight_shape = framework::vectorize2int(weight_tensor->dims()); + + int out_dim = weight_shape[1]; + const int w_m = weight_shape[0]; + const int w_k = weight_shape[1]; + + auto input_name = op_desc.Input(i_name).front(); + auto output_name = op_desc.Output("Out").front(); + + this->engine_->AddOp(op_name, "Dense", {input_name}, {output_name}); + this->engine_->AddOpAttr(op_name, "bias_term", with_bias); + this->engine_->AddOpAttr(op_name, "axis", 1); + this->engine_->AddOpAttr(op_name, "out_dim", out_dim); + + auto *weight_data = weight_tensor->data(); + PADDLE_ENFORCE(w_m * w_k == weight_tensor->numel()); + + std::vector trans_weight_data(weight_tensor->numel()); + for (int i = 0; i < w_m; i++) { + for (int j = 0; j < w_k; j++) { + trans_weight_data[i + j * w_m] = weight_data[i * w_k + j]; + } + } + + int weight_num = weight_tensor->numel(); + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + if (enable_int8) { + if (weight_shape.size() < 4UL) { + weight_shape.insert(weight_shape.begin(), 4UL - weight_shape.size(), 1); + } + ::anakin::saber::Shape anakin_shape(weight_shape); + const float int8_range = 127.; + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + auto weight_scale = + boost::get>(op_desc.GetAttr("weight_scale")); + PBlock *weight1 = + new PBlock(anakin_shape, ::anakin::AK_INT8); + this->engine_->RegistBlock(weight1); + std::vector weight_int8; + for (int i = 0; i < weight_num; i++) { + bool is_valid_int8 = + ((trans_weight_data[i] >= -128) && (trans_weight_data[i] <= 127)); + PADDLE_ENFORCE(is_valid_int8, + "We are in anakin subgraph int8 mode, the weight of fc " + "should be in range [-128, 127]"); + weight_int8.push_back(static_cast(trans_weight_data[i])); + } + memcpy(static_cast(weight1->h_tensor().mutable_data()), + static_cast(weight_int8.data()), sizeof(char) * weight_num); + weight1->d_tensor().set_shape(anakin_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + this->engine_->Graph()->SetOpPrec(op_name, ::anakin::AK_INT8); + this->engine_->Graph()->SetWeightsScale( + op_name, {weight_scale[0] / int8_range}, false); + this->engine_->AddTensorScale(input_name, in_scale / int8_range); + } else { + auto *weight1 = pblock_from_vector(trans_weight_data, + this->engine_); + this->engine_->AddOpAttr(op_name, "weight_1", *weight1); + } + + // get bias + if (with_bias) { + auto *b_v = scope.FindVar(op_desc.Input("Bias").front()); + PADDLE_ENFORCE_NOT_NULL(b_v); + auto weight2 = pblock_from_var(*b_v, this->engine_); + this->engine_->AddOpAttr(op_name, "weight_2", *weight2); + } +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(mul, MulOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(fc, FcOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/fc.h b/paddle/fluid/inference/anakin/convert/fc.h new file mode 100644 index 00000000..6fe65e3e --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/fc.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class FcBaseOpConverter : public AnakinOpConverter { + public: + FcBaseOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~FcBaseOpConverter() {} +}; + +// with bias +template +class FcOpConverter : public FcBaseOpConverter { + public: + FcOpConverter() = default; +}; + +// without bias +template +class MulOpConverter : public FcBaseOpConverter { + public: + MulOpConverter() = default; +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/flatten.cc b/paddle/fluid/inference/anakin/convert/flatten.cc new file mode 100644 index 00000000..7ce519a4 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/flatten.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/flatten.h" +#include + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void FlattenOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL); + + auto input = op_desc.Input("X").front(); + auto output = op_desc.Output("Out").front(); + int axis = boost::get(op_desc.GetAttr("axis")); + PADDLE_ENFORCE(axis == 1, + "the anakin flatten op converter now only support aixs == 1."); + + std::vector out_dims = {0, -1, 1, 1}; + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + this->engine_->AddOp(op_name, "Reshape", {input}, {output}); + this->engine_->template AddOpAttr>(op_name, "dims", out_dims); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/flatten.h b/paddle/fluid/inference/anakin/convert/flatten.h new file mode 100644 index 00000000..6e5e0599 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/flatten.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class FlattenOpConverter : public AnakinOpConverter { + public: + FlattenOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~FlattenOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/helper.cc b/paddle/fluid/inference/anakin/convert/helper.cc new file mode 100644 index 00000000..7804619b --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/helper.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +std::unique_ptr tensor_from_var( + const framework::Variable& var, const platform::Place& place) { + auto& src = var.Get(); + std::unique_ptr dst(new framework::LoDTensor()); + dst->Resize(src.dims()); + TensorCopySync((src), place, dst.get()); + return dst; +} + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/helper.h b/paddle/fluid/inference/anakin/convert/helper.h new file mode 100644 index 00000000..7b0fb211 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/helper.h @@ -0,0 +1,95 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/inference/anakin/engine.h" + +#include "framework/core/net/net.h" +#include "framework/core/types.h" +#include "framework/graph/graph.h" +#include "framework/graph/graph_global_mem.h" +#include "saber/saber_types.h" + +using anakin::saber::Shape; +using anakin::AK_FLOAT; +using anakin::AK_INT8; +using anakin::PBlock; + +namespace paddle { +namespace inference { +namespace anakin { + +std::unique_ptr tensor_from_var( + const framework::Variable& var, const platform::Place& place); + +template +PBlock* pblock_from_tensor(const framework::LoDTensor& tensor, + std::vector shape_vec, + AnakinEngine* engine) { + while (shape_vec.size() < 4) { + shape_vec.insert(shape_vec.begin(), 1); + } + Shape shape(shape_vec); + PBlock* weight = new PBlock(shape, AK_FLOAT); + engine->RegistBlock(weight); + float* cpu_data = static_cast(weight->h_tensor().mutable_data()); + std::copy_n(tensor.data(), tensor.numel(), cpu_data); + weight->d_tensor().set_shape(shape); + weight->d_tensor().copy_from(weight->h_tensor()); + return weight; +} + +template +PBlock* pblock_from_vector(const std::vector& vec, + std::vector shape_vec, + AnakinEngine* engine) { + while (shape_vec.size() < 4) { + shape_vec.insert(shape_vec.begin(), 1); + } + Shape shape(shape_vec); + PBlock* weight = new PBlock(shape, AK_FLOAT); + engine->RegistBlock(weight); + auto* weight_data = static_cast(weight->h_tensor().mutable_data()); + std::copy(std::begin(vec), std::end(vec), weight_data); + weight->d_tensor().set_shape(shape); + weight->d_tensor().copy_from(weight->h_tensor()); + return weight; +} + +template +PBlock* pblock_from_vector(const std::vector& vec, + AnakinEngine* engine) { + int size = vec.size(); + return pblock_from_vector( + vec, std::vector({1, 1, 1, size}), engine); +} + +template +PBlock* pblock_from_var(const framework::Variable& var, + AnakinEngine* engine) { + auto tensor = tensor_from_var(var, platform::CPUPlace()); + auto shape = framework::vectorize2int(tensor->dims()); + return pblock_from_tensor(*tensor, shape, engine); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.cc b/paddle/fluid/inference/anakin/convert/im2sequence.cc new file mode 100644 index 00000000..5a4e3e61 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/im2sequence.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/im2sequence.h" +#include +#include +#include + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void Im2SequenceConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 0); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto x_name = op_desc.Input("X").front(); + auto out_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + this->engine_->AddOp(op_name, "Im2Sequence", {x_name}, {out_name}); + + std::vector dilations = {1, 1}; + auto paddings = boost::get>(op_desc.GetAttr("paddings")); + auto strides = boost::get>(op_desc.GetAttr("strides")); + auto kernels = boost::get>(op_desc.GetAttr("kernels")); + + this->engine_->template AddOpAttr>(op_name, "paddings", paddings); + this->engine_->template AddOpAttr>(op_name, "strides", strides); + this->engine_->template AddOpAttr>(op_name, "window_size", + kernels); + this->engine_->template AddOpAttr>(op_name, "dilations", + dilations); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(im2sequence, Im2SequenceConverter); diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.h b/paddle/fluid/inference/anakin/convert/im2sequence.h new file mode 100644 index 00000000..8241d4d6 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/im2sequence.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class Im2SequenceConverter : public AnakinOpConverter { + public: + Im2SequenceConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~Im2SequenceConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h new file mode 100644 index 00000000..1058e744 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -0,0 +1,237 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "framework/core/types.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/anakin/engine.h" +#include "paddle/fluid/inference/utils/singleton.h" +#include "saber/saber_types.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class AnakinOpConverter { + using AnakinEngineT = AnakinEngine; + + public: + AnakinOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) {} + void ConvertOp(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const std::unordered_set ¶meters, + const framework::Scope &scope, AnakinEngineT *engine, + bool test_mode = false) { + framework::OpDesc op_desc(op, nullptr); + std::string op_type = op_desc.Type(); + AnakinOpConverter *it = nullptr; + if (op_type == "depthwise_conv2d") op_type = "conv2d"; + if (op_type == "reshape2") op_type = "reshape"; + if (op_type == "transpose2") op_type = "transpose"; + if (op_type == "flatten2") op_type = "flatten"; + + if (!it) { + it = Registry::Global().Lookup(op_type); + } + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type); + it->SetEngine(engine); + (*it)(op, block_desc, scope, test_mode); + } + + void ConvertBlock(framework::BlockDesc *block_desc, + const std::unordered_set ¶meters, + const framework::Scope &scope, AnakinEngineT *engine) { + std::unique_lock lock(mutex_); + framework::proto::BlockDesc *block = block_desc->Proto(); + for (auto i = 0; i < block->ops_size(); i++) { + auto &op = block->ops(i); + ConvertOp(op, *block_desc, parameters, scope, engine); + } + } + + // The scope here should be inited with the parameter vars. + void ConvertBlockToAnakinEngine( + framework::BlockDesc *block_desc, framework::Scope *scope, + const std::vector &inputs, + const std::unordered_set ¶meters, + const std::vector &outputs, AnakinEngineT *engine) { + ConvertBlock(block_desc, parameters, *scope, engine); + // if the max_batch size + int max_batch_size = engine->GetMaxBatchSize(); + PADDLE_ENFORCE(max_batch_size > 0, + "the max_batch_size setted from config->EnableAnakinEngine " + "must largger than 0"); + // If the user does not specify this variable, we use the input shape from + // the block_desc. + auto max_input_shape = engine->GetMaxInputShape(); + std::map> temp_max_input_shape; + // Register outputs with anakin using the RegistVar interface before Freeze. + // Note that RegistVar's parameters can only be outputs, not inputs. + for (auto &output : outputs) { + engine->Graph()->RegistVar(output); + } + engine->Freeze(); + // Add scale for tensor in int8 mode. + auto tensor_scales = engine->GetTensorScales(); + + for (auto &item : tensor_scales) { + engine->Graph()->SetVarScale(item.first, item.second); + } + + for (auto &input : inputs) { + if (parameters.count(input)) continue; + std::vector input_shape; + input_shape.resize(4); + input_shape[0] = max_batch_size; + if (max_input_shape.count(input)) { + PADDLE_ENFORCE(max_input_shape[input].size() == 4, + "the dimensions of max_input_shape setted from " + "config->EnableAnakinEngine must be 4"); + for (int i = 1; i < 4; i++) { + input_shape[i] = max_input_shape[input][i]; + } + } else { + auto *var = block_desc->FindVar(input); + PADDLE_ENFORCE(var, "no variable called %s", input); + + auto var_shape = var->GetShape(); + std::cout << "input :" << input << std::endl; + PADDLE_ENFORCE(var_shape.size() == 4); + + for (size_t i = 1; i < var_shape.size(); i++) { + input_shape[i] = var_shape[i]; + } + } + temp_max_input_shape[input] = input_shape; + engine->SetInputShape(input, input_shape); + } + engine->SetMaxInputShape(temp_max_input_shape); + engine->Optimize(); + engine->InitNet(); + } + + void SetEngine(AnakinEngineT *engine) { engine_ = engine; } + virtual ~AnakinOpConverter() {} + + protected: + bool test_mode_; + AnakinEngineT *engine_{nullptr}; + + private: + std::unordered_map *> + converters_; + framework::Scope *scope_{nullptr}; + std::mutex mutex_; +}; + +template class AnakinOpConverter<::anakin::saber::NV, + ::anakin::Precision::FP32>; +template class AnakinOpConverter<::anakin::saber::NV, + ::anakin::Precision::INT8>; +#ifdef ANAKIN_X86_PLACE +template class AnakinOpConverter<::anakin::saber::X86, + ::anakin::Precision::FP32>; +template class AnakinOpConverter<::anakin::saber::X86, + ::anakin::Precision::INT8>; +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +#define REGISTER_ANAKIN_OP_CONVERTER_BASE(op_type__, Converter__, \ + place_type__, place_class__, \ + precision_type__, precision_class__) \ + struct anakin_##op_type__##_##place_type__##_##precision_type__##_converter \ + : public ::paddle::framework::Registrar { \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter() { \ + LOG(INFO) << "register convert " << #op_type__ << " "; \ + ::paddle::inference::Registry< \ + ::paddle::inference::anakin::AnakinOpConverter< \ + place_class__, precision_class__>>::Global() \ + .Register(#op_type__); \ + } \ + }; \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter__; \ + int Touch_anakin_##op_type__##_##place_type__##_##precision_type__() { \ + anakin_##op_type__##_##place_type__##_##precision_type__##_converter__ \ + .Touch(); \ + return 0; \ + } + +#define WRAP(...) __VA_ARGS__ + +#define REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, \ + precision_type__) \ + REGISTER_ANAKIN_OP_CONVERTER_BASE( \ + op_type__, \ + ::paddle::inference::anakin::Converter__, \ + CUDA, ::anakin::saber::NV, precision_type__, \ + ::anakin::Precision::precision_type__) + +#define REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, \ + precision_type__) \ + REGISTER_ANAKIN_OP_CONVERTER_BASE( \ + op_type__, \ + ::paddle::inference::anakin::Converter__, \ + CPU, ::anakin::saber::X86, precision_type__, \ + ::anakin::Precision::precision_type__) + +#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE) +#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ + REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \ + REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8); \ + REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \ + REGISTER_CPU_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8) +#elif defined(PADDLE_WITH_CUDA) +#define REGISTER_ANAKIN_OP_CONVERTER(op_type__, Converter__) \ + REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, FP32); \ + REGISTER_CUDA_ANAKIN_OP_CONVERTER(op_type__, Converter__, INT8) +#endif + +#define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__) \ + extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \ + int use_converter_anakin_##op_type__##_##place_type__##_##precision_type__ \ + __attribute__((unused)) = \ + Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); + +#if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE) +#define USE_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, FP32) +#define USE_INT8_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CPU, INT8) +#elif defined(PADDLE_WITH_CUDA) +#define USE_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, FP32) +#define USE_INT8_ANAKIN_CONVERTER(op_type__) \ + USE_ANAKIN_CONVERTER_BASE(op_type__, CUDA, INT8) +#endif diff --git a/paddle/fluid/inference/anakin/convert/pool2d.cc b/paddle/fluid/inference/anakin/convert/pool2d.cc new file mode 100644 index 00000000..11e7c717 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/pool2d.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/pool2d.h" +#include +#include +#include + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void Pool2dOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto x_name = op_desc.Input("X").front(); + auto y_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + bool global_pooling = boost::get(op_desc.GetAttr("global_pooling")); + std::string pool_type = + boost::get(op_desc.GetAttr("pooling_type")); + std::vector ksize = + boost::get>(op_desc.GetAttr("ksize")); + std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + bool ceil_mode = boost::get(op_desc.GetAttr("ceil_mode")); + std::string anakin_pool_type; + if (pool_type == "max") { + anakin_pool_type = "MAX"; + } else if (pool_type == "avg") { + if (paddings[0] || paddings[1]) { + anakin_pool_type = "AVGEXC"; + } else { + anakin_pool_type = "AVG"; + } + } else { + PADDLE_THROW("TensorRT unsupported pooling type!"); + } + + this->engine_->AddOp(op_name, "Pooling", {x_name}, {y_name}); + this->engine_->template AddOpAttr>(op_name, "pool_size", ksize); + this->engine_->template AddOpAttr>(op_name, "strides", strides); + this->engine_->template AddOpAttr>(op_name, "padding", paddings); + this->engine_->AddOpAttr(op_name, "method", anakin_pool_type); + this->engine_->AddOpAttr(op_name, "global_pooling", global_pooling); + this->engine_->AddOpAttr(op_name, "cmp_out_shape_floor_as_conv", !ceil_mode); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(pool2d, Pool2dOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/pool2d.h b/paddle/fluid/inference/anakin/convert/pool2d.h new file mode 100644 index 00000000..7a06ff1b --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/pool2d.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class Pool2dOpConverter : public AnakinOpConverter { + public: + Pool2dOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~Pool2dOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/relu.cc b/paddle/fluid/inference/anakin/convert/relu.cc new file mode 100644 index 00000000..00853406 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/relu.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/relu.h" +#include +#include + +namespace paddle { +namespace inference { +namespace anakin { + +template +void ReluOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto input_name = op_desc.Input("X").front(); + auto output_name = op_desc.Output("Out").front(); + + this->engine_->AddOp(op_name, "ReLU", {input_name}, {output_name}); + this->engine_->AddOpAttr(op_name, "alpha", 0); +} + +template +void LeakyReluOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto input_name = op_desc.Input("X").front(); + auto output_name = op_desc.Output("Out").front(); + + float alpha = boost::get(op_desc.GetAttr("alpha")); + this->engine_->AddOp(op_name, "ReLU", {input_name}, {output_name}); + this->engine_->AddOpAttr(op_name, "alpha", alpha); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(relu, ReluOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(leaky_relu, LeakyReluOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/relu.h b/paddle/fluid/inference/anakin/convert/relu.h new file mode 100644 index 00000000..f366f05a --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/relu.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class ReluOpConverter : public AnakinOpConverter { + public: + ReluOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ReluOpConverter() {} +}; + +template +class LeakyReluOpConverter : public AnakinOpConverter { + public: + LeakyReluOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~LeakyReluOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/reshape.cc b/paddle/fluid/inference/anakin/convert/reshape.cc new file mode 100644 index 00000000..d73736b7 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/reshape.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/reshape.h" +#include + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void ReshapeOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL); + + auto input = op_desc.Input("X").front(); + auto output = op_desc.Output("Out").front(); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + this->engine_->AddOp(op_name, "Reshape", {input}, {output}); + + auto shape = boost::get>(op_desc.GetAttr("shape")); + if (shape.size() < 4) { + shape.insert(shape.end(), 4 - shape.size(), 1); + } + this->engine_->template AddOpAttr>(op_name, "dims", shape); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/reshape.h b/paddle/fluid/inference/anakin/convert/reshape.h new file mode 100644 index 00000000..88de2641 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/reshape.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class ReshapeOpConverter : public AnakinOpConverter { + public: + ReshapeOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ReshapeOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/roi_align.cc b/paddle/fluid/inference/anakin/convert/roi_align.cc new file mode 100644 index 00000000..8702f638 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/roi_align.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/roi_align.h" +#include +#include + +namespace paddle { +namespace inference { +namespace anakin { + +template +void RoiAlignOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("ROIs").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto input_x_name = op_desc.Input("X").front(); + auto input_rois_name = op_desc.Input("ROIs").front(); + auto output_name = op_desc.Output("Out").front(); + + auto spatial_scale = boost::get(op_desc.GetAttr("spatial_scale")); + auto pooled_height = boost::get(op_desc.GetAttr("pooled_height")); + auto pooled_width = boost::get(op_desc.GetAttr("pooled_width")); + auto sampling_ratio = boost::get(op_desc.GetAttr("sampling_ratio")); + + this->engine_->AddOp(op_name, "RoiAlign", {input_x_name, input_rois_name}, + {output_name}); + this->engine_->AddOpAttr(op_name, "spatial_scale", spatial_scale); + this->engine_->AddOpAttr(op_name, "pooled_height", pooled_height); + this->engine_->AddOpAttr(op_name, "pooled_width", pooled_width); + this->engine_->AddOpAttr(op_name, "sampling_ratio", sampling_ratio); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(roi_align, RoiAlignOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/roi_align.h b/paddle/fluid/inference/anakin/convert/roi_align.h new file mode 100644 index 00000000..8b5d23a0 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/roi_align.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class RoiAlignOpConverter : public AnakinOpConverter { + public: + RoiAlignOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~RoiAlignOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/scale.cc b/paddle/fluid/inference/anakin/convert/scale.cc new file mode 100644 index 00000000..2559ec49 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/scale.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/scale.h" +#include +#include + +namespace paddle { +namespace inference { +namespace anakin { + +template +void ScaleOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + auto input_name = op_desc.Input("X").front(); + auto output_name = op_desc.Output("Out").front(); + float scale = boost::get(op_desc.GetAttr("scale")); + float bias = boost::get(op_desc.GetAttr("bias")); + float bias_after_scale = + boost::get(op_desc.GetAttr("bias_after_scale")); + PADDLE_ENFORCE(bias_after_scale, + "The anakin scale layer only support bias after scale now."); + + this->engine_->AddOp(op_name, "Power", {input_name}, {output_name}); + this->engine_->AddOpAttr(op_name, "shift", bias); + this->engine_->AddOpAttr(op_name, "scale", scale); + this->engine_->AddOpAttr(op_name, "power", static_cast(1.0)); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/scale.h b/paddle/fluid/inference/anakin/convert/scale.h new file mode 100644 index 00000000..f19a9201 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/scale.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class ScaleOpConverter : public AnakinOpConverter { + public: + ScaleOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ScaleOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/shuffle_channel.cc b/paddle/fluid/inference/anakin/convert/shuffle_channel.cc new file mode 100644 index 00000000..fdd2e318 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/shuffle_channel.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/shuffle_channel.h" +#include +#include +#include + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void ShuffleChannelOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto input = op_desc.Input("X").front(); + auto output = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + this->engine_->AddOp(op_name, "ShuffleChannel", {input}, {output}); + + auto group = boost::get(op_desc.GetAttr("group")); + this->engine_->AddOpAttr(op_name, "group", group); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(shuffle_channel, ShuffleChannelOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/shuffle_channel.h b/paddle/fluid/inference/anakin/convert/shuffle_channel.h new file mode 100644 index 00000000..457a1486 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/shuffle_channel.h @@ -0,0 +1,38 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class ShuffleChannelOpConverter + : public AnakinOpConverter { + public: + ShuffleChannelOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~ShuffleChannelOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/softmax.cc b/paddle/fluid/inference/anakin/convert/softmax.cc new file mode 100644 index 00000000..a4dc5a91 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/softmax.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/softmax.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void SoftMaxOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL); + + auto input = op_desc.Input("X").front(); + auto output = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + auto input_var_desc = block_desc.FindVar(input); + PADDLE_ENFORCE(input_var_desc, + "Cant find %s variable When runing Anakin Softmax converter.", + input); + auto input_shape_in_fluid = input_var_desc->GetShape(); + size_t input_dims = input_shape_in_fluid.size(); + + this->engine_->AddOp(op_name, "Softmax", {input}, {output}); + this->engine_->AddOpAttr(op_name, "axis", static_cast(input_dims - 1)); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(softmax, SoftMaxOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/softmax.h b/paddle/fluid/inference/anakin/convert/softmax.h new file mode 100644 index 00000000..dc431b5b --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/softmax.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class SoftMaxOpConverter : public AnakinOpConverter { + public: + SoftMaxOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~SoftMaxOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/split.cc b/paddle/fluid/inference/anakin/convert/split.cc new file mode 100644 index 00000000..e63edea9 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/split.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/split.h" +#include +#include + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void SplitOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + auto input_name = op_desc.Input("X").front(); + auto y_names = op_desc.Output("Out"); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + int axis = boost::get(op_desc.GetAttr("axis")); + + std::vector output_lengths = + boost::get>(op_desc.GetAttr("sections")); + + int split_num = output_lengths.size(); + PADDLE_ENFORCE(split_num > 1, + "anakin split op converter: the split num should > 1"); + int num_sum = 0; + std::vector slice_point; + for (int i = 0; i < split_num - 1; i++) { + num_sum += output_lengths[i]; + slice_point.push_back(num_sum); + } + this->engine_->AddOp(op_name, "Slice", {input_name}, y_names); + this->engine_->AddOpAttr(op_name, "axis", axis); + this->engine_->template AddOpAttr>(op_name, "slice_point", + slice_point); + // slice_dim is useless in anakin + this->engine_->AddOpAttr(op_name, "slice_dim", 4); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(split, SplitOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/split.h b/paddle/fluid/inference/anakin/convert/split.h new file mode 100644 index 00000000..81991531 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/split.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class SplitOpConverter : public AnakinOpConverter { + public: + SplitOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~SplitOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/sum.cc b/paddle/fluid/inference/anakin/convert/sum.cc new file mode 100644 index 00000000..870c0793 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/sum.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/sum.h" +#include +#include +#include + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void SumOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto input_names = op_desc.Input("X"); + auto out_name = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + std::vector coeff = {1, 1}; + std::string elementwise_type = "Add"; + this->engine_->AddOp(op_name, "Eltwise", input_names, {out_name}); + this->engine_->template AddOpAttr>(op_name, "coeff", coeff); + this->engine_->template AddOpAttr(op_name, "type", + elementwise_type); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(sum, SumOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/sum.h b/paddle/fluid/inference/anakin/convert/sum.h new file mode 100644 index 00000000..aefc64c6 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/sum.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class SumOpConverter : public AnakinOpConverter { + public: + SumOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~SumOpConverter() {} + + private: +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/test_activation_op.cc b/paddle/fluid/inference/anakin/convert/test_activation_op.cc new file mode 100644 index 00000000..5ac8b458 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_activation_op.cc @@ -0,0 +1,92 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/activation.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +static void test_activation_op(const std::string& op_type, + const platform::DeviceContext& context, + bool use_gpu) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("act-X", {10, 6, 1, 1}); + validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); + framework::OpDesc desc; + desc.SetType(op_type); + desc.SetInput("X", {"act-X"}); + desc.SetOutput("Out", {"act-Out"}); + + if (op_type == "swish") { + desc.SetAttr("beta", 1.0f); + } + + if (op_type == "relu6") { + desc.SetAttr("threshold", 6.0f); + } + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(5); +} + +#ifdef PADDLE_WITH_CUDA +TEST(sigm_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("sigmoid", ctx, true); +} + +TEST(tanh_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("tanh", ctx, true); +} + +TEST(relu6_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("relu6", ctx, true); +} + +TEST(swish_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("swish", ctx, true); +} +#endif + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(sigmoid); +USE_OP(tanh); +USE_OP(relu6); +USE_OP(swish); + +USE_ANAKIN_CONVERTER(sigmoid); +USE_ANAKIN_CONVERTER(tanh); +USE_ANAKIN_CONVERTER(relu6); +USE_ANAKIN_CONVERTER(swish); diff --git a/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc new file mode 100644 index 00000000..008537dc --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_affine_channel_op.cc @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/affine_channel.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_affine_channel_op(const platform::DeviceContext& context, + bool use_gpu) { + // Declare the difference between the inputs. + std::unordered_set parameters({"scale", "bias"}); + + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("x", {1, 3, 5, 2}); + validator.DeclOutputVar("out", {1, 3, 5, 2}); + validator.DeclParamVar("scale", {3}); + validator.DeclParamVar("bias", {3}); + + // Prepare Op descriptions. + framework::OpDesc desc; + desc.SetType("affine_channel"); + desc.SetInput("X", {"x"}); + desc.SetInput("Bias", {"bias"}); + desc.SetInput("Scale", {"scale"}); + desc.SetOutput("Out", {"out"}); + + // Layout must be explicitly specified here as NCHW. + desc.SetAttr("data_layout", std::string("NCHW")); + + validator.SetOp(*desc.Proto()); + validator.Execute(1); +} + +#ifdef PADDLE_WITH_CUDA +TEST(affine_channel_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_affine_channel_op<::anakin::saber::NV>(ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(affine_channel_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_affine_channel_op<::anakin::saber::X86>(ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(affine_channel); +USE_ANAKIN_CONVERTER(affine_channel); diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc new file mode 100644 index 00000000..edba9023 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_batchnorm_op(const platform::DeviceContext& context, bool use_gpu) { + std::unordered_set parameters( + {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean", + "batch_norm_variance"}); + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + std::vector param_shape{2}; + + validator.DeclInputVar("batch_norm_X", {1, 2, 5, 5}); + validator.DeclParamVar("batch_norm_scale", param_shape); + validator.DeclParamVar("batch_norm_bias", param_shape); + validator.DeclParamVar("batch_norm_mean", param_shape); + validator.DeclParamVar("batch_norm_variance", param_shape); + validator.DeclOutputVar("batch_norm_Y", {1, 2, 5, 5}); + validator.DeclOutputVar("batch_norm_save_mean", param_shape); + validator.DeclOutputVar("batch_norm_save_variance", param_shape); + + // Prepare Op description + framework::OpDesc desc; + + desc.SetType("batch_norm"); + desc.SetInput("X", {"batch_norm_X"}); + desc.SetInput("Scale", {"batch_norm_scale"}); + desc.SetInput("Bias", {"batch_norm_bias"}); + desc.SetInput("Mean", {"batch_norm_mean"}); + desc.SetInput("Variance", {"batch_norm_variance"}); + desc.SetOutput("Y", {"batch_norm_Y"}); + desc.SetOutput("MeanOut", {"batch_norm_mean"}); + desc.SetOutput("VarianceOut", {"batch_norm_variance"}); + desc.SetOutput("SavedMean", {"batch_norm_save_mean"}); + desc.SetOutput("SavedVariance", {"batch_norm_save_variance"}); + + float eps = 1e-5f; + bool is_test = true; + desc.SetAttr("epsilon", eps); + desc.SetAttr("is_test", is_test); + + validator.SetOp(*desc.Proto()); + + std::unordered_set neglected_output = { + "batch_norm_save_mean", "batch_norm_save_variance", "batch_norm_mean", + "batch_norm_variance"}; + validator.Execute(1, neglected_output); +} + +#ifdef PADDLE_WITH_CUDA +TEST(batch_norm_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_batchnorm_op<::anakin::saber::NV>(ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(batch_norm_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_batchnorm_op<::anakin::saber::X86>(ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle +USE_OP(batch_norm); +USE_ANAKIN_CONVERTER(batch_norm); diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc new file mode 100644 index 00000000..6870260c --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/concat.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_concat_op(const platform::DeviceContext& context, bool use_gpu) { + std::unordered_set parameters({""}); + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("concat_x1", {1, 2, 1, 1}); + validator.DeclInputVar("concat_x2", {1, 3, 1, 1}); + validator.DeclInputVar("concat_x3", {1, 1, 1, 1}); + validator.DeclOutputVar("concat_out", {1, 6, 1, 1}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("concat"); + desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"}); + desc.SetOutput("Out", {"concat_out"}); + + int axis = 1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +#ifdef PADDLE_WITH_CUDA +TEST(concat_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_concat_op<::anakin::saber::NV>(ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(concat_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_concat_op<::anakin::saber::X86>(ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle +USE_OP(concat); +USE_ANAKIN_CONVERTER(concat); diff --git a/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc new file mode 100644 index 00000000..723a348b --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_conv2d_op.cc @@ -0,0 +1,75 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/conv2d.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_conv2d_op(const platform::DeviceContext& context, bool use_gpu) { + std::unordered_set parameters({"conv2d-Y"}); + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("conv2d-X", {1, 3, 3, 3}); + validator.DeclParamVar("conv2d-Y", {4, 3, 1, 1}); + validator.DeclOutputVar("conv2d-Out", {1, 4, 3, 3}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("conv2d"); + desc.SetInput("Input", {"conv2d-X"}); + desc.SetInput("Filter", {"conv2d-Y"}); + desc.SetOutput("Output", {"conv2d-Out"}); + + const std::vector strides({1, 1}); + const std::vector paddings({0, 0}); + const std::vector dilations({1, 1}); + const int groups = 1; + + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + desc.SetAttr("dilations", dilations); + desc.SetAttr("groups", groups); + + validator.SetOp(*desc.Proto()); + + validator.Execute(3); +} + +#ifdef PADDLE_WITH_CUDA +TEST(conv2d_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_conv2d_op<::anakin::saber::NV>(ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(conv2d_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_conv2d_op<::anakin::saber::X86>(ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(conv2d); +USE_ANAKIN_CONVERTER(conv2d); diff --git a/paddle/fluid/inference/anakin/convert/test_dropout_op.cc b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc new file mode 100644 index 00000000..83792676 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_dropout_op.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/dropout.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_dropout_op(const platform::DeviceContext& context, bool use_gpu) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("x", {1, 1, 2, 2}); + validator.DeclOutputVar("out", {1, 1, 2, 2}); + validator.DeclOutputVar("mask", {1, 1, 2, 2}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("dropout"); + desc.SetInput("X", {"x"}); + desc.SetOutput("Out", {"out"}); + desc.SetOutput("Mask", {"mask"}); + + float dropout_prob = 0.5; + desc.SetAttr("dropout_prob", dropout_prob); + desc.SetAttr("is_test", true); + + validator.SetOp(*desc.Proto()); + std::unordered_set neglected_output = {"mask"}; + validator.Execute(1, neglected_output); +} + +#ifdef PADDLE_WITH_CUDA +TEST(dropout_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_dropout_op<::anakin::saber::NV>(ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(dropout_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_dropout_op<::anakin::saber::X86>(ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(dropout); +USE_ANAKIN_CONVERTER(dropout); diff --git a/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc new file mode 100644 index 00000000..ee128c1e --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_elementwise_op.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/elementwise.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +static void test_elementwise_op(const std::string& op_type, + const platform::DeviceContext& context, + bool use_gpu) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("x", {1, 1, 2, 2}); + validator.DeclInputVar("y", {1, 1, 2, 2}); + validator.DeclOutputVar("out", {1, 1, 2, 2}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType(op_type); + desc.SetInput("X", {"x"}); + desc.SetInput("Y", {"y"}); + desc.SetOutput("Out", {"out"}); + + int axis = -1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + validator.Execute(1); +} + +#ifdef PADDLE_WITH_CUDA +TEST(elementwise_op, native_add_gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_elementwise_op<::anakin::saber::NV>("elementwise_add", ctx, true); +} +TEST(elementwise_op, native_mul_gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_elementwise_op<::anakin::saber::NV>("elementwise_mul", ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(elementwise_op, native_add_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_elementwise_op<::anakin::saber::X86>("elementwise_add", ctx, false); +} +TEST(elementwise_op, native_mul_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_elementwise_op<::anakin::saber::X86>("elementwise_mul", ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(elementwise_add); +USE_OP(elementwise_mul); +USE_ANAKIN_CONVERTER(elementwise_add); +USE_ANAKIN_CONVERTER(elementwise_mul); diff --git a/paddle/fluid/inference/anakin/convert/test_fc_op.cc b/paddle/fluid/inference/anakin/convert/test_fc_op.cc new file mode 100644 index 00000000..3e68d8fe --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_fc_op.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_mul_op(const platform::DeviceContext& context, bool use_gpu) { + std::unordered_set parameters({"mul_y"}); + framework::Scope scope; + + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("mul_x", {1, 1, 2, 2}); + validator.DeclParamVar("mul_y", {4, 2}); + validator.DeclOutputVar("mul_out", {1, 2}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("mul"); + desc.SetInput("X", {"mul_x"}); + desc.SetInput("Y", {"mul_y"}); + desc.SetOutput("Out", {"mul_out"}); + validator.SetOp(*desc.Proto()); + + validator.Execute(10); +} + +#ifdef PADDLE_WITH_CUDA +TEST(mul_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_mul_op<::anakin::saber::NV>(ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(mul_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_mul_op<::anakin::saber::X86>(ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(mul); +USE_ANAKIN_CONVERTER(fc); diff --git a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc new file mode 100644 index 00000000..5e4cfdab --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_flatten_op(const platform::DeviceContext& context, bool use_gpu) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("flatten-X", {3, 10, 10, 4}); + validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1}); + framework::OpDesc desc; + desc.SetType("flatten"); + desc.SetInput("X", {"flatten-X"}); + desc.SetOutput("Out", {"flatten-Out"}); + desc.SetAttr("axis", 1); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(5); +} + +#ifdef PADDLE_WITH_CUDA +TEST(flatten_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_flatten_op<::anakin::saber::NV>(ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(flatten_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_flatten_op<::anakin::saber::X86>(ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(reshape); +USE_OP_ITSELF(flatten); +USE_ANAKIN_CONVERTER(flatten); diff --git a/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc new file mode 100644 index 00000000..5e576463 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_im2sequence_op.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/im2sequence.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +TEST(im2sequence_op, native) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator(parameters, &scope); + + std::vector kernels = {6, 1}; + std::vector strides = {1, 1}; + std::vector paddings = {0, 0, 0, 0}; + + validator.DeclInputVar("x", {1, 1, 2, 2}); + validator.DeclOutputVar("out", {1, 1 * kernels[0] * kernels[1]}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("im2sequence"); + desc.SetInput("X", {"x"}); + desc.SetOutput("Out", {"out"}); + + desc.SetAttr("kernels", kernels); + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + + validator.SetOp(*desc.Proto()); + validator.Execute(1); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(im2sequence); +USE_ANAKIN_CONVERTER(im2sequence); diff --git a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc new file mode 100644 index 00000000..9b23b5b9 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_pool2d(const platform::DeviceContext& context, bool use_gpu, + bool global_pooling, bool ceil_mode, + std::string pool_type = "max") { + framework::Scope scope; + std::unordered_set parameters; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + + // The ITensor's Dims should not contain the batch size. + // So, the ITensor's Dims of input and output should be C * H * W. + validator.DeclInputVar("pool2d_x", {1, 3, 6, 7}); + if (global_pooling) + validator.DeclOutputVar("pool2d_out", {1, 3, 1, 1}); + else if (ceil_mode) + validator.DeclOutputVar("pool2d_out", {1, 3, 3, 4}); + else + validator.DeclOutputVar("pool2d_out", {1, 3, 3, 3}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("pool2d"); + desc.SetInput("X", {"pool2d_x"}); + desc.SetOutput("Out", {"pool2d_out"}); + + std::vector ksize({2, 2}); + std::vector strides({2, 2}); + std::vector paddings({0, 0}); + std::string pooling_t = pool_type; + + desc.SetAttr("pooling_type", pooling_t); + desc.SetAttr("ksize", ksize); + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + desc.SetAttr("global_pooling", global_pooling); + desc.SetAttr("ceil_mode", ceil_mode); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(1); +} + +#ifdef PADDLE_WITH_CUDA +TEST(Pool2dOpConverter, normal) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_pool2d<::anakin::saber::NV>(ctx, true, false, false); +} +TEST(Pool2dOpConverter, test_global_pooling) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_pool2d<::anakin::saber::NV>(ctx, true, true, false); +} + +TEST(Pool2dOpConverter, max_ceil_test) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_pool2d<::anakin::saber::NV>(ctx, true, false, true); +} + +TEST(Pool2dOpConverter, avg_ceil_test) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_pool2d<::anakin::saber::NV>(ctx, true, false, true, "avg"); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(Pool2dOpConverter, normal_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_pool2d<::anakin::saber::X86>(ctx, false, false, false); +} +TEST(Pool2dOpConverter, test_global_pooling_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_pool2d<::anakin::saber::X86>(ctx, false, true, false); +} + +TEST(Pool2dOpConverter, max_ceil_test_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_pool2d<::anakin::saber::X86>(ctx, false, false, true); +} + +TEST(Pool2dOpConverter, avg_ceil_test_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_pool2d<::anakin::saber::X86>(ctx, false, false, true, "avg"); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(pool2d); +USE_ANAKIN_CONVERTER(pool2d); diff --git a/paddle/fluid/inference/anakin/convert/test_relu_op.cc b/paddle/fluid/inference/anakin/convert/test_relu_op.cc new file mode 100644 index 00000000..eb6429f3 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_relu_op.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/relu.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +static void test_activation_op(const std::string& op_type, + const platform::DeviceContext& context, + bool use_gpu) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("act-X", {10, 6, 1, 1}); + validator.DeclOutputVar("act-Out", {10, 6, 1, 1}); + framework::OpDesc desc; + desc.SetType(op_type); + desc.SetInput("X", {"act-X"}); + desc.SetOutput("Out", {"act-Out"}); + if (op_type == "leaky_relu") { + desc.SetAttr("alpha", 0.1f); + } + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(5); +} + +#ifdef PADDLE_WITH_CUDA +TEST(relu_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("relu", ctx, true); +} + +TEST(leaky_relu_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_activation_op<::anakin::saber::NV>("leaky_relu", ctx, true); +} +#endif + +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(relu); +USE_OP(leaky_relu); +USE_ANAKIN_CONVERTER(relu); +USE_ANAKIN_CONVERTER(leaky_relu); diff --git a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc new file mode 100644 index 00000000..b1be42e5 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc @@ -0,0 +1,102 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_reshape1_op(const platform::DeviceContext& context, bool use_gpu) { + framework::Scope scope; + std::unordered_set parameters; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + + // validator.DeclInputVar("reshape-X", {2, 3, 3, 1}); + // validator.DeclOutputVar("reshape-Out", {3, 2, 1, 3}); + validator.DeclInputVar("reshape-X", {1, 2, 4, 1}); + validator.DeclOutputVar("reshape-Out", {1, 8, 1, 1}); + + framework::OpDesc desc; + desc.SetType("reshape"); + desc.SetInput("X", {"reshape-X"}); + desc.SetOutput("Out", {"reshape-Out"}); + // desc.SetAttr("shape", std::vector({3, 2, 1, 3})); + desc.SetAttr("shape", std::vector({1, 8, 1, 1})); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + validator.Execute(1); +} + +template +void test_reshape2_op(const platform::DeviceContext& context, bool use_gpu) { + framework::Scope scope; + std::unordered_set parameters; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + + validator.DeclInputVar("reshape-X", {1, 2, 4}); + validator.DeclOutputVar("reshape-Out", {1, 4, 2}); + + framework::OpDesc desc; + desc.SetType("reshape"); + desc.SetInput("X", {"reshape-X"}); + desc.SetOutput("Out", {"reshape-Out"}); + // desc.SetAttr("shape", std::vector({3, 2, 1, 3})); + desc.SetAttr("shape", std::vector({0, -1, 2})); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + validator.Execute(1); +} + +#ifdef PADDLE_WITH_CUDA +TEST(reshape1_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_reshape1_op<::anakin::saber::NV>(ctx, true); +} + +TEST(reshape2_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_reshape2_op<::anakin::saber::NV>(ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(reshape1_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_reshape2_op<::anakin::saber::X86>(ctx, false); +} + +TEST(reshape2_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_reshape2_op<::anakin::saber::X86>(ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(reshape); +USE_ANAKIN_CONVERTER(reshape); diff --git a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc new file mode 100644 index 00000000..1a324739 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_softmax_op(const platform::DeviceContext& context, bool use_gpu) { + framework::Scope scope; + std::unordered_set parameters; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + + validator.DeclInputVar("softmax-X", {1, 10, 2}); + validator.DeclOutputVar("softmax-Out", {1, 10, 2}); + + framework::OpDesc desc; + desc.SetType("softmax"); + desc.SetInput("X", {"softmax-X"}); + desc.SetOutput("Out", {"softmax-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + validator.Execute(1); +} + +#ifdef PADDLE_WITH_CUDA +TEST(softmax_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_softmax_op<::anakin::saber::NV>(ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(relu_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_softmax_op<::anakin::saber::X86>(ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(softmax); +USE_ANAKIN_CONVERTER(softmax); diff --git a/paddle/fluid/inference/anakin/convert/test_split_op.cc b/paddle/fluid/inference/anakin/convert/test_split_op.cc new file mode 100644 index 00000000..f9ef54fd --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_split_op.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/split.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void AnakinSliceTest(const platform::DeviceContext &context, bool use_gpu, + const std::vector &in_shape, + const std::vector §ions) { + std::unordered_set parameters({""}); + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + + validator.DeclInputVar("split_input", in_shape); + std::vector output_vars; + for (size_t i = 0; i < sections.size(); ++i) { + auto out_shape = in_shape; + out_shape[Axis] = sections[i]; + std::string output_name = "split_out" + std::to_string(i); + validator.DeclOutputVar(output_name, out_shape); + output_vars.push_back(output_name); + } + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("split"); + desc.SetInput("X", {"split_input"}); + desc.SetOutput("Out", output_vars); + + desc.SetAttr("axis", Axis); + desc.SetAttr("num", 0); + desc.SetAttr("sections", sections); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +// batch = 0, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch1) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 1>(ctx, true, {1, 4, 2, 2}, {2, 2}); +} +// batch = 0, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch1) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 1>(ctx, true, {1, 3, 2, 2}, {2, 1}); +} +// batch = 0, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch1) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 2>(ctx, true, {1, 3, 4, 2}, {2, 2}); +} +// batch = 0, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch1) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 2>(ctx, true, {1, 3, 3, 2}, {2, 1}); +} + +// batch = 0, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch1) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 4}, {2, 2}); +} +// batch = 0, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch1) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + AnakinSliceTest<::anakin::saber::NV, 3>(ctx, true, {1, 3, 2, 3}, {2, 1}); +} +#ifdef ANAKIN_X86_PLACE +TEST(split_op, test_different_shape_axis1_batch1_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + AnakinSliceTest<::anakin::saber::X86, 1>(ctx, false, {1, 3, 2, 3}, {2, 1}); +} + +TEST(split_op, test_different_shape_axis2_batch1_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + AnakinSliceTest<::anakin::saber::X86, 2>(ctx, false, {1, 3, 4, 2}, {2, 2}); +} + +TEST(split_op, test_different_shape_axis3_batch1_cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + AnakinSliceTest<::anakin::saber::X86, 3>(ctx, false, {1, 3, 2, 4}, {2, 2}); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(split); +USE_ANAKIN_CONVERTER(split); diff --git a/paddle/fluid/inference/anakin/convert/test_sum_op.cc b/paddle/fluid/inference/anakin/convert/test_sum_op.cc new file mode 100644 index 00000000..9d26430e --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_sum_op.cc @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/sum.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" +#include "paddle/fluid/operators/sum_op.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +static void test_sum_op(const platform::DeviceContext& context, bool use_gpu) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("sum_x1", {1, 2, 1, 2}); + validator.DeclInputVar("sum_x2", {1, 2, 1, 2}); + validator.DeclOutputVar("sum_out", {1, 2, 1, 2}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("sum"); + desc.SetInput("X", {"sum_x1", "sum_x2"}); + desc.SetOutput("Out", {"sum_out"}); + + validator.SetOp(*desc.Proto()); + validator.Execute(1); +} + +#ifdef PADDLE_WITH_CUDA +TEST(sum_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_sum_op<::anakin::saber::NV>(ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(sum_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_sum_op<::anakin::saber::X86>(ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(sum); +USE_ANAKIN_CONVERTER(sum); diff --git a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc new file mode 100644 index 00000000..466e2f1a --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc @@ -0,0 +1,100 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +void test_transpose1_op(const platform::DeviceContext& context, bool use_gpu) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("transpose-X", {2, 3, 4, 5}); + validator.DeclOutputVar("transpose-Out", {4, 2, 5, 3}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("transpose"); + desc.SetInput("X", {"transpose-X"}); + desc.SetOutput("Out", {"transpose-Out"}); + desc.SetAttr("axis", std::vector({2, 0, 3, 1})); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(3); +} + +template +void test_transpose2_op(const platform::DeviceContext& context, bool use_gpu) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator( + parameters, &scope, context, use_gpu); + validator.DeclInputVar("transpose-X", {3, 4, 5}); + validator.DeclOutputVar("transpose-Out", {3, 5, 4}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("transpose"); + desc.SetInput("X", {"transpose-X"}); + desc.SetOutput("Out", {"transpose-Out"}); + desc.SetAttr("axis", std::vector({0, 2, 1})); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(1); +} + +#ifdef PADDLE_WITH_CUDA +TEST(transpose1_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_transpose1_op<::anakin::saber::NV>(ctx, true); +} + +TEST(transpose2_op, gpu) { + platform::CUDAPlace gpu_place(0); + platform::CUDADeviceContext ctx(gpu_place); + test_transpose2_op<::anakin::saber::NV>(ctx, true); +} +#endif +#ifdef ANAKIN_X86_PLACE +TEST(transpose1_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_transpose2_op<::anakin::saber::X86>(ctx, false); +} + +TEST(transpose2_op, cpu) { + platform::CPUPlace cpu_place; + platform::CPUDeviceContext ctx(cpu_place); + test_transpose2_op<::anakin::saber::X86>(ctx, false); +} +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle + +USE_OP(transpose); +USE_ANAKIN_CONVERTER(transpose); diff --git a/paddle/fluid/inference/anakin/convert/transpose.cc b/paddle/fluid/inference/anakin/convert/transpose.cc new file mode 100644 index 00000000..28071ca8 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/transpose.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/transpose.h" +#include +#include +#include + +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +template +void TransposeOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto input = op_desc.Input("X").front(); + auto output = op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + this->engine_->AddOp(op_name, "Permute", {input}, {output}); + + auto axis = boost::get>(op_desc.GetAttr("axis")); + size_t axis_size = axis.size(); + while (axis.size() < 4) { + axis.push_back(axis_size); + axis_size += 1; + } + this->engine_->template AddOpAttr>(op_name, "dims", axis); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/transpose.h b/paddle/fluid/inference/anakin/convert/transpose.h new file mode 100644 index 00000000..b7b0a0f2 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/transpose.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +template +class TransposeOpConverter : public AnakinOpConverter { + public: + TransposeOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, + const framework::Scope &scope, + bool test_mode) override; + virtual ~TransposeOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h new file mode 100644 index 00000000..92441f25 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/ut_helper.h @@ -0,0 +1,227 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/engine.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/enforce.h" + +using anakin::Precision; + +namespace paddle { +namespace inference { +namespace anakin { + +/* + * Get a random float value between [low, high] + */ +float random(float low, float high) { + static std::random_device rd; + static std::mt19937 mt(rd()); + std::uniform_real_distribution dist(low, high); + return dist(mt); +} + +void RandomizeTensor(framework::LoDTensor* tensor, + const platform::Place& place) { + auto dims = tensor->dims(); + size_t num_elements = analysis::AccuDims(dims, dims.size()); + PADDLE_ENFORCE_GT(num_elements, 0); + + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(dims); + auto* temp_data = temp_tensor.mutable_data(cpu_place); + + for (size_t i = 0; i < num_elements; i++) { + *(temp_data + i) = random(0., 1.); + } + + TensorCopySync(temp_tensor, place, tensor); +} + +/* + * Help to validate the correctness between Fluid Op and the corresponding + * anakin + * layer. + */ +template +class AnakinConvertValidation { + using AnakinNvEngineT = AnakinEngine; + + public: + AnakinConvertValidation() = delete; + + AnakinConvertValidation(const std::unordered_set& parameters, + framework::Scope* scope, + const platform::DeviceContext& ctx, + bool use_gpu = true) + : parameters_(parameters), scope_(scope), ctx_(ctx), use_gpu_(use_gpu) { + engine_.reset(new AnakinEngine(true)); + } + + // Declare a Variable as input with random initialization. + void DeclInputVar(const std::string& name, + const std::vector tensor_dims) { + DeclVar(name, tensor_dims); + // should decalre anakin input here. + } + + void DeclParamVar(const std::string& name, const std::vector dim_vec) { + DeclVar(name, dim_vec); + } + + void DeclOutputVar(const std::string& name, const std::vector dim_vec) { + DeclVar(name, dim_vec); + // should declare anakin output here. + } + + void DeclVar(const std::string& name, const std::vector dim_vec) { + auto* x = scope_->Var(name); + auto* x_tensor = x->GetMutable(); + x_tensor->Resize(framework::make_ddim(dim_vec)); + RandomizeTensor(x_tensor, ctx_.GetPlace()); + + std::vector dim_vec_int64; + for (auto& ele : dim_vec) { + dim_vec_int64.push_back(static_cast(ele)); + } + + // Add var_desc to block_desc + auto* block_desc = program_desc_.MutableBlock(framework::kRootBlockIndex); + + auto* var_desc = block_desc->Var(name); + var_desc->SetShape(dim_vec_int64); + } + + void SetOp(const framework::proto::OpDesc& desc) { + op_ = framework::OpRegistry::CreateOp(desc); + op_desc_.reset(new framework::OpDesc(desc, nullptr)); + // should init anakin engine here. + + auto& block_desc = program_desc_.Block(framework::kRootBlockIndex); + Singleton>::Global().ConvertOp( + desc, block_desc, parameters_, *scope_, engine_.get(), + true /*test_mode*/); + engine_->Freeze(); + + std::map> temp_max_input_shape; + for (const auto& input : op_desc_->InputArgumentNames()) { + if (parameters_.count(input)) continue; + auto& t = inference::analysis::GetFromScope(*scope_, + input); + auto t_shape = framework::vectorize2int(t.dims()); + while (t_shape.size() < 4) { + t_shape.push_back(1); + } + engine_->SetInputShape(input, t_shape); + temp_max_input_shape[input] = t_shape; + } + engine_->SetMaxInputShape(temp_max_input_shape); + engine_->Optimize(); + engine_->InitNet(); + } + + // We use the set 'neglected_output' here, because some Ops like batch norm, + // the outputs specified in the op des are only used during training, + // so we should neglect those output during inference. + void Execute(int batch_size, + std::unordered_set neglected_output = {}) { + // Execute Fluid Op + op_->Run(*scope_, ctx_.GetPlace()); + + std::map inputs; + for (const auto& input : op_desc_->InputArgumentNames()) { + if (parameters_.count(input)) continue; + auto* var = scope_->FindVar(input); + auto tensor = var->GetMutable(); + inputs.insert({input, tensor}); + } + + std::map outputs; + std::vector> fluid_outputs; + for (const auto& output : op_desc_->OutputArgumentNames()) { + if (neglected_output.count(output)) continue; + std::vector fluid_out; + auto* var = scope_->FindVar(output); + auto tensor = var->GetMutable(); + framework::TensorToVector(*tensor, ctx_, &fluid_out); + fluid_outputs.push_back(fluid_out); + + outputs.insert({output, tensor}); + } + + if (!use_gpu_) { + engine_->Execute(inputs, outputs); + } else { + cudaStream_t stream; + PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream), 0); + engine_->Execute(inputs, outputs, stream); + } + + int i_output = 0; + for (const auto& output : op_desc_->OutputArgumentNames()) { + if (neglected_output.count(output)) continue; + std::vector anakin_out; + auto* var = scope_->FindVar(output); + auto tensor = var->GetMutable(); + framework::TensorToVector(*tensor, ctx_, &anakin_out); + + size_t anakin_out_size = anakin_out.size(); + auto fluid_out = fluid_outputs[i_output++]; + for (size_t i = 0; i < anakin_out_size; i++) { + EXPECT_LT(std::abs(fluid_out[i] - anakin_out[i]), 1e-3); + } + } + } + + private: + std::unique_ptr engine_{nullptr}; + std::unique_ptr op_; + std::unique_ptr op_desc_; + framework::ProgramDesc program_desc_; + const std::unordered_set& parameters_; + framework::Scope* scope_; + const platform::DeviceContext& ctx_; + bool use_gpu_{true}; +}; + +template class AnakinConvertValidation<::anakin::saber::NV, + ::anakin::Precision::FP32>; +template class AnakinConvertValidation<::anakin::saber::NV, + ::anakin::Precision::INT8>; +#ifdef ANAKIN_X86_PLACE +template class AnakinConvertValidation<::anakin::saber::X86, + ::anakin::Precision::FP32>; +template class AnakinConvertValidation<::anakin::saber::X86, + ::anakin::Precision::INT8>; +#endif +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc new file mode 100644 index 00000000..13f16c4c --- /dev/null +++ b/paddle/fluid/inference/anakin/engine.cc @@ -0,0 +1,207 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/engine.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" + +using anakin::Precision; +using anakin::OpRunType; +using paddle::framework::LoDTensor; +template +using AnakinNetT = anakin::Net; + +template +using AnakinGraphT = anakin::graph::Graph; + +namespace paddle { +namespace inference { +namespace anakin { + +template +extern std::once_flag + AnakinEngine::init_anakin_; + +template +AnakinEngine::AnakinEngine( + bool need_summary, int device, int max_batch_size, + std::map> max_input_shape, + std::vector program_inputs, bool auto_config_layout) + : device_(device), + max_batch_size_(max_batch_size), + max_input_shape_(max_input_shape), + program_inputs_(program_inputs), + auto_config_layout_(auto_config_layout) { + ::anakin::TargetWrapper::set_device(device_); + std::call_once(init_anakin_, + [this]() { ::anakin::Env::env_init(); }); + graph_.reset(new AnakinGraphT()); + net_.reset(new AnakinNetT(need_summary)); +} + +template +AnakinEngine::~AnakinEngine() {} + +template +void AnakinEngine::SetInputShape( + const std::string &name, std::vector shape) { + graph_->AddOpAttr<::anakin::PTuple>(name, "input_shape", + std::move(shape)); +} + +template +void AnakinEngine::InitNet() { + net_->init(*graph_, auto_config_layout_); +} + +template +void AnakinEngine::AddOp( + const std::string &name, const std::string &type, + const std::vector &inputs, + const std::vector &outputs) { + PADDLE_ENFORCE(graph_->AddOp(name, type, inputs, outputs), "Add operation."); +} + +template +void AnakinEngine::BindInput( + const std::map &inputs) { +#ifdef PADDLE_WITH_CUDA + cudaDeviceSynchronize(); +#endif + for (const auto &input : inputs) { + auto *tensor = input.second; + auto *data = tensor->data(); + + auto fluid_input_shape = framework::vectorize2int(tensor->dims()); + while (fluid_input_shape.size() < 4) { + fluid_input_shape.push_back(1); + } + auto *anakin_input = net_->get_in(input.first); + std::vector max_input_shape = max_input_shape_[input.first]; + int max_shape_sum = + std::accumulate(max_input_shape.begin(), max_input_shape.end(), 1, + std::multiplies()); + if (tensor->numel() > max_shape_sum) { + PADDLE_ENFORCE(std::find(program_inputs_.begin(), program_inputs_.end(), + input.first) == program_inputs_.end(), + "The anakin input max shape should be greater than" + " or equal to the real input shape, Please set the max " + "input shape using EnableAnakinEngine"); + VLOG(3) << "Anakin Net will be reset because of the inputs out of range: " + << input.first; + graph_->Reshape(input.first, fluid_input_shape); + net_.reset(new AnakinNetT(true)); + net_->init(*graph_); + anakin_input = net_->get_in(input.first); + } + anakin_input->reshape(fluid_input_shape); + ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), device_, + fluid_input_shape); + anakin_input->copy_from(tmp_anakin_tensor); + } +} + +template +void AnakinEngine::Execute( + const std::map &inputs, + const std::map &outputs) { + BindInput(inputs); + net_->prediction(); + for (const auto &output : outputs) { + platform::CPUPlace cpu_place; + auto *tensor = output.second; + auto *anakin_output = net_->get_out(output.first); + auto *anakin_data = anakin_output->data(); + auto anakin_output_shape = anakin_output->valid_shape(); + tensor->Resize(framework::make_ddim(anakin_output_shape)); + auto *fluid_data = tensor->mutable_data(cpu_place); + memory::Copy(cpu_place, static_cast(fluid_data), cpu_place, + static_cast(anakin_data), + tensor->numel() * sizeof(float)); + } +} + +#ifdef PADDLE_WITH_CUDA +template +void AnakinEngine::Execute( + const std::map &inputs, + const std::map &outputs, + cudaStream_t stream) { + BindInput(inputs); + net_->prediction(); + cudaDeviceSynchronize(); + for (const auto &output : outputs) { + platform::CUDAPlace gpu_place(device_); + auto *tensor = output.second; + auto *anakin_output = net_->get_out(output.first); + auto *anakin_data = anakin_output->data(); + auto anakin_output_shape = anakin_output->valid_shape(); + tensor->Resize(framework::make_ddim(anakin_output_shape)); + auto *fluid_data = tensor->mutable_data(gpu_place); + memory::Copy(gpu_place, static_cast(fluid_data), gpu_place, + static_cast(anakin_data), + tensor->numel() * sizeof(float), stream); + } + cudaDeviceSynchronize(); +} +#endif + +template +void AnakinEngine::Freeze() { + PADDLE_ENFORCE(graph_->Freeze(), "Freeze anakin subgraph."); +} + +template +void AnakinEngine::Optimize() { + PADDLE_ENFORCE(graph_->Optimize(), "Graph optimization."); +} + +template +void AnakinEngine::RegistBlock( + ::anakin::PBlock *block_p) { + PADDLE_ENFORCE(graph_->RegistBlock(block_p), "Block register."); +} + +template +std::unique_ptr> +AnakinEngine::Clone() { + auto *engine = new AnakinEngine(); + engine->net_ = std::move(net_->Clone()); + return std::unique_ptr(engine); +} + +#ifdef PADDLE_WITH_CUDA +template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::FP32>; +template class AnakinEngineManager<::anakin::saber::NV, + ::anakin::Precision::FP32>; + +template class AnakinEngine<::anakin::saber::NV, ::anakin::Precision::INT8>; +template class AnakinEngineManager<::anakin::saber::NV, + ::anakin::Precision::INT8>; +#endif +#ifdef ANAKIN_X86_PLACE +template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>; +template class AnakinEngineManager<::anakin::saber::X86, + ::anakin::Precision::FP32>; +template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::INT8>; +template class AnakinEngineManager<::anakin::saber::X86, + ::anakin::Precision::INT8>; +#endif +// template class AnakinEngine<::anakin::saber::X86, ::anakin::Precision::FP32>; +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h new file mode 100644 index 00000000..e62bb82f --- /dev/null +++ b/paddle/fluid/inference/anakin/engine.h @@ -0,0 +1,168 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/engine.h" +#include "paddle/fluid/inference/utils/singleton.h" +#ifdef EXIT // NOLINT +#undef EXIT // NOLINT +#endif // NOLINT +#include "framework/core/net/net.h" +#include "framework/core/types.h" +#include "framework/graph/graph.h" +#include "framework/graph/graph_global_mem.h" +#include "saber/saber_types.h" + +using anakin::Precision; + +namespace anakin { + +template +class Net; + +namespace graph { +template +class Graph; +} // namespace graph +} // namespace anakin + +namespace paddle { +namespace inference { +namespace anakin { + +template +class AnakinEngine { + using NetT = ::anakin::Net; + using GraphT = ::anakin::graph::Graph; + + public: + explicit AnakinEngine( + bool need_summary = false, int device = 0, int max_batch_size = 1, + std::map> max_input_shape = {}, + std::vector program_inputs = {}, + bool auto_config_layout = false); + ~AnakinEngine(); + void InitNet(); + void SetInputShape(const std::string &name, std::vector shape); + void AddOp(const std::string &name, const std::string &type, + const std::vector &inputs, + const std::vector &outputs); + + template + void AddOpAttr(const std::string &op_name, const std::string &attr_name, + const T &attr_value) { + PADDLE_ENFORCE(graph_->AddOpAttr(op_name, attr_name, attr_value), + "Add operation's attribution."); + } + NetT *Net() { return net_.get(); } + GraphT *Graph() { return graph_.get(); } + std::unique_ptr Clone(); + const std::map> &GetMaxInputShape() { + return max_input_shape_; + } + void SetMaxInputShape(std::map> shape) { + max_input_shape_ = shape; + } + const std::vector &GetScalableInputs() { + return program_inputs_; + } + void SetScalableInputs(std::vector program_inputs) { + program_inputs_ = program_inputs; + } + int GetMaxBatchSize() { return max_batch_size_; } + void Freeze(); + void Optimize(); + void RegistBlock(::anakin::PBlock *block_p); + void Save(std::string path) { graph_->save(path); } + bool IsInit() { return initialized_; } + int GetDevice() { return device_; } + void AddTensorScale(const std::string &tensor_name, float scale) { + tensor_scales_[tensor_name] = scale; + } + std::unordered_map GetTensorScales() { + return tensor_scales_; + } + void Execute(const std::map &inputs, + const std::map &outputs); +#ifdef PADDLE_WITH_CUDA + void Execute(const std::map &inputs, + const std::map &outputs, + cudaStream_t stream); +#endif + + private: + void BindInput(const std::map &inputs); + + private: + bool initialized_{false}; + int device_; + int max_batch_size_; + std::map> max_input_shape_; + std::vector program_inputs_; + std::unique_ptr graph_; + std::unique_ptr net_; + static std::once_flag init_anakin_; + std::unordered_map tensor_scales_; + // Always be false in gpu mode but true in most cpu cases. + bool auto_config_layout_; +}; + +template +class AnakinEngineManager { + using AnakinEngineT = AnakinEngine; + + public: + bool HasEngine(const std::string &name) const { + if (engines_.count(name) == 0) return false; + return engines_.at(name).get() != nullptr; + } + AnakinEngineT *Get(const std::string &name) const { + return engines_.at(name).get(); + } + + AnakinEngineT *Create(bool need_summary, int device, int max_batch_size, + std::map> max_input_shape, + std::vector program_inputs, + bool auto_config_layout, std::string engine_name) { + std::unique_lock lk(mut_); + auto *p = new AnakinEngine( + need_summary, device, max_batch_size, max_input_shape, program_inputs, + auto_config_layout); + engines_[engine_name].reset(p); + return p; + } + + void DeleteALL() { + for (auto &item : engines_) { + item.second.reset(nullptr); + } + } + + private: + std::unordered_map> engines_; + std::mutex mut_; +}; +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/op_teller.cc b/paddle/fluid/inference/anakin/op_teller.cc new file mode 100644 index 00000000..67b77122 --- /dev/null +++ b/paddle/fluid/inference/anakin/op_teller.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/op_teller.h" + +namespace paddle { +namespace inference { +namespace anakin { + +// Just tell by the op_types. +struct SimpleOpTypeSetTeller : public Teller { + SimpleOpTypeSetTeller() { + teller_set.insert("mul"); + teller_set.insert("fc"); + teller_set.insert("conv2d_fusion"); + teller_set.insert("split"); + teller_set.insert("relu"); + teller_set.insert("pool2d"); + teller_set.insert("elementwise_add"); + teller_set.insert("elementwise_mul"); + teller_set.insert("concat"); + teller_set.insert("tanh"); + teller_set.insert("conv2d"); + teller_set.insert("batch_norm"); + teller_set.insert("softmax"); + teller_set.insert("flatten2"); + teller_set.insert("reshape2"); + teller_set.insert("transpose2"); + teller_set.insert("density_prior_box"); + teller_set.insert("detection_out"); + teller_set.insert("dropout"); + teller_set.insert("sigmoid"); + teller_set.insert("sum"); + teller_set.insert("depthwise_conv2d"); + teller_set.insert("prior_box"); + teller_set.insert("leaky_relu"); + teller_set.insert("affine_channel"); + teller_set.insert("relu6"); + teller_set.insert("swish"); + teller_set.insert("shuffle_channel"); + } + + bool operator()(const std::string& op_type, + const framework::OpDesc& desc) override { + return teller_set.count(op_type); + } + + private: + std::unordered_set teller_set; +}; + +bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) { + for (auto& teller : tellers_) { + if ((*teller)(op_type, desc)) return true; + } + return false; +} + +OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); } + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/op_teller.h b/paddle/fluid/inference/anakin/op_teller.h new file mode 100644 index 00000000..15a42067 --- /dev/null +++ b/paddle/fluid/inference/anakin/op_teller.h @@ -0,0 +1,70 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/op_desc.h" + +namespace paddle { +namespace inference { +namespace anakin { + +/* + * Single Op teller definition. + * One can override this and define a more complex tell logic, considerring more + * issues such as op_desc. + */ +struct Teller { + virtual bool operator()(const std::string& op_type, + const framework::OpDesc& desc) = 0; + + virtual ~Teller() = default; +}; +/* + * A real example: + * + * struct SomeTeller : public Teller { + * bool operator()(const std::string& op_type, + * const framework::OpDesc& desc) override { + * return op_type == "fc" && desc.Inputs().size() == 2; + * } + *}; + */ + +/* + * class OpTeller helps to tell whether a fluid + * operator can be transformed to a TensorRT layer. + */ +class OpTeller { + public: + static OpTeller& Global() { + static std::unique_ptr x(new OpTeller); + return *x; + } + + bool Tell(const std::string& op_type, const framework::OpDesc& desc); + + private: + OpTeller(); + + private: + std::vector> tellers_; +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/test_anakin_engine.cc b/paddle/fluid/inference/anakin/test_anakin_engine.cc new file mode 100644 index 00000000..3c8a33ec --- /dev/null +++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include + +#include "paddle/fluid/inference/anakin/engine.h" + +using anakin::AK_FLOAT; +using anakin::Precision; +using anakin::saber::NV; +using anakin::saber::Shape; +using anakin::PBlock; +using anakin::PTuple; +namespace paddle { +namespace inference { +namespace anakin { + +class TestAnakinEngine : public ::testing::Test { + protected: + void SetUp() override; + void TearDown() override {} + + protected: + using AnakinNvEngineT = AnakinEngine; + std::unique_ptr engine_{nullptr}; +}; + +void TestAnakinEngine::SetUp() { + engine_.reset(new AnakinEngine(true)); +} + +TEST_F(TestAnakinEngine, Execute) { + engine_->AddOp("op1", "Dense", {"x"}, {"y"}); + engine_->AddOpAttr("op1", "out_dim", 2); + engine_->AddOpAttr("op1", "bias_term", false); + engine_->AddOpAttr("op1", "axis", 1); + std::vector shape = {1, 1, 1, 2}; + Shape tmp_shape(shape); + + PBlock *weight1 = new PBlock(tmp_shape, AK_FLOAT); + engine_->RegistBlock(weight1); + float *cpu_data = static_cast(weight1->h_tensor().mutable_data()); + cpu_data[0] = 2.; + weight1->d_tensor().set_shape(tmp_shape); + weight1->d_tensor().copy_from(weight1->h_tensor()); + engine_->AddOpAttr("op1", "weight_1", *weight1); + + engine_->Freeze(); + // PTuple input_shape = {1}; + // engine_->AddOpAttr("x", "input_shape", input_shape); + engine_->SetInputShape("x", {1, 1, 1, 1}); + engine_->Optimize(); + engine_->InitNet(); + framework::LoDTensor x; + framework::LoDTensor y; + x.Resize({1, 1, 1, 1}); + y.Resize({1, 1, 1, 2}); + auto *x_data = x.mutable_data(platform::CUDAPlace()); + float x_data_cpu[] = {1.}; + cudaMemcpy(x_data, x_data_cpu, sizeof(float), cudaMemcpyHostToDevice); + + std::map inputs = {{"x", &x}}; + auto *y_data = y.mutable_data(platform::CUDAPlace()); + std::map outputs = {{"y", &y}}; + + cudaStream_t stream; + + engine_->Execute(inputs, outputs, stream); + auto *y_data_gpu = y_data; + float y_data_cpu[2]; + cudaMemcpy(y_data_cpu, y_data_gpu, sizeof(float) * 2, cudaMemcpyDeviceToHost); + LOG(INFO) << "output value: " << y_data_cpu[0] << ", " << y_data_cpu[1]; +} + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt new file mode 100644 index 00000000..d79fb529 --- /dev/null +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -0,0 +1,68 @@ +unset(analysis_deps CACHE) +set(analysis_deps # analysis_deps can be extended accross the project + framework_proto proto_desc graph pass paddle_fluid_api executor pretty_log + ir_pass_manager + CACHE INTERNAL "") + +add_subdirectory(ir_passes) +add_subdirectory(passes) + +cc_library(analysis_helper SRCS helper.cc DEPS framework_proto proto_desc graph paddle_fluid_api) + +cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass ${INFER_IR_PASSES} analysis_helper) + +cc_library(argument SRCS argument.cc DEPS scope proto_desc) +cc_library(analysis_pass SRCS analysis_pass.cc DEPS proto_desc) + +cc_library(analysis SRCS + analyzer.cc + analysis_pass + DEPS ${analysis_deps} analysis_helper + ${INFER_IR_PASSES} + ) + +cc_test(test_dot SRCS dot_tester.cc DEPS analysis) + +function(inference_analysis_test_build TARGET) + if(WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS EXTRA_DEPS) + cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + inference_base_test_build(${TARGET} + SRCS ${analysis_test_SRCS} + DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}) + endif() +endfunction() + +function(inference_analysis_test_run TARGET) + if(WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs COMMAND ARGS) + cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + inference_base_test_run(${TARGET} + COMMAND ${analysis_test_COMMAND} + ARGS ${analysis_test_ARGS}) + endif() +endfunction() + +function(inference_analysis_test TARGET) + if(WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS ARGS EXTRA_DEPS) + cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + inference_base_test_build(${TARGET} + SRCS ${analysis_test_SRCS} + DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS}) + inference_base_test_run(${TARGET} + COMMAND ${TARGET} + ARGS ${analysis_test_ARGS}) + endif() +endfunction(inference_analysis_test) + +inference_analysis_test(test_analyzer + SRCS analyzer_tester.cc + EXTRA_DEPS reset_tensor_array paddle_inference_api + ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR}) diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md new file mode 100644 index 00000000..70adb4a9 --- /dev/null +++ b/paddle/fluid/inference/analysis/README.md @@ -0,0 +1,58 @@ +# Inference Analysis + +The `inference/analysis` module is used to analyze and optimize the inference program, +it references some philosophy from `LLVM/analysis`, +and make the various optimization features be pluggable and co-exist in a pipeline. + +We borrowed some concepts from LLVM, such as + +- [Pass](./pass.h)es to implement optimization that traverse the inference program, +- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program, +- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph. + +There are some other basic concepts here + +- [Node](./node.h), the node in a `DataFlowGraph`, + - `Function`, the Operator in Fluid, + - `Value`, the Variable in Fluid; +- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline, + +## How it works + +The `inference/analysis` module make all the passes in a pipeline, and works in such way: + +1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc, +2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes, +3. Transform a new ProgramDesc from the modified `DataFlowGraph`. + +The new optimization features can be added as an independent `Pass` and controlled by gflags, +each pass will generate unified debug information or visualization for better debugging. + +## Supported Passes + +### `FluidToDataFlowGraphPass` +Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes, +this should be the first pass of the pipeline. + +### `DataFlowGraphToFluidPass` +Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline. + +### `TensorRTSubgraphNodeMarkPass` +Mark the `Node` that are supported by TensorRT, +this pass will generate a visualization file which can be used for debugging. + +### `TensorRTSubGraphPass` +Split the sub-graph that are can be accelerated by TensorRT. + +### `DFG_GraphvizDrawPass` +This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool. + +It can be used as a helper class that draws the modified graph after each pass. + +## Utilities + +There is some helper legacy/function/class for analysis. + +- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes, +- [graph_traits.h](./graph_traits.h) contains the interfaces of the graph traversal algorithms, it uses `iterator`to make the algorithms easy to share across different passes, +there are some implementations in [data_flow_graph.cc](./data_flow_graph.cc) , such as BFS and DFS.. diff --git a/paddle/fluid/inference/analysis/analysis_pass.cc b/paddle/fluid/inference/analysis/analysis_pass.cc new file mode 100644 index 00000000..9be9f755 --- /dev/null +++ b/paddle/fluid/inference/analysis/analysis_pass.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/analysis_pass.h" diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h new file mode 100644 index 00000000..d5a972fa --- /dev/null +++ b/paddle/fluid/inference/analysis/analysis_pass.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * AnalysisPass is a pass used to control the IR passes. + */ +class AnalysisPass { + public: + AnalysisPass() = default; + virtual ~AnalysisPass() = default; + + // Run on a single Graph. + void Run(Argument* argument) { RunImpl(argument); } + + // Human-readable short representation. + virtual std::string repr() const = 0; + // Human-readable long description. + virtual std::string description() const { return "No DOC"; } + + protected: + // User should implement these. + virtual void RunImpl(Argument* argument) = 0; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc new file mode 100644 index 00000000..d82a063d --- /dev/null +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/analyzer.h" +#include +#include +#include "paddle/fluid/inference/analysis/passes/passes.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { +namespace analysis { + +Analyzer::Analyzer() {} + +void Analyzer::Run(Argument *argument) { RunAnalysis(argument); } + +void Analyzer::RunAnalysis(Argument *argument) { + PADDLE_ENFORCE(argument->analysis_passes_valid(), + "analsis_passes is not valid in the argument."); + for (auto &pass : argument->analysis_passes()) { + string::PrettyLogH1("--- Running analysis [%s]", pass); + auto *ptr = PassRegistry::Global().Retreive(pass); + PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass); + ptr->Run(argument); + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h new file mode 100644 index 00000000..a6de18db --- /dev/null +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +/* + * This file contains Analyzer, an class that exposed as a library that analyze + * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has + * multiple flags to + * control whether an process is applied on the program. + * + * The processes are called Passes in analysis, the Passes are placed in a + * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a + * Fluid ProgramDesc to + * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms + * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the + * pipeline can be any Passes + * which take a node or data flow graph as input. + * + * The Analyzer can be used in two methods, the first is a executable file which + * can be used to pre-process the inference model and can be controlled by + * passing difference command flags; + * the other way is to compose inside the inference API as a runtime pre-process + * phase in the inference service. + */ + +#include +#include +#include +#include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/inference/analysis/flags.h" + +namespace paddle { +namespace inference { +namespace analysis { + +class Analyzer final { + public: + Analyzer(); + + void Run(Argument* argument); + + DISABLE_COPY_AND_ASSIGN(Analyzer); + + protected: + void RunAnalysis(Argument* argument); +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc new file mode 100644 index 00000000..c814ce45 --- /dev/null +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -0,0 +1,98 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/analyzer.h" + +#include +#include +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using namespace framework; // NOLINT + +TEST(Analyzer, analysis_without_tensorrt) { + Argument argument; + argument.SetModelDir(FLAGS_inference_model_dir); + argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); + argument.SetUseGPU(false); + argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", + "ir_params_sync_among_devices_pass"}); + + Analyzer analyser; + analyser.Run(&argument); +} + +TEST(Analyzer, analysis_with_tensorrt) { + Argument argument; + argument.SetTensorRtMaxBatchSize(3); + argument.SetTensorRtWorkspaceSize(1 << 20); + argument.SetModelDir(FLAGS_inference_model_dir); + argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); + argument.SetUseGPU(false); + argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", + "ir_params_sync_among_devices_pass"}); + + Analyzer analyser; + analyser.Run(&argument); +} + +void TestWord2vecPrediction(const std::string& model_path) { + NativeConfig config; + config.model_dir = model_path; + config.use_gpu = false; + config.device = 0; + auto predictor = ::paddle::CreatePaddlePredictor(config); + + // One single batch + + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + // For simplicity, we set all the slots with the same data. + std::vector slots(4, tensor); + std::vector outputs; + CHECK(predictor->Run(slots, &outputs)); + + PADDLE_ENFORCE_EQ(outputs.size(), 1UL); + // Check the output buffer size and result of each tid. + PADDLE_ENFORCE_EQ(outputs.front().data.length(), 33168UL); + float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, + 0.000932706}; + const size_t num_elements = outputs.front().data.length() / sizeof(float); + // The outputs' buffers are in CPU memory. + for (size_t i = 0; i < std::min(static_cast(5UL), num_elements); + i++) { + LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i] + << " result: " << result[i]; + EXPECT_NEAR(static_cast(outputs.front().data.data())[i], result[i], + 1e-3); + } +} + +TEST(Analyzer, word2vec_without_analysis) { + TestWord2vecPrediction(FLAGS_inference_model_dir); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/argument.cc b/paddle/fluid/inference/analysis/argument.cc new file mode 100644 index 00000000..cb0263d5 --- /dev/null +++ b/paddle/fluid/inference/analysis/argument.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/argument.h" diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h new file mode 100644 index 00000000..3fcf579c --- /dev/null +++ b/paddle/fluid/inference/analysis/argument.h @@ -0,0 +1,218 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines the class Argument, which is the input and output of the + * analysis module. All the fields that needed either by Passes or PassManagers + * are contained in Argument. + * + * TODO(Superjomn) Find some way better to contain the fields when it grow too + * big. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using framework::ir::Graph; + +#ifdef PADDLE_WITH_MKLDNN +using VarQuantScale = + std::unordered_map>; +#endif + +/* + * The argument definition of both Pass and PassManagers. + * + * All the fields should be registered here for clearness. + */ +struct Argument { + Argument() = default; + explicit Argument(const std::string& model_dir) { SetModelDir(model_dir); } + + using unique_ptr_t = std::unique_ptr>; + using fusion_statis_t = std::unordered_map; + using anakin_max_shape_t = std::map>; + + bool Has(const std::string& key) const { return valid_fields_.count(key); } + void PartiallyRelease() { + if (Has("model_program_path")) { + if (Has("model_from_memory") && model_from_memory()) { + model_program_path().clear(); + model_program_path().shrink_to_fit(); + model_params_path().clear(); + model_params_path().shrink_to_fit(); + } + } + } + +#define DECL_ARGUMENT_FIELD(field__, Field, type__) \ + public: \ + type__& field__() { \ + PADDLE_ENFORCE(Has(#field__), "There is no such field"); \ + return field__##_; \ + } \ + void Set##Field(const type__& x) { \ + field__##_ = x; \ + valid_fields_.insert(#field__); \ + } \ + DECL_ARGUMENT_FIELD_VALID(field__); \ + type__* field__##_ptr() { return &field__##_; } \ + \ + private: \ + type__ field__##_; + +#define DECL_ARGUMENT_FIELD_VALID(field__) \ + bool field__##_valid() { return Has(#field__); } + +#define DECL_ARGUMENT_UNIQUE_FIELD(field__, Field, type__) \ + public: \ + type__& field__() { \ + PADDLE_ENFORCE_NOT_NULL(field__##_); \ + PADDLE_ENFORCE(Has(#field__)); \ + return *static_cast(field__##_.get()); \ + } \ + void Set##Field(type__* x) { \ + field__##_ = \ + unique_ptr_t(x, [](void* x) { delete static_cast(x); }); \ + valid_fields_.insert(#field__); \ + } \ + void Set##Field##NotOwned(type__* x) { \ + valid_fields_.insert(#field__); \ + field__##_ = unique_ptr_t(x, [](void* x) {}); \ + } \ + DECL_ARGUMENT_FIELD_VALID(field__); \ + type__* field__##_ptr() { \ + PADDLE_ENFORCE(Has(#field__)); \ + return static_cast(field__##_.get()); \ + } \ + type__* Release##Field() { \ + PADDLE_ENFORCE(Has(#field__)); \ + valid_fields_.erase(#field__); \ + return static_cast(field__##_.release()); \ + } \ + \ + private: \ + unique_ptr_t field__##_; + + DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int); + // Model path + DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string); + // Model specified with program and parameters files. + DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string); + DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); + DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool); + DECL_ARGUMENT_FIELD(optim_cache_dir, OptimCacheDir, std::string); + + // The overall graph to work on. + DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); + // The overall Scope to work on. + DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope); + + // The default program, loaded from disk. + DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc); + + // The ir passes to perform in analysis phase. + DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses, + std::vector); + DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses, + std::vector); + + // Pass a set of op types to enable its mkldnn kernel + DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, + std::unordered_set); + // The cache capacity of different input shapes for mkldnn. + DECL_ARGUMENT_FIELD(mkldnn_cache_capacity, MkldnnCacheCapacity, int); + +#ifdef PADDLE_WITH_MKLDNN + // A set of op types to enable their quantized kernels + DECL_ARGUMENT_FIELD(quantize_enabled_op_types, QuantizeEnabledOpTypes, + std::unordered_set); + + // A set of op IDs to exclude from enabling their quantized kernels + DECL_ARGUMENT_FIELD(quantize_excluded_op_ids, QuantizeExcludedOpIds, + std::unordered_set); + + // Scales for variables to be quantized + DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale); +#endif + + // Passed from config. + DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); + DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); + DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); + DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); + DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); + DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, + AnalysisConfig::Precision); + DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, + bool); + DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool); + + DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape, + anakin_max_shape_t); + DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int); + DECL_ARGUMENT_FIELD(anakin_min_subgraph_size, AnakinMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(anakin_precision_mode, AnakinPrecisionMode, + AnalysisConfig::Precision); + DECL_ARGUMENT_FIELD(anakin_auto_config_layout, AnakinAutoConfigLayout, bool); + DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool); + DECL_ARGUMENT_FIELD(anakin_passes_filter, AnakinPassesFilter, + std::vector); + DECL_ARGUMENT_FIELD(anakin_ops_filter, AnakinOpsFilter, + std::vector); + + // Memory optimized related. + DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); + DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool); + DECL_ARGUMENT_FIELD(static_memory_optim_force_update, + StaticMemoryOptimForceUpdate, bool); + // Indicate which kind of sort algorithm is used for operators, the memory + // optimization relays on the sort algorithm. + DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); + + // The program transformed by IR analysis phase. + DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, + framework::proto::ProgramDesc); + + DECL_ARGUMENT_FIELD(fusion_statis, FusionStatis, fusion_statis_t); + + private: + std::unordered_set valid_fields_; +}; + +#define ARGUMENT_CHECK_FIELD(argument__, fieldname__) \ + PADDLE_ENFORCE(argument__->Has(#fieldname__), \ + "the argument field [%s] should be set", #fieldname__); + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/device.h b/paddle/fluid/inference/analysis/device.h new file mode 100644 index 00000000..585c9923 --- /dev/null +++ b/paddle/fluid/inference/analysis/device.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +namespace paddle { +namespace inference { +namespace analysis { + +enum class Device { CPU, GPU }; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/dot.h b/paddle/fluid/inference/analysis/dot.h new file mode 100644 index 00000000..4693729c --- /dev/null +++ b/paddle/fluid/inference/analysis/dot.h @@ -0,0 +1,161 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file implements some helper classes and methods for DOT programming + * support. It will give a visualization of the graph and that helps to debug + * the logics of each Pass. + */ +#pragma once + +#include +#include +#include +#include +#include + +namespace paddle { +namespace inference { +namespace analysis { + +static size_t dot_node_counter{0}; + +/* + * A Dot template that helps to build a DOT graph definition. + */ +class Dot { + public: + struct Attr { + std::string key; + std::string value; + + Attr(const std::string& key, const std::string& value) + : key(key), value(value) {} + + std::string repr() const { + std::stringstream ss; + ss << key << "=" << '"' << value << '"'; + return ss.str(); + } + }; + + struct Node { + std::string name; + std::vector attrs; + + Node(const std::string& name, const std::vector& attrs) + : name(name), + attrs(attrs), + id_("node_" + std::to_string(dot_node_counter++)) {} + + std::string id() const { return id_; } + + std::string repr() const { + std::stringstream ss; + CHECK(!name.empty()); + ss << id_; + if (attrs.empty()) { + ss << "[label=" << '"' << name << '"' << "]"; + return ss.str(); + } + for (size_t i = 0; i < attrs.size(); i++) { + if (i == 0) { + ss << "[label=" << '"' << name << '"' << " "; + } + ss << attrs[i].repr(); + ss << ((i < attrs.size() - 1) ? " " : "]"); + } + return ss.str(); + } + + private: + std::string id_; + }; + + struct Edge { + std::string source; + std::string target; + std::vector attrs; + + Edge(const std::string& source, const std::string& target, + const std::vector& attrs) + : source(source), target(target), attrs(attrs) {} + + std::string repr() const { + std::stringstream ss; + CHECK(!source.empty()); + CHECK(!target.empty()); + ss << source << "->" << target; + for (size_t i = 0; i < attrs.size(); i++) { + if (i == 0) { + ss << "["; + } + ss << attrs[i].repr(); + ss << ((i < attrs.size() - 1) ? " " : "]"); + } + return ss.str(); + } + }; + + Dot() = default; + + explicit Dot(const std::vector& attrs) : attrs_(attrs) {} + + void AddNode(const std::string& id, const std::vector& attrs, + std::string label = "") { + CHECK(!nodes_.count(id)) << "duplicate Node '" << id << "'"; + if (label.empty()) label = id; + nodes_.emplace(id, Node{label, attrs}); + } + + void AddEdge(const std::string& source, const std::string& target, + const std::vector& attrs) { + CHECK(!source.empty()); + CHECK(!target.empty()); + auto sid = nodes_.at(source).id(); + auto tid = nodes_.at(target).id(); + edges_.emplace_back(sid, tid, attrs); + } + + // Compile to DOT language codes. + std::string Build() const { + std::stringstream ss; + const std::string indent = " "; + ss << "digraph G {" << '\n'; + + // Add graph attrs + for (const auto& attr : attrs_) { + ss << indent << attr.repr() << '\n'; + } + // add nodes + for (auto& item : nodes_) { + ss << indent << item.second.repr() << '\n'; + } + // add edges + for (auto& edge : edges_) { + ss << indent << edge.repr() << '\n'; + } + ss << "} // end G"; + return ss.str(); + } + + private: + std::unordered_map nodes_; + std::vector edges_; + std::vector attrs_; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/dot_tester.cc b/paddle/fluid/inference/analysis/dot_tester.cc new file mode 100644 index 00000000..c785a312 --- /dev/null +++ b/paddle/fluid/inference/analysis/dot_tester.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/dot.h" + +#include +#include + +namespace paddle { +namespace inference { +namespace analysis { + +class DotTester : public ::testing::Test { + protected: + void SetUp() override { + std::vector attrs({{"title", "hello"}}); + dot.reset(new Dot(attrs)); + dot->AddNode("a", {Dot::Attr{"shape", "box"}, Dot::Attr("color", "blue")}); + dot->AddNode("b", {}); + dot->AddNode("c", {}); + dot->AddEdge("a", "b", {}); + dot->AddEdge("b", "c", {}); + dot->AddEdge("a", "c", {}); + } + + std::unique_ptr dot; +}; + +TEST_F(DotTester, Build) { + auto codes = dot->Build(); + // Output the DOT language code, the generated codes are too long to compare + // the string. + // + // The output is + // + // digraph G { + // title="hello" + // node_1 + // node_2 + // node_0[label="a" shape="box" color="blue"] + // node_0->node_1 + // node_1->node_2 + // node_0->node_2 + // } // end G + LOG(INFO) << '\n' << codes; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/flags.h b/paddle/fluid/inference/analysis/flags.h new file mode 100644 index 00000000..717e543f --- /dev/null +++ b/paddle/fluid/inference/analysis/flags.h @@ -0,0 +1,22 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this +// flag if not available. +DECLARE_bool(IA_enable_tensorrt_subgraph_engine); +DECLARE_string(IA_graphviz_log_root); +DECLARE_string(IA_output_storage_path); +DECLARE_bool(IA_enable_ir); diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc new file mode 100644 index 00000000..008608c1 --- /dev/null +++ b/paddle/fluid/inference/analysis/helper.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace inference { +namespace analysis { + +template <> +void SetAttr(framework::proto::OpDesc *op, const std::string &name, + const std::string &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::STRING); + attr->set_s(data); +} +template <> +void SetAttr(framework::proto::OpDesc *op, const std::string &name, + const int &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(data); +} +template <> +void SetAttr(framework::proto::OpDesc *op, const std::string &name, + const bool &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::BOOLEAN); + attr->set_b(data); +} +template <> +void SetAttr(framework::proto::OpDesc *op, const std::string &name, + const int64_t &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::LONG); + attr->set_l(data); +} +template <> +void SetAttr>(framework::proto::OpDesc *op, + const std::string &name, + const std::vector &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::STRINGS); + for (const auto &s : data) { + attr->add_strings(s.c_str()); + } +} + +template <> +void SetAttr>(framework::proto::OpDesc *op, + const std::string &name, + const std::vector &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::INTS); + for (const auto i : data) { + attr->add_ints(i); + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h new file mode 100644 index 00000000..a4805840 --- /dev/null +++ b/paddle/fluid/inference/analysis/helper.h @@ -0,0 +1,257 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/port.h" + +#ifdef _WIN32 +#include +#include +#define GCC_ATTRIBUTE(attr__) +#define MKDIR(path) _mkdir(path) +#else +#include +#define GCC_ATTRIBUTE(attr__) __attribute__((attr__)); +#define MKDIR(path) mkdir(path, S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) +#endif +#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result) + +namespace paddle { +namespace inference { +namespace analysis { + +template +void SetAttr(framework::proto::OpDesc *op, const std::string &name, + const T &data); + +template +int AccuDims(Vec &&vec, int size) { + int res = 1; + for (int i = 0; i < size; i++) { + res *= std::forward(vec)[i]; + } + return res; +} + +#define SET_TYPE(type__) dic_[std::type_index(typeid(type__))] = #type__; +/* + * Map typeid to representation. + */ +struct DataTypeNamer { + static const DataTypeNamer &Global() { + static auto *x = new DataTypeNamer(); + return *x; + } + + template + const std::string &repr() const { + auto x = std::type_index(typeid(T)); + PADDLE_ENFORCE(dic_.count(x), "unknown type for representation"); + return dic_.at(x); + } + + const std::string &repr(const std::type_index &type) const { // NOLINT + PADDLE_ENFORCE(dic_.count(type), "unknown type for representation"); + return dic_.at(type); + } + + private: + DataTypeNamer() { + SET_TYPE(int); + SET_TYPE(bool); + SET_TYPE(float); + SET_TYPE(void *); + } + + std::unordered_map dic_; +}; +#undef SET_TYPE + +template +class iterator_range { + IteratorT begin_, end_; + + public: + template + explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {} + + iterator_range(const IteratorT &begin, const IteratorT &end) + : begin_(begin), end_(end) {} + + const IteratorT &begin() const { return begin_; } + const IteratorT &end() const { return end_; } +}; + +/* + * An registry helper class, with its records keeps the order they registers. + */ +template +class OrderedRegistry { + public: + T *Register(const std::string &name, T *x) { + PADDLE_ENFORCE(!dic_.count(name), "duplicate key [%s]", name); + dic_[name] = elements_.size(); + elements_.emplace_back(std::unique_ptr(x)); + return elements_.back().get(); + } + + T *Lookup(const std::string &name) { + auto it = dic_.find(name); + if (it == dic_.end()) return nullptr; + return elements_[it->second].get(); + } + + protected: + std::unordered_map dic_; + std::vector> elements_; +}; + +template +T &GetFromScope(const framework::Scope &scope, const std::string &name) { + framework::Variable *var = scope.FindVar(name); + PADDLE_ENFORCE(var != nullptr); + return *var->GetMutable(); +} + +static framework::proto::ProgramDesc LoadProgramDesc( + const std::string &model_path) { + std::ifstream fin(model_path, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path); + fin.seekg(0, std::ios::end); + std::string buffer(fin.tellg(), ' '); + fin.seekg(0, std::ios::beg); + fin.read(&buffer[0], buffer.size()); + fin.close(); + framework::proto::ProgramDesc program_desc; + program_desc.ParseFromString(buffer); + return program_desc; +} + +static bool FileExists(const std::string &filepath) { + std::ifstream file(filepath); + bool exists = file.is_open(); + file.close(); + return exists; +} + +static bool PathExists(const std::string &path) { + struct stat statbuf; + if (stat(path.c_str(), &statbuf) != -1) { + if (S_ISDIR(statbuf.st_mode)) { + return true; + } + } + return false; +} + +static std::string GetDirRoot(const std::string &path) { + char sep = '/'; + +#ifdef _WIN32 + sep = '\\'; +#endif + + size_t i = path.rfind(sep, path.length()); + if (i != std::string::npos) { + return (path.substr(0, i)); + } + return path; +} + +static std::string GetOrCreateModelOptCacheDir(const std::string &model_root) { + std::string opt_cache_dir = model_root + "/_opt_cache/"; + if (!PathExists(opt_cache_dir)) { + PADDLE_ENFORCE(MKDIR(opt_cache_dir.c_str()) != -1, + "Can not create optimize cache directory: %s, Make sure you " + "have permission to write", + opt_cache_dir); + } + return opt_cache_dir; +} + +static std::string GetTrtCalibPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_calib_" + engine_key; +} + +// If there is no calib table data file in model_opt_cache_dir, return "". +static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir, + const std::string &engine_key, + bool enable_int8) { + std::string trt_calib_table_path = + GetTrtCalibPath(model_opt_cache_dir, engine_key); + if (enable_int8 && FileExists(trt_calib_table_path)) { + VLOG(3) << "Calibration table file: " << trt_calib_table_path + << "is found here"; + std::ifstream infile(trt_calib_table_path, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string calibration_data(buffer.str()); + return calibration_data; + } + return ""; +} + +static std::string GetTrtEngineSerializedPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_serialized_" + engine_key; +} + +static std::string GetTrtEngineSerializedData( + const std::string &model_opt_cache_dir, const std::string &engine_key) { + std::string trt_serialized_path = + GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key); + if (FileExists(trt_serialized_path)) { + VLOG(3) << "Trt serialized file: " << trt_serialized_path + << "is found here"; + std::ifstream infile(trt_serialized_path, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string trt_engine_serialized_data(buffer.str()); + return trt_engine_serialized_data; + } + return ""; +} + +static void SaveTrtEngineSerializedDataToFile( + const std::string &trt_serialized_path, + const std::string &engine_serialized_data) { + std::ofstream outfile(trt_serialized_path); + outfile << engine_serialized_data; + outfile.close(); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle + +#define PADDLE_DISALLOW_COPY_AND_ASSIGN(type__) \ + type__(const type__ &) = delete; \ + void operator=(const type__ &) = delete; diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc new file mode 100644 index 00000000..2dae5137 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/ir_pass_manager.h" +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { +namespace analysis { +using string::PrettyLogEndl; +using string::PrettyLog; +using string::Style; + +IRPassManager::IRPassManager(Argument *argument) { + ARGUMENT_CHECK_FIELD(argument, main_program); + graph_ = std::unique_ptr(new Graph(argument->main_program())); + if (argument->Has("scope")) { + auto *scope_ptr = argument->scope_ptr(); + PADDLE_ENFORCE(scope_ptr); + graph_->SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr); + } + + ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); + CreatePasses(argument, argument->ir_analysis_passes()); +} + +void IRPassManager::CreatePasses(Argument *argument, + const std::vector &passes) { + std::string pre_pass; + int pass_num = 0; + for (const std::string &pass_name : passes) { + auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); + + if (pass_name == "graph_viz_pass") { + std::string dot_file_path = std::to_string(pass_num) + "_ir_" + + (pre_pass.empty() ? "origin" : pre_pass) + + ".dot"; + pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); + pass_num++; + } else if (pass_name == "mkldnn_placement_pass") { + pass->Set("mkldnn_enabled_op_types", + new std::unordered_set( + argument->mkldnn_enabled_op_types())); +#ifdef PADDLE_WITH_MKLDNN + } else if (pass_name == "cpu_quantize_placement_pass") { + pass->Set("quantize_enabled_op_types", + new std::unordered_set( + argument->quantize_enabled_op_types())); + pass->Set( + "quantize_excluded_op_ids", + new std::unordered_set(argument->quantize_excluded_op_ids())); + } else if (pass_name == "cpu_quantize_pass") { + pass->Set("quant_var_scales", + new VarQuantScale(argument->quant_var_scales())); +#endif + } else if (pass_name == "tensorrt_subgraph_pass") { + pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); + pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); + pass->Set("min_subgraph_size", + new int(argument->tensorrt_min_subgraph_size())); + pass->Set("program", + new framework::ProgramDesc *(&argument->main_program())); + + bool enable_int8 = argument->tensorrt_precision_mode() == + AnalysisConfig::Precision::kInt8; + + pass->Set("predictor_id", new int(argument->predictor_id())); + bool use_calib_mode = argument->tensorrt_use_calib_mode(); + pass->Set("enable_int8", new bool(enable_int8)); + pass->Set("use_calib_mode", new bool(use_calib_mode)); + + bool use_static_engine = argument->tensorrt_use_static_engine(); + bool model_from_memory = argument->model_from_memory(); + std::string optim_cache_dir = argument->optim_cache_dir(); + bool int8_valid = + !(model_from_memory && optim_cache_dir.empty() && enable_int8); + PADDLE_ENFORCE(int8_valid, + "When you are in TRT INT8 mode, and load model from " + "memory, you should set optim_cache_dir using " + "config.SetOptimCacheDir()"); + PADDLE_ENFORCE(!(model_from_memory && use_static_engine), + "When you are using Paddle-TRT, and also using load model " + "from memory, you should set the use_static to false."); + + if (!optim_cache_dir.empty()) { + pass->Set("model_opt_cache_dir", new std::string(optim_cache_dir)); + } else if (use_static_engine || enable_int8) { + std::string model_opt_cache_dir = + argument->Has("model_dir") + ? argument->model_dir() + : GetDirRoot(argument->model_program_path()); + pass->Set( + "model_opt_cache_dir", + new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); + } + pass->Set("gpu_device_id", new int(argument->gpu_device_id())); + pass->Set("use_static_engine", new bool(use_static_engine)); + pass->Set("model_from_memory", new bool(argument->model_from_memory())); + } + if (pass_name == "ngraph_subgraph_pass") { + pass->Set("program", + new framework::ProgramDesc *(&argument->main_program())); + } + if (pass_name == "anakin_subgraph_pass") { + pass->Set("program", + new framework::ProgramDesc *(&argument->main_program())); + pass->Set("use_gpu", new bool(argument->use_gpu())); + pass->Set("gpu_device_id", new int(argument->gpu_device_id())); + pass->Set("model_from_memory", new bool(argument->model_from_memory())); + pass->Set("predictor_id", new int(argument->predictor_id())); + pass->Set("max_input_shape", new std::map>( + argument->anakin_max_input_shape())); + pass->Set("max_batch_size", new int(argument->anakin_max_batch_size())); + bool enable_int8 = + argument->anakin_precision_mode() == AnalysisConfig::Precision::kInt8; + pass->Set("enable_int8", new bool(enable_int8)); + pass->Set("anakin_ops_filter", + new std::vector(argument->anakin_ops_filter())); + pass->Set("auto_config_layout", + new bool(argument->anakin_auto_config_layout())); + } + + pre_pass = pass_name; + + passes_.emplace_back(std::move(pass)); + } +} + +std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { + if (passes_.empty()) { + return graph; + } + PADDLE_ENFORCE(graph.get()); + // Apply all the passes + for (const auto &pass : passes_) { + if (pass->Type() != "graph_viz_pass") { + PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); + } + graph.reset(pass->Apply(graph.release())); + } + return graph; +} + +framework::proto::ProgramDesc IRPassManager::AcquireProgram( + std::unique_ptr *graph, ProgramDesc *program) const { + auto pass = + framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); + + // Direct using ProgramDesc desc(argument->main_program()) may cause + // incomplete copies of information. + ProgramDesc desc; + desc.CopyFrom(*program->Proto()); + pass->SetNotOwned("program", &desc); + auto *the_graph = graph->release(); + graph->reset(pass->Apply(the_graph)); + return *desc.Proto(); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h new file mode 100644 index 00000000..2d120679 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines IRPassManager, it helps control the passes in IR. Inference + * phrase will load the model program and parameters from disk, that is quite + * different from the training phase. + * This manager will control the Passes and make the passes in IR work smoothly + * for inference. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/helper.h" + +namespace paddle { +namespace inference { +namespace analysis { +using framework::ProgramDesc; +using framework::ir::Graph; + +class IRPassManager final { + public: + explicit IRPassManager(Argument *argument); + + std::unique_ptr Apply(std::unique_ptr graph); + + framework::proto::ProgramDesc AcquireProgram(std::unique_ptr *graph, + ProgramDesc *program) const; + + framework::ir::Graph &graph() const { return *graph_; } + + private: + void CreatePasses(Argument *argument, const std::vector &passes); + + std::unique_ptr graph_; + std::vector> passes_; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt new file mode 100644 index 00000000..ddadbc6d --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -0,0 +1,28 @@ +cc_library(subgraph_detector SRCS subgraph_detector.cc subgraph_util.cc DEPS proto_desc) +if(WITH_TESTING) + add_dependencies(subgraph_detector gtest) +endif() + +if (WITH_GPU AND TENSORRT_FOUND) + cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) + + set(analysis_deps ${analysis_deps} + subgraph_detector tensorrt_subgraph_pass + CACHE INTERNAL "") + + set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) + file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") + set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") +endif() + +if (ANAKIN_SUBGRAPH) + cc_library(anakin_subgraph_pass SRCS anakin_subgraph_pass.cc DEPS subgraph_detector anakin_op_teller) + + set(analysis_deps ${analysis_deps} + subgraph_detector anakin_subgraph_pass + CACHE INTERNAL "") + + set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) + file(APPEND ${pass_file} "USE_PASS(anakin_subgraph_pass);\n") + set(INFER_IR_PASSES ${INFER_IR_PASSES} anakin_subgraph_pass CACHE INTERNAL "") +endif() diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc new file mode 100644 index 00000000..a6c6f33c --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -0,0 +1,283 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/op_teller.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using framework::ir::Node; + +void analysis::AnakinSubgraphPass::ApplyImpl( + framework::ir::Graph *graph) const { + framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph); + + auto &anakin_ops_filter = Get>("anakin_ops_filter"); + + auto teller = [&anakin_ops_filter](const framework::ir::Node *node) { + if (!node->IsOp() || !node->Op()) + return false; + else if (std::find(anakin_ops_filter.begin(), anakin_ops_filter.end(), + node->Op()->Type()) != anakin_ops_filter.end()) + return false; + return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); + }; + + SubGraphFuser fuser(graph, teller, 6 /* min_subgraph_size */); + fuser(); + + std::vector graph_param_names = + ExtractParameters(graph->Nodes()); + + // those parameter already exist in anakin, and should not have another copy + // in fluid. + std::vector repetitive_params; + + for (auto *node : graph->Nodes()) { + if (node->IsOp() && !Agent(node).subgraph()->empty()) { + CreateAnakinOp(node, graph, graph_param_names, &repetitive_params); + std::unordered_set nodes2remove( + Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); + } + } + + std::unordered_set nodes2remove; + for (auto *node : graph->Nodes()) { + if (node->IsOp() && Agent(node).deleted()) { + nodes2remove.insert(node); + } + } + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); + graph->Set(framework::ir::kRepetitiveParamAttr, + new std::vector(repetitive_params)); +} + +std::string GenerateAnakinEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs, + std::string id) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + engine_hash_key += id; + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} + +void AnakinSubgraphPass::CreateAnakinOp( + framework::ir::Node *node, Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const { + auto *op_desc = node->Op(); + auto &subgraph = *Agent(node).subgraph(); + PADDLE_ENFORCE(!subgraph.empty()); + + framework::ProgramDesc *program_desc = + Get("program"); + // Add new block for TensorRTEngineOP + const framework::BlockDesc &main_block = + program_desc->Block(framework::kRootBlockIndex); + // const framework::BlockDesc& main_block = program_desc->Block(0); + framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); + + // An fake block desc. + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + string::PrettyLogDetail("--- detect a sub-graph with %d nodes", + subgraph.size()); + + for (auto *node : subgraph) { + auto *new_block_op = new_block->AppendOp(); + auto *op = block_desc.AppendOp(); + *new_block_op->Proto() = *node->Op()->Proto(); + *op->Proto() = *node->Op()->Proto(); + } + + // Then, we will use the input_names_with_id and output_names_with_id to + // generate the eigine key. + // So, We use set instead of unordered_set here to ensure that the engine key + // is unique. + std::set input_names; + std::set input_names_with_id; + std::vector params; + for (auto *x : node->inputs) { + input_names.insert(x->Name()); + input_names_with_id.insert(x->Name() + std::to_string(x->id())); + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { + params.push_back(x->Name()); + } + } + std::copy(params.begin(), params.end(), + std::back_inserter(*repetitive_params)); + op_desc->SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); + + std::set output_names; + std::set output_names_with_id; + for (auto *x : node->outputs) { + output_names.insert(x->Name()); + output_names_with_id.insert(x->Name() + std::to_string(x->id())); + } + + op_desc->SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); + op_desc->SetType("anakin_engine"); + + std::unordered_map output_name_map; + std::unordered_map graph_var_map; + + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + graph_var_map[node->Name()] = node; + } + } + auto &subgraph_nodes = *Agent(node).subgraph(); + + // The following procedure is used to rename all the intermediate + // variables and the output variables of the subgraph. + RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, + &output_names_with_id, &output_names, &output_name_map, + graph_var_map, false); + + // When anakin engine runs at the end of the operation, + // output_mapping help us copy the data from the renamed ITensor + // to Tensor. + std::vector output_mapping; + for (auto name : output_names) { + PADDLE_ENFORCE(output_name_map.count(name) != 0); + output_mapping.push_back(output_name_map[name]); + } + + PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), + "the block has no var-desc"); + PADDLE_ENFORCE(!output_mapping.empty()); + op_desc->SetBlockAttr("sub_block", new_block); + SetAttr(op_desc->Proto(), "subgraph", + block_desc.Proto()->SerializeAsString()); + // Set attrs + SetAttr(op_desc->Proto(), "parameters", params); + SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); + int predictor_id = Get("predictor_id"); + auto engine_key = GenerateAnakinEngineKey( + input_names_with_id, output_names_with_id, std::to_string(predictor_id)); + + SetAttr(op_desc->Proto(), "engine_key", engine_key); + auto max_input_shape = + Get>>("max_input_shape"); + auto program_inputs = program_desc->GetFeedTargetNames(); + + bool use_gpu = Get("use_gpu"); + SetAttr(op_desc->Proto(), "use_gpu", use_gpu); + bool enable_int8 = Get("enable_int8"); + SetAttr(op_desc->Proto(), "enable_int8", enable_int8); + if (enable_int8) { + CreateAnakinEngine<::anakin::Precision::INT8>(&block_desc, params, + input_names, output_mapping, + program_inputs, engine_key); + } else { + CreateAnakinEngine<::anakin::Precision::FP32>(&block_desc, params, + input_names, output_mapping, + program_inputs, engine_key); + } +} + +template <::anakin::Precision PrecisionT> +void AnakinSubgraphPass::CreateAnakinEngine( + framework::BlockDesc *block_desc, const std::vector ¶ms, + const std::set &input_names, + const std::vector &output_mapping, + const std::vector &program_inputs, + const std::string &engine_key) const { + framework::BlockDesc block_desc_temp(nullptr, block_desc->Proto()); + bool use_gpu = Get("use_gpu"); + auto max_batch_size = Get("max_batch_size"); + auto max_input_shape = + Get>>("max_input_shape"); + if (use_gpu) { +#ifdef PADDLE_WITH_CUDA + inference::Singleton< + anakin::AnakinEngineManager<::anakin::saber::NV, PrecisionT>>::Global() + .Create(true, Get("gpu_device_id"), max_batch_size, + max_input_shape, program_inputs, false, engine_key); +#endif + } else { +#ifdef ANAKIN_X86_PLACE + bool auto_config_layout = Get("auto_config_layout"); + inference::Singleton< + anakin::AnakinEngineManager<::anakin::saber::X86, PrecisionT>>::Global() + .Create(true, Get("gpu_device_id"), max_batch_size, + max_input_shape, program_inputs, auto_config_layout, + engine_key); +#endif + } + + auto *scope = param_scope(); + std::unordered_set param_set(params.begin(), params.end()); + if (use_gpu) { +#ifdef PADDLE_WITH_CUDA + auto *anakin_engine = + inference::Singleton>::Global() + .Get(engine_key); + inference::Singleton>::Global() + .ConvertBlockToAnakinEngine( + &block_desc_temp, scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, anakin_engine); +#endif + } else { +#ifdef ANAKIN_X86_PLACE + auto *anakin_engine = + inference::Singleton>::Global() + .Get(engine_key); + inference::Singleton>::Global() + .ConvertBlockToAnakinEngine( + &block_desc_temp, scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, anakin_engine); +#endif + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle + +REGISTER_PASS(anakin_subgraph_pass, + paddle::inference::analysis::AnakinSubgraphPass); diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h new file mode 100644 index 00000000..4ab2297b --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/anakin/engine.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" + +using anakin::Precision; +using anakin::saber::NV; +namespace paddle { +namespace inference { +namespace analysis { + +class AnakinSubgraphPass : public framework::ir::FusePassBase { + public: + void ApplyImpl(framework::ir::Graph *graph) const override; + + private: + void CreateAnakinOp(framework::ir::Node *x, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const; + void CleanIntermediateOutputs(framework::ir::Node *node); + template <::anakin::Precision PrecisionT> + void CreateAnakinEngine(framework::BlockDesc *block_desc, + const std::vector ¶ms, + const std::set &input_names, + const std::vector &output_mapping, + const std::vector &program_inputs, + const std::string &engine_key) const; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc new file mode 100644 index 00000000..67033582 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc @@ -0,0 +1,467 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/node.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using framework::ir::Node; + +std::pair, std::vector> +ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT + std::unordered_set nodes(graph.begin(), graph.end()); + std::unordered_set inputs; + std::unordered_set outputs; + // Input a Value, check whether its inlink is in the subgraph. + auto inlink_in_subgraph = [&](Node *n) { + for (auto *in : n->inputs) { + if (nodes.count(in)) return true; + } + return false; + }; + + for (auto &node : graph) { + for (auto *in : node->inputs) { + // The Value that is written by nodes inside a sub-graph shouldn't be the + // input of the sub-graph. + if (!nodes.count(in) && in->IsVar() && !inlink_in_subgraph(in)) { + inputs.insert(in); + } + } + for (auto *out : node->outputs) { + if (!nodes.count(out) && out->IsVar()) { + outputs.insert(out); + } + } + } + return std::make_pair(std::vector(inputs.begin(), inputs.end()), + std::vector(outputs.begin(), outputs.end())); +} + +// Filter the Intermediate results of the subgraph node. +void FilterRedundantOutputOfSubGraph(Graph *graph) { + std::vector op_nodes; + for (auto &node : TopologicalSort(*graph)) { + if (node.IsVar() || Agent(&node).deleted()) { + continue; + } + op_nodes.push_back(&node); + } + size_t op_num = op_nodes.size(); + for (size_t i = 0; i < op_num; i++) { + if (op_nodes[i]->IsOp()) continue; + std::unordered_set follow_up_input_names; + for (size_t j = i + 1; j < op_num; j++) { + for (auto *in : op_nodes[j]->inputs) { + follow_up_input_names.insert(in->Name()); + } + } + std::vector filtered_subgraph_outlinks; + for (auto *out : op_nodes[i]->outputs) { + if (follow_up_input_names.count(out->Name())) { + filtered_subgraph_outlinks.push_back(out); + } else { + Agent(out).set_deleted(true); + } + } + // The filtered_subgraph_outlinks may be empty. + op_nodes[i]->outputs = filtered_subgraph_outlinks; + } +} + +std::vector> SubgraphDetector::operator()() { + MarkNodesInsideSubGraph(); + return ExtractSubGraphs(); +} + +// Mark the output variables inside a subgraph with the func. +inline void MarkOutLinksInSubGraph(const Node *func) { + for (auto *var : func->outputs) { + Agent(var).set_marked(true); + } +} + +void SubgraphDetector::MarkNodesInsideSubGraph() { + for (auto &node : framework::ir::GraphTraits::DFS(*graph_)) { + if (node_inside_subgraph_teller_(&node)) { + Agent(&node).set_marked(true); + if (node.IsOp()) { + // If a function is inside the sub-graph, mark all the output variables + // to be inside too, so that two marked functions will be inside a same + // sub-graph, lets take a example: A_function->var->B_function, if + // A_function is marked, var should also be marked, so that B_function + // will be in the same sub-graph with A_function if B_function is + // marked. + MarkOutLinksInSubGraph(&node); + } + } + } +} + +// Use the Union Find(UF) algorithm to find fully connected sub-graphs, if node +// a's output is node b, that is a and b is in the same sub-graph. The UF +// algorithm will group them to the same cluster. +using node_map_t = std::unordered_map; +// Find the ancestor id of a node. +int UnionFindGetAncestor(const node_map_t &node_map, size_t id) { + int tmp = id; + do { + tmp = Agent(node_map.at(tmp)).union_find_parent(); + } while (Agent(node_map.at(tmp)).union_find_parent() != tmp); + return tmp; +} +// Make this two node share the same ancestor. +// TODO(Superjom) bad performance, make a balanced tree latter. +void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { + int a_ancestor = UnionFindGetAncestor(node_map, a); + int b_ancestor = UnionFindGetAncestor(node_map, b); + Agent(node_map.at(b_ancestor)).set_union_find_parent(a_ancestor); + Agent(node_map.at(a)).set_union_find_parent(a_ancestor); + Agent(node_map.at(b)).set_union_find_parent(a_ancestor); +} + +// This is a simple representation of a graph. +// The BriefNode hold the pointer of the Node. +// This is to avoid changing the original graph +// in the process of trt graph analysis. +struct BriefNode { + explicit BriefNode(Node *n) { node = n; } + Node *node; + std::vector inlinks; + std::vector outlinks; +}; + +// Union two adjacent BriefNode. +// Suppose we have two adjacent nodes src and dst. +// We will perform the following operations: +// 1. add all inputs(except src) of dst to src inlinks. +// 2. add all outputs of dst to src outlinks. +// 3. change all the dst's inputs and outputs +// corresponding inlinks and outlinks to src node. +// 4. delete all dst's inlinks and outlinks. +void UnionContractedNodes(const std::unordered_map &node_map, + int src_id, int dst_id) { + // merge the two adjacent nodes into one node. + BriefNode *src_node = node_map.at(src_id); + BriefNode *dst_node = node_map.at(dst_id); + + std::unordered_set inputs(src_node->inlinks.begin(), + src_node->inlinks.end()); + std::unordered_set outputs; + + for (auto *n : src_node->outlinks) { + if (n != dst_node) outputs.insert(n); + } + + // Add the inlinks and outlinks of dst node to src node. + std::vector dst_in_nodes = dst_node->inlinks; + for (BriefNode *node : dst_in_nodes) { + if (node != src_node) { + inputs.insert(node); + } + } + + std::vector dst_out_nodes = dst_node->outlinks; + for (BriefNode *node : dst_out_nodes) { + outputs.insert(node); + } + +// update the dst and src node's inlinks and outlinks. +#ifdef __clang__ + src_node->inlinks = std::vector(inputs.begin(), inputs.end()); + src_node->outlinks = std::vector(outputs.begin(), outputs.end()); + dst_node->inlinks.clear(); + dst_node->outlinks.clear(); +#else + src_node->inlinks = + std::move(std::vector(inputs.begin(), inputs.end())); + src_node->outlinks = + std::move(std::vector(outputs.begin(), outputs.end())); + dst_node->inlinks.clear(); + dst_node->outlinks.clear(); +#endif + + auto inlink_or_outlink_cleaner = [&](std::vector &nodes) { + for (auto *&n : nodes) { + if (n == src_node || n == dst_node) { + n = src_node; + } + } + }; + // Change all the dst inputs and outputs corresponding inlink and + // outlink to the src node. + for (auto *node : src_node->inlinks) { + inlink_or_outlink_cleaner(node->outlinks); + } + + for (auto *node : src_node->outlinks) { + inlink_or_outlink_cleaner(node->inlinks); + } +} + +// FlexibleDFS +// If reverse is true, do reverse dfs. +// If enter func is not nullptr, calls enter(node) before visiting any children +// of node. +// If leave func not nullptr, calls leave(node) after visiting all parents of +// node. +void FlexibleDFS(const std::vector &source, bool reverse, + const std::function &enter, + const std::function &leave) { + typedef struct { + const BriefNode *node; + bool leave; + } FNode; + + std::vector stack; + for (auto &node : source) { + stack.push_back(FNode{node, false}); + } + std::unordered_set visited; + while (!stack.empty()) { + auto fnode = stack.back(); + stack.pop_back(); + + if (fnode.leave) { + if (leave && !leave(fnode.node)) return; + } + if (visited.count(fnode.node)) continue; + visited.insert(fnode.node); + + if (enter && !enter(fnode.node)) return; + + if (leave) stack.push_back(FNode{fnode.node, true}); + const std::vector iter_nodes = + reverse == true ? fnode.node->inlinks : fnode.node->outlinks; + for (const BriefNode *node : iter_nodes) { + if (!visited.count(node)) { + stack.push_back(FNode{node, false}); + } + } + } +} + +std::vector> SubgraphDetector::ExtractSubGraphs() { + // Run the Extract algorithm to find all subgraphs. + std::vector marked_nodes; + // We use brief_node_map to represent the original graph in order to avoid + // changing the original graph. + std::unordered_map brief_node_map; + + std::unordered_set valid_node_ids; + for (auto *node : graph_->Nodes()) { + valid_node_ids.insert(node->id()); + } + + for (auto &node : framework::ir::GraphTraits::TS(*graph_)) { + brief_node_map[node.id()] = new BriefNode(&node); + if (Agent(&node).marked()) { + marked_nodes.push_back(&node); + } + } + + // extract sub-graphs in the marked node set, use Union Find algorithm. + node_map_t node_map; // id to ptr + for (auto *n : marked_nodes) { + // n's parent == n.id means it is the ancestor + Agent(n).set_union_find_parent(n->id()); + node_map[n->id()] = n; + } + + // create breif node map + for (auto &itr : brief_node_map) { + for (Node *node : itr.second->node->inputs) { + if (!valid_node_ids.count(node->id())) { + LOG(INFO) << "invalid node id " << node->id(); + continue; + } + itr.second->inlinks.push_back(brief_node_map.at(node->id())); + } + + for (Node *node : itr.second->node->outputs) { + if (!valid_node_ids.count(node->id())) { + LOG(INFO) << "invalid node id " << node->id(); + continue; + } + itr.second->outlinks.push_back(brief_node_map.at(node->id())); + } + } + + for (auto &itr : brief_node_map) { + BriefNode *brief_node = itr.second; + + if (!Agent(brief_node->node).marked()) { + VLOG(4) << brief_node->node->id() << " node not a trt candidate."; + continue; + } + + // Our algorithm must guarantee that: + // 1. The graph is always directed acyclic graph(DAG). + // 2. If there is a path in the subgraph from X to Y (X and Y are both + // nodes in the subgraph), then all paths from X to Y are in the + // subgraph. + // + // In order to achieve the above guarantee. + // For adjacent nodes src -> dst. + // 1. Get all dst input nodes except src. + // 2. Reverse DFS from those input nodes + // 3. If there is a path from input nodes to src, + // then the src and dst nodes can not be fused into one node, + // otherwise it can be done. + + while (true) { + std::unordered_set contract_nodes; + for (auto *out : brief_node->outlinks) { + // must be an trt candidate + if (!Agent(out->node).marked()) continue; + // get all dst input nodes except src. + std::vector source_nodes; + for (auto *n : out->inlinks) { + if (n != brief_node) { + source_nodes.push_back(n); + } + } + + // Reverse DFS from the source_nodes. + bool have_excess_path = false; + FlexibleDFS(source_nodes, true, nullptr, + [&have_excess_path, brief_node](const BriefNode *n) { + if (n == brief_node) { + have_excess_path = true; + return false; + } + return true; + }); + if (have_excess_path) continue; + contract_nodes.insert(out); + } + if (contract_nodes.empty()) break; + + for (auto dst_node : contract_nodes) { + UnionFindCombine(node_map, brief_node->node->id(), + dst_node->node->id()); + UnionContractedNodes(brief_node_map, brief_node->node->id(), + dst_node->node->id()); + } + } + } + + std::unordered_map> clusters; + for (auto *n : marked_nodes) { + if (n->IsOp()) { + clusters[UnionFindGetAncestor(node_map, Agent(n).union_find_parent())] + .push_back(n); + } + } + std::vector> result; + std::for_each(clusters.begin(), clusters.end(), + [&](const decltype(clusters)::value_type &it) { + result.push_back(it.second); + }); + + return result; +} + +void SubGraphFuser::operator()() { ReplaceNodesWithSubGraphs(); } + +void RemoveIntermediateOutputInSubgraph(const std::vector &subgraph, + Graph *graph, + std::vector *outputs) { + std::unordered_set subgraph_set(subgraph.begin(), subgraph.end()); + std::unordered_set valid_output; + + for (auto *output : *outputs) { + int num_used = 0; + for (auto *node : output->outputs) { + if (!subgraph_set.count(node)) ++num_used; + if (num_used > 0) valid_output.insert(output); + } + } + + outputs->assign(valid_output.begin(), valid_output.end()); +} + +void DetachDeletedNodes(framework::ir::Graph *graph) { + std::unordered_set nodes; + for (auto *node : graph->Nodes()) { + if (Agent(node).deleted()) { + node->inputs.clear(); + node->outputs.clear(); + } + } +} + +void SubGraphFuser::ReplaceNodesWithSubGraphs() { + auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)(); + for (auto &subgraph : subgraphs) { + if (subgraph.size() <= (size_t)min_subgraph_size_) continue; + std::unordered_set subgraph_uniq(subgraph.begin(), subgraph.end()); + // replace this sub-graph with the first node. Two steps: 1. Create a Block + // Node that contains this subgraph 2. Mark the nodes inside the sub-graph + // as deleted. 3. Replace the deleted node with the new Block Node. + framework::OpDesc empty_desc; + empty_desc.SetType(name_); + auto *block_node = graph_->CreateOpNode(&empty_desc); + Agent(block_node).set_subgraph({}); + auto io = ExtractInputAndOutputOfSubGraph(subgraph); + block_node->inputs = std::move(io.first); + block_node->outputs = std::move(io.second); + + RemoveIntermediateOutputInSubgraph(subgraph, graph_, &block_node->outputs); + + for (auto *node : subgraph) { + // TODO(Superjomn) need a unified mechanism to treat deleted node in each + // pass. + Agent(node).set_deleted(true); + Agent(block_node).subgraph()->push_back(node); + } + + // Change all the sub-graph's inputs and outputs corresponding inlink and + // outlink to this sub-graph node. + auto inlink_or_outlink_cleaner = [&](std::vector &nodes) { + for (auto *&n : nodes) { + if (subgraph_uniq.count(n)) { + n = block_node; + } + } + std::unordered_set uniq(nodes.begin(), nodes.end()); + nodes.assign(uniq.begin(), uniq.end()); + }; + for (auto *i : block_node->inputs) { + inlink_or_outlink_cleaner(i->outputs); + } + for (auto *&o : block_node->outputs) { + inlink_or_outlink_cleaner(o->inputs); + } + } + // DetachDeletedNodes(graph_); + FilterRedundantOutputOfSubGraph(graph_); +} + +inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) { + return node.inputs.size() == n; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h new file mode 100644 index 00000000..26201541 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h @@ -0,0 +1,160 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file defines the the class to partition a graph. + */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/inference/analysis/argument.h" +#include "paddle/fluid/inference/analysis/helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using framework::ir::Graph; +using framework::ir::NodesTSIterator; + +const char kIsFunctionNode[] = "__is_function_node__"; +const char kFunctionNodeSubGraph[] = "__function_node_sub_graph__"; +const char kSubgraphSplitterMarkerAttrName[] = + "_sub_graph_splitter_inside_sub_graph"; + +/* + * Detect the nodes in a sub-graph that meet some conditions. This class doesn't + * modify the graph. + */ +class SubgraphDetector { + public: + // Tell whether a node is inside a sub-graph. + using NodeInsideSubgraphTeller = + std::function; + + SubgraphDetector(Graph *graph, const NodeInsideSubgraphTeller &teller) + : graph_(graph), node_inside_subgraph_teller_(teller) {} + + std::vector> operator()(); + + protected: + // Mark the nodes inside the accepted sub-graph using + // node_inside_subgraph_teller. + void MarkNodesInsideSubGraph(); + + // Merge the marked nodes into sub-graphs and return the sub-graphs. + std::vector> ExtractSubGraphs(); + + private: + Graph *graph_; + NodeInsideSubgraphTeller node_inside_subgraph_teller_; +}; + +/* + * SubGraphFuser - Replace some nodes with the sub-graph node they are inside. + * To some extent, the TensorRT engine is just a fusion op for a model. + */ +class SubGraphFuser { + public: + using NodeInsideSubgraphTeller = SubgraphDetector::NodeInsideSubgraphTeller; + + SubGraphFuser(Graph *graph, const NodeInsideSubgraphTeller &teller, + int min_subgraph_size, std::string name = "anakin_engine") + : graph_(graph), + node_inside_subgraph_teller_(teller), + min_subgraph_size_{min_subgraph_size}, + name_{name} {} + + // The main method which run all the logic. + void operator()(); + + protected: + // Remove the nodes inside sub-graphs and replace with the SubGraphNode. + void ReplaceNodesWithSubGraphs(); + + private: + Graph *graph_; + NodeInsideSubgraphTeller node_inside_subgraph_teller_; + int min_subgraph_size_; + const std::string name_; +}; + +struct NodeWrapper { + bool deleted{false}; + bool marked{false}; + int union_find_parent{-1}; + std::vector subgraph; +}; + +/* + * ir::Node agent for subgraph detector. + */ +struct Agent { + explicit Agent(framework::ir::Node *x) : x_(x) {} + + NodeWrapper &wrapper() { + if (!x_->IsWrappedBy()) { + x_->WrappedBy(new NodeWrapper); + } + return x_->template Wrapper(); + } + + bool deleted() { return wrapper().deleted; } + void set_deleted(bool x) { wrapper().deleted = x; } + + bool marked() { return wrapper().marked; } + void set_marked(bool x) { wrapper().marked = x; } + + void set_subgraph(const std::vector &x) { + wrapper().subgraph = x; + } + + int union_find_parent() { return wrapper().union_find_parent; } + void set_union_find_parent(int v) { wrapper().union_find_parent = v; } + + std::vector *subgraph() { return &wrapper().subgraph; } + std::vector &inputs() { return x_->inputs; } + std::vector &outputs() { return x_->outputs; } + + private: + framework::ir::Node *x_; +}; + +// The nodes those have no input will be treated as start points. +static std::vector ExtractStartPoints(const Graph &g) { + std::vector result; + for (auto *node : g.Nodes()) { + if (node->inputs.empty()) { + result.push_back(node); + } + } + return result; +} + +static iterator_range TopologicalSort(const Graph &g) { + auto start_points = ExtractStartPoints(g); + PADDLE_ENFORCE(!start_points.empty()); + NodesTSIterator x(start_points); + return iterator_range(NodesTSIterator(start_points), + NodesTSIterator()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc new file mode 100644 index 00000000..e16cce54 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc @@ -0,0 +1,169 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file defines the the class to partition a graph. + */ + +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" +#include +#include + +namespace paddle { +namespace inference { +namespace analysis { +using framework::ir::Node; + +std::vector ExtractParameters( + const std::unordered_set &nodes) { + // We can judge whether a variable is a parameter by + // its presistable property, but sometimes the presistable + // of the feed op output is true, so we have to identify it. + std::vector feed_outputs; + for (const auto &node : nodes) { + if (!node->IsOp()) continue; + std::string op_type = node->Op()->Type(); + if (op_type == "feed" || op_type == "fetch") { + std::vector output_names = node->Op()->OutputArgumentNames(); + std::copy(output_names.begin(), output_names.end(), + std::back_inserter(feed_outputs)); + } + } + + std::vector parameters; + for (const auto &node : nodes) { + if (!node->IsVar()) continue; + if (node->Var()->Persistable() && + std::find(feed_outputs.begin(), feed_outputs.end(), node->Name()) == + feed_outputs.end()) { + parameters.push_back(node->Name()); + } + } + return parameters; +} + +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map, + const std::unordered_map &graph_var_map, + bool trt_and_not_int8) { + //// In the normal case, the paddle-trt exists bug when runing the googlenet. + // When there are more than two convolutions of 1 * 1 with the same input, the + // paddle-tensorrt will do the merging optimization, which fuse those conv + // into one conv, and then trigger bug. So, We should use strategy to avoid + // this optimization for the time being. This bug will be fixed in the future. + std::unordered_map + same_hierarchy_conv2d_num_map; + + auto add_block_var = [&](const std::string &graph_arg, + const std::string &block_arg) { + auto arg_var_node = graph_var_map.find(graph_arg); + PADDLE_ENFORCE(arg_var_node != graph_var_map.end()); + auto *var_t = block_desc->Var(block_arg); + var_t->SetShape(arg_var_node->second->Var()->GetShape()); + var_t->SetDataType(arg_var_node->second->Var()->GetDataType()); + }; + + for (size_t index = 0; index < block_desc->OpSize(); ++index) { + framework::proto::OpDesc *op = block_desc->Op(index)->Proto(); + framework::OpDesc op_desc(*op, nullptr); + auto correspond_node = subgraph_nodes[index]; + PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); + + std::unordered_map var2id; + std::unordered_map in_vars; + for (auto *in_var : correspond_node->inputs) { + var2id[in_var->Name()] = in_var->id(); + in_vars[in_var->Name()] = in_var; + } + // rename for the input variables of op inside subgraph + for (int i = 0; i < op->inputs_size(); i++) { + // one input + auto *in_var = op->mutable_inputs(i); + std::vector replaced_names; + for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments + const std::string arg_value = in_var->arguments(k); + const std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { + replaced_names.push_back(arg_value); + if (graph_var_map.count(arg_value)) { + add_block_var(arg_value, arg_value); + } + } else { + replaced_names.push_back(arg_value_with_id); + if (graph_var_map.count(arg_value)) { + add_block_var(arg_value, arg_value_with_id); + } + } + } + in_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + in_var->add_arguments(replaced_names[k]); + } + } + var2id.clear(); + for (auto out_var : correspond_node->outputs) { + var2id[out_var->Name()] = out_var->id(); + } + if (op_desc.Type() == "conv2d" && trt_and_not_int8) { + auto input_var_name = op_desc.Input("Input").front(); + auto filter_var_name = op_desc.Input("Filter").front(); + auto out_var_name = op_desc.Output("Output").front(); + auto filter_shape = in_vars[filter_var_name]->Var()->GetShape(); + const std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + const std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + if (same_hierarchy_conv2d_num_map[input_var_name] > 0) { + (*output_names_with_id) + .insert(out_var_name + std::to_string(var2id[out_var_name])); + (*output_names).insert(out_var_name); + } else if (filter_shape[2] == 1 && filter_shape[3] == 1 && + strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 && + paddings[1] == 0) { + same_hierarchy_conv2d_num_map[input_var_name] += 1; + } + } + // rename for the output variables of op inside subgraph + for (int i = 0; i < op->outputs_size(); i++) { + framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); + std::vector replaced_names; + for (int k = 0; k < out_var->arguments_size(); k++) { + const std::string arg_value = out_var->arguments(k); + const std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (graph_var_map.count(arg_value)) { + add_block_var(arg_value, arg_value_with_id); + } + if (output_names_with_id->count(arg_value_with_id)) { + (*output_name_map)[arg_value] = arg_value_with_id; + } + replaced_names.push_back(arg_value_with_id); + } + out_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + out_var->add_arguments(replaced_names[k]); + } + } + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h new file mode 100644 index 00000000..444e1984 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file defines the the class to partition a graph. + */ + +#pragma once +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/framework/ir/node.h" + +namespace paddle { +namespace inference { +namespace analysis { +using framework::ir::Node; + +std::vector ExtractParameters( + const std::unordered_set &nodes); + +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map, + const std::unordered_map &graph_var_map, + bool trt_and_not_int8 = false); + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc new file mode 100644 index 00000000..ce8f57c0 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -0,0 +1,298 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" +#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/op_teller.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using framework::ir::Node; + +void analysis::TensorRtSubgraphPass::ApplyImpl( + framework::ir::Graph *graph) const { + framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph); + + auto teller = [](const framework::ir::Node *node) { + if (!node->IsOp() || !node->Op()) return false; + return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); + }; + + SubGraphFuser fuser(graph, teller, + Get("min_subgraph_size") /*min subgraph size*/); + fuser(); + + std::vector graph_param_names = + ExtractParameters(graph->Nodes()); + // those parameter already exist in trt, and should not have another copy in + // fluid. + std::vector repetitive_params; + + for (auto *node : graph->Nodes()) { + if (node->IsOp() && !Agent(node).subgraph()->empty()) { + CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params); + + std::unordered_set nodes2remove( + Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); + } + } + + std::unordered_set nodes2remove; + for (auto *node : graph->Nodes()) { + if (node->IsOp() && Agent(node).deleted()) { + nodes2remove.insert(node); + } + } + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); + graph->Set(framework::ir::kRepetitiveParamAttr, + new std::vector(repetitive_params)); +} + +std::string GenerateEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs, + const std::string &predictor_id) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + engine_hash_key += predictor_id; + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} + +void TensorRtSubgraphPass::CreateTensorRTOp( + framework::ir::Node *node, Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const { + auto *op_desc = node->Op(); + auto &subgraph = *Agent(node).subgraph(); + PADDLE_ENFORCE(!subgraph.empty()); + + framework::ProgramDesc *program_desc = + Get("program"); + // Add new block for TensorRTEngineOP + const framework::BlockDesc &main_block = + program_desc->Block(framework::kRootBlockIndex); + // const framework::BlockDesc& main_block = program_desc->Block(0); + framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); + + // An fake block desc. + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + string::PrettyLogDetail("--- detect a sub-graph with %d nodes", + subgraph.size()); + + for (auto *node : subgraph) { + auto *new_block_op = new_block->AppendOp(); + auto *op = block_desc.AppendOp(); + *new_block_op->Proto() = *node->Op()->Proto(); + *op->Proto() = *node->Op()->Proto(); + } + + // Then, we will use the input_names_with_id and output_names_with_id to + // generate the eigine key. + // So, We use set instead of unordered_set here to ensure that the engine key + // is unique. + std::set input_names; + std::set input_names_with_id; + std::vector params; + + // The node->inputs containes input tensors and parameters. + for (auto *x : node->inputs) { + input_names.insert(x->Name()); + input_names_with_id.insert(x->Name() + std::to_string(x->id())); + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { + params.push_back(x->Name()); + } + } + + std::set output_names; + std::set output_names_with_id; + for (auto *x : node->outputs) { + output_names.insert(x->Name()); + output_names_with_id.insert(x->Name() + std::to_string(x->id())); + } + + std::unordered_map output_name_map; + std::unordered_map graph_var_map; + + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + graph_var_map[node->Name()] = node; + } + } + auto enable_int8 = Get("enable_int8"); + auto use_calib_mode = Get("use_calib_mode"); + auto &subgraph_nodes = *Agent(node).subgraph(); + + // The following procedure is used to rename all the intermediate + // variables and the output variables of the subgraph. + // Why we do this? + // During the transition from fluid OP to tensorrt OP, we map + // the input and output Tensor(fluid data structure) of fluid OP + // to the corresponding ITensor (trt data structure) through the + // Tensor name. When we set up ITensor for an variable, we must + // ensure that it has not been set before. + // If there is variable in the fluid graph, which is not only the + // input of a OP, but also the output of a Op, there will be problems. + // So we have to rename the variable in the subgraph to make sure + // it is either an OP's input or an OP's output. + RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, + &output_names_with_id, &output_names, &output_name_map, + graph_var_map, !enable_int8); + + // When tensorrt engine runs at the end of the operation, + // output_mapping help us copy the data from the renamed ITensor + // to Tensor. + std::vector output_mapping; + for (auto name : output_names) { + PADDLE_ENFORCE(output_name_map.count(name) != 0); + output_mapping.push_back(output_name_map[name]); + } + PADDLE_ENFORCE(!output_mapping.empty()); + PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), + "the block has no var-desc"); + + // Set attrs + op_desc->SetType("tensorrt_engine"); + op_desc->SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); + + op_desc->SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); + + op_desc->SetBlockAttr("sub_block", new_block); + SetAttr(op_desc->Proto(), "subgraph", + block_desc.Proto()->SerializeAsString()); + SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); + SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); + SetAttr(op_desc->Proto(), "gpu_id", Get("gpu_device_id")); + SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); + SetAttr(op_desc->Proto(), "parameters", params); + + auto use_static_engine = Get("use_static_engine"); + // TODO(NHZlX) + // There are models with the same structure but the different parameters, + // when runing in the 'use_serialize' mode, there is a bug. + auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, + std::to_string(0)); + auto predictor_id = Get("predictor_id"); + + // Get "" when there is no cached calibration table data. + bool load_from_memory = Get("model_from_memory"); + std::string calibration_data = ""; + if (enable_int8 && use_calib_mode) { + calibration_data = GetTrtCalibTableData( + Get("model_opt_cache_dir"), engine_key, enable_int8); + } + SetAttr(op_desc->Proto(), "calibration_data", calibration_data); + + SetAttr(op_desc->Proto(), "enable_int8", enable_int8); + SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode); + SetAttr(op_desc->Proto(), "engine_key", engine_key); + SetAttr(op_desc->Proto(), "predictor_id", predictor_id); + std::string trt_engine_serialized_data = ""; + SetAttr(op_desc->Proto(), "engine_serialized_data", + trt_engine_serialized_data); + + std::unique_ptr calibrator; + if (enable_int8 && calibration_data.size() != 0) { + calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data)); + LOG(INFO) << "RUN Paddle TRT int8 calibration mode..."; + } + // When in int8 mode and calibration_mode, the program just produce the + // calibration table data. + bool calibration_mode = + (enable_int8 && calibration_data.size() == 0 && use_calib_mode); + if (calibration_mode) { + // calibraion mode means generate int8 calibration table data process. + return; + } + + std::copy(params.begin(), params.end(), + std::back_inserter(*repetitive_params)); + + tensorrt::TensorRTEngine *trt_engine = + inference::Singleton::Global() + .Create(engine_key + std::to_string(predictor_id), + Get("max_batch_size"), Get("workspace_size"), + enable_int8, calibrator.get(), Get("gpu_device_id")); + + bool need_serialize = (use_static_engine && !load_from_memory); + if (need_serialize) { + trt_engine_serialized_data = GetTrtEngineSerializedData( + Get("model_opt_cache_dir"), engine_key); + // we can load the engine info serialized before from the disk. + if (!trt_engine_serialized_data.empty()) { + trt_engine->Deserialize(trt_engine_serialized_data); + LOG(INFO) << "Load TRT Optimized Info from " + << GetTrtEngineSerializedPath( + Get("model_opt_cache_dir"), engine_key); + return; + } + } + + // the following code will NOT run in following situation: + // 1. calibraion mode (generate trt int8 calibraiton table data) + // 2. already load serialized trt engine info. + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; + + auto *scope = param_scope(); + framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); + std::unordered_set param_set(params.begin(), params.end()); + inference::Singleton::Global() + .ConvertBlockToTRTEngine( + &block_desc_temp, *scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, trt_engine); + + if (need_serialize) { + nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); + trt_engine_serialized_data = + std::string((const char *)serialized_engine_data->data(), + serialized_engine_data->size()); + SaveTrtEngineSerializedDataToFile( + GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), + engine_key), + trt_engine_serialized_data); + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle + +REGISTER_PASS(tensorrt_subgraph_pass, + paddle::inference::analysis::TensorRtSubgraphPass) + .RequirePassAttr("max_batch_size") + .RequirePassAttr("workspace_size") + .RequirePassAttr("min_subgraph_size"); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h new file mode 100644 index 00000000..f530a5a0 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -0,0 +1,42 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" + +namespace paddle { +namespace inference { +namespace analysis { + +class TensorRtSubgraphPass : public framework::ir::FusePassBase { + public: + void ApplyImpl(framework::ir::Graph *graph) const override; + + private: + void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const; + void CleanIntermediateOutputs(framework::ir::Node *node); +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt new file mode 100644 index 00000000..860dc309 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -0,0 +1,22 @@ +cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) +cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) +cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass zero_copy_tensor) +cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) +cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) +cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass) +cc_library(inference_op_replace_pass SRCS inference_op_replace_pass.cc DEPS analysis_pass graph_to_program_pass) + +cc_library(analysis_passes SRCS passes.cc DEPS + ir_graph_build_pass + ir_analysis_pass + ir_params_sync_among_devices_pass + adjust_cudnn_workspace_size_pass + memory_optim_pass + inference_op_replace_pass + ir_graph_to_program_pass +) + +set(analysis_deps ${analysis_deps} + analysis_passes + subgraph_detector + CACHE INTERNAL "") diff --git a/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.cc b/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.cc new file mode 100644 index 00000000..0470e0d5 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void AdjustCudnnWorkSpacePass::RunImpl(Argument* argument) { + if (!argument->use_gpu()) return; + auto& graph = argument->main_graph(); + auto nodes = graph.Nodes(); + const int cudnn_workspace_size_MB = 64; + const std::string attr_name = "workspace_size_MB"; + + for (auto& node : nodes) { + if (!node->IsOp()) continue; + auto* op_desc = node->Op(); + if (!op_desc->HasAttr(attr_name)) continue; + op_desc->SetAttr(attr_name, cudnn_workspace_size_MB); + op_desc->Flush(); + } +} + +std::string AdjustCudnnWorkSpacePass::repr() const { + return "adjust-cudnn-work-space-pass"; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h b/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h new file mode 100644 index 00000000..65d1c545 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * The default cudnn workspace is 4G, we set it to 64M in this pass, which + * is applicable for most inference tasks. + */ +class AdjustCudnnWorkSpacePass : public AnalysisPass { + public: + void RunImpl(Argument *argument) override; + std::string repr() const override; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc new file mode 100644 index 00000000..ef7d13da --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h" +#include + +namespace paddle { +namespace inference { +namespace analysis { + +void InferenceOpReplacePass::RunImpl(Argument* argument) { + if (!argument->use_gpu()) return; + std::unordered_map replaced_map{ + {"conditional_block", "conditional_block_infer"}, + }; + + auto& graph = argument->main_graph(); + auto nodes = graph.Nodes(); + + for (auto& node : nodes) { + if (!node->IsOp()) continue; + auto* op_desc = node->Op(); + std::string op_type = op_desc->Type(); + if (!replaced_map.count(op_type)) continue; + op_desc->SetType(replaced_map[op_type]); + op_desc->Flush(); + } +} + +std::string InferenceOpReplacePass::repr() const { + return "inference-op-replace-pass"; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h new file mode 100644 index 00000000..7fbdd88e --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * There are some ops (while, conditional_block_op etc) which have different + * optimization points under predicion and training conditions. + * So, We added the corresponding inference impl to these ops separately. + * This pass replaces these ops with corresponding inference ops. + */ +class InferenceOpReplacePass : public AnalysisPass { + public: + void RunImpl(Argument *argument) override; + std::string repr() const override; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc new file mode 100644 index 00000000..d986811a --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/analysis/ir_pass_manager.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void IrAnalysisPass::RunImpl(Argument* argument) { + ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); + ARGUMENT_CHECK_FIELD(argument, main_program); + ARGUMENT_CHECK_FIELD(argument, scope); + + auto* the_graph = argument->ReleaseMainGraph(); + auto graph = std::unique_ptr(the_graph); + + // Apply passes. + IRPassManager the_ir_manager(argument); + graph = the_ir_manager.Apply(std::move(graph)); + PADDLE_ENFORCE_GT(graph->Nodes().size(), 0); + argument->SetMainGraph(graph.release()); + CollectFusionStatis(argument); +} + +void IrAnalysisPass::CollectFusionStatis(Argument* argument) { + if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) { + LOG(INFO) << "argument has no fuse statis"; + return; + } + argument->SetFusionStatis( + argument->main_graph().Get( + framework::ir::kFuseStatisAttr)); +} + +std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; } + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h new file mode 100644 index 00000000..2c2113c0 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/inference/analysis/analysis_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Perform IR analysis passes. + * + * It is used to fuse some + */ +class IrAnalysisPass : public AnalysisPass { + public: + void RunImpl(Argument* argument) override; + + void CollectFusionStatis(Argument* argument); + + std::string repr() const override; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc new file mode 100644 index 00000000..970ecdbb --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" +#include +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { + +extern void ReadBinaryFile(const std::string &filename, std::string *contents); + +namespace analysis { + +void IrGraphBuildPass::RunImpl(Argument *argument) { + if (!argument->scope_valid()) { + argument->SetScope(new framework::Scope); + } + PADDLE_ENFORCE(argument->use_gpu_valid()); + + // The load program should run on the same device with the inference program, + // so that the parameters will on the same device, or they will keep copying + // between difference devices. + platform::Place place; + place = platform::CPUPlace(); + + if (argument->model_dir_valid()) { + auto program = + LoadModel(argument->model_dir(), argument->scope_ptr(), place); + argument->SetMainProgram(program.release()); + } else if (argument->model_program_path_valid() && + argument->model_params_path_valid()) { + auto program = LoadModel( + argument->model_program_path(), argument->model_params_path(), + argument->scope_ptr(), place, + argument->model_from_memory_valid() && argument->model_from_memory()); + argument->SetMainProgram(program.release()); + } else { + PADDLE_THROW( + "either model_dir or (program path and parameter path) should be set."); + } + + auto graph = std::unique_ptr(new Graph(argument->main_program())); + argument->SetMainGraph(graph.release()); + auto *scope_ptr = argument->scope_ptr(); + PADDLE_ENFORCE(scope_ptr); + argument->main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr); +} + +std::unique_ptr IrGraphBuildPass::LoadModel( + const std::string &path, framework::Scope *scope, + const platform::Place &place) { + framework::Executor exe(place); + return Load(&exe, scope, path); +} + +std::unique_ptr IrGraphBuildPass::LoadModel( + const std::string &program_path, const std::string ¶ms_path, + framework::Scope *scope, const platform::Place &place, + bool model_from_memory) { + framework::Executor exe(place); + if (!model_from_memory) { + return Load(&exe, scope, program_path, params_path); + } else { + return LoadFromMemory(&exe, scope, program_path, params_path); + } +} + +std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; } + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h new file mode 100644 index 00000000..adbde043 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Load program and parameter to memory from the disk or directly from memory. + */ +class IrGraphBuildPass : public AnalysisPass { + public: + void RunImpl(Argument *argument) override; + + std::string repr() const override; + + private: + std::unique_ptr LoadModel( + const std::string &path, framework::Scope *scope, + const platform::Place &place); + std::unique_ptr LoadModel( + const std::string &program_path, const std::string ¶ms_path, + framework::Scope *scope, const platform::Place &place, + bool model_from_memory); + + std::string model_binary_str_; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc new file mode 100644 index 00000000..35df396f --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void IrGraphToProgramPass::RunImpl(Argument *argument) { + auto pass = + framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); + + if (argument->memory_optim_sort_kind_valid()) { + pass->Set(framework::ir::kGraphToProgramSortKind, + new int(argument->memory_optim_sort_kind())); + } + + std::unique_ptr graph(argument->main_graph_ptr()); + + // Direct using ProgramDesc desc(argument->main_program()) may cause + // incomplete copies of information. + framework::ProgramDesc desc; + desc.CopyFrom(*argument->main_program().Proto()); + pass->SetNotOwned("program", &desc); + pass->Apply(graph.release()); // the argument still own the graph. + + argument->SetIrAnalyzedProgram( + new framework::proto::ProgramDesc(*desc.Proto())); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h new file mode 100644 index 00000000..838ebdbc --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/analysis/analysis_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +class IrGraphToProgramPass : public AnalysisPass { + public: + void RunImpl(Argument *argument) override; + + std::string repr() const override { return "ir-graph-to-param-pass"; } +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc new file mode 100644 index 00000000..fedee3ff --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -0,0 +1,86 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { + PADDLE_ENFORCE(argument->scope_valid()); + PADDLE_ENFORCE(argument->use_gpu_valid()); + + platform::Place place; + + // The parameters are on the cpu, therefore, synchronization is not necessary. + if (!argument->use_gpu()) return; + + auto &graph = argument->main_graph(); + std::vector repetitive_params; + + if (graph.Has(framework::ir::kRepetitiveParamAttr)) + repetitive_params = graph.Get>( + framework::ir::kRepetitiveParamAttr); + + LOG(INFO) << "Sync params from CPU to GPU"; + + PADDLE_ENFORCE(argument->gpu_device_id_valid()); + place = platform::CUDAPlace(argument->gpu_device_id()); + + auto *scope = argument->scope_ptr(); + std::vector all_vars = scope->LocalVarNames(); + + // We get all the vars from local_scope instead of the ProgramDesc. + // Because there exists the case that new parameter variables are not added to + // the program in the analysis pass. + for (auto &var_name : all_vars) { + if (std::count(repetitive_params.begin(), repetitive_params.end(), + var_name)) { + scope->EraseVars({var_name}); + continue; + } + auto *var = scope->FindLocalVar(var_name); + PADDLE_ENFORCE(var != nullptr); + if (var->IsType() || + var->IsType()) { + auto *t = var->GetMutable(); + + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(t->dims()); + temp_tensor.mutable_data(cpu_place); + + // Copy the parameter data to a tmp tensor. + TensorCopySync(*t, cpu_place, &temp_tensor); + // Reallocation the space on GPU + t->clear(); + + // Copy parameter data to newly allocated GPU space. + TensorCopySync(temp_tensor, place, t); + } + } +} + +std::string IrParamsSyncAmongDevicesPass::repr() const { + return "ir-params-sync-among-devices-pass"; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h new file mode 100644 index 00000000..61990150 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Sync parameter from CPU to GPU. + */ +class IrParamsSyncAmongDevicesPass : public AnalysisPass { + public: + void RunImpl(Argument *argument) override; + std::string repr() const override; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc new file mode 100644 index 00000000..c894acfd --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -0,0 +1,845 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using framework::ir::Graph; +using framework::ir::Node; +using framework::ir::TopologyVarientSort; +using space_table_t = MemoryOptimizePass::space_table_t; + +typedef struct { + std::string name; + size_t size; + int cluster; + std::pair lifetime; + std::unordered_set adj; +} MemNode; + +// Collect the lifecycles of the tensors. +// Traverse the graph in topological order. +// The traversal order also affect the lifecycles, so different sort_kind is +// used. +void MemoryOptimizePass::CollectLifeCycle( + std::unordered_map* lifecycles, + int sort_kind) const { + max_lifecycle_ = 0; + for (auto* op_node : framework::ir::TopologyVarientSort( + *graph_, static_cast(sort_kind))) { + if (!op_node->IsOp()) continue; + auto reads = op_node->inputs; + auto writes = op_node->outputs; + + std::vector requires(reads.begin(), reads.end()); + requires.insert(requires.end(), writes.begin(), writes.end()); + + // Disable reuse of feed variables. + if (op_node->Name() == "feed") { + for (auto* node : op_node->outputs) { + auto var = node->Name(); + lifecycles->emplace(var, + std::make_pair(0, std::numeric_limits::max())); + } + } else { + // Normal operators. + for (const Node* node : requires) { + if (node->Var()->Persistable()) continue; + std::string var = node->Name(); + if (!lifecycles->count(var)) { + (*lifecycles)[var] = std::make_pair(max_lifecycle_, max_lifecycle_); + } else { + (*lifecycles)[var].second = + std::max(max_lifecycle_, lifecycles->at(var).second); // max() + } + } + } + + ++max_lifecycle_; + } +} + +// TODO(Superjomn) Make this a general help method. +int DataTypeToSpace(framework::proto::VarType_Type type) { + switch (type) { + case framework::proto::VarType_Type_BOOL: + return sizeof(bool); + case framework::proto::VarType_Type_FP32: + return sizeof(float); + case framework::proto::VarType_Type_INT32: + return sizeof(int32_t); + case framework::proto::VarType_Type_INT64: + return sizeof(int64_t); + default: + PADDLE_THROW("Unknown data type"); + } +} + +void MemoryOptimizePass::CollectVarMemorySize( + space_table_t* space_table) const { + const int fake_batch_size = 1; + auto valid_var = [&](framework::ir::Node* node) -> bool { + std::set invalid_op = {"while", "conditional_block", + "tensorrt_engine", + "conditional_block_infer"}; + for (auto* tmp : node->inputs) { + CHECK(tmp->IsOp()); + std::string op_type = tmp->Op()->Type(); + if (std::find(invalid_op.begin(), invalid_op.end(), op_type) != + invalid_op.end()) { + return false; + } + } + for (auto* tmp : node->outputs) { + CHECK(tmp->IsOp()); + std::string op_type = tmp->Op()->Type(); + if (std::find(invalid_op.begin(), invalid_op.end(), op_type) != + invalid_op.end()) { + return false; + } + } + return true; + }; + // Collect tensors from graph. + for (auto* node : graph_->Nodes()) { + if (node->IsVar() && + node->Var()->GetType() == + framework::proto::VarType::Type::VarType_Type_LOD_TENSOR && + valid_var(node)) { + // Parameters will not be reused. + if (node->Var()->Persistable()) continue; + auto shape = node->Var()->GetShape(); + for (auto& v : shape) { + if (v < 0) v = fake_batch_size; + } + + int size = std::accumulate(shape.begin(), shape.end(), 1, + std::multiplies()); + (*space_table)[node->Var()->Name()] = + size * DataTypeToSpace(node->Var()->GetDataType()); + } + } +} + +void MakeSimpleReusePlan( + const std::unordered_map>& lifecycles, + const std::unordered_map& space_table, + std::unordered_map* node2cluster, + std::unordered_map* cluster_size) { + std::vector mem_nodes; + for (auto& data : lifecycles) { + if (!space_table.count(data.first)) continue; + MemNode temp_node; + temp_node.name = data.first; + temp_node.size = space_table.at(data.first); + temp_node.cluster = -1; + temp_node.lifetime = data.second; + mem_nodes.push_back(temp_node); + } + auto overlap = [](std::pair a, std::pair b) -> bool { + return b.second >= a.first && a.second >= b.first; + }; + // If the lifetime of two nodes is overwritten, we set them as adjacent nodes. + for (size_t i = 0; i < mem_nodes.size(); i++) { + for (size_t j = i + 1; j < mem_nodes.size(); j++) { + if (overlap(mem_nodes[i].lifetime, mem_nodes[j].lifetime)) { + mem_nodes[i].adj.insert(mem_nodes[j].name); + mem_nodes[j].adj.insert(mem_nodes[i].name); + } + } + } + + // Sort the nodes according to the node memory size. + auto sort_func = [](MemNode a, MemNode b) { return a.size > b.size; }; + std::sort(mem_nodes.begin(), mem_nodes.end(), sort_func); + + // Generating Memory Reuse Strategy Based on Greedy Way + for (size_t i = 0; i < mem_nodes.size(); i++) { + if (mem_nodes[i].cluster >= 0) continue; + int cluster_index = cluster_size->size(); + mem_nodes[i].cluster = cluster_index; + (*cluster_size)[mem_nodes[i].name] = mem_nodes[i].size; + (*node2cluster)[mem_nodes[i].name] = mem_nodes[i].name; + std::unordered_set cluster_adj = mem_nodes[i].adj; + for (size_t j = i + 1; j < mem_nodes.size(); j++) { + if (mem_nodes[j].cluster < 0 && + (cluster_adj.find(mem_nodes[j].name) == cluster_adj.end())) { + (*node2cluster)[mem_nodes[j].name] = mem_nodes[i].name; + mem_nodes[j].cluster = cluster_index; + for (auto& n : mem_nodes[j].adj) { + cluster_adj.insert(n); + } + } + } + } + for (auto& cluster : *cluster_size) { + LOG(INFO) << "Cluster name : " << cluster.first + << " size: " << cluster.second; + } +} + +// Collect the memory size of the tensors. +void MemoryOptimizePass::CollectVarMemorySize( + const std::unordered_map& batch_var_ave_dim, + std::unordered_map* tensor_nodes, + space_table_t* space_table) const { + // Collect tensors from graph. + for (auto* node : graph_->Nodes()) { + if (node->IsVar() && + node->Var()->GetType() == + framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { + // Parameters will not be reused. + if (node->Var()->Persistable()) continue; + (*tensor_nodes)[node->Name()] = node; + (*space_table)[node->Name()] = + DataTypeToSpace(node->Var()->GetDataType()) * + batch_var_ave_dim.at(node->Name()); + } + } +} + +// Find a sutable (big enough but smallest to avoid memory waste). +// +// Args: +// @tensor_nodes: the tensor nodes in the ir::Graph. +// @free_existing_tensors: the allocated tensor and are free. +// @space_table: the memory space of tensors. +// @tensor2use: the tensor that requires memory. +// +// Returns: +// true if found some existing tensor to reuse. +// false if no sutable tensor to reuse, one need to allocate a new tensor for +// this requirement. +// The suitable tensor for reuse is one that is approximately equal to the +// memory demand. +bool FindSuitableTensorToReuse( + const std::string& tensor, int space_required, + const std::unordered_map& tensor_nodes, + std::unordered_set* free_existing_tensors, + const space_table_t& space_table, + const std::vector>& var_clusters, + std::string* tensor2use) __SHOULD_USE_RESULT__; + +bool FindSuitableTensorToReuse( + const std::string& tensor, int space_required, + const std::unordered_map& tensor_nodes, + std::unordered_set* free_existing_tensors, + const space_table_t& space_table, + const std::vector>& var_clusters, + std::string* tensor2use) { + std::pair best_fit; + best_fit.second = std::numeric_limits::max(); + VLOG(5) << "Split Tensors to " << var_clusters.size() << " clusters"; + + // find the cluster this var belongs to. + const std::unordered_set* cluster = nullptr; + for (const auto& c : var_clusters) { + if (c.count(tensor)) { + cluster = &c; + break; + } + } + PADDLE_ENFORCE_NOT_NULL(cluster, + "something wrong in memory optimization, the " + "variable %s not in the clusters.", + tensor); + + for (auto& candidate : *free_existing_tensors) { + // This is not a temporary tensor. + if (!space_table.count(candidate)) continue; + // Not in the same cluster. + if (!cluster->count(candidate)) continue; + + size_t space = space_table.at(candidate); + PADDLE_ENFORCE( + space <= std::numeric_limits::type>::max(), + "space overload"); + size_t space_diff = + std::abs((std::make_signed::type)space - space_required); + if (space_diff < best_fit.second) { + best_fit.first = candidate; + best_fit.second = space_diff; + } + } + + if (best_fit.second < std::numeric_limits::max()) { + *tensor2use = best_fit.first; + return true; + } + return false; +} + +// Allocate new tensor instead of reusing the existing one. +void AllocateNewTensor( + const std::string& name, size_t space_required, + const std::unordered_map& tensor_nodes, + std::unordered_set* free_existing_tensors, + space_table_t* space_table, + std::unordered_map* reuse_table) { + // The newly born tensor is free to be used. + free_existing_tensors->insert(name); + // Register the space it has. + PADDLE_ENFORCE(space_table->count(name)); + space_table->at(name) = std::max(space_table->at(name), space_required); + // The allocated new tensor use the memory of itself. + (*reuse_table)[name] = name; +} + +// Free a tensor and make it resuable. +// @tensor: the tensor to free. +// @free_existing_tensors: the free and allocated tensors. +// @reuse_table: a map from a fake tensor to the existing allocated tensor. +void FreeATensor(const std::string& tensor, + std::unordered_set* free_existing_tensors, + std::unordered_map* reuse_table) { + if (tensor == "feed" || tensor == "fetch") return; + // the really allocated tensor. + const auto& free_tensor = reuse_table->at(tensor); + + free_existing_tensors->insert(free_tensor); +} + +// Reuse a free existing tensor. +void ReuseATensor(const std::string& tensor, const std::string& tensor2reuse, + size_t memory_size, + std::unordered_set* free_existing_tensors, + std::unordered_map* reuse_table, + space_table_t* reused_space_table) { + auto it = free_existing_tensors->find(tensor2reuse); + PADDLE_ENFORCE(it != free_existing_tensors->end()); + free_existing_tensors->erase(it); + (*reuse_table)[tensor] = tensor2reuse; + // Update the memory size of a reused tensor, the memory will grow if the + // required memory is larger. + (*reused_space_table)[tensor2reuse] = + std::max(reused_space_table->at(tensor2reuse), memory_size); +} + +// Calculate the memory usage. +void EvaluateMemoryUsage( + const std::unordered_map& reuse_table, + const space_table_t& space_table, + const std::unordered_map& var_batch_ave_size, + size_t* allocated, size_t* saved) { + *allocated = 0; + *saved = 0; + + for (auto elem : reuse_table) { + if (elem.first == elem.second) { + *allocated += space_table.at(elem.first); + VLOG(4) << elem.first << " <-> " << elem.second << " " + << space_table.at(elem.first) << " " + << space_table.at(elem.second); + } else { + *saved += space_table.at(elem.first); + VLOG(4) << "reuse " << elem.first << " -> " << elem.second; + } + } + VLOG(4) << "allocated " << *allocated; + VLOG(4) << "saved " << *saved; +} + +// Return saved ratio. +void MemoryOptimizePass::MakeReusePlan( + const std::vector>& var_clusters, + const std::unordered_map& var_batch_ave_size, + const space_table_t& space_table, + std::unordered_map* reuse_table, int sort_kind, + MemoryAllocation* memory_allocation) const { + // Clear the existing plan. + reuse_table->clear(); + + // The `space_table` stores the real memory size for each tensor. + // The `reused_space_table` stores the maximum memory size required by a + // tensor during the memory reusing, the small tensor might be reused by a + // larger tensor, and the memory size of the small one will grow. + auto reused_space_table = space_table; + + std::unordered_map life_cycles; + std::unordered_map tensor_nodes; + // The allocated tensors whose memory can be reused, they will live across the + // program execution. + std::unordered_set existing_tensors; + // The existing tensor that has been allocated, and is also free to reuse. + std::unordered_set free_existing_tensors; + + CollectLifeCycle(&life_cycles, sort_kind); + + for (int age = 0; age < max_lifecycle_; ++age) { + std::unordered_set born_tensors; + std::unordered_set dead_tensors; + // Gather the dead and born tensors. + for (auto elem_it = life_cycles.begin(); elem_it != life_cycles.end(); + elem_it++) { + if (elem_it->second.first == -1) { + continue; + } + const auto& tensor = elem_it->first; + const auto& lifecycle = elem_it->second; + VLOG(4) << "process " << tensor << " reuse " << lifecycle.first << "->" + << lifecycle.second; + + // Collect newly born tensors. + if (lifecycle.first == age) { + born_tensors.insert(tensor); + } + // Collect dead tensors whose memory can be reused. + else if (lifecycle.second < age) { // NOLINT + dead_tensors.insert(tensor); + // remove to avoid duplicate process. + elem_it->second.first = -1; // avoid duplicate search + } + } + + // Reuse the dead tensors for born tensors + for (const auto& tensor : born_tensors) { + // Skip the feed and fetch tensor for that they share data with others. + std::string tensor2reuse; + if (!space_table.count(tensor)) continue; + size_t space_required = space_table.at(tensor); + if (FindSuitableTensorToReuse(tensor, space_required, tensor_nodes, + &free_existing_tensors, reused_space_table, + var_clusters, &tensor2reuse)) { + if (tensor != tensor2reuse) { + VLOG(4) << tensor << " -> " << tensor2reuse; + } + ReuseATensor(tensor, tensor2reuse, space_required, + &free_existing_tensors, reuse_table, &reused_space_table); + } else { + VLOG(4) << "allocate " << tensor; + AllocateNewTensor(tensor, space_required, tensor_nodes, + &free_existing_tensors, &reused_space_table, + reuse_table); + ReuseATensor(tensor, tensor, space_required, &free_existing_tensors, + reuse_table, &reused_space_table); + } + } + + for (const auto& tensor : dead_tensors) { + // free its memory. + FreeATensor(tensor, &free_existing_tensors, reuse_table); + } + } + + EvaluateMemoryUsage(*reuse_table, reused_space_table, var_batch_ave_size, + &(memory_allocation->allocated), + &(memory_allocation->saved)); + memory_allocation->sort_kind = sort_kind; +} + +void BuildVarNodeTable(Graph* graph, + std::unordered_map* var_node_table) { + for (auto* node : graph->Nodes()) { + if (node->IsVar()) { + (*var_node_table)[node->Name()] = node; + } + } +} + +// NOTE The optimized opdesc doesn't match ir::Graph. +void UpdateOpDescsByReuse( + Graph* graph, + const std::unordered_map& reuse_table, + int sort_kind) { + // TODO(Superjomn) change here to be compatible with the runtime order. + for (auto* node : TopologyVarientSort( + *graph, static_cast(sort_kind))) { + if (node->IsOp()) { + // Replace the original inputs/outputs with the reused tensors. + std::unordered_map> in_args, + out_args; + for (auto argument : node->Op()->Inputs()) { + for (const auto& x : argument.second) { + auto name = x; + if (reuse_table.count(x) && reuse_table.at(x) != x) { + name = reuse_table.at(x); + } + in_args[argument.first].push_back(name); + VLOG(4) << node->Name() << " input " << x << " -> " << name; + } + } + + // modify the graph + for (auto input_node : node->inputs) { + PADDLE_ENFORCE(input_node->IsVar()); + std::string input_node_name = input_node->Name(); + if (reuse_table.count(input_node_name) && + reuse_table.at(input_node_name) != input_node_name) { + auto name = reuse_table.at(input_node_name); + input_node->RenameVar(name); + } + } + + for (auto argument : node->Op()->Outputs()) { + for (const auto& x : argument.second) { + auto name = x; + if (reuse_table.count(x) && reuse_table.at(x) != x) { + name = reuse_table.at(x); + } + out_args[argument.first].push_back(name); + VLOG(4) << node->Name() << " output " << x << " -> " << name; + } + } + + // modify the graph + for (auto out_node : node->outputs) { + PADDLE_ENFORCE(out_node->IsVar()); + std::string out_node_name = out_node->Name(); + if (reuse_table.count(out_node_name) && + reuse_table.at(out_node_name) != out_node_name) { + auto name = reuse_table.at(out_node_name); + out_node->RenameVar(name); + } + } + + // Update arguments. + for (auto& arg : in_args) { + node->Op()->SetInput(arg.first, arg.second); + } + for (auto& arg : out_args) { + node->Op()->SetOutput(arg.first, arg.second); + } + node->Op()->Flush(); + } + } +} + +void MemoryOptimizePass::PerformReusePlan( + const std::unordered_map& reuse_table, + int sort_kind, std::unordered_set* vars2remove) const { + std::unordered_map var_node_table; + BuildVarNodeTable(graph_, &var_node_table); + UpdateOpDescsByReuse(graph_, reuse_table, sort_kind); + + for (auto& item : reuse_table) { + if (item.first != item.second) { + vars2remove->insert(item.first); + } + } + VLOG(2) << "to remove vars " << vars2remove->size(); +} + +std::vector split(const std::string& line, char delim) { + std::vector res; + std::string field; + std::stringstream line_stream(line); + while (std::getline(line_stream, field, delim)) { + res.emplace_back(field); + } + return res; +} + +// Deserialize the batch var shapes from the cache file. +std::vector>> DeseralizeBatchVarShapes( + const std::string& path) { + std::ifstream file(path); + PADDLE_ENFORCE(file.is_open(), "failed to open %s to read cache", path); + std::string line; + std::vector>> batch_shapes; + + while (std::getline(file, line)) { + std::map> batch; + for (const auto& var_info : split(line, ';')) { + auto fields = split(var_info, ':'); + PADDLE_ENFORCE_EQ(fields.size(), 2UL); + auto var_name = fields.front(); + auto shape_str = split(fields[1], ','); + std::vector shape; + for (const auto& v : shape_str) shape.push_back(std::stoi(v)); + batch[var_name] = shape; + } + batch_shapes.push_back(batch); + } + return batch_shapes; +} + +// Replace the -1 in shape to a real number to fake the shape. +std::vector>> FakeBatchVarShapes( + const framework::ProgramDesc& program) { + std::vector>> res; + res.emplace_back(); + auto& record = res.front(); + const int fake_batch_size = 3; + for (auto* var : program.Block(0).AllVars()) { + if (var->GetType() == + framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { + auto shape = var->GetShape(); + for (auto& v : shape) { + if (v < 0) v = fake_batch_size; + } + record[var->Name()].assign(shape.begin(), shape.end()); + } + } + return res; +} + +// Calculate the average dim of each tensor from the batch shape cache. +std::unordered_map GetBatchAverageSize( + const std::vector>>& batches) { + std::unordered_map var2size; + // The average size of the batches for each variable. + int num_batch = 0; + for (const auto& batch : batches) { + num_batch++; + for (const auto& item : batch) { + int dim = std::accumulate(item.second.begin(), item.second.end(), 1, + [](int a, int b) { return a * b; }); + var2size[item.first] += dim; + } + } + + for (auto& item : var2size) { + item.second /= num_batch; + } + + return var2size; +} + +// Analysis the batch shapes loading from the cache file. +// By splitting the variables to different clusters by analyzing their batch +// size, we can pre-schedule the changes of difference LoDTensor when different +// length of input sequences is entered. +// This should works fine for the models operating on sentences. +std::vector> AnalysisBatchShapesByBatchSize( + const std::vector>>& batches) { + // collect the batch size of each shape and combine to a stringstream in + // converient to generate a hash. + std::unordered_map var_batchsize_hashes; + for (auto& batch : batches) { + for (auto& ele : batch) { + PADDLE_ENFORCE(!ele.second.empty()); + int batch_size = ele.second.front(); + // TODO(Superjomn) might consume large memory here, use combine hash. + var_batchsize_hashes[ele.first] << batch_size; + } + } + + // Split to sets by batch size sequences. + std::unordered_map> + shape_sets; + for (auto& ele : var_batchsize_hashes) { + auto hash = std::hash()(ele.second.str()); + shape_sets[hash].insert(ele.first); + } + std::vector> res; + for (auto& ele : shape_sets) { + res.emplace_back(std::move(ele.second)); + } + + VLOG(3) << "Cluster by batch_size and get " << res.size() << " clusters"; + return res; +} + +// Analysis the batch shapes loading from the cache file, and split them to +// different clusters by their size. +// This should works fine for the overall models. +std::vector> AnalysisBatchShapesBySimilarSize( + const space_table_t& space_table, + const std::vector>>& batches, + int interval = 200000) { + PADDLE_ENFORCE_GT(interval, 0); + // cluster to different clusters. + size_t max_size = 0; + for (auto& item : space_table) { + max_size = std::max(item.second, max_size); + } + VLOG(4) << "tensor max size " << max_size; + + std::vector> res; + + // cluster by intervals. + for (size_t interval_size = 0; interval_size <= max_size; + interval_size += interval) { + std::unordered_set cluster; + for (auto& item : space_table) { + if (interval_size <= item.second && + interval_size + interval > item.second) { + cluster.insert(item.first); + } + } + if (!cluster.empty()) { + res.push_back(cluster); + } + } + + VLOG(3) << "Cluster by interval and get " << res.size() << " cluster"; + return res; +} + +std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; } + +std::pair GetRange( + const std::unordered_map& ave_size) { + auto res = std::make_pair(std::numeric_limits::max(), + std::numeric_limits::min()); + for (auto& item : ave_size) { + res.first = std::min(item.second, res.first); + res.second = std::max(item.second, res.second); + } + return res; +} + +void MemoryOptimizePass::RunImpl(Argument* argument) { + // When force update, should not optimize memory. + if (!argument->enable_memory_optim() || + argument->static_memory_optim_force_update()) + return; + graph_ = argument->main_graph_ptr(); + + auto path = GetMemoryCachePath( + argument->model_dir_valid() ? argument->model_dir() : "", + argument->model_program_path_valid() ? argument->model_program_path() + : ""); + VLOG(3) << "Load memory cache from " << path; + std::vector>> batches; + + if (!(argument->static_memory_optim() && inference::IsFileExists(path))) { + string::PrettyLogInfo("--- Performing dynamic memory optimize"); + // batches = FakeBatchVarShapes(argument->main_program()); + int sort_kind = 0; + std::unordered_map lifecycles; + space_table_t space_table; + std::unordered_map node2cluster; + std::unordered_map cluster_size; + + CollectLifeCycle(&lifecycles, sort_kind); + CollectVarMemorySize(&space_table); + MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size); + UpdateOpDescsByReuse(graph_, node2cluster, sort_kind); + return; + + } else { + string::PrettyLogInfo("--- Performing static memory optimize"); + batches = DeseralizeBatchVarShapes(path); + } + auto var_batch_ave_size = GetBatchAverageSize(batches); + + // Get min and max memory size. + const auto range = GetRange(var_batch_ave_size); + const int cluster_size = std::max( + static_cast((range.second - range.first) / 100 /*cluster num*/), + 1024); + const int cluster_size1 = std::max( + static_cast((range.second - range.first) / 1000 /*cluster num*/), + 1024); + + std::unordered_map tensor_nodes; + space_table_t space_table; + CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); + + std::unordered_map reuse_table; + double max_saving_ratio = 0.; + + std::vector> strategies; + + for (int sort_kind = 0; sort_kind < 2; sort_kind++) { + if (argument->static_memory_optim()) { + // This strategy only make scene in static memory optimize. + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_batch_size = + AnalysisBatchShapesByBatchSize(batches); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_batch_size, var_batch_ave_size, + space_table, &reuse_table, sort_kind, &allocation); + return allocation; + }); + } + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = + AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = + AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size1); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( + space_table, batches, + std::numeric_limits::max()); // no intervals + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + } + + std::function* best_strategy{nullptr}; + + // Try all strategies to get the best result. + for (auto& strategy : strategies) { + auto allocation = strategy(); + string::PrettyLogDetail("--- get strategy saving %f memory for workspace", + allocation.GetSavingRatio()); + if (allocation.GetSavingRatio() > max_saving_ratio) { + max_saving_ratio = allocation.GetSavingRatio(); + best_strategy = &strategy; + } + } + if (!best_strategy) { + LOG(ERROR) << "This model makes poor memory optimize, skip memory optimize"; + return; + } + auto memory_allocation = (*best_strategy)(); + + string::PrettyLogInfo( + "--- Saved %.2f%s memory for workspace(temporary variables)", + memory_allocation.GetSavingRatio() * 100, "%"); + + argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, + new std::unordered_set); + auto& vars2remove = + argument->main_graph().Get>( + framework::ir::kGraphToProgramVarsToRemove); + + PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); + argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); +} + +float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const { + return (saved / 1024.) / (allocated / 1024. + saved / 1024.); +} +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h new file mode 100644 index 00000000..5a907303 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -0,0 +1,112 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Memory optimization pass for inference with pre-analysis of memory usage + * without GC. + * Different from training, the inference memory reuse strategies doesn't + * include GC for that overhead is too much when batch size equals one. + * + * The inference memory reuse tries to pre-determine the tensor reusing strategy + * without runtime overhead. + * + * To improve the strategy's performance, a warm-up running is introduced: + * - Before officially deploy the inference program, one should warm it up and + * generate some runtime cache, + * - Run the inference program with several batches of data, it will persist + * some runtime information about memory of tensors to disk, we call the + * information the memory reusing cache, + * - With the memory reusing cache, user can deploy the inference to a + * service, before running the model, the inference program will load the + * memory cache, analysis it and generate the best memory reusing strategy, + * and adjust the execution of the network. + * + * With the warm-up and memory reusing cache design, the memory reusing + * algorithm can analysis the real memory consume of the tensors, even with the + * flexible LoDTensor and special shape changing operators such as + * sequence-pooling. + */ +class MemoryOptimizePass : public AnalysisPass { + public: + using space_table_t = std::unordered_map; + using lifecycle_t = std::pair; + + struct MemoryAllocation { + size_t allocated; // allocated memory in byte. + size_t saved; // saved memory in byte. + int sort_kind; // the kind of the corresponding sorting algorithm. + + // Get the memory saving ratio of temporary variables. + float GetSavingRatio() const; + }; + + virtual ~MemoryOptimizePass() = default; + + protected: + void RunImpl(Argument *argument) override; + + private: + void CollectLifeCycle( + std::unordered_map *lifecycles, + int sort_kind) const; + + void CollectVarMemorySize(space_table_t *space_table) const; + + void CollectVarMemorySize( + const std::unordered_map &batch_var_ave_dim, + std::unordered_map *tensor_nodes, + space_table_t *space_table) const; + + // Returns percentage of saved memory. + void MakeReusePlan( + const std::vector> &var_clusters, + const std::unordered_map &var_batch_ave_size, + const space_table_t &space_table, + std::unordered_map *reuse_table, int sort_kind, + MemoryAllocation *memory_allocation) const; + + void PerformReusePlan( + const std::unordered_map &reuse_table, + int sort_kind, std::unordered_set *vars2remove) const; + + public: + std::string repr() const override; + + private: + mutable framework::ir::Graph *graph_{nullptr}; + mutable int max_lifecycle_{-1}; +}; + +static std::string GetMemoryCachePath(const std::string &model_path, + const std::string &prog_path) { + auto path = model_path.empty() ? prog_path : model_path; + return path + ".memory_cache"; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc new file mode 100644 index 00000000..97debcec --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/passes.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/passes.h" +#include "paddle/fluid/inference/analysis/passes/adjust_cudnn_workspace_size_pass.h" +#include "paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h" +#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" +#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" +#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" +#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +PassRegistry::PassRegistry() { + // Register manually to avoid the trivial `USE_OP` like macro for easier use + // and link. + passes_.emplace("ir_analysis_pass", + std::unique_ptr(new IrAnalysisPass)); + passes_.emplace("ir_graph_build_pass", + std::unique_ptr(new IrGraphBuildPass)); + passes_.emplace("memory_optimize_pass", + std::unique_ptr(new MemoryOptimizePass)); + passes_.emplace( + "ir_params_sync_among_devices_pass", + std::unique_ptr(new IrParamsSyncAmongDevicesPass)); + passes_.emplace("adjust_cudnn_workspace_size_pass", + std::unique_ptr(new AdjustCudnnWorkSpacePass)); + passes_.emplace("inference_op_replace_pass", + std::unique_ptr(new InferenceOpReplacePass)); + passes_.emplace( + "ir_graph_to_program_pass", + std::unique_ptr(new IrGraphToProgramPass)); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/passes.h b/paddle/fluid/inference/analysis/passes/passes.h new file mode 100644 index 00000000..8a13091d --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/passes.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/inference/analysis/analysis_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +struct PassRegistry { + PassRegistry(); + + AnalysisPass* Retreive(const std::string& pass_type) { + return passes_[pass_type].get(); + } + + static PassRegistry& Global() { + static auto* x = new PassRegistry; + return *x; + } + + private: + std::unordered_map> passes_; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h new file mode 100644 index 00000000..d599099a --- /dev/null +++ b/paddle/fluid/inference/analysis/ut_helper.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/inference/analysis/helper.h" + +namespace paddle { +namespace inference { + +// Read ProgramDesc from a __model__ file, defined in io.cc +extern void ReadBinaryFile(const std::string& filename, std::string* contents); + +namespace analysis { + +DEFINE_string(inference_model_dir, "", "inference test model dir"); + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt new file mode 100755 index 00000000..7c697c81 --- /dev/null +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if(APPLE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") +endif(APPLE) + + +set(inference_deps ${analysis_deps} + paddle_inference_api paddle_fluid_api + analysis pass naive_executor + ${GLOB_PASS_LIB}) + +if(WITH_GPU AND TENSORRT_FOUND) + set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) +endif() + +if (ANAKIN_SUBGRAPH) + set(inference_deps ${inference_deps} anakin_op_converter anakin_engine) +endif() + +if(WITH_NGRAPH) + set(inference_deps ${inference_deps} ngraph) +endif() + +add_subdirectory(details) + +if(WITH_MKLDNN) + set(mkldnn_quantizer_src mkldnn_quantizer.cc) + set(mkldnn_quantizer_cfg mkldnn_quantizer_config) + cc_library(${mkldnn_quantizer_cfg} SRCS mkldnn_quantizer_config.cc DEPS lod_tensor paddle_pass_builder) +endif() + +cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder) +if(WITH_NGRAPH) + cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc DEPS ngraph) +else(WITH_NGRAPH) + cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) +endif(WITH_NGRAPH) +cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS paddle_inference_api zero_copy_tensor + reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps}) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS + lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config + paddle_pass_builder zero_copy_tensor + reset_tensor_array) + +cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) + +if(WITH_TESTING) + inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} + ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) + set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) + set_tests_properties(test_api_impl PROPERTIES LABELS "RUN_TYPE=DIST") +endif() +cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} + ARGS --dirname=${WORD2VEC_MODEL_DIR}) + +if(ANAKIN_FOUND) + # Do not turn warnings into errors. + set_source_files_properties(api.cc api_anakin_engine.cc PROPERTIES COMPILE_FLAGS "-Wno-error") + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS boost xxhash) + target_link_libraries(inference_anakin_api anakin anakin_saber_common) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS boost xxhash) + target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common) + function(anakin_target target_name) + target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) + endfunction() + anakin_target(inference_anakin_api) + anakin_target(inference_anakin_api_shared) +endif() diff --git a/paddle/fluid/inference/api/README.md b/paddle/fluid/inference/api/README.md new file mode 100644 index 00000000..a2d685d7 --- /dev/null +++ b/paddle/fluid/inference/api/README.md @@ -0,0 +1,17 @@ +# Embed Paddle Inference in Your Application + +Paddle inference offers the APIs in `C` and `C++` languages. + +You can easily deploy a model trained by Paddle following the steps as below: + +1. Optimize the native model; +2. Write some codes for deployment. + +## The APIs + +All the released APIs are located in the `paddle_inference_api.h` header file. +The stable APIs are wrapped by `namespace paddle`, the unstable APIs are protected by `namespace paddle::contrib`. + +## Write some codes + +Read `paddle_inference_api.h` for more information. diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc new file mode 100644 index 00000000..0ea26000 --- /dev/null +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -0,0 +1,460 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_pass_builder.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +extern const std::vector kTRTSubgraphPasses; +extern const std::vector kAnakinSubgraphPasses; + +PassStrategy *AnalysisConfig::pass_builder() const { + if (!pass_builder_.get()) { + if (use_gpu_) { + LOG(INFO) << "Create GPU IR passes"; + pass_builder_.reset(new GpuPassStrategy); + } else { + LOG(INFO) << "Create CPU IR passes"; + pass_builder_.reset(new CpuPassStrategy); + } + } else if (pass_builder_->use_gpu() ^ use_gpu()) { + LOG(WARNING) << "The use_gpu flag is not compatible between Config and " + "PassBuilder, the flags are " + << use_gpu() << " " << pass_builder_->use_gpu(); + LOG(WARNING) << "Please make them compatible, still use the existing " + "PassBuilder."; + } + + return pass_builder_.get(); +} + +AnalysisConfig::AnalysisConfig(const std::string &model_dir) { + model_dir_ = model_dir; + + Update(); +} +AnalysisConfig::AnalysisConfig(const std::string &prog_file, + const std::string ¶ms_file) { + prog_file_ = prog_file; + params_file_ = params_file; + + Update(); +} +void AnalysisConfig::SetModel(const std::string &prog_file_path, + const std::string ¶ms_file_path) { + prog_file_ = prog_file_path; + params_file_ = params_file_path; + + Update(); +} +void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, + int device_id) { +#ifdef PADDLE_WITH_CUDA + use_gpu_ = true; + memory_pool_init_size_mb_ = memory_pool_init_size_mb; + device_id_ = device_id; +#else + LOG(ERROR) << "Please compile with gpu to EnableGpu()"; + use_gpu_ = false; +#endif + + Update(); +} +void AnalysisConfig::DisableGpu() { + use_gpu_ = false; + + Update(); +} + +AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { +#define CP_MEMBER(member__) member__ = other.member__; + + // Model related. + CP_MEMBER(model_dir_); + CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and + // params_file_ fields. + + CP_MEMBER(opt_cache_dir_); + prog_file_ = std::move(other.prog_file_); + params_file_ = std::move(other.params_file_); + + // Gpu related. + CP_MEMBER(use_gpu_); + CP_MEMBER(device_id_); + CP_MEMBER(memory_pool_init_size_mb_); + + CP_MEMBER(enable_memory_optim_); + CP_MEMBER(static_memory_optim_); + CP_MEMBER(static_memory_optim_force_update_); + // TensorRT related. + CP_MEMBER(use_tensorrt_); + CP_MEMBER(tensorrt_workspace_size_); + CP_MEMBER(tensorrt_max_batchsize_); + CP_MEMBER(tensorrt_min_subgraph_size_); + CP_MEMBER(tensorrt_precision_mode_); + CP_MEMBER(trt_use_static_engine_); + CP_MEMBER(trt_use_calib_mode_); + // NGRAPH related. + CP_MEMBER(use_ngraph_); + // MKLDNN related. + CP_MEMBER(use_mkldnn_); + CP_MEMBER(mkldnn_enabled_op_types_); + CP_MEMBER(mkldnn_cache_capacity_); + // Quantization related. + CP_MEMBER(use_mkldnn_quantizer_); + CP_MEMBER(mkldnn_quantizer_config_); + + CP_MEMBER(use_anakin_); + CP_MEMBER(anakin_max_batchsize_); + CP_MEMBER(anakin_max_input_shape_); + CP_MEMBER(anakin_min_subgraph_size_); + CP_MEMBER(anakin_precision_mode_); + CP_MEMBER(anakin_auto_config_layout_); + CP_MEMBER(anakin_passes_filter_); + CP_MEMBER(anakin_ops_filter_); + + // Ir related. + CP_MEMBER(enable_ir_optim_); + CP_MEMBER(use_feed_fetch_ops_); + CP_MEMBER(ir_debug_); + CP_MEMBER(specify_input_name_); + + CP_MEMBER(cpu_math_library_num_threads_); + + CP_MEMBER(serialized_info_cache_); + + if (use_gpu_) { + pass_builder_.reset(new GpuPassStrategy( + *static_cast(other.pass_builder()))); + } else { + pass_builder_.reset(new CpuPassStrategy( + *static_cast(other.pass_builder()))); + } + +#undef CP_MEMBER + + Update(); +} + +void AnalysisConfig::EnableMKLDNN() { +#ifdef PADDLE_WITH_MKLDNN + use_mkldnn_ = true; +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; + use_mkldnn_ = false; +#endif + + Update(); +} + +void AnalysisConfig::SetMkldnnCacheCapacity(int capacity) { +#ifdef PADDLE_WITH_MKLDNN + mkldnn_cache_capacity_ = capacity; +#else + LOG(ERROR) << "Please compile with MKLDNN first to set MKLDNN Thread Id"; + mkldnn_cache_capacity_ = 0; +#endif +} + +void AnalysisConfig::EnableMkldnnQuantizer() { +#ifdef PADDLE_WITH_MKLDNN + if (!mkldnn_quantizer_config_) + mkldnn_quantizer_config_.reset(new MkldnnQuantizerConfig()); + use_mkldnn_quantizer_ = true; +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; + use_mkldnn_quantizer_ = false; +#endif + + Update(); +} + +void AnalysisConfig::EnableNgraph() { +#ifdef PADDLE_WITH_NGRAPH + pass_builder()->EnableNgraph(); + use_ngraph_ = true; +#else + LOG(ERROR) << "Please compile with NGRAPH first to use NGRAPH"; + use_ngraph_ = false; +#endif +} + +MkldnnQuantizerConfig *AnalysisConfig::mkldnn_quantizer_config() const { + PADDLE_ENFORCE_NOT_NULL(mkldnn_quantizer_config_, + "MkldnnQuantizer was not enabled yet."); + return mkldnn_quantizer_config_.get(); +} + +void AnalysisConfig::EnableTensorRtEngine( + int workspace_size, int max_batch_size, int min_subgraph_size, + AnalysisConfig::Precision precision_mode, bool use_static, + bool use_calib_mode) { +#ifdef PADDLE_WITH_CUDA + if (!use_gpu()) { + LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; + return; + } + + use_tensorrt_ = true; + tensorrt_workspace_size_ = workspace_size; + tensorrt_max_batchsize_ = max_batch_size; + tensorrt_min_subgraph_size_ = min_subgraph_size; + tensorrt_precision_mode_ = precision_mode; + trt_use_static_engine_ = use_static; + trt_use_calib_mode_ = use_calib_mode; + + Update(); +#else + LOG(ERROR) + << "To use TensorRT engine, please compile inference lib with GPU first."; +#endif +} + +// TODO(Superjomn) refactor this, buggy. +void AnalysisConfig::Update() { + auto info = SerializeInfoCache(); + if (info == serialized_info_cache_) return; + + // Transfer pass_builder and copy the existing compatible passes. + if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) { + if (use_gpu()) { + pass_builder_.reset(new GpuPassStrategy); + + if (use_tensorrt_) { + // Append after the Affine_channel_conv_fuse pass. + pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); + } + } else { + pass_builder_.reset(new CpuPassStrategy); + } + + } else { + if (use_gpu()) { + pass_builder_.reset(new GpuPassStrategy( + *static_cast(pass_builder_.get()))); + + } else { + pass_builder_.reset(new CpuPassStrategy( + *static_cast(pass_builder_.get()))); + } + } + + if (use_tensorrt_) { + pass_builder()->ClearPasses(); + for (const auto &pass : kTRTSubgraphPasses) { + pass_builder()->AppendPass(pass); + } + } + + if (use_ngraph_) { + if (!enable_ir_optim_) { + LOG(ERROR) + << "EnableNgraph() only works when IR optimization is enabled."; + } +#ifdef PADDLE_WITH_NGRAPH + pass_builder()->EnableNgraph(); + use_ngraph_ = true; +#else + LOG(ERROR) << "Please compile with NGRAPH first to use NGRAPH"; + use_ngraph_ = false; +#endif + } + + if (use_mkldnn_) { +#ifdef PADDLE_WITH_MKLDNN + if (!enable_ir_optim_) { + LOG(ERROR) + << "EnableMKLDNN() only works when IR optimization is enabled."; + } else { + pass_builder()->EnableMKLDNN(); + } +#endif + } + + // Quantization passes must come after all other optimization passes + if (use_mkldnn_quantizer_) { + if (!enable_ir_optim_) { + LOG(ERROR) << "EnableMkldnnQuantizer() only works when IR optimization " + "is enabled."; + } +#ifdef PADDLE_WITH_MKLDNN + pass_builder()->EnableMkldnnQuantizer(); +#endif + } + +#ifdef PADDLE_WITH_MKLDNN + // Do not optimize before quantization + if (enable_memory_optim_ && !use_mkldnn_quantizer_) { +#else + if (enable_memory_optim_) { +#endif + pass_builder()->AppendAnalysisPass("memory_optimize_pass"); + } + + if (use_anakin_) { + PADDLE_ENFORCE(!use_tensorrt_, + "Anakin sub-graph and TensorRT sub-graph are not allowed to " + "run at the same time!"); + if (use_gpu_) { + LOG(INFO) << "Run Anakin GPU mode"; + } else { + LOG(INFO) << "Run Anakin CPU mode"; + } + + pass_builder()->ClearPasses(); + for (const auto &pass : kAnakinSubgraphPasses) { + if (std::find(anakin_passes_filter_.begin(), anakin_passes_filter_.end(), + pass) == anakin_passes_filter_.end()) { + pass_builder()->AppendPass(pass); + } + } + } + + if (ir_debug_) { + pass_builder()->TurnOnDebug(); + } +} + +std::string AnalysisConfig::SerializeInfoCache() { + std::stringstream ss; + ss << model_dir_; + ss << prog_file_; + ss << params_file_; + + ss << use_gpu_; + ss << device_id_; + ss << memory_pool_init_size_mb_; + + ss << use_tensorrt_; + ss << tensorrt_workspace_size_; + ss << tensorrt_max_batchsize_; + ss << tensorrt_min_subgraph_size_; + + ss << enable_memory_optim_; + ss << static_memory_optim_; + ss << static_memory_optim_force_update_; + + ss << use_ngraph_; + + ss << use_mkldnn_; + ss << mkldnn_cache_capacity_; + for (auto &item : mkldnn_enabled_op_types_) ss << item; + ss << ";"; + + ss << use_mkldnn_quantizer_; + ss << model_from_memory_; + + ss << enable_ir_optim_; + ss << use_feed_fetch_ops_; + ss << ir_debug_; + + ss << specify_input_name_; + ss << cpu_math_library_num_threads_; + ss << use_anakin_; + ss << anakin_min_subgraph_size_; + return ss.str(); +} + +void AnalysisConfig::SetCpuMathLibraryNumThreads( + int cpu_math_library_num_threads) { + cpu_math_library_num_threads_ = cpu_math_library_num_threads; + + Update(); +} + +float AnalysisConfig::fraction_of_gpu_memory_for_pool() const { +#ifdef PADDLE_WITH_CUDA + // Get the GPU memory details and calculate the fraction of memory for the + // GPU memory pool. + size_t gpu_used, gpu_available; + platform::SetDeviceId(device_id_); + platform::GpuMemoryUsage(&gpu_used, &gpu_available); + double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.; + float fraction_of_gpu_memory = + static_cast(memory_pool_init_size_mb()) / total_gpu_memory; + return fraction_of_gpu_memory; +#else + return 0.; +#endif +} + +void AnalysisConfig::EnableMemoryOptim(bool static_optim, + bool force_update_static_cache) { + enable_memory_optim_ = true; + static_memory_optim_ = static_optim; + static_memory_optim_force_update_ = force_update_static_cache; + + Update(); +} + +bool AnalysisConfig::enable_memory_optim() const { + return enable_memory_optim_; +} + +void AnalysisConfig::SetModelBuffer(const char *prog_buffer, + size_t prog_buffer_size, + const char *param_buffer, + size_t param_buffer_size) { + prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size); + params_file_ = std::string(param_buffer, param_buffer + param_buffer_size); + model_from_memory_ = true; + + Update(); +} + +NativeConfig AnalysisConfig::ToNativeConfig() const { + NativeConfig config; + config.model_dir = model_dir_; + config.prog_file = prog_file_; + config.param_file = params_file_; + config.use_gpu = use_gpu_; + config.device = device_id_; + config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); + config.specify_input_name = specify_input_name_; + return config; +} + +void AnalysisConfig::SwitchIrDebug(int x) { + ir_debug_ = x; + Update(); +} +void AnalysisConfig::EnableAnakinEngine( + int max_batch_size, std::map> max_input_shape, + int min_subgraph_size, AnalysisConfig::Precision precision_mode, + bool auto_config_layout, std::vector passes_filter, + std::vector ops_filter) { + anakin_max_batchsize_ = max_batch_size; + anakin_max_input_shape_ = max_input_shape; + anakin_min_subgraph_size_ = min_subgraph_size; + anakin_passes_filter_ = passes_filter; + anakin_ops_filter_ = ops_filter; + use_anakin_ = true; + anakin_precision_mode_ = precision_mode; + anakin_auto_config_layout_ = auto_config_layout; + Update(); +} + +void AnalysisConfig::PartiallyRelease() { + prog_file_.clear(); + prog_file_.shrink_to_fit(); + params_file_.clear(); + params_file_.shrink_to_fit(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc new file mode 100644 index 00000000..7650b2e9 --- /dev/null +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -0,0 +1,1001 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/profiler.h" + +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#endif + +#if PADDLE_WITH_TENSORRT +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" +#endif + +#if PADDLE_WITH_ANAKIN +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#endif + +DECLARE_bool(profile); + +namespace paddle { + +using inference::Singleton; +#if PADDLE_WITH_TENSORRT +using inference::tensorrt::TRTInt8Calibrator; +using inference::tensorrt::TRTCalibratorEngine; +using inference::tensorrt::TRTCalibratorEngineManager; +#endif + +namespace { +bool IsPersistable(const framework::VarDesc *var) { + if (var->Persistable() && + var->GetType() != framework::proto::VarType::FEED_MINIBATCH && + var->GetType() != framework::proto::VarType::FETCH_LIST && + var->GetType() != framework::proto::VarType::RAW) { + return true; + } + return false; +} +} // namespace + +bool AnalysisPredictor::Init( + const std::shared_ptr &parent_scope, + const std::shared_ptr &program) { + VLOG(3) << "Predictor::init()"; + if (FLAGS_profile) { + LOG(WARNING) << "Profiler is actived, might affect the performance"; + LOG(INFO) << "You can turn off by set gflags '-profile false'"; + auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll + : platform::ProfilerState::kCPU; + platform::EnableProfiler(tracking_device); + } + + // no matter with or without MKLDNN + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); + + if (!PrepareScope(parent_scope)) { + return false; + } + if (!CreateExecutor()) { + return false; + } + if (!PrepareProgram(program)) { + return false; + } + + // Prepare executor, create local variables. + if (!PrepareExecutor()) { + return true; + } + + // Get the feed_target_names and fetch_target_names + PrepareFeedFetch(); + + return true; +} + +bool AnalysisPredictor::PrepareScope( + const std::shared_ptr &parent_scope) { + if (parent_scope) { + PADDLE_ENFORCE_NOT_NULL( + parent_scope, + "Both program and parent_scope should be set in Clone mode."); + scope_ = parent_scope; + status_is_cloned_ = true; + } else { + if (config_.use_gpu_) { + paddle::framework::InitDevices(false, {config_.device_id_}); + } else { + paddle::framework::InitDevices(false, {}); + } + scope_.reset(new paddle::framework::Scope()); + status_is_cloned_ = false; + } + sub_scope_ = &scope_->NewScope(); + return true; +} +bool AnalysisPredictor::PrepareProgram( + const std::shared_ptr &program) { + if (!program) { + if (!LoadProgramDesc()) return false; + + // If not cloned, the parameters should be loaded. + // If config_.ir_optim() is True, parameters is loaded in + // OptimizeInferenceProgram(), but other persistable variables + // (like RAW type var) are not created in scope. + // If config_.ir_optim() is False, parameters is loaded in LoadParameters(), + // still need to create other persistable variables. + // So in both case, create persistable variables at first. + executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); + + // Optimize the program, and load parameters and modify them in the + // scope_. + // This will change the scope_ address. + if (config_.ir_optim()) { + status_ir_optim_enabled_ = true; + OptimizeInferenceProgram(); + } else { + // Load parameters + LOG(INFO) << "load parameters "; + LoadParameters(); + } + } else { + // If the program is passed from external, no need to optimize it, this + // logic is used in the clone scenario. + inference_program_ = program; + } + + executor_->CreateVariables(*inference_program_, 0, false, sub_scope_); + + return true; +} +bool AnalysisPredictor::CreateExecutor() { + if (config_.use_gpu_) { + status_use_gpu_ = true; + place_ = paddle::platform::CUDAPlace(config_.device_id_); + } else { + place_ = paddle::platform::CPUPlace(); + } + executor_.reset(new paddle::framework::NaiveExecutor(place_)); + return true; +} +bool AnalysisPredictor::PrepareExecutor() { + executor_->Prepare(sub_scope_, *inference_program_, 0, + config_.use_feed_fetch_ops_); + + PADDLE_ENFORCE_NOT_NULL(sub_scope_); + + return true; +} + +void AnalysisPredictor::MkldnnPreSet(const std::vector &inputs) { +#ifdef PADDLE_WITH_MKLDNN + VLOG(2) << "AnalysisPredictor::Run get_cur_mkldnn_session_id=" + << platform::get_cur_mkldnn_session_id(); + // In cache clearing mode. + if (config_.mkldnn_cache_capacity_ > 0) { + VLOG(2) << "In mkldnn cache clear mode."; + platform::set_cur_mkldnn_session_id( + platform::kMKLDNNSessionID_CacheClearing); + platform::set_cur_input_shape_cache_capacity( + config_.mkldnn_cache_capacity_); + // Set current_input_shape for caching dynamic shape. + std::stringstream ss; + for (size_t i = 0; i < inputs.size(); ++i) { + for (size_t j = 0; j < inputs[i].shape.size(); ++j) { + ss << inputs[i].shape[j] << "-"; + } + } + VLOG(2) << "Set input shape=" << ss.str(); + platform::set_cur_input_shape_str(ss.str()); + } +#endif +} + +void AnalysisPredictor::MkldnnPostReset() { +#ifdef PADDLE_WITH_MKLDNN + // In cache clearing mode. + if (config_.mkldnn_cache_capacity_ > 0) { + paddle::platform::set_cur_mkldnn_session_id( + platform::kMKLDNNSessionID_Default); + platform::set_cur_input_shape_cache_capacity(0); + platform::set_cur_input_shape_str(""); + } +#endif +} + +bool AnalysisPredictor::Run(const std::vector &inputs, + std::vector *output_data, + int batch_size) { + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); +#ifdef PADDLE_WITH_MKLDNN + if (config_.use_mkldnn_) MkldnnPreSet(inputs); +#endif + VLOG(3) << "Predictor::predict"; + inference::Timer timer; + timer.tic(); + // set feed variable + framework::Scope *scope = sub_scope_ ? sub_scope_ : scope_.get(); + PADDLE_ENFORCE_NOT_NULL(scope, "The scope should not be nullptr."); + if (!SetFeed(inputs, scope)) { + LOG(ERROR) << "fail to set feed"; + return false; + } + + // Run the inference program + // if share variables, we need not create variables + executor_->Run(); + + // get fetch variable + if (!GetFetch(output_data, scope)) { + LOG(ERROR) << "fail to get fetches"; + return false; + } + + // Collect variable shapes for memory optimization. + if (need_collect_var_shapes_for_memory_optim()) { + CollectVarShapes(); + } + + VLOG(3) << "predict cost: " << timer.toc() << "ms"; + + // All the containers in the scope will be hold in inference, but the + // operators assume that the container will be reset after each batch. + // Here is a bugfix, collect all the container variables, and reset then to a + // bool; the next time, the operator will call MutableData and construct a new + // container again, so that the container will be empty for each batch. + if (sub_scope_) { + tensor_array_batch_cleaner_.CollectNoTensorVars(sub_scope_); + } + tensor_array_batch_cleaner_.ResetNoTensorVars(); + + // recover the cpu_math_library_num_threads to 1, in order to avoid thread + // conflict when integrating it into deployment service. + paddle::platform::SetNumThreads(1); +#ifdef PADDLE_WITH_MKLDNN + if (config_.use_mkldnn_) MkldnnPostReset(); +#endif + return true; +} + +bool AnalysisPredictor::SetFeed(const std::vector &inputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::set_feed"; + if (inputs.size() != feeds_.size()) { + LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " + << inputs.size(); + return false; + } + + // Cache the inputs memory for better concurrency performance. + feed_tensors_.resize(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto &input = feed_tensors_[i]; + framework::DDim ddim = framework::make_ddim(inputs[i].shape); + void *input_ptr; + if (inputs[i].dtype == PaddleDType::INT64) { + input_ptr = input.mutable_data(ddim, place_); + } else if (inputs[i].dtype == PaddleDType::FLOAT32) { + input_ptr = input.mutable_data(ddim, place_); + } else if (inputs[i].dtype == PaddleDType::INT32) { + input_ptr = input.mutable_data(ddim, place_); + } else { + LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; + return false; + } + + PADDLE_ENFORCE_NOT_NULL(input_ptr); + PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data()); + + if (platform::is_cpu_place(place_)) { + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), + inputs[i].data.length()); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(place_)); + auto dst_gpu_place = boost::get(place_); + memory::Copy(dst_gpu_place, static_cast(input_ptr), + platform::CPUPlace(), inputs[i].data.data(), + inputs[i].data.length(), dev_ctx->stream()); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. + framework::LoD lod; + for (auto &level : inputs[i].lod) { + lod.emplace_back(level); + } + input.set_lod(lod); + int idx = -1; + if (config_.specify_input_name_) { + auto name = inputs[i].name; + if (feed_names_.find(name) == feed_names_.end()) { + LOG(ERROR) << "feed names from program do not have name: [" << name + << "] from specified input"; + } + idx = feed_names_[name]; + } else { + idx = boost::get(feeds_[i]->GetAttr("col")); + } + framework::SetFeedVariable(scope, input, "feed", idx); + } + return true; +} + +template +void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, + PaddleTensor *output) { + // set shape. + auto shape = framework::vectorize(fetch.dims()); + output->shape.assign(shape.begin(), shape.end()); + // set data. + const T *data = fetch.data(); + int num_elems = inference::VecReduceToInt(shape); + output->data.Resize(num_elems * sizeof(T)); + // The fetched tensor output by fetch op, should always in CPU memory, so just + // copy. + memcpy(output->data.data(), data, num_elems * sizeof(T)); + // set lod + output->lod.clear(); + for (auto &level : fetch.lod()) { + output->lod.emplace_back(level.begin(), level.end()); + } +} + +bool AnalysisPredictor::GetFetch(std::vector *outputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::get_fetch"; + outputs->resize(fetches_.size()); + for (size_t i = 0; i < fetches_.size(); ++i) { + int idx = boost::get(fetches_[i]->GetAttr("col")); + PADDLE_ENFORCE((size_t)idx == i); + framework::LoDTensor &fetch = + framework::GetFetchVariable(*scope, "fetch", idx); + auto type = fetch.type(); + auto output = &(outputs->at(i)); + output->name = fetches_[idx]->Input("X")[0]; + if (type == framework::proto::VarType::FP32) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::FLOAT32; + } else if (type == framework::proto::VarType::INT64) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT64; + } else if (type == framework::proto::VarType::INT32) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT32; + } else { + LOG(ERROR) << "unknown type, only support float32, int64 and int32 now."; + } + } + return true; +} + +void AnalysisPredictor::PrepareArgument() { + argument_.SetUseGPU(config_.use_gpu()); + argument_.SetGPUDeviceId(config_.gpu_device_id()); + argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); + argument_.SetStaticMemoryOptim(config_.static_memory_optim_); + argument_.SetStaticMemoryOptimForceUpdate( + config_.static_memory_optim_force_update_); + argument_.SetModelFromMemory(config_.model_from_memory_); + // Analyze inference_program + argument_.SetUseAnakin(config_.anakin_engine_enabled()); + argument_.SetPredictorID(predictor_id_); + argument_.SetOptimCacheDir(config_.opt_cache_dir_); + if (!config_.model_dir().empty()) { + argument_.SetModelDir(config_.model_dir()); + } else { + PADDLE_ENFORCE( + !config_.params_file().empty(), + "Either model_dir or (param_file, prog_file) should be set."); + PADDLE_ENFORCE(!config_.prog_file().empty()); + std::string dir = inference::analysis::GetDirRoot(config_.prog_file()); + + argument_.SetModelProgramPath(config_.prog_file()); + argument_.SetModelParamsPath(config_.params_file()); + } + + if (config_.use_gpu() && config_.tensorrt_engine_enabled()) { + LOG(INFO) << "TensorRT subgraph engine is enabled"; + argument_.SetUseTensorRT(true); + argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); + argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); + argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); + argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); + argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); + argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_); + } + + if (config_.anakin_engine_enabled()) { + argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_); + argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_); + argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_); + argument_.SetAnakinPrecisionMode(config_.anakin_precision_mode_); + argument_.SetAnakinAutoConfigLayout(config_.anakin_auto_config_layout_); + argument_.SetAnakinPassesFilter(config_.anakin_passes_filter_); + argument_.SetAnakinOpsFilter(config_.anakin_ops_filter_); + LOG(INFO) << "Anakin subgraph engine is enabled"; + } + + if (config_.use_mkldnn_) { + LOG(INFO) << "MKLDNN is enabled"; + argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); + } + +#ifdef PADDLE_WITH_MKLDNN + if (config_.mkldnn_quantizer_enabled()) { + LOG(INFO) << "Quantization is enabled"; + argument_.SetQuantizeEnabledOpTypes( + config_.mkldnn_quantizer_config()->enabled_op_types()); + argument_.SetQuantizeExcludedOpIds( + config_.mkldnn_quantizer_config()->excluded_op_ids()); + } +#endif + + auto passes = config_.pass_builder()->AllPasses(); + if (!config_.ir_optim()) { + passes.clear(); + LOG(INFO) << "ir_optim is turned off, no IR pass will be executed"; + } + argument_.SetIrAnalysisPasses(passes); + argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); + argument_.SetScopeNotOwned(scope_.get()); +} + +// NOTE All the members in AnalysisConfig should be copied to Argument. +void AnalysisPredictor::OptimizeInferenceProgram() { + status_program_optimized_ = true; + + PrepareArgument(); + Analyzer().Run(&argument_); + + PADDLE_ENFORCE(argument_.scope_valid()); + VLOG(5) << "to prepare executor"; + ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program); + inference_program_.reset( + new framework::ProgramDesc(argument_.ir_analyzed_program())); + // The config and argument take a lot of storage, + // when the predictor settings are complete, we release these stores. + argument_.PartiallyRelease(); + config_.PartiallyRelease(); + LOG(INFO) << "== optimize end =="; +} + +template <> +std::unique_ptr CreatePaddlePredictor< + AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { + VLOG(3) << "create AnalysisConfig"; + PADDLE_ENFORCE(config.is_valid(), + "Note: Each config can only be used for one predictor."); + if (config.use_gpu()) { + // 1. GPU memory + PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f); + PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d", + config.gpu_device_id()); + std::vector flags; + + float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool(); + if (fraction_of_gpu_memory > 0.95f) { + LOG(ERROR) + << "Allocate too much memory for the GPU memory pool, assigned " + << config.memory_pool_init_size_mb() << " MB"; + LOG(ERROR) + << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)"; + } + + if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) { + flags.push_back("dummpy"); + std::string flag = "--fraction_of_gpu_memory_to_use=" + + std::to_string(fraction_of_gpu_memory); + flags.push_back(flag); + flags.push_back("--selected_gpus=" + + std::to_string(config.gpu_device_id())); + VLOG(3) << "set flag: " << flag; + framework::InitGflags(flags); + } + } + + std::unique_ptr predictor(new AnalysisPredictor(config)); + // Each config can only be used for one predictor. + config.SetInValid(); + auto predictor_p = dynamic_cast(predictor.get()); + + if (!predictor_p->Init(nullptr)) { + return nullptr; + } + + if (config.mkldnn_quantizer_enabled() && !predictor_p->MkldnnQuantize()) { + return nullptr; + } + + return predictor; +} + +bool AnalysisPredictor::MkldnnQuantize() { +#if PADDLE_WITH_MKLDNN + if (!mkldnn_quantizer_) + mkldnn_quantizer_ = new AnalysisPredictor::MkldnnQuantizer( + *this, config_.mkldnn_quantizer_config()); + return mkldnn_quantizer_->Quantize(); +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MkldnnQuantizer"; + return false; +#endif +} + +void AnalysisPredictor::PrepareFeedFetch() { + PADDLE_ENFORCE_NOT_NULL(sub_scope_); + CreateFeedFetchVar(sub_scope_); + for (auto *op : inference_program_->Block(0).AllOps()) { + if (op->Type() == "feed") { + int idx = boost::get(op->GetAttr("col")); + if (feeds_.size() <= static_cast(idx)) { + feeds_.resize(idx + 1); + } + feeds_[idx] = op; + feed_names_[op->Output("Out")[0]] = idx; + idx2feeds_[idx] = op->Output("Out")[0]; + } else if (op->Type() == "fetch") { + int idx = boost::get(op->GetAttr("col")); + if (fetches_.size() <= static_cast(idx)) { + fetches_.resize(idx + 1); + } + fetches_[idx] = op; + idx2fetches_[idx] = op->Input("X")[0]; + } + } +} + +void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) { + PADDLE_ENFORCE_NOT_NULL(scope); + auto *var = scope->Var("feed"); + var->GetMutable(); + var = scope->Var("fetch"); + var->GetMutable(); +} + +std::vector AnalysisPredictor::GetInputNames() { + std::vector input_names; + for (auto &item : idx2feeds_) { + input_names.push_back(item.second); + } + return input_names; +} + +std::vector AnalysisPredictor::GetOutputNames() { + std::vector output_names; + for (auto &item : idx2fetches_) { + output_names.push_back(item.second); + } + return output_names; +} + +std::unique_ptr AnalysisPredictor::GetInputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = true; + res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = boost::get(place_); + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + + return res; +} + +std::unique_ptr AnalysisPredictor::GetOutputTensor( + const std::string &name) { + PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); + std::unique_ptr res( + new ZeroCopyTensor(static_cast(executor_->scope()))); + res->input_or_output_ = false; + res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = boost::get(place_); + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + return res; +} + +bool AnalysisPredictor::ZeroCopyRun() { + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); + executor_->Run(); + // Fix TensorArray reuse not cleaned bug. + tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_); + tensor_array_batch_cleaner_.ResetTensorArray(); + + // recover the cpu_math_library_num_threads to 1, in order to avoid thread + // conflict when integrating it into deployment service. + paddle::platform::SetNumThreads(1); + return true; +} + +bool AnalysisPredictor::LoadProgramDesc() { + // Initialize the inference program + std::string filename; + if (!config_.model_dir().empty()) { + filename = config_.model_dir() + "/__model__"; + } else if (!config_.prog_file().empty() && !config_.params_file().empty()) { + // All parameters are saved in a single file. + // The file names should be consistent with that used + // in Python API `fluid.io.save_inference_model`. + filename = config_.prog_file(); + } else { + if (config_.model_dir().empty() && config_.prog_file().empty()) { + LOG(ERROR) + << "Either model_dir or (prog_file, param_file) should be set."; + return false; + } + LOG(ERROR) << string::Sprintf( + "not valid model path '%s' or program path '%s'.", config_.model_dir(), + config_.params_file()); + return false; + } + + // Create ProgramDesc + framework::proto::ProgramDesc proto; + if (!config_.model_from_memory()) { + std::string pb_content; + // Read binary + std::ifstream fin(filename, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin.is_open()), "Cannot open file %s", + filename); + fin.seekg(0, std::ios::end); + pb_content.resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&(pb_content.at(0)), pb_content.size()); + fin.close(); + + proto.ParseFromString(pb_content); + } else { + proto.ParseFromString(config_.prog_file()); + } + inference_program_.reset(new framework::ProgramDesc(proto)); + return true; +} + +bool AnalysisPredictor::LoadParameters() { + PADDLE_ENFORCE_NOT_NULL(inference_program_.get(), + "The inference program should be loaded first."); + + const auto &global_block = inference_program_->MutableBlock(0); + + // create a temporary program to load parameters. + + std::unique_ptr load_program( + new framework::ProgramDesc()); + framework::BlockDesc *load_block = load_program->MutableBlock(0); + std::vector params; + + for (auto *var : global_block->AllVars()) { + if (IsPersistable(var)) { + VLOG(3) << "persistable variable's name: " << var->Name(); + + framework::VarDesc *new_var = load_block->Var(var->Name()); + new_var->SetShape(var->GetShape()); + new_var->SetDataType(var->GetDataType()); + new_var->SetType(var->GetType()); + new_var->SetLoDLevel(var->GetLoDLevel()); + new_var->SetPersistable(true); + + if (!config_.params_file().empty()) { + params.push_back(new_var->Name()); + } else { + // append_op + framework::OpDesc *op = load_block->AppendOp(); + op->SetType("load"); + op->SetOutput("Out", {new_var->Name()}); + op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()}); + op->CheckAttrs(); + } + } + } + + if (!config_.params_file().empty()) { + // sort paramlist to have consistent ordering + std::sort(params.begin(), params.end()); + // append just the load_combine op + framework::OpDesc *op = load_block->AppendOp(); + op->SetType("load_combine"); + op->SetOutput("Out", params); + op->SetAttr("file_path", {config_.params_file()}); + op->CheckAttrs(); + } + + // Use NaiveExecutor to Load parameters. + framework::NaiveExecutor e(place_); + e.Prepare(scope_.get(), *load_program, 0, false); + e.Run(); + VLOG(3) << "get " << scope_->LocalVarNames().size() << " vars after load"; + + return true; +} + +#if PADDLE_WITH_TENSORRT +bool AnalysisPredictor::SaveTrtCalibToDisk() { + PADDLE_ENFORCE(config_.tensorrt_engine_enabled(), + "This func can be invoked only in trt mode"); + auto &block = inference_program_->Block(0); + for (auto &op_desc : block.AllOps()) { + if (op_desc->Type() == "tensorrt_engine") { + std::string engine_name = + boost::get(op_desc->GetAttr("engine_key")); + if (!Singleton::Global().Has(engine_name)) { + LOG(ERROR) << "You should run the predictor(with trt) on the real data " + "to generate calibration info"; + return false; + } + TRTCalibratorEngine *calib_engine = + Singleton::Global().Get(engine_name); + LOG(INFO) << "Wait for calib threads done."; + calib_engine->calib_->waitAndSetDone(); + LOG(INFO) << "Generating TRT Calibration table data, this may cost a lot " + "of time..."; + calib_engine->thr_->join(); + std::string calibration_table_data = + calib_engine->calib_->getCalibrationTableAsString(); + + if (calibration_table_data.empty()) { + LOG(ERROR) << "the calibration table is empty."; + return false; + } + + std::string model_opt_cache_dir = + argument_.Has("model_dir") + ? argument_.model_dir() + : inference::analysis::GetDirRoot(argument_.model_program_path()); + + std::string calibration_table_data_path = + inference::analysis::GetTrtCalibPath( + inference::analysis::GetOrCreateModelOptCacheDir( + model_opt_cache_dir), + engine_name); + + std::ofstream ofile(calibration_table_data_path, std::ios::out); + LOG(INFO) << "Write Paddle-TRT INT8 calibration table data to file " + << calibration_table_data_path; + ofile << calibration_table_data; + ofile.close(); + } + } + // Free all calibrator resources. + Singleton::Global().DeleteALL(); + return true; +} +#endif + +AnalysisPredictor::~AnalysisPredictor() { +#if PADDLE_WITH_TENSORRT + if (config_.tensorrt_engine_enabled() && + config_.tensorrt_precision_mode_ == AnalysisConfig::Precision::kInt8 && + Singleton::Global().Has()) { + SaveTrtCalibToDisk(); + } +#endif + if (FLAGS_profile) { + platform::DisableProfiler(platform::EventSortingKey::kTotal, + "./profile.log"); + } + if (sub_scope_) { + scope_->DeleteScope(sub_scope_); + } + +#if PADDLE_WITH_MKLDNN + if (mkldnn_quantizer_) { + delete mkldnn_quantizer_; + mkldnn_quantizer_ = nullptr; + } +#endif + + // TODO(Superjomn) deduce the directory path. + std::string out_path = inference::analysis::GetMemoryCachePath( + config_.model_dir(), config_.prog_file()); + if (need_collect_var_shapes_for_memory_optim()) { + SerializeBatchVarShapes(out_path); + } +} + +std::unique_ptr AnalysisPredictor::Clone() { + std::lock_guard lk(clone_mutex_); + auto *x = new AnalysisPredictor(config_); + x->Init(scope_, inference_program_); + return std::unique_ptr(x); +} + +void AnalysisPredictor::CollectVarShapes() { + VLOG(4) << "Collecting var shapes"; + if (batch_var_shapes_.size() >= max_shape_collect_count_) return; + std::map> var_shapes; + for (auto var_name : inference_program_->Block(0).LocalVarNames()) { + auto *var = sub_scope_->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var); + if (var->Type() == framework::VarTypeTrait::kId || + var->Type() == framework::VarTypeTrait::kId) { + auto &tensor = var->Get(); + auto shape = framework::vectorize(tensor.dims()); + var_shapes[var_name].assign(shape.begin(), shape.end()); + } + } + batch_var_shapes_.push_back(var_shapes); + LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size() + << " batch of var shapes for analysis"; +} + +void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) { + LOG(INFO) << "serialize batch var shapes to " << path; + std::ofstream file(path); + if (!file.is_open()) { + LOG(ERROR) << "failed to serialize the var shapes to " << path; + return; + } + + // The sirialized data format: + // :dim0,dim1,dim2,; + for (auto &batch : batch_var_shapes_) { + for (auto &ele : batch) { + file << ele.first << ":"; + for (size_t i = 0; i < ele.second.size() - 1; i++) { + file << ele.second[i] << ","; + } + file << ele.second.back() << ";"; + } + file << "\n"; + } +} + +bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { + if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_; + bool need = false; + // check if the cache exists + if (!config_.enable_memory_optim()) { + need = false; + } else if (config_.static_memory_optim_ && + !inference::IsFileExists(inference::analysis::GetMemoryCachePath( + config_.model_dir(), config_.prog_file()))) { + need = true; + } else if (config_.static_memory_optim_ && + config_.static_memory_optim_force_update_) { + need = true; + } + + need_collect_var_shapes_ = need ? 1 : 0; + return need; +} + +std::string AnalysisPredictor::GetSerializedProgram() const { + return inference_program_->Proto()->SerializeAsString(); +} + +// Add SaveOptimModel +void AnalysisPredictor::SaveOptimModel(const std::string &dir) { + // save model + std::string model_name = dir + "/model"; + std::ofstream outfile; + outfile.open(model_name, std::ios::out | std::ios::binary); + std::string inference_prog_desc = GetSerializedProgram(); + outfile << inference_prog_desc; + // save params + framework::ProgramDesc save_program; + auto *save_block = save_program.MutableBlock(0); + + const framework::ProgramDesc &main_program = program(); + const framework::BlockDesc &global_block = main_program.Block(0); + std::vector save_var_list; + for (framework::VarDesc *var : global_block.AllVars()) { + if (IsPersistable(var)) { + framework::VarDesc *new_var = save_block->Var(var->Name()); + new_var->SetShape(var->GetShape()); + new_var->SetDataType(var->GetDataType()); + new_var->SetType(var->GetType()); + new_var->SetLoDLevel(var->GetLoDLevel()); + new_var->SetPersistable(true); + + save_var_list.push_back(new_var->Name()); + } + } + std::sort(save_var_list.begin(), save_var_list.end()); + auto *op = save_block->AppendOp(); + op->SetType("save_combine"); + op->SetInput("X", save_var_list); + op->SetAttr("file_path", dir + "/params"); + op->CheckAttrs(); + + platform::CPUPlace place; + framework::Executor exe(place); + exe.Run(save_program, scope(), 0, true, true); +} + +template <> +std::unique_ptr CreatePaddlePredictor( + const AnalysisConfig &config) { + return CreatePaddlePredictor( + config); +} + +} // namespace paddle + +#if PADDLE_WITH_TENSORRT +USE_TRT_CONVERTER(elementwise_add_weight); +USE_TRT_CONVERTER(elementwise_add_tensor); +USE_TRT_CONVERTER(elementwise_sub_tensor); +USE_TRT_CONVERTER(elementwise_div_tensor); +USE_TRT_CONVERTER(elementwise_mul_tensor); +USE_TRT_CONVERTER(elementwise_max_tensor); +USE_TRT_CONVERTER(elementwise_min_tensor); +USE_TRT_CONVERTER(elementwise_pow_tensor); +USE_TRT_CONVERTER(mul); +USE_TRT_CONVERTER(conv2d); +USE_TRT_CONVERTER(relu); +USE_TRT_CONVERTER(sigmoid); +USE_TRT_CONVERTER(tanh); +USE_TRT_CONVERTER(fc); +USE_TRT_CONVERTER(pool2d); +USE_TRT_CONVERTER(softmax); +USE_TRT_CONVERTER(batch_norm); +USE_TRT_CONVERTER(concat); +USE_TRT_CONVERTER(dropout); +USE_TRT_CONVERTER(pad); +USE_TRT_CONVERTER(split); +USE_TRT_CONVERTER(prelu); +USE_TRT_CONVERTER(conv2d_transpose); +USE_TRT_CONVERTER(leaky_relu); +#endif + +#if PADDLE_WITH_ANAKIN +USE_ANAKIN_CONVERTER(mul); +USE_ANAKIN_CONVERTER(fc); +USE_ANAKIN_CONVERTER(conv2d); +USE_ANAKIN_CONVERTER(conv2d_fusion); +USE_ANAKIN_CONVERTER(concat); +USE_ANAKIN_CONVERTER(split); +USE_ANAKIN_CONVERTER(relu); +USE_ANAKIN_CONVERTER(sigmoid); +USE_ANAKIN_CONVERTER(tanh); +USE_ANAKIN_CONVERTER(pool2d); +USE_ANAKIN_CONVERTER(elementwise_add); +USE_ANAKIN_CONVERTER(elementwise_mul); +USE_ANAKIN_CONVERTER(batch_norm); +USE_ANAKIN_CONVERTER(flatten); +USE_ANAKIN_CONVERTER(reshape); +USE_ANAKIN_CONVERTER(transpose); +USE_ANAKIN_CONVERTER(softmax); +USE_ANAKIN_CONVERTER(detection_out); +USE_ANAKIN_CONVERTER(density_prior_box); +USE_ANAKIN_CONVERTER(dropout); +USE_ANAKIN_CONVERTER(sum); +USE_ANAKIN_CONVERTER(prior_box); +USE_ANAKIN_CONVERTER(leaky_relu); +USE_ANAKIN_CONVERTER(affine_channel); +USE_ANAKIN_CONVERTER(relu6); +USE_ANAKIN_CONVERTER(swish); +USE_ANAKIN_CONVERTER(shuffle_channel); +#endif diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h new file mode 100644 index 00000000..7a366b10 --- /dev/null +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -0,0 +1,187 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/string/printf.h" +#ifdef PADDLE_WITH_TESTING +#include +#include +#endif +namespace paddle { + +using inference::analysis::Argument; +using inference::analysis::Analyzer; +using framework::proto::ProgramDesc; +using framework::NaiveExecutor; + +/** \brief This predictor is based on the original native predictor with IR and + * Analysis support. + * + * It will optimize IR and Parameters in the runtime. + * + * TODO(Superjomn) Replace the Navive predictor? + */ +class AnalysisPredictor : public PaddlePredictor { + public: + explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) { + predictor_id_ = inference::GetUniqueId(); + } + ~AnalysisPredictor(); + + bool Init(const std::shared_ptr &parent_scope, + const std::shared_ptr &program = nullptr); + + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = -1) override; + + std::vector GetInputNames(); + std::vector GetOutputNames(); + + std::unique_ptr GetInputTensor( + const std::string &name) override; + std::unique_ptr GetOutputTensor( + const std::string &name) override; + + bool ZeroCopyRun() override; + + void CreateFeedFetchVar(framework::Scope *scope); + void PrepareFeedFetch(); + + void PrepareArgument(); + void OptimizeInferenceProgram(); + + Argument &analysis_argument() { return argument_; } + + std::unique_ptr Clone() override; + + framework::Scope *scope() { return scope_.get(); } + framework::ProgramDesc &program() { return *inference_program_; } + + std::string GetSerializedProgram() const override; + + bool MkldnnQuantize(); + + // save program to model + // save parameters to params + void SaveOptimModel(const std::string &dir); + + protected: + // For memory optimization. + bool need_collect_var_shapes_for_memory_optim(); + void CollectVarShapes(); + void SerializeBatchVarShapes(const std::string &path); + + bool PrepareProgram(const std::shared_ptr &program); + bool PrepareScope(const std::shared_ptr &parent_scope); + bool CreateExecutor(); + bool PrepareExecutor(); + + bool LoadProgramDesc(); + bool LoadParameters(); + + bool SetFeed(const std::vector &input_datas, + framework::Scope *scope); + bool GetFetch(std::vector *output_data, + framework::Scope *scope); + template + void GetFetchOne(const framework::LoDTensor &fetchs, + PaddleTensor *output_data); + // PreSet and PostReset for Mkldnn multi-thread and dynamic shape input. + // Used in AnalysisPredictor::Run(), do not support + // AnalysisPredictor::ZeroRun() now. + void MkldnnPreSet(const std::vector &inputs); + void MkldnnPostReset(); + +#if PADDLE_WITH_TENSORRT + // When we use Paddle-TRT INT8 engine, we need to generate calibration table + // data first, + // the calibration table contains the range for each op's input and output, + // this whole process can be divided into several steps: + // + // 1. Builds a 32-bit engine, runs it on the calibration set, and records a + // histogram for each + // tensor of the distribution of activation values. + // 2. Builds a calibration table from the histograms. + // + // After step 2, we need to store the calibration table on disk + bool SaveTrtCalibToDisk(); +#endif + +// Some more detailed tests, they are made the friends of the predictor, so that +// the all the details can be tested. +#if PADDLE_WITH_TESTING + FRIEND_TEST(AnalysisPredictor, analysis_off); + FRIEND_TEST(AnalysisPredictor, analysis_on); + FRIEND_TEST(AnalysisPredictor, with_gpu); +#endif + + private: + AnalysisConfig config_; + Argument argument_; + std::unique_ptr executor_; + platform::Place place_; + std::shared_ptr scope_; + framework::Scope *sub_scope_{nullptr}; + std::shared_ptr inference_program_; + std::vector feeds_; + std::map feed_names_; + // Sorted according to the idx. + std::map idx2feeds_; + std::vector fetches_; + std::map idx2fetches_; + +#if PADDLE_WITH_MKLDNN + // Helper class to perform quantization + class MkldnnQuantizer; + MkldnnQuantizer *mkldnn_quantizer_{nullptr}; + +#if PADDLE_WITH_TESTING + friend class MkldnnQuantizerTest; +#endif +#endif + + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious + // concurrency problems, wrong results and memory leak, so cache them. + std::vector feed_tensors_; + details::TensorArrayBatchCleaner tensor_array_batch_cleaner_; + // A mutex help to make Clone thread safe. + std::mutex clone_mutex_; + + // For memory optimization. + const size_t max_shape_collect_count_{1000}; + int need_collect_var_shapes_{-1}; // -1 for default, 0 for false, 1 for true. + std::vector>> batch_var_shapes_; + int predictor_id_; + + private: + // Some status here that help to determine the status inside the predictor. + bool status_program_optimized_{false}; + bool status_is_cloned_{false}; + bool status_use_gpu_{false}; + bool status_ir_optim_enabled_{false}; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc new file mode 100644 index 00000000..44b1b807 --- /dev/null +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -0,0 +1,491 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include +#include +#include // NOLINT +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#endif + +DEFINE_string(dirname, "", "dirname to tests."); + +namespace paddle { + +TEST(AnalysisPredictor, analysis_off) { + AnalysisConfig config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(false); + + auto _predictor = CreatePaddlePredictor(config); + auto* predictor = static_cast(_predictor.get()); + + // Without analysis, the scope_ and sub_scope_ are created by predictor + // itself. + ASSERT_TRUE(predictor->scope_); + ASSERT_TRUE(predictor->sub_scope_); + ASSERT_EQ(predictor->scope_->parent(), nullptr); + ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); + // ir is turned off, so program shouldn't be optimized. + ASSERT_FALSE(predictor->status_program_optimized_); + LOG(INFO) << "scope parameters " << predictor->scope_->LocalVarNames().size(); + + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data.Reset(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + std::vector inputs(4, tensor); + std::vector outputs; + ASSERT_TRUE(predictor->Run(inputs, &outputs)); +} + +TEST(AnalysisPredictor, analysis_on) { + AnalysisConfig config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(true); +#ifdef PADDLE_WITH_CUDA + config.EnableUseGpu(100, 0); +#else + config.DisableGpu(); +#endif + + auto _predictor = CreatePaddlePredictor(config); + auto* predictor = static_cast(_predictor.get()); + + ASSERT_TRUE(predictor->scope_); + ASSERT_TRUE(predictor->sub_scope_); + ASSERT_EQ(predictor->scope_->parent(), nullptr); + ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); + // ir is turned on, so program should be optimized. + ASSERT_TRUE(predictor->status_program_optimized_); + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data.Reset(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + std::vector inputs(4, tensor); + std::vector outputs; + ASSERT_TRUE(predictor->Run(inputs, &outputs)); + + for (auto& output : outputs) { + LOG(INFO) << inference::DescribeTensor(output); + } + + // compare with NativePredictor + auto naive_predictor = + CreatePaddlePredictor(config.ToNativeConfig()); + std::vector naive_outputs; + ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs)); + ASSERT_EQ(naive_outputs.size(), 1UL); + inference::CompareTensor(outputs.front(), naive_outputs.front()); +} + +TEST(AnalysisPredictor, ZeroCopy) { + AnalysisConfig config; + config.SetModel(FLAGS_dirname); + config.SwitchUseFeedFetchOps(false); + auto predictor = CreatePaddlePredictor(config); + + auto w0 = predictor->GetInputTensor("firstw"); + auto w1 = predictor->GetInputTensor("secondw"); + auto w2 = predictor->GetInputTensor("thirdw"); + auto w3 = predictor->GetInputTensor("forthw"); + + w0->Reshape({4, 1}); + w1->Reshape({4, 1}); + w2->Reshape({4, 1}); + w3->Reshape({4, 1}); + + auto* w0_data = w0->mutable_data(PaddlePlace::kCPU); + auto* w1_data = w1->mutable_data(PaddlePlace::kCPU); + auto* w2_data = w2->mutable_data(PaddlePlace::kCPU); + auto* w3_data = w3->mutable_data(PaddlePlace::kCPU); + + for (int i = 0; i < 4; i++) { + w0_data[i] = i; + w1_data[i] = i; + w2_data[i] = i; + w3_data[i] = i; + } + + predictor->ZeroCopyRun(); + + auto out = predictor->GetOutputTensor("fc_1.tmp_2"); + PaddlePlace place; + int size = 0; + auto* out_data = out->data(&place, &size); + LOG(INFO) << "output size: " << size / sizeof(float); + LOG(INFO) << "output_data: " << out_data; +} + +TEST(AnalysisPredictor, Clone) { + AnalysisConfig config; + config.SetModel(FLAGS_dirname); + config.SwitchUseFeedFetchOps(true); + config.SwitchIrOptim(true); + + std::vector> predictors; + predictors.emplace_back(CreatePaddlePredictor(config)); + + LOG(INFO) << "************** to clone ************************"; + const int num_threads = 3; + for (int i = 1; i < num_threads; i++) { + predictors.emplace_back(predictors.front()->Clone()); + } + + auto* root_scope = + static_cast(predictors[0].get())->scope(); + ASSERT_FALSE(root_scope->kids().empty()); + LOG(INFO) << "***** scope ******\n" + << framework::GenScopeTreeDebugInfo(root_scope); + + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data.Reset(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + std::vector inputs(4, tensor); + std::vector outputs; + predictors[0]->Run(inputs, &outputs); + + LOG(INFO) << "Run with single thread"; + for (int i = 0; i < num_threads; i++) { + LOG(INFO) << "run predictor " << i; + ASSERT_TRUE(predictors[i]->Run(inputs, &outputs)); + } + + LOG(INFO) << "Run with multiple threads"; + std::vector threads; + for (int i = 0; i < num_threads; i++) { + threads.emplace_back([&predictors, &inputs, i] { + LOG(INFO) << "thread #" << i << " running"; + std::vector outputs; + auto predictor = predictors.front()->Clone(); + for (int j = 0; j < 10; j++) { + ASSERT_TRUE(predictor->Run(inputs, &outputs)); + } + }); + } + + for (auto& t : threads) { + t.join(); + } +} + +// This function is not released yet, will fail on some machine. +// TODO(Superjomn) Turn on it latter. +/* +TEST(AnalysisPredictor, memory_optim) { + AnalysisConfig config(FLAGS_dirname); + config.DisableGpu(); + config.EnableMemoryOptim(true); + config.SwitchIrDebug(); + + auto native_predictor = + CreatePaddlePredictor(config.ToNativeConfig()); + + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data.Reset(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + std::vector inputs(4, tensor); + std::vector output, output1; + + { + // The first predictor help to cache the memory optimize strategy. + auto predictor = CreatePaddlePredictor(config); + LOG(INFO) << "serialized program: " << predictor->GetSerializedProgram(); + ASSERT_FALSE(predictor->GetSerializedProgram().empty()); + + // Run several times to check the parameters are not reused by mistake. + for (int i = 0; i < 5; i++) { + ASSERT_TRUE(predictor->Run(inputs, &output)); + } + } + + { + output.clear(); + // The second predictor to perform memory optimization. + config.EnableMemoryOptim(false); + auto predictor = CreatePaddlePredictor(config); + + // Run with memory optimization + ASSERT_TRUE(predictor->Run(inputs, &output)); + } + + // Run native + ASSERT_TRUE(native_predictor->Run(inputs, &output1)); + + LOG(INFO) << "the output " << inference::DescribeTensor(output.front()); + LOG(INFO) << "the native output " + << inference::DescribeTensor(output1.front()); + + inference::CompareResult(output, output1); +} +*/ + +#ifdef PADDLE_WITH_MKLDNN +class MkldnnQuantizerTest : public testing::Test { + public: + MkldnnQuantizerTest() { + AnalysisConfig config(FLAGS_dirname); + + predictor.reset(new AnalysisPredictor(config)); + auto* predictor_p = static_cast(predictor.get()); + + auto qconfig = new MkldnnQuantizerConfig(); + + mkldnn_quantizer.reset( + new AnalysisPredictor::MkldnnQuantizer(*predictor_p, qconfig)); + } + + std::pair, float> Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + int num_bins) const { + return mkldnn_quantizer->Histogram(var_tensor, min_val, max_val, num_bins); + } + + std::pair GetMaxScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetMaxScalingFactor(var_tensor, is_unsigned); + } + + std::pair GetMaxChScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned); + } + + std::pair GetKLScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const { + return mkldnn_quantizer->GetKLScalingFactor(var_tensor, is_unsigned); + } + + protected: + std::unique_ptr predictor; + std::unique_ptr mkldnn_quantizer; + float abs_error = 1e-6; + static const std::array non_negative_values; + static const std::array positive_and_negative_values; +}; + +const std::array MkldnnQuantizerTest::non_negative_values = { + 0.0158671, 0.026459, 0.0280772, 0.00962479, 0.0131628, + 0.016704, 0.00118407, 0.00765726, 0.0123213, 0.00944741}; +const std::array MkldnnQuantizerTest::positive_and_negative_values = + {-0.0482659, -0.0102493, -0.00794221, -0.00387115, -0.00674586, + -0.0495346, 0.0629528, -0.00531285, -0.0230353, 0.0269089}; + +TEST_F(MkldnnQuantizerTest, histogram_inverted_min_max) { + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, max_val, min_val, 3), + platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, histogram_non_negative_to_3) { + // all non-negative values + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + std::vector histogram; + float bin_width; + + std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); + + ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.f, abs_error) + << "Improperly calculated bin_width."; + + ASSERT_EQ(histogram[0], 4); + ASSERT_EQ(histogram[1], 4); + ASSERT_EQ(histogram[2], 2); +} + +TEST_F(MkldnnQuantizerTest, histogram_positive_and_negative_to_3) { + const auto& values = positive_and_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + std::vector histogram; + float bin_width; + + std::tie(histogram, bin_width) = Histogram(var_tensor, min_val, max_val, 3); + + ASSERT_NEAR(bin_width, std::abs(max_val - min_val) / 3.0f, abs_error) + << "Improperly calculated bin_width."; + + ASSERT_EQ(histogram[0], 3); + ASSERT_EQ(histogram[1], 5); + ASSERT_EQ(histogram[2], 2); +} + +TEST_F(MkldnnQuantizerTest, histogram_zero_bins) { + const auto& values = non_negative_values; + auto min_val = *std::min_element(values.begin(), values.end()); + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + ASSERT_THROW(Histogram(var_tensor, min_val, max_val, 0), + platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, histogram_empty) { + // empty tensor + ASSERT_THROW(Histogram({}, -1, 1, 1), platform::EnforceNotMet); + + // zero tensor + framework::LoDTensor var_tensor; + var_tensor.Resize({0}); + var_tensor.mutable_data(platform::CPUPlace()); + + ASSERT_THROW(Histogram(var_tensor, -1, 1, 1), platform::EnforceNotMet); +} + +TEST_F(MkldnnQuantizerTest, kl_scaling_factor_signed) { + const auto& values = positive_and_negative_values; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, false); + + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / 0.0899106152344, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_signed) { + const auto& values = positive_and_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, false); + + ASSERT_EQ(is_unsigned, false); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / max_val, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_unsigned) { + const auto& values = non_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / max_val, abs_error); +} + +TEST_F(MkldnnQuantizerTest, max_scaling_factor_chwise_unsigned) { + const auto& values = non_negative_values; + auto max_val = *std::max_element(values.begin(), values.end()); + int channels = 3; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(channels, 1, 1, values.size())); + for (int i = 0; i < channels; i++) + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace()) + + i * values.size()); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetMaxChScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), channels); + for (int i = 0; i < channels; i++) { + ASSERT_NEAR(lod_tensor.data()[i], 1.0 / max_val, abs_error); + } +} + +TEST_F(MkldnnQuantizerTest, kl_scaling_factor_unsigned) { + const auto& values = non_negative_values; + + framework::LoDTensor var_tensor; + var_tensor.Resize(framework::make_dim(values.size())); + std::copy(begin(values), end(values), + var_tensor.mutable_data(platform::CPUPlace())); + + bool is_unsigned; + framework::LoDTensor lod_tensor; + + std::tie(is_unsigned, lod_tensor) = GetKLScalingFactor(var_tensor, true); + + ASSERT_EQ(is_unsigned, true); + ASSERT_EQ(lod_tensor.numel(), 1); + ASSERT_NEAR(lod_tensor.data()[0], 1.0 / 0.0252845321362, abs_error); +} +#endif + +} // namespace paddle diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc new file mode 100644 index 00000000..fc2d7b48 --- /dev/null +++ b/paddle/fluid/inference/api/api.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/commit.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_pass_builder.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { + +int PaddleDtypeSize(PaddleDType dtype) { + switch (dtype) { + case PaddleDType::FLOAT32: + return sizeof(float); + case PaddleDType::INT64: + return sizeof(int64_t); + case PaddleDType::INT32: + return sizeof(int32_t); + default: + assert(false); + return -1; + } +} + +PaddleBuf::PaddleBuf(PaddleBuf &&other) + : data_(other.data_), + length_(other.length_), + memory_owned_(other.memory_owned_) { + other.memory_owned_ = false; + other.data_ = nullptr; + other.length_ = 0; +} + +PaddleBuf::PaddleBuf(const PaddleBuf &other) { *this = other; } + +PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) { + if (!other.memory_owned_) { + data_ = other.data_; + length_ = other.length_; + memory_owned_ = other.memory_owned_; + } else { + Resize(other.length()); + PADDLE_ENFORCE(!(other.length() > 0 && other.data() == nullptr)); + memcpy(data_, other.data(), other.length()); + length_ = other.length(); + memory_owned_ = true; + } + return *this; +} + +PaddleBuf &PaddleBuf::operator=(PaddleBuf &&other) { + // only the buffer with external memory can be copied + data_ = other.data_; + length_ = other.length_; + memory_owned_ = other.memory_owned_; + other.data_ = nullptr; + other.length_ = 0; + other.memory_owned_ = false; + return *this; +} + +void PaddleBuf::Resize(size_t length) { + // Only the owned memory can be reset, the external memory can't be changed. + if (length_ >= length) return; + if (memory_owned_) { + Free(); + data_ = malloc(length); + length_ = length; + memory_owned_ = true; + } else { + PADDLE_THROW("The memory is allocated externally, can not Resized"); + } +} + +void PaddleBuf::Reset(void *data, size_t length) { + Free(); + memory_owned_ = false; + data_ = data; + length_ = length; +} + +void PaddleBuf::Free() { + if (memory_owned_ && data_) { + PADDLE_ENFORCE_GT(length_, 0UL); + free(static_cast(data_)); + data_ = nullptr; + length_ = 0; + } +} + +std::string get_version() { + std::stringstream ss; + ss << "version: " << framework::paddle_version() << "\n"; + ss << "commit: " << framework::paddle_commit() << "\n"; + ss << "branch: " << framework::paddle_compile_branch() << "\n"; + return ss.str(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc new file mode 100644 index 00000000..e38531a4 --- /dev/null +++ b/paddle/fluid/inference/api/api_anakin_engine.cc @@ -0,0 +1,454 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/inference/api/api_anakin_engine.h" +#include "paddle/fluid/inference/api/paddle_api.h" + +#include "framework/core/net/net.h" +#include "framework/operators/ops.h" +#include "saber/funcs/timer.h" + +namespace paddle { + +using paddle::contrib::AnakinConfig; +template +extern std::mutex PaddleInferenceAnakinPredictor::mutex_; +template +extern std::once_flag PaddleInferenceAnakinPredictor::init_anakin_; + +template +void PaddleInferenceAnakinPredictor::InitEnv() { + std::call_once(this->init_anakin_, [this]() { + anakin::Env::env_init(this->config_.max_stream); + }); + anakin::TargetWrapper::set_device(this->config_.device_id); +} +template +void PaddleInferenceAnakinPredictor::InitNet() { + std::unique_lock lock(this->mutex_); + this->executor_p_ = new anakin::Net(*this->graph_p_, true); +} +template +void PaddleInferenceAnakinPredictor::SetContext() { + this->ctx_p_ = std::make_shared>( + this->config_.device_id, this->config_.data_stream_id, + this->config_.compute_stream_id); +} +template +void PaddleInferenceAnakinPredictor::InitGraph() { + this->graph_p_ = + std::make_shared>(); + if (!this->config_.model_file.empty()) { + this->graph_p_->load(this->config_.model_file); + } else if (this->config_.model_buf_p) { + this->graph_p_->load(this->config_.model_buf_p, + this->config_.model_buf_len); + } else { + LOG(FATAL) << "Model load error."; + } + this->input_names_ = this->graph_p_->get_ins(); + this->output_names_ = this->graph_p_->get_outs(); + for (auto &input_str : this->input_names_) { + if (this->config_.init_inputs_shape.find(input_str) == + this->config_.init_inputs_shape.end()) { + LOG(FATAL) << input_str << " should be set in init_inputs_shape."; + } + std::vector shape = + this->config_.init_inputs_shape.find(input_str)->second; + this->graph_p_->Reshape(input_str, shape); + } +} +template +void PaddleInferenceAnakinPredictor::OptimizeGraph() { + if (!this->graph_p_->Optimize()) { + LOG(FATAL) << "Graph optimization error."; + } +} +template +void PaddleInferenceAnakinPredictor::InitPredictor() { + this->InitEnv(); + this->SetContext(); + this->InitGraph(); + this->OptimizeGraph(); + this->InitNet(); +} +template +void PaddleInferenceAnakinPredictor::Predict() { + anakin::TargetWrapper::device_sync(); + this->executor_p_->prediction(); + anakin::TargetWrapper::device_sync(); +} +template +bool PaddleInferenceAnakinPredictor::Run( + const std::vector &inputs, + std::vector *output_data, int batch_size) { + if (this->config_.re_allocable) { + return this->RunImpl(inputs, output_data); + } else { + // Run inputs data that exceeds batch size in batches. + // 1. Reassign the batch size. + if (batch_size == -1) { + if (!inputs[0].lod.empty()) { + batch_size = inputs[0].lod[0].size() - 1; + } else { + batch_size = inputs[0].shape[0]; + } + } + // 2. If the data don't need to be batched, run it directly. + if (batch_size <= this->config_.init_batch_size) { + return this->RunImpl(inputs, output_data); + } + // 3. Check the batch size and define temporary variables. + std::vector cur_inputs; + std::vector outputs_master; + std::vector> outputs_vec; + for (const auto &input : inputs) { + if (!input.lod.empty()) { + if (input.lod.size() != 1) { + return false; + } + if (input.lod[0].size() - 1 != batch_size) { + return false; + } + } else { + LOG(INFO) << "Non-lod mode to be implemented."; + return false; + } + PaddleTensor tensor; + tensor.name = input.name; + tensor.dtype = PaddleDType::FLOAT32; + cur_inputs.push_back(tensor); + } + for (auto output : *output_data) { + PaddleTensor tensor; + tensor.name = output.name; + outputs_master.push_back(tensor); + } + // 4. Batch execution. + for (size_t start_batch = 0; start_batch < batch_size;) { + auto end_batch = start_batch + this->config_.init_batch_size; + if (end_batch > batch_size) { + end_batch = batch_size; + } + auto cur_outputs = outputs_master; + for (size_t i = 0; i < inputs.size(); i++) { + auto start = inputs[i].lod[0][start_batch]; + auto end = inputs[i].lod[0][end_batch]; + std::vector offsets; + for (size_t j = start_batch; j <= end_batch; j++) { + offsets.push_back(inputs[i].lod[0][j] - + inputs[i].lod[0][start_batch]); + } + auto mem_start = static_cast(inputs[i].data.data()) + start; + cur_inputs[i].data = + PaddleBuf(mem_start, (end - start) * sizeof(float)); + cur_inputs[i].lod = std::vector>({offsets}); + cur_inputs[i].shape = + std::vector({static_cast(end - start), 1, 1, 1}); + } + if (!this->RunImpl(cur_inputs, &cur_outputs)) { + return false; + } + outputs_vec.push_back(cur_outputs); + start_batch = end_batch; + } + // 5. Copy the results to contiguous memory. + // Assume that each batch has the same final outputs size. + auto count = [](const std::vector &v) { + int cnt = 1; + for_each(v.begin(), v.end(), [&cnt](int n) { cnt *= n; }); + return cnt; + }; + for (size_t i = 0; i < output_data->size(); i++) { + std::vector shape = outputs_vec[i][0].shape; + shape[0] = batch_size; + int total_cnt = count(shape); + (*output_data)[i].shape = shape; + (*output_data)[i].data.Resize(total_cnt * sizeof(float)); + float *addr = static_cast((*output_data)[i].data.data()); + for (const auto &single_out : outputs_vec) { + int cnt = count(single_out[i].shape); + memcpy(addr, single_out[i].data.data(), cnt * sizeof(float)); + addr += cnt; + } + } + } + return true; +} +template +bool PaddleInferenceAnakinPredictor::RunImpl( + const std::vector &inputs, + std::vector *output_data) { + anakin::TargetWrapper::set_device(this->config_.device_id); + for (const auto &input : inputs) { + if (input.dtype != PaddleDType::FLOAT32) { + LOG(FATAL) << "Only support float type inputs. " << input.name + << "'s type is not float"; + } + auto d_tensor_p = this->executor_p_->get_in(input.name); + auto net_shape = d_tensor_p->valid_shape(); + if (net_shape.size() != input.shape.size()) { + LOG(FATAL) << " input " << input.name + << "'s shape size should be equal to that of net"; + } + int sum = 1; + for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; }); + if (sum > net_shape.count()) { + if (this->config_.re_allocable) { + this->graph_p_->Reshape(input.name, input.shape); + delete this->executor_p_; + this->InitNet(); + d_tensor_p = this->executor_p_->get_in(input.name); + } else { + LOG(FATAL) + << "Run failed because Anakin was expected not to reallocate " + "memory."; + } + } + std::vector tmp_shape; + for (auto s : input.shape) { + tmp_shape.push_back(s); + } + auto *data = static_cast(input.data.data()); + anakin::saber::Tensor::Host_type> + h_tensor(data, typename anakin::DefaultHostType::Host_type(), 0, + tmp_shape); + d_tensor_p->reshape(tmp_shape); + + if (input.lod.size() > 0) { + if (input.lod.size() > 1) { + LOG(FATAL) << " input lod first dim should <=1, but you set " + << input.lod.size(); + } + std::vector lod(input.lod[0].begin(), input.lod[0].end()); + std::vector> offset({lod}); + d_tensor_p->set_seq_offset(offset); + VLOG(3) << "offset.size(): " << offset[0].size(); + for (int i = 0; i < offset[0].size(); i++) { + VLOG(3) << offset[0][i]; + } + } + d_tensor_p->copy_from(h_tensor); + } + this->Predict(); + if (output_data->empty()) { + LOG(FATAL) << "At least one output should be set with tensors' names."; + } + for (auto &output : *output_data) { + if (std::find(this->output_names_.begin(), this->output_names_.end(), + output.name) == this->output_names_.end()) { + LOG(FATAL) << output.name << " is not in the outputs of the graph."; + } + auto *d_tensor_p = this->executor_p_->get_out(output.name); + output.shape = d_tensor_p->valid_shape(); + if (output.data.length() < d_tensor_p->valid_size() * sizeof(float)) { + output.data.Resize(d_tensor_p->valid_size() * sizeof(float)); + } + auto *data = static_cast(output.data.data()); + anakin::saber::Tensor::Host_type> + h_tensor(data, typename anakin::DefaultHostType::Host_type(), 0, + d_tensor_p->valid_shape()); + h_tensor.copy_from(*d_tensor_p); + } + return true; +} +template +bool PaddleInferenceAnakinPredictor::Reset( + PaddleInferenceAnakinPredictor *predictor) { + this->config_ = predictor->GetConfig(); + this->graph_p_ = predictor->GetGraph(); + this->input_names_ = predictor->GetInputNames(); + this->output_names_ = predictor->GetOutputNames(); + this->ctx_p_ = std::make_shared>( + this->config_.device_id, this->config_.data_stream_id, + this->config_.compute_stream_id); + this->InitNet(); + return true; +} +template +std::unique_ptr +PaddleInferenceAnakinPredictor::New() { + return std::unique_ptr( + new PaddleInferenceAnakinPredictor()); +} +// the cloned new Predictor of anakin share the same net weights from original +// Predictor +template +std::unique_ptr +PaddleInferenceAnakinPredictor::Clone() { + VLOG(3) << "Anakin Predictor::clone"; + std::unique_ptr cls = std::move(this->New()); + auto anakin_predictor_p = + dynamic_cast *>(cls.get()); + if (!anakin_predictor_p) { + LOG(FATAL) << "fail to call Init"; + } + anakin_predictor_p->Reset(this); + return cls; +} + +#ifdef ANAKIN_MLU_PLACE +template +std::unique_ptr +PaddleInferenceAnakinMLUPredictor::New() { + return std::unique_ptr( + new PaddleInferenceAnakinMLUPredictor()); +} +template +void PaddleInferenceAnakinMLUPredictor::SetContext() { + this->ctx_p_ = std::make_shared>( + this->config_.device_id, this->config_.data_stream_id, + this->config_.compute_stream_id); + this->ctx_p_->set_model_parallel(this->config_.model_parallel); + this->ctx_p_->set_fusion(this->config_.op_fuse); +} +template +void PaddleInferenceAnakinMLUPredictor::OptimizeGraph() { + if (!this->graph_p_->fusion_optimize(this->config_.op_fuse)) { + LOG(FATAL) << "Graph optimization error."; + } +} +template +void PaddleInferenceAnakinMLUPredictor::InitNet() { + std::unique_lock lock(this->mutex_); + this->executor_p_ = new anakin::Net(); + this->executor_p_->fusion_init(*this->graph_p_, this->ctx_p_, true); +} +template +void PaddleInferenceAnakinMLUPredictor::Predict() { + anakin::TargetWrapper::device_sync(); + this->executor_p_->fusion_prediction(); + anakin::TargetWrapper::device_sync(); +} +#endif + +#ifdef ANAKIN_BM_PLACE +template +std::unique_ptr PaddleInferenceAnakinBMPredictor::New() { + return std::unique_ptr( + new PaddleInferenceAnakinBMPredictor()); +} +template +void PaddleInferenceAnakinBMPredictor::OptimizeGraph() { + if (!this->graph_p_->fusion_optimize()) { + LOG(FATAL) << "Graph optimization error."; + } +} +template +void PaddleInferenceAnakinBMPredictor::InitNet() { + std::unique_lock lock(this->mutex_); + this->executor_p_ = new anakin::Net(); + this->executor_p_->fusion_init(*this->graph_p_, this->ctx_p_, true); +} +template +void PaddleInferenceAnakinBMPredictor::Predict() { + anakin::TargetWrapper::device_sync(); + this->executor_p_->fusion_prediction(); + anakin::TargetWrapper::device_sync(); +} +#endif + +#ifdef PADDLE_WITH_CUDA +template class PaddleInferenceAnakinPredictor< + anakin::NV, anakin::Precision::FP32, ::anakin::OpRunType::ASYNC>; +#endif +#ifdef ANAKIN_X86_PLACE +template class PaddleInferenceAnakinPredictor< + anakin::X86, anakin::Precision::FP32, ::anakin::OpRunType::ASYNC>; +#endif +#ifdef ANAKIN_MLU_PLACE +template class PaddleInferenceAnakinMLUPredictor; +#endif +#ifdef ANAKIN_BM_PLACE +template class PaddleInferenceAnakinBMPredictor; +#endif + +// A factory to help create difference predictor. +template <> +std::unique_ptr +CreatePaddlePredictor( + const contrib::AnakinConfig &config) { +#ifdef PADDLE_WITH_CUDA + if (config.target_type == contrib::AnakinConfig::NVGPU) { + return std::unique_ptr( + new PaddleInferenceAnakinPredictor(config)); + } +#endif +#ifdef ANAKIN_X86_PLACE + if (config.target_type == contrib::AnakinConfig::X86) { + return std::unique_ptr( + new PaddleInferenceAnakinPredictor(config)); + } +#endif +#ifdef ANAKIN_MLU_PLACE + if (config.target_type == contrib::AnakinConfig::MLU) { + return std::unique_ptr( + new PaddleInferenceAnakinMLUPredictor( + config)); + } +#endif +#ifdef ANAKIN_BM_PLACE + if (config.target_type == contrib::AnakinConfig::BM) { + return std::unique_ptr( + new PaddleInferenceAnakinBMPredictor( + config)); + } +#endif + LOG(FATAL) << "Anakin Predictor create on unknown platform: " + << config.target_type; + return nullptr; +} +template +void DisplayOpTimer(anakin::Net *net_executor, int epoch) { +#ifdef PADDLE_ANAKIN_ENABLE_OP_TIMER + std::vector op_time = net_executor->get_op_time(); + auto exec_funcs = net_executor->get_exec_funcs(); + auto op_param = net_executor->get_op_param(); + for (int i = 0; i < op_time.size(); i++) { + LOG(INFO) << "name: " << exec_funcs[i].name + << " op_type: " << exec_funcs[i].op_name + << " op_param: " << op_param[i] << " time " << op_time[i] / epoch; + } + std::map op_map; + for (int i = 0; i < op_time.size(); i++) { + auto it = op_map.find(op_param[i]); + if (it != op_map.end()) + op_map[op_param[i]] += op_time[i]; + else + op_map.insert(std::pair(op_param[i], op_time[i])); + } + for (auto it = op_map.begin(); it != op_map.end(); ++it) { + LOG(INFO) << it->first << " " << (it->second) / epoch << " ms"; + } +#endif +} +template +PaddleInferenceAnakinPredictor::~PaddleInferenceAnakinPredictor() { + DisplayOpTimer(this->executor_p_, this->config_.init_batch_size); + delete this->executor_p_; + this->executor_p_ = nullptr; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h new file mode 100644 index 00000000..88d3325b --- /dev/null +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -0,0 +1,126 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains the implementation of inference API with Anakin engine + * embeded, this API can only support Anakin models. + */ + +#pragma once + +#include +#include +#include + +#include "framework/core/net/net.h" +#include "framework/graph/graph.h" +#include "paddle/fluid/inference/api/paddle_anakin_config.h" +#include "saber/core/shape.h" +#include "saber/saber_types.h" + +namespace paddle { + +using contrib::AnakinConfig; +using anakin::Precision; +using anakin::OpRunType; + +template +class PaddleInferenceAnakinPredictor : public PaddlePredictor { + public: + PaddleInferenceAnakinPredictor() = default; + + explicit PaddleInferenceAnakinPredictor(const AnakinConfig& config) + : config_(config) { + this->InitPredictor(); + } + + // NOTE Unlike the native engine, the buffers of anakin engine's output_data + // should be allocated first. + bool Run(const std::vector& inputs, + std::vector* output_data, + int batch_size = -1) override; + + std::unique_ptr Clone() override; + bool Reset(PaddleInferenceAnakinPredictor* predictor); + void InitPredictor(); + std::shared_ptr> GetGraph() { + return this->graph_p_; + } + std::vector GetInputNames() override { + return this->input_names_; + } + std::vector GetOutputNames() override { + return this->output_names_; + } + const AnakinConfig& GetConfig() const { return this->config_; } + + ~PaddleInferenceAnakinPredictor() override; + + protected: + void InitEnv(); + void InitGraph(); + virtual void OptimizeGraph(); + virtual void InitNet(); + virtual void SetContext(); + virtual void Predict(); + virtual std::unique_ptr New(); + static std::mutex mutex_; + AnakinConfig config_; + std::shared_ptr> ctx_p_; + std::shared_ptr> graph_p_; + anakin::Net* executor_p_{nullptr}; + std::vector input_names_; + std::vector output_names_; + + private: + bool RunImpl(const std::vector& inputs, + std::vector* output_data); + static std::once_flag init_anakin_; +}; + +#ifdef ANAKIN_MLU_PLACE +template +class PaddleInferenceAnakinMLUPredictor final + : public PaddleInferenceAnakinPredictor { + public: + PaddleInferenceAnakinMLUPredictor() = default; + explicit PaddleInferenceAnakinMLUPredictor(const AnakinConfig& config) { + this->config_ = config; + this->InitPredictor(); + } + std::unique_ptr New() override; + void SetContext() override; + void OptimizeGraph() override; + void InitNet() override; + void Predict() override; +}; +#endif + +#ifdef ANAKIN_BM_PLACE +template +class PaddleInferenceAnakinBMPredictor final + : public PaddleInferenceAnakinPredictor { + public: + PaddleInferenceAnakinBMPredictor() = default; + explicit PaddleInferenceAnakinBMPredictor(const AnakinConfig& config) { + this->config_ = config; + this->InitPredictor(); + } + std::unique_ptr New() override; + void OptimizeGraph() override; + void InitNet() override; + void Predict() override; +}; +#endif +} // namespace paddle diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc new file mode 100644 index 00000000..628817c6 --- /dev/null +++ b/paddle/fluid/inference/api/api_impl.cc @@ -0,0 +1,341 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/feed_fetch_method.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_bool(profile, false, "Turn on profiler for fluid"); + +namespace paddle { +namespace { +using paddle::inference::Timer; + +template +std::string num2str(T a) { + std::stringstream istr; + istr << a; + return istr.str(); +} +} // namespace + +void NativePaddlePredictor::PrepareFeedFetch() { + for (auto *op : inference_program_->Block(0).AllOps()) { + if (op->Type() == "feed") { + int idx = boost::get(op->GetAttr("col")); + if (feeds_.size() <= static_cast(idx)) { + feeds_.resize(idx + 1); + } + feeds_[idx] = op; + feed_names_[op->Output("Out")[0]] = idx; + } else if (op->Type() == "fetch") { + int idx = boost::get(op->GetAttr("col")); + if (fetchs_.size() <= static_cast(idx)) { + fetchs_.resize(idx + 1); + } + fetchs_[idx] = op; + } + } +} + +bool NativePaddlePredictor::Init( + std::shared_ptr parent_scope) { + VLOG(3) << "Predictor::init()"; + if (FLAGS_profile) { + LOG(WARNING) << "Profiler is actived, might affect the performance"; + LOG(INFO) << "You can turn off by set gflags '-profile false'"; + + auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll + : platform::ProfilerState::kCPU; + platform::EnableProfiler(tracking_device); + } + + // no matter with or without MKLDNN + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); + + if (config_.use_gpu) { + place_ = paddle::platform::CUDAPlace(config_.device); + } else { + place_ = paddle::platform::CPUPlace(); + } + if (parent_scope) { + scope_ = parent_scope; + sub_scope_ = &(parent_scope->NewScope()); + PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail"); + } else { + paddle::framework::InitDevices(false); + scope_.reset(new paddle::framework::Scope()); + } + + executor_.reset(new paddle::framework::Executor(place_)); + + // Initialize the inference program + if (!config_.model_dir.empty()) { + // Parameters are saved in separate files sited in + // the specified `dirname`. + inference_program_ = paddle::inference::Load(executor_.get(), scope_.get(), + config_.model_dir); + } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { + // All parameters are saved in a single file. + // The file names should be consistent with that used + // in Python API `fluid.io.save_inference_model`. + inference_program_ = paddle::inference::Load( + executor_.get(), scope_.get(), config_.prog_file, config_.param_file); + } else { + LOG(ERROR) << "fail to load inference model from " << config_.model_dir; + return false; + } + + ctx_ = executor_->Prepare(*inference_program_, 0); + executor_->CreateVariables(*inference_program_, + sub_scope_ ? sub_scope_ : scope_.get(), 0); + + // Get the feed_target_names and fetch_target_names + PrepareFeedFetch(); + return true; +} + +NativePaddlePredictor::~NativePaddlePredictor() { + if (FLAGS_profile) { + platform::DisableProfiler(platform::EventSortingKey::kTotal, + "./profile.log"); + } + if (sub_scope_) { + scope_->DeleteScope(sub_scope_); + } +} + +bool NativePaddlePredictor::Run(const std::vector &inputs, + std::vector *output_data, + int batch_size) { + if (UNLIKELY(config_.cpu_math_library_num_threads() > 1)) { + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); + } + VLOG(3) << "Predictor::predict"; + Timer timer; + timer.tic(); + // set feed variable + framework::Scope *scope = sub_scope_ != nullptr ? sub_scope_ : scope_.get(); + if (!SetFeed(inputs, scope)) { + LOG(ERROR) << "fail to set feed"; + return false; + } + // Run the inference program + // if share variables, we need not create variables + VLOG(4) << "Run prepared context"; + executor_->RunPreparedContext(ctx_.get(), scope, + false, /* don't create local scope each time*/ + false /* don't create variable each time */); + VLOG(4) << "Finish prepared context"; + // get fetch variable + if (!GetFetch(output_data, scope)) { + LOG(ERROR) << "fail to get fetches"; + return false; + } + VLOG(3) << "predict cost: " << timer.toc() << "ms"; + + // For some other vector like containers not cleaned after each batch. + tensor_array_batch_cleaner_.CollectNoTensorVars(scope_.get()); + tensor_array_batch_cleaner_.ResetNoTensorVars(); + return true; +} + +std::unique_ptr NativePaddlePredictor::Clone() { + std::lock_guard lk(clone_mutex_); + VLOG(3) << "Predictor::clone"; + std::unique_ptr cls(new NativePaddlePredictor(config_)); + // Hot fix the bug that result diff in multi-thread. + // TODO(Superjomn) re-implement a real clone here. + PADDLE_ENFORCE_NOT_NULL(dynamic_cast(cls.get())); + if (!dynamic_cast(cls.get())->Init(nullptr)) { + LOG(ERROR) << "fail to call Init"; + return nullptr; + } + +#ifdef __clang__ + // fix clang compile error + return cls; +#else + // fix manylinux compile error. + return std::move(cls); +#endif +} + +bool NativePaddlePredictor::SetFeed(const std::vector &inputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::set_feed"; + if (inputs.size() != feeds_.size()) { + LOG(ERROR) << "wrong feed input size, need " << feeds_.size() << " but get " + << inputs.size(); + return false; + } + + // Cache the inputs memory for better concurrency performance. + feed_tensors_.resize(inputs.size()); + + for (size_t i = 0; i < inputs.size(); ++i) { + auto &input = feed_tensors_[i]; + framework::DDim ddim = framework::make_ddim(inputs[i].shape); + void *input_ptr; + if (inputs[i].dtype == PaddleDType::INT64) { + input_ptr = input.mutable_data(ddim, place_); + } else if (inputs[i].dtype == PaddleDType::FLOAT32) { + input_ptr = input.mutable_data(ddim, place_); + } else if (inputs[i].dtype == PaddleDType::INT32) { + input_ptr = input.mutable_data(ddim, place_); + } else { + LOG(ERROR) << "unsupported feed type " << inputs[i].dtype; + return false; + } + + PADDLE_ENFORCE_NOT_NULL(input_ptr); + PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data()); + if (platform::is_cpu_place(place_)) { + // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. + std::memcpy(static_cast(input_ptr), inputs[i].data.data(), + inputs[i].data.length()); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(place_)); + auto dst_gpu_place = boost::get(place_); + memory::Copy(dst_gpu_place, static_cast(input_ptr), + platform::CPUPlace(), inputs[i].data.data(), + inputs[i].data.length(), dev_ctx->stream()); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } + + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. + framework::LoD lod; + for (auto &level : inputs[i].lod) { + lod.emplace_back(level); + } + input.set_lod(lod); + int idx = -1; + if (config_.specify_input_name) { + idx = feed_names_[inputs[i].name]; + } else { + idx = boost::get(feeds_[i]->GetAttr("col")); + } + framework::SetFeedVariable(scope, input, "feed", idx); + } + return true; +} +template +void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch, + PaddleTensor *output) { + // set shape. + auto shape = framework::vectorize(fetch.dims()); + output->shape.assign(shape.begin(), shape.end()); + // set data. + const T *data = fetch.data(); + int num_elems = inference::VecReduceToInt(shape); + output->data.Resize(num_elems * sizeof(T)); + // The fetched tensor output by fetch op, should always in CPU memory, so just + // copy. + memcpy(output->data.data(), data, num_elems * sizeof(T)); + // set lod + output->lod.clear(); + for (auto &level : fetch.lod()) { + output->lod.emplace_back(level.begin(), level.end()); + } +} + +bool NativePaddlePredictor::GetFetch(std::vector *outputs, + framework::Scope *scope) { + VLOG(3) << "Predictor::get_fetch"; + outputs->resize(fetchs_.size()); + for (size_t i = 0; i < fetchs_.size(); ++i) { + int idx = boost::get(fetchs_[i]->GetAttr("col")); + PADDLE_ENFORCE((size_t)idx == i); + framework::LoDTensor &fetch = + framework::GetFetchVariable(*scope, "fetch", idx); + auto type = fetch.type(); + auto output = &(outputs->at(i)); + output->name = fetchs_[idx]->Input("X")[0]; + if (type == framework::DataTypeTrait::DataType()) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::FLOAT32; + } else if (type == framework::DataTypeTrait::DataType()) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT64; + } else if (type == framework::DataTypeTrait::DataType()) { + GetFetchOne(fetch, output); + output->dtype = PaddleDType::INT32; + } else { + LOG(ERROR) << "unknown type, only support float32, int64 and int32 now."; + } + } + return true; +} + +template <> +std::unique_ptr CreatePaddlePredictor< + NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) { + VLOG(3) << "create NativePaddlePredictor"; + if (config.use_gpu) { + // 1. GPU memory + PADDLE_ENFORCE_GE( + config.fraction_of_gpu_memory, 0.f, + "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); + PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); + std::vector flags; + if (config.fraction_of_gpu_memory >= 0.0f || + config.fraction_of_gpu_memory <= 0.95f) { + flags.push_back("dummpy"); + std::string flag = "--fraction_of_gpu_memory_to_use=" + + num2str(config.fraction_of_gpu_memory); + flags.push_back(flag); + VLOG(3) << "set flag: " << flag; + framework::InitGflags(flags); + } + } + + std::unique_ptr predictor(new NativePaddlePredictor(config)); + PADDLE_ENFORCE_NOT_NULL( + dynamic_cast(predictor.get())); + if (!dynamic_cast(predictor.get())->Init(nullptr)) { + return nullptr; + } +#ifdef __clang__ + // fix clang compile error + return predictor; +#else + return std::move(predictor); +#endif +} + +template <> +std::unique_ptr CreatePaddlePredictor( + const NativeConfig &config) { + return CreatePaddlePredictor(config); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h new file mode 100644 index 00000000..96b94777 --- /dev/null +++ b/paddle/fluid/inference/api/api_impl.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { + +class NativePaddlePredictor : public PaddlePredictor { + public: + explicit NativePaddlePredictor(const NativeConfig &config) + : config_(config) {} + + // will only create sub scope if have global scope + bool Init(std::shared_ptr parent_scope); + + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = -1) override; + + std::unique_ptr Clone() override; + + ~NativePaddlePredictor() override; + + framework::Scope *scope() { return sub_scope_ ? sub_scope_ : scope_.get(); } + + protected: + bool SetFeed(const std::vector &input_datas, + framework::Scope *scope); + bool GetFetch(std::vector *output_data, + framework::Scope *scope); + template + void GetFetchOne(const framework::LoDTensor &fetchs, + PaddleTensor *output_data); + void PrepareFeedFetch(); + + NativeConfig config_; + platform::Place place_; + std::unique_ptr executor_; + std::shared_ptr scope_; + std::unique_ptr ctx_; + std::unique_ptr inference_program_; + std::vector feeds_; + std::map feed_names_; + std::vector fetchs_; + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious + // concurrency problems, wrong results and memory leak, so cache them. + std::vector feed_tensors_; + // Do not use unique_ptr, use parent scope to delete + framework::Scope *sub_scope_{nullptr}; + details::TensorArrayBatchCleaner tensor_array_batch_cleaner_; + // A mutex to make Clone thread safe. + std::mutex clone_mutex_; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc new file mode 100644 index 00000000..2dc5dda3 --- /dev/null +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -0,0 +1,309 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include // NOLINT + +#include "gflags/gflags.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +#ifdef __clang__ +#define ACC_DIFF 4e-3 +#else +#define ACC_DIFF 1e-3 +#endif + +DEFINE_string(word2vec_dirname, "", + "Directory of the word2vec inference model."); +DEFINE_string(book_dirname, "", "Directory of the book inference model."); + +namespace paddle { + +PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { + PaddleTensor pt; + + if (t->type() == framework::proto::VarType::INT64) { + pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); + pt.dtype = PaddleDType::INT64; + } else if (t->type() == framework::proto::VarType::FP32) { + pt.data.Reset(t->data(), t->numel() * sizeof(float)); + pt.dtype = PaddleDType::FLOAT32; + } else if (t->type() == framework::proto::VarType::INT32) { + pt.data.Reset(t->data(), t->numel() * sizeof(int32_t)); + pt.dtype = PaddleDType::INT32; + } else { + LOG(FATAL) << "unsupported type."; + } + pt.shape = framework::vectorize2int(t->dims()); + return pt; +} + +NativeConfig GetConfig() { + NativeConfig config; + config.model_dir = FLAGS_word2vec_dirname; + LOG(INFO) << "dirname " << config.model_dir; + config.fraction_of_gpu_memory = 0.15; +#ifdef PADDLE_WITH_CUDA + config.use_gpu = true; +#else + config.use_gpu = false; +#endif + config.device = 0; + return config; +} + +void MainWord2Vec(bool use_gpu) { + NativeConfig config = GetConfig(); + auto predictor = CreatePaddlePredictor(config); + config.use_gpu = use_gpu; + + framework::LoDTensor first_word, second_word, third_word, fourth_word; + framework::LoD lod{{0, 1}}; + int64_t dict_size = 2073; // The size of dictionary + + SetupLoDTensor(&first_word, lod, static_cast(0), dict_size - 1); + SetupLoDTensor(&second_word, lod, static_cast(0), dict_size - 1); + SetupLoDTensor(&third_word, lod, static_cast(0), dict_size - 1); + SetupLoDTensor(&fourth_word, lod, static_cast(0), dict_size - 1); + + std::vector paddle_tensor_feeds; + paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&first_word)); + paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&second_word)); + paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&third_word)); + paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&fourth_word)); + + std::vector outputs; + ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); + ASSERT_EQ(outputs.size(), 1UL); + size_t len = outputs[0].data.length(); + float* data = static_cast(outputs[0].data.data()); + for (size_t j = 0; j < len / sizeof(float); ++j) { + ASSERT_LT(data[j], 1.0); + ASSERT_GT(data[j], -1.0); + } + + std::vector cpu_feeds; + cpu_feeds.push_back(&first_word); + cpu_feeds.push_back(&second_word); + cpu_feeds.push_back(&third_word); + cpu_feeds.push_back(&fourth_word); + + framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + TestInference(config.model_dir, cpu_feeds, cpu_fetchs1); + + float* lod_data = output1.data(); + for (int i = 0; i < output1.numel(); ++i) { + EXPECT_LT(lod_data[i] - data[i], ACC_DIFF); + EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF); + } +} + +void MainImageClassification(bool use_gpu) { + int batch_size = 2; + bool repeat = false; + NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; + config.model_dir = + FLAGS_book_dirname + "/image_classification_resnet.inference.model"; + + const bool is_combined = false; + std::vector> feed_target_shapes = + GetFeedTargetShapes(config.model_dir, is_combined); + + framework::LoDTensor input; + // Use normilized image pixels as input data, + // which should be in the range [0.0, 1.0]. + feed_target_shapes[0][0] = batch_size; + framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]); + SetupTensor(&input, input_dims, static_cast(0), + static_cast(1)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + TestInference( + config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined); + + auto predictor = CreatePaddlePredictor(config); + std::vector paddle_tensor_feeds; + paddle_tensor_feeds.push_back(LodTensorToPaddleTensor(&input)); + + std::vector outputs; + ASSERT_TRUE(predictor->Run(paddle_tensor_feeds, &outputs)); + ASSERT_EQ(outputs.size(), 1UL); + size_t len = outputs[0].data.length(); + float* data = static_cast(outputs[0].data.data()); + float* lod_data = output1.data(); + for (size_t j = 0; j < len / sizeof(float); ++j) { + EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF); + } +} + +void MainThreadsWord2Vec(bool use_gpu) { + NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; + auto main_predictor = CreatePaddlePredictor(config); + + // prepare inputs data and reference results + constexpr int num_jobs = 3; + std::vector> jobs(num_jobs); + std::vector> paddle_tensor_feeds(num_jobs); + std::vector refs(num_jobs); + for (size_t i = 0; i < jobs.size(); ++i) { + // each job has 4 words + jobs[i].resize(4); + for (size_t j = 0; j < 4; ++j) { + framework::LoD lod{{0, 1}}; + int64_t dict_size = 2073; // The size of dictionary + SetupLoDTensor(&jobs[i][j], lod, static_cast(0), dict_size - 1); + paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i][j])); + } + + // get reference result of each job + std::vector ref_feeds; + std::vector ref_fetches(1, &refs[i]); + for (auto& word : jobs[i]) { + ref_feeds.push_back(&word); + } + TestInference(config.model_dir, ref_feeds, ref_fetches); + } + + // create threads and each thread run 1 job + std::vector threads; + for (int tid = 0; tid < num_jobs; ++tid) { + threads.emplace_back([&, tid]() { + auto predictor = CreatePaddlePredictor(config); + auto& local_inputs = paddle_tensor_feeds[tid]; + std::vector local_outputs; + ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); + + // check outputs range + ASSERT_EQ(local_outputs.size(), 1UL); + const size_t len = local_outputs[0].data.length(); + float* data = static_cast(local_outputs[0].data.data()); + for (size_t j = 0; j < len / sizeof(float); ++j) { + ASSERT_LT(data[j], 1.0); + ASSERT_GT(data[j], -1.0); + } + + // check outputs correctness + float* ref_data = refs[tid].data(); + EXPECT_EQ(refs[tid].numel(), static_cast(len / sizeof(float))); + for (int i = 0; i < refs[tid].numel(); ++i) { + EXPECT_NEAR(ref_data[i], data[i], 2e-3); + } + }); + } + for (int i = 0; i < num_jobs; ++i) { + threads[i].join(); + } +} + +void MainThreadsImageClassification(bool use_gpu) { + constexpr int num_jobs = 4; // each job run 1 batch + constexpr int batch_size = 1; + NativeConfig config = GetConfig(); + config.use_gpu = use_gpu; + config.model_dir = + FLAGS_book_dirname + "/image_classification_resnet.inference.model"; + + auto main_predictor = CreatePaddlePredictor(config); + std::vector jobs(num_jobs); + std::vector> paddle_tensor_feeds(num_jobs); + std::vector refs(num_jobs); + for (size_t i = 0; i < jobs.size(); ++i) { + // prepare inputs + std::vector> feed_target_shapes = + GetFeedTargetShapes(config.model_dir, /*is_combined*/ false); + feed_target_shapes[0][0] = batch_size; + framework::DDim input_dims = framework::make_ddim(feed_target_shapes[0]); + SetupTensor(&jobs[i], input_dims, 0.f, 1.f); + paddle_tensor_feeds[i].push_back(LodTensorToPaddleTensor(&jobs[i])); + + // get reference result of each job + std::vector ref_feeds(1, &jobs[i]); + std::vector ref_fetches(1, &refs[i]); + TestInference(config.model_dir, ref_feeds, ref_fetches); + } + + // create threads and each thread run 1 job + std::vector threads; + for (int tid = 0; tid < num_jobs; ++tid) { + threads.emplace_back([&, tid]() { + auto predictor = CreatePaddlePredictor(config); + auto& local_inputs = paddle_tensor_feeds[tid]; + std::vector local_outputs; + ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); + + // check outputs correctness + ASSERT_EQ(local_outputs.size(), 1UL); + const size_t len = local_outputs[0].data.length(); + float* data = static_cast(local_outputs[0].data.data()); + float* ref_data = refs[tid].data(); + EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float)); + for (int i = 0; i < refs[tid].numel(); ++i) { + EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF); + } + }); + } + for (int i = 0; i < num_jobs; ++i) { + threads[i].join(); + } +} + +TEST(inference_api_native, word2vec_cpu) { MainWord2Vec(false /*use_gpu*/); } +TEST(inference_api_native, word2vec_cpu_threads) { + MainThreadsWord2Vec(false /*use_gpu*/); +} +TEST(inference_api_native, image_classification_cpu) { + MainImageClassification(false /*use_gpu*/); +} +TEST(inference_api_native, image_classification_cpu_threads) { + MainThreadsImageClassification(false /*use_gpu*/); +} + +#ifdef PADDLE_WITH_CUDA +TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); } +// Turn off temporarily for the unstable result. +// TEST(inference_api_native, word2vec_gpu_threads) { +// MainThreadsWord2Vec(true /*use_gpu*/); +// } +TEST(inference_api_native, image_classification_gpu) { + MainImageClassification(true /*use_gpu*/); +} +// Turn off temporarily for the unstable result. +// TEST(inference_api_native, image_classification_gpu_threads) { +// MainThreadsImageClassification(true /*use_gpu*/); +// } +#endif + +TEST(PassBuilder, Delete) { + AnalysisConfig config; + config.DisableGpu(); + config.pass_builder()->DeletePass("attention_lstm_fuse_pass"); + const auto& passes = config.pass_builder()->AllPasses(); + auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass"); + ASSERT_EQ(it, passes.end()); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc new file mode 100644 index 00000000..2c450ef7 --- /dev/null +++ b/paddle/fluid/inference/api/api_tester.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { + +/* + * Do not use this, just a demo indicating how to customize a config for a + * specific predictor. + */ +struct DemoConfig : public PaddlePredictor::Config { + float other_config; +}; + +/* + * Do not use this, just a demo indicating how to customize a Predictor. + */ +class DemoPredictor : public PaddlePredictor { + public: + explicit DemoPredictor(const DemoConfig &config) { + LOG(INFO) << "I get other_config " << config.other_config; + } + bool Run(const std::vector &inputs, + std::vector *output_data, + int batch_size = 0) override { + LOG(INFO) << "Run"; + return false; + } + + std::unique_ptr Clone() override { return nullptr; } + + ~DemoPredictor() override {} +}; + +template <> +std::unique_ptr CreatePaddlePredictor( + const DemoConfig &config) { + std::unique_ptr x(new DemoPredictor(config)); + return x; +} + +TEST(paddle_inference_api, demo) { + DemoConfig config; + config.other_config = 1.7; + auto predictor = CreatePaddlePredictor(config); + std::vector outputs; + predictor->Run({}, &outputs); +} + +TEST(paddle_inference_api, get_version) { + LOG(INFO) << "paddle version:\n" << get_version(); + auto version = get_version(); + ASSERT_FALSE(version.empty()); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/demo_ci/.gitignore b/paddle/fluid/inference/api/demo_ci/.gitignore new file mode 100644 index 00000000..1269488f --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/.gitignore @@ -0,0 +1 @@ +data diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt new file mode 100644 index 00000000..318658e0 --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -0,0 +1,163 @@ +cmake_minimum_required(VERSION 3.0) +project(cpp_inference_demo CXX C) +option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL." ON) +option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) +option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) +option(USE_TENSORRT "Compile demo with TensorRT." OFF) + +macro(safe_set_static_flag) + foreach(flag_var + CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE + CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + if(${flag_var} MATCHES "/MD") + string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") + endif(${flag_var} MATCHES "/MD") + endforeach(flag_var) +endmacro() + +if(NOT DEFINED PADDLE_LIB) + message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") +endif() +if(NOT DEFINED DEMO_NAME) + message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") +endif() + +include_directories("${PADDLE_LIB}/") +set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappy/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappystream/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}zlib/include") +include_directories("${PADDLE_LIB}/third_party/boost") +include_directories("${PADDLE_LIB}/third_party/eigen3") + +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappy/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappystream/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}zlib/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") +link_directories("${PADDLE_LIB}/paddle/lib") + +if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + if (WITH_STATIC_LIB) + safe_set_static_flag() + add_definitions(-DSTATIC_LIB) + endif() +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +endif() +message("flags" ${CMAKE_CXX_FLAGS}) + +if(WITH_GPU) + if(NOT WIN32) + set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") + else() + if(CUDA_LIB STREQUAL "") + set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") + endif() + endif(NOT WIN32) +endif() + +if (NOT WIN32) + if (USE_TENSORRT AND WITH_GPU) + include_directories("${TENSORRT_INCLUDE_DIR}") + link_directories("${TENSORRT_LIB_DIR}") + endif() +endif(NOT WIN32) + +if (NOT WIN32) + set(NGRAPH_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}ngraph") + if(EXISTS ${NGRAPH_PATH}) + include(GNUInstallDirs) + include_directories("${NGRAPH_PATH}/include") + link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}") + set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() +endif() + +if(WITH_MKL) + set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml") + include_directories("${MATH_LIB_PATH}/include") + if(WIN32) + set(MATH_DLL ${MATH_LIB_PATH}/lib/mklml${CMAKE_SHARED_LIBRARY_SUFFIX} + ${MATH_LIB_PATH}/lib/libiomp5md${CMAKE_SHARED_LIBRARY_SUFFIX} + ${MATH_LIB_PATH}/lib/msvcr120${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + else() + set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) + endif() + set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn") + if(EXISTS ${MKLDNN_PATH}) + include_directories("${MKLDNN_PATH}/include") + if(WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib) + else(WIN32) + set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0) + endif(WIN32) + endif() +else() + set(MATH_LIB ${PADDLE_LIB_THIRD_PARTY_PATH}openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) + if(WIN32) + set(MATH_DLL ${PADDLE_LIB_THIRD_PARTY_PATH}openblas/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() +endif() + +# Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a +if(WITH_STATIC_LIB) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) +else() + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) +endif() + +if (NOT WIN32) + set(EXTERNAL_LIB "-lrt -ldl -lpthread") + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} + glog gflags protobuf snappystream snappy z xxhash + ${EXTERNAL_LIB}) +else() + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + glog gflags_static libprotobuf snappy zlibstatic xxhash snappystream ${EXTERNAL_LIB}) + set(DEPS ${DEPS} libcmt shlwapi.lib) +endif(NOT WIN32) + +if(WITH_GPU) + if(NOT WIN32) + if (USE_TENSORRT) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() + set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) + else() + set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) + endif() +endif() + +add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) +target_link_libraries(${DEMO_NAME} ${DEPS}) +if(WIN32) + if(WITH_MKL) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${MATH_DLL} ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + else() + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${MATH_DLL}/openblas.dll ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() +endif() diff --git a/paddle/fluid/inference/api/demo_ci/README.md b/paddle/fluid/inference/api/demo_ci/README.md new file mode 100644 index 00000000..7f013da7 --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/README.md @@ -0,0 +1,26 @@ +# Inference Demos + +There are several demos: + +- simple_on_word2vec: + - Follow the C++ codes is in `simple_on_word2vec.cc`. + - It is suitable for word2vec model. +- vis_demo: + - Follow the C++ codes is in `vis_demo.cc`. + - It is suitable for mobilenet, se_resnext50 and ocr three models. + - Input data format: + - Each line contains a single record + - Each record's format is + ``` + \t + ``` + +To build and execute the demos, simply run +``` +./run.sh $PADDLE_ROOT $TURN_ON_MKL $TEST_GPU_CPU +``` +- It will build and execute the demos in both static and shared library. +- `$PADDLE_ROOT`: paddle library path +- `$TURN_ON_MKL`: use MKL or Openblas +- `$TEST_GPU_CPU`: test both GPU/CPU mode or only CPU mode +- NOTE: for simple_on_word2vec, must run `ctest -R test_word2vec -R` to obtain word2vec model at first. diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh new file mode 100755 index 00000000..0d9f3d2a --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/clean.sh @@ -0,0 +1,4 @@ +set -x +cd `dirname $0` +rm -rf build/ data/ +set +x diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh new file mode 100755 index 00000000..054f9de3 --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -0,0 +1,126 @@ +#!/bin/bash +set -x +PADDLE_ROOT=$1 +TURN_ON_MKL=$2 # use MKL or Openblas +TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode +DATA_DIR=$4 # dataset +TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include +TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib +inference_install_dir=${PADDLE_ROOT}/build/fluid_inference_install_dir + +cd `dirname $0` +current_dir=`pwd` +if [ $2 == ON ]; then + # You can export yourself if move the install path + MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB} +fi +if [ $3 == ON ]; then + use_gpu_list='true false' +else + use_gpu_list='false' +fi + +USE_TENSORRT=OFF +if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then + USE_TENSORRT=ON +fi + +PREFIX=inference-vis-demos%2F +URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX} + +# download vis_demo data +function download() { + dir_name=$1 + mkdir -p $dir_name + cd $dir_name + if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then + echo "${PREFIX}{dir_name}.tar.gz has been downloaded." + else + wget -q ${URL_ROOT}$dir_name.tar.gz + tar xzf *.tar.gz + fi + cd .. +} +mkdir -p $DATA_DIR +cd $DATA_DIR +vis_demo_list='se_resnext50 ocr mobilenet' +for vis_demo_name in $vis_demo_list; do + download $vis_demo_name +done + +# compile and test the demo +cd $current_dir +mkdir -p build +cd build + +for WITH_STATIC_LIB in ON OFF; do +# TODO(Superjomn) reopen this +# something wrong with the TensorArray reset. +:< +#include + +#include +#include +#include //NOLINT + +#include "utils.h" // NOLINT + +DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_bool(use_gpu, false, "Whether use gpu."); + +namespace paddle { +namespace demo { + +void Main(bool use_gpu) { + //# 1. Create PaddlePredictor with a config. + NativeConfig config; + if (FLAGS_dirname.empty()) { + LOG(INFO) << "Usage: ./simple_on_word2vec --dirname=path/to/your/model"; + exit(1); + } + config.model_dir = FLAGS_dirname; + config.use_gpu = use_gpu; + config.fraction_of_gpu_memory = 0.15; + config.device = 0; + auto predictor = CreatePaddlePredictor(config); + + for (int batch_id = 0; batch_id < 3; batch_id++) { + //# 2. Prepare input. + int64_t data[4] = {1, 2, 3, 4}; + + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + // For simplicity, we set all the slots with the same data. + std::vector slots(4, tensor); + + //# 3. Run + std::vector outputs; + CHECK(predictor->Run(slots, &outputs)); + + //# 4. Get output. + CHECK_EQ(outputs.size(), 1UL); + // Check the output buffer size and result of each tid. + CHECK_EQ(outputs.front().data.length(), 33168UL); + float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, + 0.000932706}; + const size_t num_elements = outputs.front().data.length() / sizeof(float); + // The outputs' buffers are in CPU memory. + for (size_t i = 0; i < std::min(static_cast(5), num_elements); + i++) { + CHECK_NEAR(static_cast(outputs.front().data.data())[i], result[i], + 0.001); + } + } +} + +void MainThreads(int num_threads, bool use_gpu) { + // Multi-threads only support on CPU + // 0. Create PaddlePredictor with a config. + NativeConfig config; + config.model_dir = FLAGS_dirname; + config.use_gpu = use_gpu; + config.fraction_of_gpu_memory = 0.15; + config.device = 0; + auto main_predictor = CreatePaddlePredictor(config); + + std::vector threads; + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // 1. clone a predictor which shares the same parameters + auto predictor = main_predictor->Clone(); + constexpr int num_batches = 3; + for (int batch_id = 0; batch_id < num_batches; ++batch_id) { + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + std::vector inputs(4, tensor); + std::vector outputs; + // 3. Run + CHECK(predictor->Run(inputs, &outputs)); + + // 4. Get output. + CHECK_EQ(outputs.size(), 1UL); + // Check the output buffer size and result of each tid. + CHECK_EQ(outputs.front().data.length(), 33168UL); + float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, + 0.000932706}; + const size_t num_elements = + outputs.front().data.length() / sizeof(float); + // The outputs' buffers are in CPU memory. + for (size_t i = 0; i < std::min(static_cast(5), num_elements); + i++) { + CHECK_NEAR(static_cast(outputs.front().data.data())[i], + result[i], 0.001); + } + } + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); + } +} + +} // namespace demo +} // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(false /* use_gpu*/); + paddle::demo::MainThreads(1, false /* use_gpu*/); + paddle::demo::MainThreads(4, false /* use_gpu*/); + if (FLAGS_use_gpu) { + paddle::demo::Main(true /*use_gpu*/); + paddle::demo::MainThreads(1, true /*use_gpu*/); + paddle::demo::MainThreads(4, true /*use_gpu*/); + } + return 0; +} diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc new file mode 100644 index 00000000..f7da55c9 --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains demo of mobilenet for tensorrt. + */ + +#include +#include // use glog instead of CHECK to avoid importing other paddle header files. +#include "utils.h" // NOLINT + +DECLARE_double(fraction_of_gpu_memory_to_use); +DEFINE_string(modeldir, "", "Directory of the inference model."); +DEFINE_string(refer, "", "path to reference result for comparison."); +DEFINE_string( + data, "", + "path of data; each line is a record, format is " + "'\t predictor; + paddle::AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(FLAGS_modeldir + "/__model__", + FLAGS_modeldir + "/__params__"); + config.EnableTensorRtEngine(); + predictor = CreatePaddlePredictor(config); + + VLOG(3) << "begin to process data"; + // Just a single batch of data. + std::string line; + std::ifstream file(FLAGS_data); + std::getline(file, line); + auto record = ProcessALine(line); + file.close(); + + // Inference. + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; + + VLOG(3) << "run executor"; + std::vector output; + predictor->Run({input}, &output, 1); + + VLOG(3) << "output.size " << output.size(); + auto& tensor = output.front(); + VLOG(3) << "output: " << SummaryTensor(tensor); + + // compare with reference result + CheckOutput(FLAGS_refer, tensor); +} + +} // namespace demo +} // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + paddle::demo::Main(); + return 0; +} diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h new file mode 100644 index 00000000..1505a898 --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/utils.h @@ -0,0 +1,140 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/include/paddle_inference_api.h" + +namespace paddle { +namespace demo { + +struct Record { + std::vector data; + std::vector shape; +}; + +static void split(const std::string& str, char sep, + std::vector* pieces) { + pieces->clear(); + if (str.empty()) { + return; + } + size_t pos = 0; + size_t next = str.find(sep, pos); + while (next != std::string::npos) { + pieces->push_back(str.substr(pos, next - pos)); + pos = next + 1; + next = str.find(sep, pos); + } + if (!str.substr(pos).empty()) { + pieces->push_back(str.substr(pos)); + } +} + +Record ProcessALine(const std::string& line) { + VLOG(3) << "process a line"; + std::vector columns; + split(line, '\t', &columns); + CHECK_EQ(columns.size(), 2UL) + << "data format error, should be \t"; + + Record record; + std::vector data_strs; + split(columns[0], ' ', &data_strs); + for (auto& d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(columns[1], ' ', &shape_strs); + for (auto& s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + VLOG(3) << "data size " << record.data.size(); + VLOG(3) << "data shape size " << record.shape.size(); + return record; +} + +void CheckOutput(const std::string& referfile, const PaddleTensor& output) { + std::string line; + std::ifstream file(referfile); + std::getline(file, line); + auto refer = ProcessALine(line); + file.close(); + + size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); + VLOG(3) << "predictor output numel " << numel; + VLOG(3) << "reference output numel " << refer.data.size(); + CHECK_EQ(numel, refer.data.size()); + switch (output.dtype) { + case PaddleDType::INT64: { + for (size_t i = 0; i < numel; ++i) { + CHECK_EQ(static_cast(output.data.data())[i], refer.data[i]); + } + break; + } + case PaddleDType::FLOAT32: { + for (size_t i = 0; i < numel; ++i) { + CHECK_LT( + fabs(static_cast(output.data.data())[i] - refer.data[i]), + 1e-5); + } + break; + } + case PaddleDType::INT32: { + for (size_t i = 0; i < numel; ++i) { + CHECK_EQ(static_cast(output.data.data())[i], refer.data[i]); + } + break; + } + } +} + +/* + * Get a summary of a PaddleTensor content. + */ +static std::string SummaryTensor(const PaddleTensor& tensor) { + std::stringstream ss; + int num_elems = tensor.data.length() / PaddleDtypeSize(tensor.dtype); + + ss << "data[:10]\t"; + switch (tensor.dtype) { + case PaddleDType::INT64: { + for (int i = 0; i < std::min(num_elems, 10); i++) { + ss << static_cast(tensor.data.data())[i] << " "; + } + break; + } + case PaddleDType::FLOAT32: { + for (int i = 0; i < std::min(num_elems, 10); i++) { + ss << static_cast(tensor.data.data())[i] << " "; + } + break; + } + case PaddleDType::INT32: { + for (int i = 0; i < std::min(num_elems, 10); i++) { + ss << static_cast(tensor.data.data())[i] << " "; + } + break; + } + } + return ss.str(); +} + +} // namespace demo +} // namespace paddle diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc new file mode 100644 index 00000000..0d2c418c --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains demo for mobilenet, se-resnext50 and ocr. + */ + +#include +#include +#include "utils.h" // NOLINT + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +#endif +DEFINE_string(modeldir, "", "Directory of the inference model."); +DEFINE_string(refer, "", "path to reference result for comparison."); +DEFINE_string( + data, "", + "path of data; each line is a record, format is " + "'\t predictor, analysis_predictor; + AnalysisConfig config; + if (use_gpu) { + config.EnableUseGpu(100, 0); + } + config.SetModel(FLAGS_modeldir + "/__model__", + FLAGS_modeldir + "/__params__"); + + predictor = CreatePaddlePredictor(config.ToNativeConfig()); + analysis_predictor = CreatePaddlePredictor(config); + + // Just a single batch of data. + std::string line; + std::ifstream file(FLAGS_data); + std::getline(file, line); + auto record = ProcessALine(line); + file.close(); + + // Inference. + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; + + std::vector output, analysis_output; + predictor->Run({input}, &output, 1); + + auto& tensor = output.front(); + + // compare with reference result + CheckOutput(FLAGS_refer, tensor); + + // the analysis_output has some diff with native_output, + // TODO(luotao): add CheckOutput for analysis_output later. + analysis_predictor->Run({input}, &analysis_output, 1); +} + +} // namespace demo +} // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + if (FLAGS_use_gpu) { + paddle::demo::Main(true /*use_gpu*/); + } else { + paddle::demo::Main(false /*use_gpu*/); + } + return 0; +} diff --git a/paddle/fluid/inference/api/demo_ci/windows_inference.md b/paddle/fluid/inference/api/demo_ci/windows_inference.md new file mode 100644 index 00000000..44b2586a --- /dev/null +++ b/paddle/fluid/inference/api/demo_ci/windows_inference.md @@ -0,0 +1,19 @@ +# windows inference +本文介绍windows inference,目前只提供了静态编译,编译出paddle_fluid.lib,包含了除openblas.dll之外的所有第三方依赖库。 + +1. 下载最新的paddle_fluid.lib和openblas.dll,并把它们放在同一个目录下。 + +2. 准备预训练好的模型文件,例如models中的模型,可以将模型用safe_inference_model接口保存下来。将模型文件放到该目录下 + +3. 进入Paddle/paddle/fluid/inference/api/demo_ci目录,新建build目录,然后使用cmake生成vs2015的solution文件。 +其中PADDLE_LIB是前面的paddle_fluid.lib对应文件夹, CUDA_LIB指定为x64格式下的cuda系统库目录文件夹。 +```shell + cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_fluid.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64 +``` +然后用vs2015打开对应的项目文件,注意使用静态链接 "/MT",生成对应的exe。将openblas.dll放到exe所在目录。 + +4. 该exe即为项目生成文件,可绑定运行。 + +## FAQ +1. cmake需要您手动下载,并添加到系统路径里 +2. 路径中的不要包含空格,例如发现CUDA_LIB路径是Program Files(x86)可能会出错。可以将CUDA拷贝到一个新位置。 diff --git a/paddle/fluid/inference/api/details/CMakeLists.txt b/paddle/fluid/inference/api/details/CMakeLists.txt new file mode 100644 index 00000000..80b53b32 --- /dev/null +++ b/paddle/fluid/inference/api/details/CMakeLists.txt @@ -0,0 +1,18 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +cc_library(reset_tensor_array SRCS reset_tensor_array.cc DEPS lod_tensor scope) +cc_library(zero_copy_tensor SRCS zero_copy_tensor.cc DEPS scope lod_tensor enforce) +cc_library(zero_copy_tensor_dummy SRCS zero_copy_tensor_dummy.cc) diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc new file mode 100644 index 00000000..03c2aa3f --- /dev/null +++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" + +namespace paddle { +namespace details { + +// Should be called after the parameters are loaded. +void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { + if (flag_) { + for (auto &var_name : scope->LocalVarNames()) { + auto *var = scope->FindVar(var_name); + // TODO(Superjomn) should avoid the case when a TensorArray is a + // parameter. + if (var_name == "feed" || var_name == "fetch") continue; + if (var->IsType()) { + VLOG(4) << "collect " << var_name; + arrays_.push_back(var->GetMutable()); + } + } + for (auto *kid : scope->kids()) { + CollectTensorArrays(kid); + } + + VLOG(3) << "Collect " << arrays_.size() << " arrays"; + flag_ = false; + } +} + +// Should be called when `Run` finished. +void TensorArrayBatchCleaner::ResetTensorArray() { + for (auto *arr : arrays_) { + arr->clear(); + } +} + +void TensorArrayBatchCleaner::CollectNoTensorVars(framework::Scope *scope) { + if (no_tensor_flag_) { + for (auto &var_name : scope->LocalVarNames()) { + auto *var = scope->FindVar(var_name); + if (!var->IsInitialized()) continue; + if (!valid_types_.count(var->Type())) { + no_tensor_vars_.insert(var); + } + } + + for (auto *kid : scope->kids()) { + CollectTensorArrays(kid); + } + no_tensor_flag_ = false; // Only collect one time. + } +} + +void TensorArrayBatchCleaner::ResetNoTensorVars() { + for (auto *var : no_tensor_vars_) { + var->Clear(); + } +} + +} // namespace details +} // namespace paddle diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h new file mode 100644 index 00000000..213c6891 --- /dev/null +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -0,0 +1,57 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace details { + +// Clean the TensorArray each batch to make the behavior the same with the +// training phase. +struct TensorArrayBatchCleaner { + TensorArrayBatchCleaner() { + constexpr auto kTensorId = framework::VarTypeTrait::kId; + constexpr auto kLoDTensorId = + framework::VarTypeTrait::kId; + valid_types_.insert(kTensorId); + valid_types_.insert(kLoDTensorId); + } + // Collect the variables that are not Tensor or LoDTensor, and reset them to a + // bool(trick), because some of them are containers, and some operators just + // keep inserting new items without clearing the containers first; So the + // memory grow larger and larger in inference service deployed online. + void CollectNoTensorVars(framework::Scope *scope); + void ResetNoTensorVars(); + + // Fix the tensor array not clear in the inference scenarios. + void CollectTensorArrays(framework::Scope *scope); + void ResetTensorArray(); + + private: + bool flag_{true}; + bool no_tensor_flag_{true}; + std::vector arrays_; + + std::unordered_set valid_types_; + std::unordered_set no_tensor_vars_; +}; + +} // namespace details +} // namespace paddle diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc new file mode 100644 index 00000000..937b6398 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -0,0 +1,196 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(input_or_output_, + "Can't reshape the output tensor, it is readonly"); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + tensor->Resize(framework::make_ddim(shape)); +} + +#define EAGER_GET_TENSOR \ + if (!tensor_) { \ + tensor_ = FindTensor(); \ + } \ + auto *tensor = static_cast(tensor_); + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + EAGER_GET_TENSOR; + switch (static_cast(place)) { + case static_cast(PaddlePlace::kCPU): { + return tensor->mutable_data(platform::CPUPlace()); + } + case static_cast(PaddlePlace::kGPU): { + return tensor->mutable_data(platform::CUDAPlace()); + } + default: + PADDLE_THROW("Unsupported place: %d", static_cast(place)); + break; + } + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { + EAGER_GET_TENSOR; + auto *res = tensor->data(); + + if (platform::is_cpu_place(tensor->place())) { + *place = PaddlePlace::kCPU; + } else if (platform::is_gpu_place(tensor->place())) { + *place = PaddlePlace::kGPU; + } else { + *place = PaddlePlace::kUNK; + } + + *size = tensor->numel(); + return res; +} + +PaddleDType ZeroCopyTensor::type() const { + EAGER_GET_TENSOR; + auto type = tensor->type(); + if (type == framework::proto::VarType::FP32) { + return PaddleDType::FLOAT32; + } else if (type == framework::proto::VarType::INT64) { + return PaddleDType::INT64; + } else if (type == framework::proto::VarType::INT32) { + return PaddleDType::INT32; + } else { + LOG(ERROR) << "unknown type, only support float32 and int64 now."; + } + return PaddleDType::FLOAT32; +} + +template +void ZeroCopyTensor::copy_from_cpu(const T *data) { + EAGER_GET_TENSOR; + PADDLE_ENFORCE_GE( + tensor->numel(), 0, + "You should call ZeroCopyTensor::Reshape(const std::vector &shape)" + "function before copy data from cpu."); + size_t ele_size = tensor->numel() * sizeof(T); + + if (place_ == PaddlePlace::kCPU) { + auto *t_data = tensor->mutable_data(platform::CPUPlace()); + std::memcpy(static_cast(t_data), data, ele_size); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + platform::CUDAPlace gpu_place(device_); + auto *t_data = tensor->mutable_data(gpu_place); + auto *dev_ctx = + static_cast(pool.Get(gpu_place)); + + memory::Copy(gpu_place, static_cast(t_data), platform::CPUPlace(), + data, ele_size, dev_ctx->stream()); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } +} + +template +void ZeroCopyTensor::copy_to_cpu(T *data) { + EAGER_GET_TENSOR; + auto ele_num = tensor->numel(); + auto *t_data = tensor->data(); + auto t_place = tensor->place(); + + if (platform::is_cpu_place(t_place)) { + std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto gpu_place = boost::get(t_place); + auto *dev_ctx = + static_cast(pool.Get(gpu_place)); + memory::Copy(platform::CPUPlace(), static_cast(data), gpu_place, + t_data, ele_num * sizeof(T), dev_ctx->stream()); + cudaDeviceSynchronize(); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } +} +template void ZeroCopyTensor::copy_from_cpu(const float *data); +template void ZeroCopyTensor::copy_from_cpu(const int64_t *data); +template void ZeroCopyTensor::copy_from_cpu(const int32_t *data); +template void ZeroCopyTensor::copy_to_cpu(float *data); +template void ZeroCopyTensor::copy_to_cpu(int64_t *data); +template void ZeroCopyTensor::copy_to_cpu(int32_t *data); + +template float *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; +template int32_t *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int32_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { + PADDLE_ENFORCE(!name_.empty(), + "Need to SetName first, so that the corresponding tensor can " + "be retrieved."); + PADDLE_ENFORCE(scope_); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE(var, "No tensor called [%s] in the runtime scope", name_); + auto *tensor = var->GetMutable(); + return tensor; +} + +std::vector ZeroCopyTensor::shape() const { + EAGER_GET_TENSOR; + PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_); + return framework::vectorize2int(tensor->dims()); +} + +void ZeroCopyTensor::SetLoD(const std::vector> &x) { + EAGER_GET_TENSOR; + framework::LoD lod; + for (auto &level : x) { + lod.emplace_back(level); + } + tensor->set_lod(lod); +} + +std::vector> ZeroCopyTensor::lod() const { + EAGER_GET_TENSOR; + std::vector> res; + for (auto &level : tensor->lod()) { + res.emplace_back(level); + } + return res; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc new file mode 100644 index 00000000..cbbb3ea2 --- /dev/null +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { + +void ZeroCopyTensor::Reshape(const std::vector &shape) {} + +template +T *ZeroCopyTensor::mutable_data(PaddlePlace place) { + return nullptr; +} + +template +T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { + return nullptr; +} + +template float *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; +template int64_t *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; +template float *ZeroCopyTensor::mutable_data(PaddlePlace place); +template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); + +void *ZeroCopyTensor::FindTensor() const { return nullptr; } + +std::vector ZeroCopyTensor::shape() const { return {}; } + +void ZeroCopyTensor::SetLoD(const std::vector> &x) {} + +std::vector> ZeroCopyTensor::lod() const { + return std::vector>(); +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc new file mode 100644 index 00000000..9cc491e1 --- /dev/null +++ b/paddle/fluid/inference/api/helper.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/helper.h" + +namespace paddle { +namespace inference { + +template <> +std::string to_string>( + const std::vector> &vec) { + std::stringstream ss; + for (const auto &piece : vec) { + ss << to_string(piece) << "\n"; + } + return ss.str(); +} + +template <> +std::string to_string>>( + const std::vector>> &vec) { + std::stringstream ss; + for (const auto &line : vec) { + for (const auto &rcd : line) { + ss << to_string(rcd) << ";\t"; + } + ss << '\n'; + } + return ss.str(); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h new file mode 100644 index 00000000..e5820c36 --- /dev/null +++ b/paddle/fluid/inference/api/helper.h @@ -0,0 +1,341 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#if !defined(_WIN32) +#include +#endif +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/string/printf.h" + +extern std::string paddle::framework::DataTypeToString( + const framework::proto::VarType::Type type); + +namespace paddle { +namespace inference { + +using paddle::framework::DataTypeToString; + +// Timer for timer +class Timer { + public: + std::chrono::high_resolution_clock::time_point start; + std::chrono::high_resolution_clock::time_point startu; + + void tic() { start = std::chrono::high_resolution_clock::now(); } + double toc() { + startu = std::chrono::high_resolution_clock::now(); + std::chrono::duration time_span = + std::chrono::duration_cast>(startu - + start); + double used_time_ms = static_cast(time_span.count()) * 1000.0; + return used_time_ms; + } +}; + +static int GetUniqueId() { + static int id = 0; + return id++; +} + +static void split(const std::string &str, char sep, + std::vector *pieces, bool ignore_null = true) { + pieces->clear(); + if (str.empty()) { + if (!ignore_null) { + pieces->push_back(str); + } + return; + } + size_t pos = 0; + size_t next = str.find(sep, pos); + while (next != std::string::npos) { + pieces->push_back(str.substr(pos, next - pos)); + pos = next + 1; + next = str.find(sep, pos); + } + if (!str.substr(pos).empty()) { + pieces->push_back(str.substr(pos)); + } +} + +template +static T convert(const std::string &item, + std::function func) { + T res; + try { + res = func(item); + } catch (std::invalid_argument &e) { + std::string message = + "invalid_argument exception when try to convert : " + item; + LOG(ERROR) << message; + PADDLE_THROW(message); + } catch (std::out_of_range &e) { + std::string message = + "out_of_range exception when try to convert : " + item; + LOG(ERROR) << message; + PADDLE_THROW(message); + } catch (...) { + std::string message = "unexpected exception when try to convert " + item; + LOG(ERROR) << message; + PADDLE_THROW(message); + } + return res; +} + +static void split_to_float(const std::string &str, char sep, + std::vector *fs) { + std::vector pieces; + split(str, sep, &pieces); + std::transform(pieces.begin(), pieces.end(), std::back_inserter(*fs), + [](const std::string &v) { + return convert(v, [](const std::string &item) { + return std::stof(item); + }); + }); +} +static void split_to_int64(const std::string &str, char sep, + std::vector *is) { + std::vector pieces; + split(str, sep, &pieces); + std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is), + [](const std::string &v) { + return convert(v, [](const std::string &item) { + return std::stoll(item); + }); + }); +} +static void split_to_int(const std::string &str, char sep, + std::vector *is) { + std::vector pieces; + split(str, sep, &pieces); + std::transform(pieces.begin(), pieces.end(), std::back_inserter(*is), + [](const std::string &v) { + return convert(v, [](const std::string &item) { + return std::stoi(item); + }); + }); +} +template +std::string to_string(const std::vector &vec) { + std::stringstream ss; + for (const auto &c : vec) { + ss << c << " "; + } + return ss.str(); +} +template <> +std::string to_string>( + const std::vector> &vec); + +template <> +std::string to_string>>( + const std::vector>> &vec); + +template +int VecReduceToInt(const std::vector &v) { + return std::accumulate(v.begin(), v.end(), 1, [](T a, T b) { return a * b; }); +} + +template +static void TensorAssignData(PaddleTensor *tensor, + const std::vector> &data) { + // Assign buffer + int num_elems = VecReduceToInt(tensor->shape); + tensor->data.Resize(sizeof(T) * num_elems); + int c = 0; + for (const auto &f : data) { + for (T v : f) { + static_cast(tensor->data.data())[c++] = v; + } + } +} + +template +static void TensorAssignData(PaddleTensor *tensor, + const std::vector> &data, + const std::vector &lod) { + int size = lod[lod.size() - 1]; + tensor->shape.assign({size, 1}); + tensor->lod.assign({lod}); + TensorAssignData(tensor, data); +} + +template +static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const std::vector> &data) { + auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); + int c = 0; + for (const auto &f : data) { + for (T v : f) { + ptr[c++] = v; + } + } +} + +template +static void ZeroCopyTensorAssignData(ZeroCopyTensor *tensor, + const PaddleBuf &data) { + auto *ptr = tensor->mutable_data(PaddlePlace::kCPU); + for (size_t i = 0; i < data.length() / sizeof(T); i++) { + ptr[i] = *(reinterpret_cast(data.data()) + i); + } +} + +static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) { + if (a.dtype != b.dtype) { + LOG(ERROR) << "dtype not match"; + return false; + } + + if (a.lod.size() != b.lod.size()) { + LOG(ERROR) << "lod not match"; + return false; + } + for (size_t i = 0; i < a.lod.size(); i++) { + if (a.lod[i].size() != b.lod[i].size()) { + LOG(ERROR) << "lod not match"; + return false; + } + for (size_t j = 0; j < a.lod[i].size(); j++) { + if (a.lod[i][j] != b.lod[i][j]) { + LOG(ERROR) << "lod not match"; + return false; + } + } + } + + if (a.shape.size() != b.shape.size()) { + LOG(INFO) << "shape not match"; + return false; + } + for (size_t i = 0; i < a.shape.size(); i++) { + if (a.shape[i] != b.shape[i]) { + LOG(ERROR) << "shape not match"; + return false; + } + } + + auto *adata = static_cast(a.data.data()); + auto *bdata = static_cast(b.data.data()); + for (int i = 0; i < VecReduceToInt(a.shape); i++) { + if (adata[i] != bdata[i]) { + LOG(ERROR) << "data not match"; + return false; + } + } + return true; +} + +static std::string DescribeTensor(const PaddleTensor &tensor, + int max_num_of_data = 15) { + std::stringstream os; + os << "Tensor [" << tensor.name << "]\n"; + os << " - type: "; + switch (tensor.dtype) { + case PaddleDType::FLOAT32: + os << "float32"; + break; + case PaddleDType::INT64: + os << "int64"; + break; + case PaddleDType::INT32: + os << "int32"; + break; + default: + os << "unset"; + } + os << '\n'; + + os << " - shape: " << to_string(tensor.shape) << '\n'; + os << " - lod: "; + for (auto &l : tensor.lod) { + os << to_string(l) << "; "; + } + os << "\n"; + os << " - memory length: " << tensor.data.length(); + os << "\n"; + + os << " - data: "; + int dim = VecReduceToInt(tensor.shape); + float *pdata = static_cast(tensor.data.data()); + for (int i = 0; i < dim; i++) { + os << pdata[i] << " "; + } + os << '\n'; + return os.str(); +} + +static std::string DescribeZeroCopyTensor(const ZeroCopyTensor &tensor) { + std::stringstream os; + os << "Tensor [" << tensor.name() << "]\n"; + + os << " - shape: " << to_string(tensor.shape()) << '\n'; + os << " - lod: "; + for (auto &l : tensor.lod()) { + os << to_string(l) << "; "; + } + os << "\n"; + PaddlePlace place; + int size; + const auto *data = tensor.data(&place, &size); + os << " - numel: " << size; + os << "\n"; + os << " - data: "; + for (int i = 0; i < size; i++) { + os << data[i] << " "; + } + return os.str(); +} + +static void PrintTime(int batch_size, int repeat, int num_threads, int tid, + double batch_latency, int epoch = 1, + const framework::proto::VarType::Type data_type = + framework::proto::VarType::FP32) { + PADDLE_ENFORCE(batch_size > 0, "Non-positive batch size."); + double sample_latency = batch_latency / batch_size; + LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid + << " ======"; + LOG(INFO) << "====== batch size: " << batch_size << ", iterations: " << epoch + << ", repetitions: " << repeat << " ======"; + LOG(INFO) << "====== batch latency: " << batch_latency + << "ms, number of samples: " << batch_size * epoch + << ", sample latency: " << sample_latency + << "ms, fps: " << 1000.f / sample_latency + << ", data type: " << DataTypeToString(data_type) << " ======"; +} + +static bool IsFileExists(const std::string &path) { + std::ifstream file(path); + bool exists = file.is_open(); + file.close(); + return exists; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/api/high_level_api.md b/paddle/fluid/inference/api/high_level_api.md new file mode 100644 index 00000000..8b8b6916 --- /dev/null +++ b/paddle/fluid/inference/api/high_level_api.md @@ -0,0 +1,60 @@ +# Inference High-level APIs +This document describes the high-level inference APIs, one can use them to deploy a Paddle model for an application quickly. + +The APIs are described in `paddle_inference_api.h`, just one header file, and two libaries `libpaddle_fluid.so` and `libpaddle_fluid_api.so` are needed for a deployment. + +## PaddleTensor +We provide the `PaddleTensor` data structure to give a general tensor interface. + +The definition is + +```c++ +struct PaddleTensor { + std::string name; // variable name. + std::vector shape; + PaddleBuf data; // blob of data. + PaddleDType dtype; +}; +``` + +The data is stored in a continuous memory `PaddleBuf,` and a `PaddleDType` specifies tensor's data type. +The `name` field is used to specify the name of an input variable, +that is important when there are multiple inputs and need to distinguish which variable to set. + +## engine +The inference APIs has two different underlying engines + +- the native engine, which is consists of the native operators and framework, +- the Anakin engine, which has an Anakin library embedded. + +The native engine takes a native Paddle model as input, and supports any model that trained by Paddle, +the Anakin engine is faster for some model, +but it can only take the Anakin model as input(user need to transform the format first manually) and currently not all Paddle models are supported. + +```c++ +enum class PaddleEngineKind { + kNative = 0, // Use the native Fluid facility. + kAnakin, // Use Anakin for inference. +}; +``` + +## PaddlePredictor and how to create one +The main interface is `PaddlePredictor,` there are following methods + +- `bool Run(const std::vector& inputs, std::vector* output_data)` + - take inputs and output `output_data.` +- `Clone` to clone a predictor from an existing one, with model parameter shared. + +There is a factory method to help create a predictor, and the user takes the ownership of this object. + +```c++ +template +std::unique_ptr CreatePaddlePredictor(const ConfigT& config); +``` + +By specifying the engine kind and config, one can get a specific implementation. + +## Reference + +- [paddle_inference_api.h](./paddle_inference_api.h) +- [some demos](./demo_ci) diff --git a/paddle/fluid/inference/api/high_level_api_cn.md b/paddle/fluid/inference/api/high_level_api_cn.md new file mode 100644 index 00000000..442c5989 --- /dev/null +++ b/paddle/fluid/inference/api/high_level_api_cn.md @@ -0,0 +1,87 @@ +# Paddle 预测 API + +为了更简单方便的预测部署,Fluid 提供了一套高层 API 用来隐藏底层不同的优化实现。 + +预测库包含: + +- 头文件 `paddle_inference_api.h` 定义了所有的接口 +- 库文件`libpaddle_fluid.so` 或 `libpaddle_fluid.a` +- 库文件 `libpaddle_inference_api.so` 或 `libpaddle_inference_api.a` + +下面是详细的一些 API 概念介绍 + +## PaddleTensor + +PaddleTensor 定义了预测最基本的输入输出的数据格式,其定义是 + +```c++ +struct PaddleTensor { + std::string name; // variable name. + std::vector shape; + PaddleBuf data; // blob of data. + PaddleDType dtype; +}; +``` + +- `name` 用于指定输入数据对应的 模型中variable 的名字 (暂时没有用,但会在后续支持任意 target 时启用) +- `shape` 表示一个 Tensor 的 shape +- `data` 数据以连续内存的方式存储在`PaddleBuf` 中,`PaddleBuf` 可以接收外面的数据或者独立`malloc`内存,详细可以参考头文件中相关定义。 +- `dtype` 表示 Tensor 的数据类型 + +## engine + +高层 API 底层有多种优化实现,我们称之为 engine,目前有三种 engine + +- 原生 engine,由 paddle 原生的 forward operator 组成,可以天然支持所有paddle 训练出的模型, +- Anakin engine,封装了 [Anakin](https://github.com/PaddlePaddle/Anakin) ,在某些模型上性能不错,但只能接受自带模型格式,无法支持所有 paddle 模型, +- TensorRT mixed engine,用子图的方式支持了 [TensorRT](https://developer.nvidia.com/tensorrt) ,支持所有paddle 模型,并自动切割部分计算子图到 TensorRT 上加速(WIP) + +其实现为 + +```c++ +enum class PaddleEngineKind { + kNative = 0, // Use the native Fluid facility. + kAnakin, // Use Anakin for inference. + kAutoMixedTensorRT // Automatically mixing TensorRT with the Fluid ops. +}; +``` + +## 预测部署过程 + +总体上分为以下步骤 + +1. 用合适的配置创建 `PaddlePredictor` +2. 创建输入用的 `PaddleTensor`,传入到 `PaddlePredictor` 中 +3. 获取输出的 `PaddleTensor` ,将结果取出 + +下面完整演示一个简单的模型,部分细节代码隐去 + +```c++ +#include "paddle_inference_api.h" + +// 创建一个 config,并修改相关设置 +paddle::NativeConfig config; +config.model_dir = "xxx"; +config.use_gpu = false; +// 创建一个原生的 PaddlePredictor +auto predictor = + paddle::CreatePaddlePredictor(config); +// 创建输入 tensor +int64_t data[4] = {1, 2, 3, 4}; +paddle::PaddleTensor tensor{.name = "", + .shape = std::vector({4, 1}), + .data = paddle::PaddleBuf(data, sizeof(data)), + .dtype = paddle::PaddleDType::INT64}; +// 创建输出 tensor,输出 tensor 的内存可以复用 +std::vector outputs; +// 执行预测 +CHECK(predictor->Run(slots, &outputs)); +// 获取 outputs ... +``` + +编译时,联编 `libpaddle_fluid.a/.so` 和 `libpaddle_inference_api.a/.so` 便可。 + +## 详细代码参考 + +- [inference demos](./demo_ci) +- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/test_api_impl.cc) diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc new file mode 100644 index 00000000..fea56f01 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -0,0 +1,477 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/mkldnn_quantizer.h" +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { + +using platform::CPUPlace; +using framework::LoDTensor; +using framework::ir::Graph; +using ConstEigenVectorArrayMap = + Eigen::Map>; +using string::PrettyLogH1; +static LoDTensor CreateScaleTensor(int64_t channels_num = 1); + +bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() { + PrettyLogH1("--- Calculating scales for quantization"); + using VariableNameMap = std::map>; + std::map> gathered_data; + for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) { + if (op->HasAttr("use_quantizer") && + boost::get(op->GetAttr("use_quantizer"))) { + const VariableNameMap& connections_in = op->Inputs(); + const VariableNameMap& connections_out = op->Outputs(); + + auto glambda = [&](const VariableNameMap& connections, bool is_output) { + for (auto const& conn : connections) { + for (const auto& var_name : conn.second) { + // skip if scale already computed + if (scales_.find(var_name) != scales_.end()) continue; + + auto* var = predictor_.sub_scope_->FindVar(var_name); + PADDLE_ENFORCE(var, "%s is not in the scope", var_name); + PADDLE_ENFORCE(var->IsType(), + "Only support lod tensor now."); + LoDTensor* var_tensor = var->GetMutable(); + + // force unsigned type if already know it + bool is_unsigned = false; + bool compute_scale = true; + if (is_output) { + if (op->Type() == "conv2d") { + // output of conv2d with relu must be unsigned + is_unsigned = (op->HasAttr("fuse_relu") && + boost::get(op->GetAttr("fuse_relu"))) || + (op->HasAttr("fuse_brelu") && + boost::get(op->GetAttr("fuse_brelu"))); + } else if (op->Type() == "relu") { + is_unsigned = true; + } else if (op->Type() == "transpose2" || + op->Type() == "reshape2" || op->Type() == "pool2d") { + auto input_var_name = op->Input("X")[0]; + PADDLE_ENFORCE(scales_.find(input_var_name) != scales_.end(), + "Input scales must be calculated before the " + "output scales to infer if output is unsigned."); + if (scales_.find(input_var_name) != scales_.end()) { + scales_[var_name] = scales_[input_var_name]; + } + compute_scale = false; + } else if (op->Type() == "concat") { + // output of ops with unsigned input must be unsigned + is_unsigned = true; + double min_scale = std::numeric_limits::max(); + for (auto input_var_name : op->Input("X")) { + PADDLE_ENFORCE( + scales_.find(input_var_name) != scales_.end(), + "Input scales must be calculated before the " + "output scales to infer if output is unsigned."); + is_unsigned = is_unsigned && scales_[input_var_name].first; + min_scale = std::min( + min_scale, + scales_[input_var_name].second.data()[0]); + } + auto scale_tensor = CreateScaleTensor(); + scale_tensor.data()[0] = min_scale; + scales_[var_name] = {is_unsigned, scale_tensor}; + compute_scale = false; + } + } + if (compute_scale) + CalculateSingleScale(op->Type(), conn.first, var_name, + *var_tensor, is_unsigned); + } + } + }; + + // handle inputs first to let is_unsigned be inferred for the outputs + glambda(connections_in, false /* is_output */); + glambda(connections_out, true /* is_output */); + } + } + + return true; +} + +void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale( + const std::string& op_type_name, const std::string& conn_name, + const std::string& var_name, const LoDTensor& var_tensor, + bool is_unsigned) { + auto rule = qconfig_->scale_algo(op_type_name, conn_name); + if (rule == ScaleAlgo::NONE) return; + + PADDLE_ENFORCE( + var_tensor.numel() > 0, + "MkldnnQuantizer: LoDTensor of variable %s for quantization of op " + "%s of connection %s should not be empty.", + var_name, op_type_name, conn_name); + + switch (rule) { + case ScaleAlgo::MAX: + scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned); + break; + case ScaleAlgo::MAX_CH: + scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned); + break; + case ScaleAlgo::KL: + scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned); + break; + default: + throw std::runtime_error( + "MkldnnQuantizer: Unexpected ScaleAlgo specified."); + } +} + +static LoDTensor CreateScaleTensor(int64_t channels_num) { + LoDTensor scale_tensor; + scale_tensor.Resize({channels_num}); + scale_tensor.mutable_data(CPUPlace()); + return scale_tensor; +} + +std::vector AnalysisPredictor::MkldnnQuantizer::ExpandQuantizedBins( + std::vector quantized_bins, std::vector reference_bins) const { + std::vector expanded_quantized_bins(reference_bins.size(), 0); + int num_merged_bins = reference_bins.size() / quantized_bins.size(); + int j_start = 0; + int j_end = num_merged_bins; + for (size_t idx = 0; idx < quantized_bins.size(); idx++) { + int zero_count = + std::count(&reference_bins[j_start], &reference_bins[j_end], 0); + num_merged_bins = j_end - j_start; + int avg_bin_ele; + if (zero_count == num_merged_bins) { + avg_bin_ele = 0; + } else { + avg_bin_ele = quantized_bins[idx] / (num_merged_bins - zero_count + 0.0); + } + for (int idx1 = j_start; idx1 < j_end; idx1++) { + expanded_quantized_bins[idx1] = + (reference_bins[idx1] == 0) ? 0 : avg_bin_ele; + } + j_start += num_merged_bins; + j_end += num_merged_bins; + if ((idx + 1) == quantized_bins.size() - 1) { + j_end = reference_bins.size(); + } + } + return expanded_quantized_bins; +} + +std::pair +AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + int precision_hist_num_bins = 2048; + float max_val = eigen_tensor.maxCoeff(); + float min_val = eigen_tensor.minCoeff(); + bool is_positive = min_val >= 0.0f; + if (is_unsigned) + PADDLE_ENFORCE( + is_positive, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + int num_quantized_bins = 255; + + std::vector hist; + float bin_width; + int starting_iter; + int ending_iter = precision_hist_num_bins - 1; + if (is_positive) { + std::tie(hist, bin_width) = + Histogram(var_tensor, min_val, max_val, precision_hist_num_bins); + starting_iter = static_cast(ending_iter * 0.7); + } else { + float th = std::max(std::abs(max_val), std::abs(min_val)); + std::tie(hist, bin_width) = + Histogram(var_tensor, -th, th, precision_hist_num_bins); + starting_iter = 0; + if (std::abs(max_val) > std::abs(min_val)) { + while (starting_iter < ending_iter) { + if (hist[starting_iter] == 0) { + ++starting_iter; + continue; + } else { + break; + } + } + starting_iter += static_cast((ending_iter - starting_iter) * 0.6); + } else { + while (ending_iter > 0) { + if (hist[ending_iter] == 0) { + --ending_iter; + continue; + } else { + break; + } + } + starting_iter = static_cast(0.6 * ending_iter); + } + } + auto P_sum = eigen_tensor.size(); + int min_kl_divergence = 0; + int min_kl_index = 0; + bool kl_inited = false; + for (int i = starting_iter; i <= ending_iter; i++) { + std::vector reference_distr_P(&hist[0], &hist[i]); + auto outliers_count = + std::accumulate(&hist[i], &hist[precision_hist_num_bins], 0); + if (reference_distr_P[i - 1] == 0) { + continue; + } + reference_distr_P[i - 1] += outliers_count; + auto reference_distr_bins = reference_distr_P; + std::vector candidate_distr_Q(&hist[0], &hist[i]); + int num_merged_bins = i / num_quantized_bins; + std::vector candidate_distr_Q_quantized(num_quantized_bins, 0); + int j_start = 0; + int j_end = num_merged_bins; + for (int idx = 0; idx < num_quantized_bins; idx++) { + candidate_distr_Q_quantized[idx] = std::accumulate( + &candidate_distr_Q[j_start], &candidate_distr_Q[j_end], 0); + j_start += num_merged_bins; + j_end += num_merged_bins; + if ((idx + 1) == num_quantized_bins - 1) { + j_end = i; + } + } + candidate_distr_Q = + ExpandQuantizedBins(candidate_distr_Q_quantized, reference_distr_bins); + int Q_sum = + std::accumulate(candidate_distr_Q.begin(), candidate_distr_Q.end(), 0); + auto kl_divergence = + SafeEntropy(reference_distr_P, P_sum, candidate_distr_Q, Q_sum); + if (!kl_inited) { + min_kl_divergence = kl_divergence; + min_kl_index = i; + kl_inited = true; + } else if (kl_divergence < min_kl_divergence) { + min_kl_divergence = kl_divergence; + min_kl_index = i; + } else { + } + } + if (min_kl_index == 0) { + while (starting_iter > 0) { + if (hist[starting_iter] == 0) { + starting_iter -= 1; + continue; + } else { + break; + } + } + min_kl_index = starting_iter; + } + + LoDTensor scale_tensor = CreateScaleTensor(); + scale_tensor.data()[0] = 1.0 / ((min_kl_index + 0.5) * bin_width); + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair +AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + float max_abs = eigen_tensor.abs().maxCoeff(); + float min_val = eigen_tensor.minCoeff(); + if (is_unsigned) + PADDLE_ENFORCE( + min_val >= 0.0f, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + LoDTensor scale_tensor = CreateScaleTensor(); + scale_tensor.data()[0] = 1.0 / max_abs; + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair +AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor( + const LoDTensor& var_tensor, bool is_unsigned) const { + PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty."); + + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + float min_val = eigen_tensor.minCoeff(); + if (is_unsigned) + PADDLE_ENFORCE( + min_val >= 0.0f, + "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", + min_val); + + int channels = var_tensor.dims()[0]; + LoDTensor scale_tensor = CreateScaleTensor(channels); + auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); + + for (int i = 0; i < channels; ++i) { + const auto tensor = var_tensor.Slice(i, i + 1); + + ConstEigenVectorArrayMap eigen_tensor{tensor.data(), tensor.numel(), + 1}; + float max_abs = eigen_tensor.abs().maxCoeff(); + scale_ptr[i] = 1.0 / max_abs; + } + + return std::make_pair(is_unsigned, scale_tensor); +} + +std::pair, float> +AnalysisPredictor::MkldnnQuantizer::Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + size_t num_bins) const { + PADDLE_ENFORCE_GT(num_bins, 0, + "MkldnnQuantizer: To calculate Histogram, num_bins (" + + std::to_string(num_bins) + ") must be positive."); + PADDLE_ENFORCE_GT( + var_tensor.numel(), 0, + "MkldnnQuantizer: To calculate Histogram, the tensor must not be empty."); + PADDLE_ENFORCE(max_val >= min_val, + "MkldnnQuantizer: To calculate Histogram, max_val (" + + std::to_string(max_val) + + ") must be greater or equal" + "to min_val (" + + std::to_string(min_val) + ")."); + ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), + var_tensor.numel(), 1}; + auto bin_width = std::abs(max_val - min_val) / num_bins; + std::vector hist(num_bins); + + for (int i = 0; i < eigen_tensor.size(); i++) { + int bin = std::min( + num_bins - 1, + static_cast(floor((eigen_tensor[i] - min_val) / bin_width))); + ++hist[bin]; + } + + return std::make_pair(std::move(hist), std::move(bin_width)); +} + +void AnalysisPredictor::MkldnnQuantizer::ClearDeviceContext() const { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::MKLDNNDeviceContext* dev_ctx = + (platform::MKLDNNDeviceContext*)pool.Get(predictor_.place_); + dev_ctx->ResetBlobMap(); +} + +void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const { + auto& arg = predictor_.argument_; + if (!arg.scope_valid()) arg.SetScope(new framework::Scope); + arg.SetMainProgramNotOwned(predictor_.inference_program_.get()); + auto graph = std::unique_ptr(new Graph(arg.main_program())); + arg.SetMainGraph(graph.release()); + auto* scope_ptr = arg.scope_ptr(); + PADDLE_ENFORCE(scope_ptr); + arg.main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr); + + auto* builder = predictor_.config_.pass_builder(); + builder->SetPasses({ + "infer_clean_graph_pass", "cpu_quantize_pass", "cpu_quantize_squash_pass", + }); + if (predictor_.config_.ir_debug_) builder->TurnOnDebug(); + auto passes = builder->AllPasses(); + predictor_.argument_.SetIrAnalysisPasses(passes); + predictor_.argument_.SetAnalysisPasses( + {"ir_analysis_pass", "memory_optimize_pass", "ir_graph_to_program_pass"}); + predictor_.argument_.SetQuantVarScales(scales_); +} + +bool AnalysisPredictor::MkldnnQuantizer::Quantize() { + if (!RunWarmup()) return false; + if (!CalculateScales()) return false; + ClearDeviceContext(); + predictor_.PrepareScope(predictor_.scope_); + predictor_.CreateExecutor(); + if (!RunQuantizePasses()) return false; + predictor_.PrepareExecutor(); + predictor_.PrepareFeedFetch(); + return true; +} + +bool AnalysisPredictor::MkldnnQuantizer::RunQuantizePasses() const { + predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, true, + predictor_.sub_scope_); + PrepareArgument(); + auto& arg = predictor_.argument_; + Analyzer().Run(&arg); + PADDLE_ENFORCE(arg.scope_valid()); + VLOG(5) << "to prepare executor"; + ARGUMENT_CHECK_FIELD((&arg), ir_analyzed_program); + predictor_.inference_program_.reset( + new framework::ProgramDesc(arg.ir_analyzed_program())); + LOG(INFO) << "== optimize 2 end =="; + predictor_.executor_->CreateVariables(*predictor_.inference_program_, 0, + false, predictor_.sub_scope_); + return true; +} + +bool AnalysisPredictor::MkldnnQuantizer::RunWarmup() const { + VLOG(3) << "Predictor: run a quantization warmup iteration"; + auto warmup_data = qconfig_->warmup_data(); + PADDLE_ENFORCE_NOT_NULL(warmup_data, + "Warmup data cannot be NULL in the config."); + PrettyLogH1("--- Running warmup iteration for quantization"); + + // Run the inference program + std::vector output_slots; + predictor_.Run(*warmup_data, &output_slots, qconfig_->warmup_batch_size()); + + return true; +} + +float AnalysisPredictor::MkldnnQuantizer::SafeEntropy( + std::vector reference_distr_P, int P_sum, + std::vector candidate_distr_Q, int Q_sum) const { + PADDLE_ENFORCE_EQ(reference_distr_P.size(), candidate_distr_Q.size()); + float tmp_sum1 = 0; + float tmp_sum2 = 0; + for (size_t idx = 0; idx < reference_distr_P.size(); idx++) { + int p_idx = reference_distr_P[idx]; + int q_idx = candidate_distr_Q[idx]; + if (p_idx == 0) { + tmp_sum1 += 0; + tmp_sum2 += 0; + } else { + PADDLE_ENFORCE(q_idx != 0, "MkldnnQuantizer: Fatal error!, idx = " + + std::to_string(idx) + + " qindex = 0! p_idx = " + + std::to_string(p_idx)); + } + tmp_sum1 += p_idx * (log(Q_sum * p_idx)); + tmp_sum2 += p_idx * (log(P_sum * q_idx)); + } + return (tmp_sum1 - tmp_sum2) / P_sum; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h new file mode 100644 index 00000000..6c438265 --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer.h @@ -0,0 +1,104 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/string/printf.h" +#ifdef PADDLE_WITH_TESTING +#include +#include +#endif + +namespace paddle { + +/* + * Map variable name to tensor of scaling factors scaling it to MAX=1.0. + * bool denotes whether quantization of the variable should be done to unsigned + * type. + */ +using VarQuantScale = + std::unordered_map>; + +class AnalysisPredictor::MkldnnQuantizer { + public: + explicit MkldnnQuantizer(AnalysisPredictor& predictor, // NOLINT + const MkldnnQuantizerConfig* qconfig) + : predictor_(predictor), qconfig_(qconfig) {} + + // Execute full quantization procedure. + bool Quantize(); + +#if PADDLE_WITH_TESTING + friend class MkldnnQuantizerTest; +#endif + + private: + // Run single warmup iteration + bool RunWarmup() const; + // Gather data from variables and calculate scales for them. + bool CalculateScales(); + // Calculate a scale for tensor based on ScaleAlgo rules. + void CalculateSingleScale(const std::string& op_name, + const std::string& conn_name, + const std::string& var_name, + const framework::LoDTensor& var_tensor, + bool is_unsigned); + void PrepareArgument() const; + void ClearDeviceContext() const; + bool RunQuantizePasses() const; + + std::vector ExpandQuantizedBins(std::vector quantized_bins, + std::vector reference_bins) const; + + // Using the KL-divergence method get the most precise scaling factor. + std::pair GetKLScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + std::pair GetMaxChScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + std::pair GetMaxScalingFactor( + const framework::LoDTensor& var_tensor, bool is_unsigned) const; + + // Returns histogram and bin width + std::pair, float> Histogram( + const framework::LoDTensor& var_tensor, float min_val, float max_val, + size_t num_bins = 2048) const; + + // Calculate the entropy. + float SafeEntropy(std::vector reference_distr_P, int P_sum, + std::vector candidate_distr_Q, int Q_sum) const; + + private: + AnalysisPredictor& predictor_; + const MkldnnQuantizerConfig* qconfig_; + + // A map: variable name -> scale + VarQuantScale scales_; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc new file mode 100644 index 00000000..c2b2ba0b --- /dev/null +++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h" + +namespace paddle { + +MkldnnQuantizerConfig::MkldnnQuantizerConfig() { + // The default configuration of scale computing algorightms + rules_["conv2d"]["Input"] = ScaleAlgo::KL; + rules_["conv2d"]["Filter"] = ScaleAlgo::MAX_CH; + rules_["conv2d"]["Bias"] = ScaleAlgo::NONE; // do not compute scale + rules_["conv2d"]["ResidualData"] = ScaleAlgo::KL; + rules_["conv2d"]["Output"] = ScaleAlgo::KL; + + rules_["pool2d"]["X"] = ScaleAlgo::KL; + rules_["pool2d"]["Out"] = ScaleAlgo::KL; + + rules_["concat"]["X"] = ScaleAlgo::KL; + rules_["concat"]["Out"] = ScaleAlgo::KL; + + rules_["prior_box"]["Input"] = ScaleAlgo::KL; + rules_["prior_box"]["Image"] = ScaleAlgo::NONE; + rules_["prior_box"]["Boxes"] = ScaleAlgo::NONE; + rules_["prior_box"]["Variances"] = ScaleAlgo::NONE; +} + +ScaleAlgo MkldnnQuantizerConfig::scale_algo( + const std::string& op_type_name, const std::string& conn_name) const { + if (rules_.find(op_type_name) != rules_.end()) { + auto op_rule = rules_.at(op_type_name); + if (op_rule.find(conn_name) != op_rule.end()) return op_rule.at(conn_name); + } + return default_scale_algo_; +} + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_anakin_config.h b/paddle/fluid/inference/api/paddle_anakin_config.h new file mode 100644 index 00000000..e9af13f5 --- /dev/null +++ b/paddle/fluid/inference/api/paddle_anakin_config.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle_api.h" // NOLINT + +namespace paddle { +namespace contrib { +// Configurations for Anakin engine. +struct AnakinConfig : public PaddlePredictor::Config { + enum TargetType { NVGPU = 0, X86, MLU, BM }; + int device_id{0}; + std::string model_file; + std::map> init_inputs_shape; + int init_batch_size{-1}; + bool re_allocable{true}; + int max_stream{4}; + int data_stream_id{0}; + int compute_stream_id{0}; + char* model_buf_p{nullptr}; + size_t model_buf_len{0}; + TargetType target_type; +#ifdef ANAKIN_MLU_PLACE + int model_parallel{8}; + int data_parallel{1}; + bool op_fuse{false}; + bool sparse{false}; +#endif +}; + +} // namespace contrib +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h new file mode 100644 index 00000000..83143be0 --- /dev/null +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -0,0 +1,336 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +/*! \file */ + +// Here we include some header files with relative paths, for that in deploy, +// the abstract path of this header file will be changed. +#include "paddle_api.h" // NOLINT +#include "paddle_pass_builder.h" // NOLINT +#ifdef PADDLE_WITH_MKLDNN +#include "paddle_mkldnn_quantizer_config.h" // NOLINT +#endif + +namespace paddle { + +class AnalysisPredictor; +struct MkldnnQuantizerConfig; + +// NOTE WIP, not stable yet. +struct AnalysisConfig { + AnalysisConfig() = default; + explicit AnalysisConfig(const AnalysisConfig& other); + explicit AnalysisConfig(const std::string& model_dir); + explicit AnalysisConfig(const std::string& prog_file, + const std::string& params_file); + enum class Precision { + kFloat32 = 0, + kInt8, + }; + + /** Set model with a directory. + */ + void SetModel(const std::string& model_dir) { model_dir_ = model_dir; } + /** Set model with two specific pathes for program and parameters. + */ + void SetModel(const std::string& prog_file_path, + const std::string& params_file_path); + /** Set program file path. + */ + void SetProgFile(const std::string& x) { prog_file_ = x; } + /** Set parameter composed file path. + */ + void SetParamsFile(const std::string& x) { params_file_ = x; } + /** Set opt cache dir. + */ + void SetOptimCacheDir(const std::string& opt_cache_dir) { + opt_cache_dir_ = opt_cache_dir; + } + /** Get the model directory path. + */ + const std::string& model_dir() const { return model_dir_; } + /** Get the program file path. + */ + const std::string& prog_file() const { return prog_file_; } + /** Get the composed parameters file. + */ + const std::string& params_file() const { return params_file_; } + + // GPU related. + + /** + * \brief Turn on GPU. + * @param memory_pool_init_size_mb initial size of the GPU memory pool in MB. + * @param device_id the GPU card to use (default is 0). + */ + void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0); + /** Turn off the GPU. + */ + void DisableGpu(); + /** A bool state telling whether the GPU is turned on. + */ + bool use_gpu() const { return use_gpu_; } + /** Get the GPU device id. + */ + int gpu_device_id() const { return device_id_; } + /** Get the initial size in MB of the GPU memory pool. + */ + int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; } + /** Get the proportion of the initial memory pool size compared to the device. + */ + float fraction_of_gpu_memory_for_pool() const; + + /** \brief Control whether to perform IR graph optimization. + * + * If turned off, the AnalysisConfig will act just like a NativeConfig. + */ + void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; } + /** A boolean state tell whether the ir graph optimization is actived. + */ + bool ir_optim() const { return enable_ir_optim_; } + + /** \brief INTERNAL Determine whether to use the feed and fetch operators. + * Just for internal development, not stable yet. + * When ZeroCopyTensor is used, this should turned off. + */ + void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; } + /** A boolean state telling whether to use the feed and fetch operators. + */ + bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; } + + /** \brief Control whether to specify the inputs' names. + * + * The PaddleTensor type has a `name` member, assign it with the corresponding + * variable name. This is used only when the input PaddleTensors passed to the + * `PaddlePredictor.Run(...)` cannot follow the order in the training phase. + */ + void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; } + + /** A boolean state tell whether the input PaddleTensor names specified should + * be used to reorder the inputs in `PaddlePredictor.Run(...)`. + */ + bool specify_input_name() const { return specify_input_name_; } + + /** + * \brief Turn on the TensorRT engine. + * + * The TensorRT engine will accelerate some subgraphes in the original Fluid + * computation graph. In some models such as TensorRT50, GoogleNet and so on, + * it gains significant performance acceleration. + * + * @param workspace_size the memory size(in byte) used for TensorRT workspace. + * @param max_batch_size the maximum batch size of this prediction task, + * better set as small as possible, or performance loss. + * @param min_subgrpah_size the minimum TensorRT subgraph size needed, if a + * subgraph is less than this, it will not transfer to TensorRT engine. + */ + void EnableTensorRtEngine(int workspace_size = 1 << 20, + int max_batch_size = 1, int min_subgraph_size = 3, + Precision precision = Precision::kFloat32, + bool use_static = false, + bool use_calib_mode = true); + /** A boolean state telling whether the TensorRT engine is used. + */ + bool tensorrt_engine_enabled() const { return use_tensorrt_; } + /** + * \brief Turn on the usage of Anakin sub-graph engine. + */ + void EnableAnakinEngine( + int max_batch_size = 1, + std::map> max_input_shape = {}, + int min_subgraph_size = 6, Precision precision = Precision::kFloat32, + bool auto_config_layout = false, + std::vector passes_filter = {}, + std::vector ops_filter = {}); + + /** A boolean state indicating whether the Anakin sub-graph engine is used. + */ + bool anakin_engine_enabled() const { return use_anakin_; } + + /** \brief Control whether to debug IR graph analysis phase. + * + * This will generate DOT files for visualizing the computation graph after + * each analysis pass applied. + */ + void SwitchIrDebug(int x = true); + + /** Turn on NGRAPH. + */ + void EnableNgraph(); + /** A boolean state telling whether to use the NGRAPH. + */ + bool ngraph_enabled() const { return use_ngraph_; } + + /** Turn on MKLDNN. + */ + void EnableMKLDNN(); + /** set the cache capacity of different input shapes for MKLDNN. + * Default 0 means don't cache any shape. + */ + void SetMkldnnCacheCapacity(int capacity); + /** A boolean state telling whether to use the MKLDNN. + */ + bool mkldnn_enabled() const { return use_mkldnn_; } + + /** Set and get the number of cpu math library threads. + */ + void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads); + /** An int state telling how many threads are used in the CPU math library. + */ + int cpu_math_library_num_threads() const { + return cpu_math_library_num_threads_; + } + + /** Transform the AnalysisConfig to NativeConfig. + */ + NativeConfig ToNativeConfig() const; + /** Specify the operator type list to use MKLDNN acceleration. + * @param op_list the operator type list. + */ + void SetMKLDNNOp(std::unordered_set op_list) { + mkldnn_enabled_op_types_ = op_list; + } + + /** Turn on quantization. + */ + void EnableMkldnnQuantizer(); + + /** A boolean state telling whether the quantization is enabled. + */ + bool mkldnn_quantizer_enabled() const { return use_mkldnn_quantizer_; } + + MkldnnQuantizerConfig* mkldnn_quantizer_config() const; + + /** Specify the memory buffer of program and parameter + * @param prog_buffer the memory buffer of program. + * @param prog_buffer_size the size of the data. + * @param params_buffer the memory buffer of the composed parameters file. + * @param params_buffer_size the size of the commposed parameters data. + */ + void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size, + const char* params_buffer, size_t params_buffer_size); + /** A boolean state telling whether the model is set from the CPU memory. + */ + bool model_from_memory() const { return model_from_memory_; } + + /** Turn on memory optimize + * NOTE still in development, will release latter. + */ + void EnableMemoryOptim(bool static_optim = false, + bool force_update_static_cache = false); + /** Tell whether the memory optimization is activated. */ + bool enable_memory_optim() const; + void SetInValid() const { is_valid_ = false; } + bool is_valid() const { return is_valid_; } + + friend class ::paddle::AnalysisPredictor; + + /** NOTE just for developer, not an official API, easily to be broken. + * Get a pass builder for customize the passes in IR analysis phase. + */ + PassStrategy* pass_builder() const; + void PartiallyRelease(); + + protected: + // Update the config. + void Update(); + + std::string SerializeInfoCache(); + + protected: + // Model pathes. + std::string model_dir_; + mutable std::string prog_file_; + mutable std::string params_file_; + + // GPU related. + bool use_gpu_{false}; + int device_id_{0}; + uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. + + // TensorRT related. + bool use_tensorrt_{false}; + // For workspace_size, refer it from here: + // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting + int tensorrt_workspace_size_; + // While TensorRT allows an engine optimized for a given max batch size + // to run at any smaller size, the performance for those smaller + // sizes may not be as well-optimized. Therefore, Max batch is best + // equivalent to the runtime batch size. + int tensorrt_max_batchsize_; + // We transform the Ops that can be converted into TRT layer in the model, + // and aggregate these Ops into subgraphs for TRT execution. + // We set this variable to control the minimum number of nodes in the + // subgraph, 3 as default value. + int tensorrt_min_subgraph_size_{3}; + Precision tensorrt_precision_mode_; + bool trt_use_static_engine_; + bool trt_use_calib_mode_; + + // memory reuse related. + bool enable_memory_optim_{false}; + bool static_memory_optim_{false}; + bool static_memory_optim_force_update_{false}; + + bool use_ngraph_{false}; + bool use_mkldnn_{false}; + std::unordered_set mkldnn_enabled_op_types_; + + bool model_from_memory_{false}; + + bool enable_ir_optim_{true}; + bool use_feed_fetch_ops_{true}; + bool ir_debug_{false}; + + bool specify_input_name_{false}; + + int cpu_math_library_num_threads_{1}; + + // A runtime cache, shouldn't be transferred to others. + std::string serialized_info_cache_; + + mutable std::unique_ptr pass_builder_; + + bool use_anakin_{false}; + int anakin_max_batchsize_; + int anakin_min_subgraph_size_{6}; + std::map> anakin_max_input_shape_; + Precision anakin_precision_mode_; + bool anakin_auto_config_layout_{false}; + std::vector anakin_passes_filter_; + std::vector anakin_ops_filter_; + + // mkldnn related. + int mkldnn_cache_capacity_{0}; + bool use_mkldnn_quantizer_{false}; + std::shared_ptr mkldnn_quantizer_config_; + + // If the config is already used on a predictor, it becomes invalid. + // Any config can only be used with one predictor. + // Variables held by config can take up a lot of memory in some cases. + // So we release the memory when the predictor is set up. + mutable bool is_valid_{true}; + std::string opt_cache_dir_; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h new file mode 100644 index 00000000..87f40f09 --- /dev/null +++ b/paddle/fluid/inference/api/paddle_api.h @@ -0,0 +1,357 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +/*! \file paddle_api.h + */ + +/*! \mainpage Paddle Inference APIs + * \section intro_sec Introduction + * The Paddle inference library aims to offer an high performance inference SDK + * for Paddle users. + */ + +#include +#include +#include +#include + +/*! \namespace paddle + */ +namespace paddle { + +/** paddle data type. + */ +enum PaddleDType { + FLOAT32, + INT64, + INT32, + // TODO(Superjomn) support more data types if needed. +}; + +/** + * \brief Memory manager for `PaddleTensor`. + * + * The PaddleBuf holds a buffer for data input or output. The memory can be + * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf + * should be reused for better performance. + * + * For user allocated memory, the following API can be used: + * - PaddleBuf(void* data, size_t length) to set an external memory by + * specifying the memory address and length. + * - Reset(void* data, size_t length) to reset the PaddleBuf with an external + *memory. + * ATTENTION, for user allocated memory, deallocation should be done by users + *externally after the program finished. The PaddleBuf won't do any allocation + *or deallocation. + * + * To have the PaddleBuf allocate and manage the memory: + * - PaddleBuf(size_t length) will allocate a memory of size `length`. + * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION + * if the allocated memory is larger than `length`, nothing will done. + * + * Usage: + * + * Let PaddleBuf manage the memory internally. + * \code{cpp} + * const int num_elements = 128; + * PaddleBuf buf(num_elements * sizeof(float)); + * \endcode + * + * Or + * \code{cpp} + * PaddleBuf buf; + * buf.Resize(num_elements * sizeof(float)); + * \endcode + * Works the exactly the same. + * + * One can also make the `PaddleBuf` use the external memory. + * \code{cpp} + * PaddleBuf buf; + * void* external_memory = new float[num_elements]; + * buf.Reset(external_memory, num_elements*sizeof(float)); + * ... + * delete[] external_memory; // manage the memory lifetime outside. + * \endcode + */ +class PaddleBuf { + public: + /** PaddleBuf allocate memory internally, and manage it. + */ + explicit PaddleBuf(size_t length) + : data_(new char[length]), length_(length), memory_owned_(true) {} + /** Set external memory, the PaddleBuf won't manage it. + */ + PaddleBuf(void* data, size_t length) + : data_(data), length_(length), memory_owned_{false} {} + /** Copy only available when memory is managed externally. + */ + explicit PaddleBuf(const PaddleBuf&); + + /** Resize the memory. + */ + void Resize(size_t length); + /** Reset to external memory, with address and length set. + */ + void Reset(void* data, size_t length); + /** Tell whether the buffer is empty. + */ + bool empty() const { return length_ == 0; } + /** Get the data's memory address. + */ + void* data() const { return data_; } + /** Get the memory length. + */ + size_t length() const { return length_; } + + ~PaddleBuf() { Free(); } + PaddleBuf& operator=(const PaddleBuf&); + PaddleBuf& operator=(PaddleBuf&&); + PaddleBuf() = default; + PaddleBuf(PaddleBuf&& other); + + private: + void Free(); + void* data_{nullptr}; // pointer to the data memory. + size_t length_{0}; // number of memory bytes. + bool memory_owned_{true}; +}; + +/** Basic input and output data structure for PaddlePredictor. + */ +struct PaddleTensor { + PaddleTensor() = default; + std::string name; // variable name. + std::vector shape; + PaddleBuf data; // blob of data. + PaddleDType dtype; + std::vector> lod; // Tensor+LoD equals LoDTensor +}; + +enum class PaddlePlace { kUNK = -1, kCPU, kGPU }; + +/** Tensor without copy, currently only supports `AnalysisPredictor`. + */ +class ZeroCopyTensor { + public: + void Reshape(const std::vector& shape); + + /** Get the memory in CPU or GPU with specific data type, should Reshape first + * to tell the data size. + * Once can directly call this data to feed the data. + * This is for write the input tensor. + */ + template + T* mutable_data(PaddlePlace place); + /** Get the memory directly, will return the place and element size by + * pointer. + * This is for reading the output tensor. + */ + template + T* data(PaddlePlace* place, int* size) const; + + template + void copy_from_cpu(const T* data); + + template + void copy_to_cpu(T* data); + + std::vector shape() const; + + void SetLoD(const std::vector>& x); + std::vector> lod() const; + const std::string& name() const { return name_; } + void SetPlace(PaddlePlace place, int device = -1) { + place_ = place; + device_ = device; + } + + PaddleDType type() const; + + protected: + explicit ZeroCopyTensor(void* scope) : scope_{scope} {} + void SetName(const std::string& name) { name_ = name; } + void* FindTensor() const; + + private: + std::string name_; + bool input_or_output_; + friend class AnalysisPredictor; + void* scope_{nullptr}; + // The corresponding tensor pointer inside Paddle workspace is cached for + // performance. + mutable void* tensor_{nullptr}; + PaddlePlace place_; + PaddleDType dtype_; + int device_; +}; + +/** A simple Inference API for Paddle. + */ +class PaddlePredictor { + public: + struct Config; + PaddlePredictor() = default; + PaddlePredictor(const PaddlePredictor&) = delete; + PaddlePredictor& operator=(const PaddlePredictor&) = delete; + + /** Predict an record. + * The caller should be responsible for allocating and releasing the memory of + * `inputs`. `inputs` should be available until Run returns. Caller should be + * responsible for the output tensor's buffer, either allocated or passed from + * outside. + */ + virtual bool Run(const std::vector& inputs, + std::vector* output_data, + int batch_size = -1) = 0; + + /** \brief Get input names of the model + */ + virtual std::vector GetInputNames() { return {}; } + + /** \brief Get output names of the model + */ + virtual std::vector GetOutputNames() { return {}; } + + /** \brief Get a mutable tensor directly. + * + * NOTE Only works in AnalysisPredictor. + * + * One can also use this to modify any temporary variable related tensors in + * the predictor. + * + */ + virtual std::unique_ptr GetInputTensor( + const std::string& name) { + return nullptr; + } + /** + * \brief Get an immutable tensor without copy. + * + * NOTE Only works in AnalysisPredictor. + * One can use this API to get any temporary tensors in the predictor and + * read it. + */ + virtual std::unique_ptr GetOutputTensor( + const std::string& name) { + return nullptr; + } + /** + * \brief Run the predictor with zero-copied inputs and outputs. + * + * NOTE Only works in AnalysisPredictor. + * + * This will save the IO copy for transfering inputs and outputs to predictor + * workspace and get some performance improvement. + * To use it, one should call the `AnalysisConfig.SwitchUseFeedFetchOp(true)` + * and then use the `GetInputTensor` and `GetOutputTensor` to directly write + * or read the input/output tensors. + */ + virtual bool ZeroCopyRun() { return false; } + + /** Clone a predictor that share the model weights, the Cloned predictor + * should be thread-safe. + */ + virtual std::unique_ptr Clone() = 0; + + /** Destroy the Predictor. + */ + virtual ~PaddlePredictor() = default; + + /** \brief Get the serialized model program that executes in inference phase. + * Its data type is ProgramDesc, which is a protobuf message. + */ + virtual std::string GetSerializedProgram() const { + assert(false); // Force raise error. + return "NotImplemented"; + } + + /** The common configs for all the predictors. + */ + struct Config { + std::string model_dir; /*!< path to the model directory. */ + }; +}; + +struct NativeConfig : public PaddlePredictor::Config { + // GPU related fields. + bool use_gpu{false}; + int device{0}; + float fraction_of_gpu_memory{ + -1.f}; /*!< Change to a float in (0,1] if needed. */ + + // Specify the exact path of program and parameter files. + std::string prog_file; + std::string param_file; + + /** Specify the variable's name of each input if input tensors don't follow + * the + * `feeds` and `fetches` of the phase `save_inference_model`. + */ + bool specify_input_name{false}; + + /** Set and get the number of cpu math library threads. + */ + void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) { + cpu_math_library_num_threads_ = cpu_math_library_num_threads; + } + int cpu_math_library_num_threads() const { + return cpu_math_library_num_threads_; + } + + protected: + // number of cpu math library (such as MKL, OpenBlas) threads for each + // instance. + int cpu_math_library_num_threads_{1}; +}; + +/*! \fn std::unique_ptr CreatePaddlePredictor(const ConfigT& + * config); + * + * \brief A factory to help create different predictors. + * + * Usage: + * + * \code{.cpp} + * NativeConfig config; + * ... // change the configs. + * auto native_predictor = CreatePaddlePredictor(config); + * \endcode + * + * FOR EXTENSION DEVELOPER: + * Different predictors are designated by config type. Similar configs can be + * merged, but there shouldn't be a huge config containing different fields for + * more than one kind of predictors. + */ +template +std::unique_ptr CreatePaddlePredictor(const ConfigT& config); + +/** NOTE The following APIs are too trivial, we will discard it in the following + * versions. + */ +enum class PaddleEngineKind { + kNative = 0, /*!< Use the native Fluid facility. */ + kAutoMixedTensorRT, /*!< Automatically mix Fluid with TensorRT. */ + kAnalysis, /*!< More optimization. */ + kAnakin /*!< Use Anakin for inference, not mature yet. */ +}; + +template +std::unique_ptr CreatePaddlePredictor(const ConfigT& config); + +int PaddleDtypeSize(PaddleDType dtype); + +std::string get_version(); + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h new file mode 100644 index 00000000..feb5373c --- /dev/null +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file contains the definition of a simple Inference API for Paddle. + * + * ATTENTION: It requires some C++11 features, for lower version C++ or C, we + * might release another API. + */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle_analysis_config.h" // NOLINT +#include "paddle_api.h" // NOLINT +#if (defined PADDLE_WITH_ANAKIN) +#include "paddle_anakin_config.h" // NOLINT +#endif diff --git a/paddle/fluid/inference/api/paddle_inference_pass.h b/paddle/fluid/inference/api/paddle_inference_pass.h new file mode 100644 index 00000000..64628d8e --- /dev/null +++ b/paddle/fluid/inference/api/paddle_inference_pass.h @@ -0,0 +1,33 @@ +// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT! + +#pragma once +#include "paddle/fluid/framework/ir/pass.h" +USE_PASS(graph_to_program_pass); +USE_PASS(graph_viz_pass); +USE_PASS(lock_free_optimize_pass); +USE_PASS(fc_fuse_pass); +USE_PASS(attention_lstm_fuse_pass); +USE_PASS(infer_clean_graph_pass); +USE_PASS(fc_lstm_fuse_pass); +USE_PASS(embedding_fc_lstm_fuse_pass); +USE_PASS(fc_gru_fuse_pass); +USE_PASS(seq_concat_fc_fuse_pass); +USE_PASS(multi_batch_merge_pass); +USE_PASS(conv_bn_fuse_pass); +USE_PASS(seqconv_eltadd_relu_fuse_pass); +USE_PASS(seqpool_concat_fuse_pass); +USE_PASS(repeated_fc_relu_fuse_pass); +USE_PASS(squared_mat_sub_fuse_pass); +USE_PASS(is_test_pass); +USE_PASS(conv_elementwise_add_act_fuse_pass); +USE_PASS(conv_elementwise_add2_act_fuse_pass); +USE_PASS(conv_elementwise_add_fuse_pass); +USE_PASS(conv_affine_channel_fuse_pass); +USE_PASS(transpose_flatten_concat_fuse_pass); +USE_PASS(identity_scale_op_clean_pass); +USE_PASS(sync_batch_norm_pass); +USE_PASS(runtime_context_cache_pass); +USE_PASS(quant_conv2d_dequant_fuse_pass); +USE_PASS(fillconstant_elementwisemul_fuse); +USE_PASS(shuffle_channel_detect_pass); +USE_PASS(delete_quant_dequant_op_pass); diff --git a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h new file mode 100644 index 00000000..d46f842d --- /dev/null +++ b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h @@ -0,0 +1,105 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle_api.h" // NOLINT + +namespace paddle { + +// Algorithms for finding scale of quantized Tensors. +enum class ScaleAlgo { + NONE, // Do not compute scale + MAX, // Find scale based on the maximum absolute value + MAX_CH, // Find scale based on the maximum absolute value per channel + KL, // Find scale based on KL Divergence +}; + +struct MkldnnQuantizerConfig { + MkldnnQuantizerConfig(); + + /** Specify a quantization algorithm for a connection (input/output) of the + * operator type. + * @param op_type_name the operator's name. + * @param conn_name name of the connection (input/output) of the operator. + * @param algo the algorithm for computing scale. + */ + void SetScaleAlgo(std::string op_type_name, std::string conn_name, + ScaleAlgo algo) { + rules_[op_type_name][conn_name] = algo; + } + + /** Get the quantization algorithm for a connection (input/output) of the + * operator type. + * @param op_type_name the operator's name. + * @param conn_name name of the connection (input/output) of the operator. + * @return the algorithm for computing scale. + */ + ScaleAlgo scale_algo(const std::string& op_type_name, + const std::string& conn_name) const; + + /** Set the batch of data to be used for warm-up iteration. + * @param data batch of data. + */ + void SetWarmupData(std::shared_ptr> data) { + warmup_data_ = data; + } + + /** Get the batch of data used for warm-up iteration. + * @return batch of data. + */ + std::shared_ptr> warmup_data() const { + return warmup_data_; + } + + void SetWarmupBatchSize(int batch_size) { warmup_bs_ = batch_size; } + + int warmup_batch_size() const { return warmup_bs_; } + + void SetEnabledOpTypes(std::unordered_set op_list) { + enabled_op_types_ = op_list; + } + + const std::unordered_set& enabled_op_types() const { + return enabled_op_types_; + } + + void SetExcludedOpIds(std::unordered_set op_ids_list) { + excluded_op_ids_ = op_ids_list; + } + + const std::unordered_set& excluded_op_ids() const { + return excluded_op_ids_; + } + + void SetDefaultScaleAlgo(ScaleAlgo algo) { default_scale_algo_ = algo; } + + ScaleAlgo default_scale_algo() const { return default_scale_algo_; } + + protected: + std::map> rules_; + std::unordered_set enabled_op_types_; + std::unordered_set excluded_op_ids_; + std::shared_ptr> warmup_data_; + int warmup_bs_{1}; + ScaleAlgo default_scale_algo_{ScaleAlgo::MAX}; +}; + +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc new file mode 100644 index 00000000..bc2c0914 --- /dev/null +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -0,0 +1,216 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/paddle_pass_builder.h" +#ifdef PADDLE_WITH_CUDA +#include +#endif +#include + +namespace paddle { + +void PaddlePassBuilder::AppendPass(const std::string &pass_type) { + passes_.push_back(pass_type); +} + +void PaddlePassBuilder::TurnOnDebug() { + std::vector passes; + auto it = std::begin(passes_); + while (it != std::end(passes_)) { + if (*it != "graph_viz_pass") { + it = passes_.insert(it + 1, "graph_viz_pass"); + } else { + ++it; + } + } +} + +std::string PaddlePassBuilder::DebugString() { + std::stringstream ss; + ss << "Passes to apply:\n"; + for (auto &pass : passes_) { + ss << " - " << pass << '\n'; + } + return ss.str(); +} + +void PaddlePassBuilder::DeletePass(const std::string &pass_type) { + auto it = std::begin(passes_); + while (it != std::end(passes_)) { + if (*it == pass_type) { + it = passes_.erase(it); + } else { + ++it; + } + } +} + +void PaddlePassBuilder::InsertPass(size_t idx, const std::string &pass_type) { + passes_.insert(std::begin(passes_) + idx, pass_type); +} + +void PaddlePassBuilder::DeletePass(size_t idx) { + passes_.erase(std::begin(passes_) + idx); +} + +void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { + analysis_passes_.push_back(pass); +} + +void PaddlePassBuilder::ClearPasses() { passes_.clear(); } + +const std::vector kTRTSubgraphPasses({ + "infer_clean_graph_pass", // + "conv_affine_channel_fuse_pass", // + "conv_eltwiseadd_affine_channel_fuse_pass", // + "quant_conv2d_dequant_fuse_pass", // + "delete_quant_dequant_op_pass", // + // "fc_fuse_pass", // + "tensorrt_subgraph_pass", // + "conv_bn_fuse_pass", // +#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be + // guaranteed at least v7 + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // +#endif // + "transpose_flatten_concat_fuse_pass", +}); + +// The following passes works for Anakin sub-graph engine. +const std::vector kAnakinSubgraphPasses({ + "infer_clean_graph_pass", // + "quant_conv2d_dequant_fuse_pass", // + "simplify_anakin_priorbox_detection_out_pass", // + "fillconstant_elementwisemul_fuse", // + "fc_fuse_pass", // + "conv_elementwise_add_fuse_pass", // + "fc_gru_fuse_pass", // + "shuffle_channel_detect_pass", // + "anakin_subgraph_pass", // + "fc_gru_fuse_pass", // +}); + +GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { + passes_.assign({ + "infer_clean_graph_pass", // + // "identity_scale_op_clean_pass", // + "conv_affine_channel_fuse_pass", // + "conv_eltwiseadd_affine_channel_fuse_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // +#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be + // guaranteed at least v7 + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // +#endif // + "transpose_flatten_concat_fuse_pass", + // following pass should be located in the last, since it will + // work on all fused ops. + "runtime_context_cache_pass" + }); + + use_gpu_ = true; +} + +void GpuPassStrategy::EnableMKLDNN() { + LOG(ERROR) << "GPU not support MKLDNN yet"; +} + +void GpuPassStrategy::EnableMkldnnQuantizer() { + LOG(ERROR) << "GPU not support MKL-DNN quantization"; +} + +void GpuPassStrategy::EnableNgraph() { + LOG(ERROR) << "GPU not support Ngraph yet"; +} + +CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { + // NOTE the large fusions should be located in the front, so that they will + // not be damaged by smaller ones. + passes_.assign({"infer_clean_graph_pass", // + "attention_lstm_fuse_pass", // + "seqconv_eltadd_relu_fuse_pass", // + // "seqpool_concat_fuse_pass", // + // "embedding_fc_lstm_fuse_pass", // + "fc_lstm_fuse_pass", // + "mul_lstm_fuse_pass", // + "fc_gru_fuse_pass", // + "mul_gru_fuse_pass", // + "seq_concat_fc_fuse_pass", // + "fc_fuse_pass", // + "repeated_fc_relu_fuse_pass", // + "squared_mat_sub_fuse_pass", // + "conv_bn_fuse_pass", // + "conv_eltwiseadd_bn_fuse_pass", // + "is_test_pass", // + // following pass should be located in the last, since + // it will work on all fused ops. + "runtime_context_cache_pass"}); + + use_gpu_ = false; +} + +void CpuPassStrategy::EnableMKLDNN() { +// TODO(Superjomn) Consider the way to mix CPU with GPU. +#ifdef PADDLE_WITH_MKLDNN + if (!use_mkldnn_) { + passes_.insert(passes_.begin(), "mkldnn_placement_pass"); + + for (auto &pass : std::vector({ + "depthwise_conv_mkldnn_pass", // + "conv_bn_fuse_pass", // Execute BN passes again to + "conv_eltwiseadd_bn_fuse_pass", // preserve correct pass order + "conv_bias_mkldnn_fuse_pass", // + "conv_transpose_bias_mkldnn_fuse_pass", + "conv3d_bias_mkldnn_fuse_pass", // + "conv_elementwise_add_mkldnn_fuse_pass", + "conv_concat_relu_mkldnn_fuse_pass", + "conv_relu_mkldnn_fuse_pass", // + "conv_brelu_mkldnn_fuse_pass", // + // Disabled due to topology-dependent speed-up + // "fc_mkldnn_pass" + })) { + passes_.push_back(pass); + } + } + use_mkldnn_ = true; +#else + use_mkldnn_ = false; +#endif +} + +void CpuPassStrategy::EnableMkldnnQuantizer() { +#ifdef PADDLE_WITH_MKLDNN + if (!use_mkldnn_quantizer_) { + passes_.push_back("cpu_quantize_placement_pass"); + } + use_mkldnn_quantizer_ = true; +#else + use_mkldnn_quantizer_ = false; +#endif +} + +void CpuPassStrategy::EnableNgraph() { +#ifdef PADDLE_WITH_NGRAPH + if (!use_ngraph_) { + passes_.insert(passes_.begin(), "ngraph_subgraph_pass"); + } + use_ngraph_ = true; +#else + use_ngraph_ = false; +#endif +} +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h new file mode 100644 index 00000000..62b7ab30 --- /dev/null +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -0,0 +1,157 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +/*! \file */ + +/*! \namespace paddle */ +namespace paddle { + +/** This is a pass builder based on string. It is part of inference API. + */ +class PaddlePassBuilder { + public: + explicit PaddlePassBuilder(const std::vector &passes) + : passes_(passes) {} + + void SetPasses(std::initializer_list passes) { + passes_ = passes; + } + + /** Append a pass to the end of the passes. */ + void AppendPass(const std::string &pass_type); + + /** Insert a pass to a specific position. + * @param idx the position to insert. + * @param pass_type the pass key. + */ + void InsertPass(size_t idx, const std::string &pass_type); + + /** Delete the `idx`-th pass. */ + void DeletePass(size_t idx); + + /** Delete all the passes that has type `pass_type`. */ + void DeletePass(const std::string &pass_type); + + void ClearPasses(); + /** Append an analysis pass. */ + void AppendAnalysisPass(const std::string &pass); + + /** Visualize the computation graph after each pass by generating a DOT + * language file, one can draw them with the Graphviz toolkit. + */ + void TurnOnDebug(); + + /** Human-readible information. */ + std::string DebugString(); + + const std::vector &AllPasses() const { return passes_; } + std::vector AnalysisPasses() const { + auto passes = analysis_passes_; + // To make sure the ir_graph_to_program should be the last pass so any + // modication of IR will persist to the program. + passes.push_back("ir_graph_to_program_pass"); + return passes; + } + + protected: + std::vector analysis_passes_{ + {"ir_graph_build_pass", "ir_analysis_pass", + "ir_params_sync_among_devices_pass", "adjust_cudnn_workspace_size_pass", + "inference_op_replace_pass"}}; + std::vector passes_; +}; + +/**Pass strategy to help control the IR passes. + */ +class PassStrategy : public PaddlePassBuilder { + public: + explicit PassStrategy(const std::vector &passes) + : PaddlePassBuilder(passes) {} + + /** The MKLDNN control exists in both CPU and GPU mode, because there can be + * still some CPU kernels running in CPU mode. + */ + virtual void EnableMKLDNN() {} + + /** Enable NGRAPH optimization + */ + virtual void EnableNgraph() {} + + /** Enable MKLDNN quantize optimization + */ + virtual void EnableMkldnnQuantizer() {} + + bool use_gpu() const { return use_gpu_; } + + virtual ~PassStrategy() = default; + + protected: + bool use_ngraph_{false}; + bool use_gpu_{false}; + bool use_mkldnn_{false}; +}; + +/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode. + */ +class CpuPassStrategy : public PassStrategy { + public: + CpuPassStrategy(); + + explicit CpuPassStrategy(const CpuPassStrategy &other) + : PassStrategy(other.AllPasses()) { + use_gpu_ = other.use_gpu_; + use_ngraph_ = other.use_ngraph_; + use_mkldnn_ = other.use_mkldnn_; + use_mkldnn_quantizer_ = other.use_mkldnn_quantizer_; + } + + virtual ~CpuPassStrategy() = default; + + void EnableNgraph() override; + void EnableMKLDNN() override; + void EnableMkldnnQuantizer() override; + + protected: + bool use_ngraph_{false}; + bool use_mkldnn_quantizer_{false}; +}; + +/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. + */ +class GpuPassStrategy : public PassStrategy { + public: + GpuPassStrategy(); + + explicit GpuPassStrategy(const GpuPassStrategy &other) + : PassStrategy(other.AllPasses()) { + use_gpu_ = true; + } + + void EnableNgraph() override; + void EnableMKLDNN() override; + void EnableMkldnnQuantizer() override; + + virtual ~GpuPassStrategy() = default; +}; + +extern const std::vector kTRTSubgraphPasses; +extern const std::vector kAnakinSubgraphPasses; + +} // namespace paddle diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh new file mode 100755 index 00000000..b6b7d1f2 --- /dev/null +++ b/paddle/fluid/inference/check_symbol.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +lib=$1 +if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi + +num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l) +num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep "T " | wc -l) + +if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi +if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi + +exit 0 diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h new file mode 100644 index 00000000..1a13ba51 --- /dev/null +++ b/paddle/fluid/inference/engine.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace inference { + +struct Buffer; +enum class DeviceType { UNK = -1, CPU, GPU }; + +/* + * EngineBase is the base class of all inference engines. An inference engine + * takes a paddle program as input, and outputs the result in fluid Tensor + * format. It can be used to optimize performance of computation sub-blocks, for + * example, break down the original block into sub-blocks and execute each + * sub-blocks in different engines. + * + * For example: + * When inference, the resnet50 model can put most of the model into subgraph + * and run it on a TensorRT engine. + * + * There are several engines such as TensorRT and other frameworks, so an + * EngineBase is put forward to give an unified interface for all the + * different engine implemention. + */ +class EngineBase { + public: + using DescType = ::paddle::framework::proto::BlockDesc; + + // Build the model and do some preparation, for example, in TensorRT, run + // createInferBuilder, buildCudaEngine. + virtual void Build(const DescType& paddle_model) = 0; + + // Execute the engine, that will run the inference network. + virtual void Execute(int batch_size) = 0; + + virtual ~EngineBase() {} +}; // class EngineBase + +struct Buffer { + void* buffer{nullptr}; // buffer should be allocated only once. + size_t max_size; // buffer allocated space. + size_t size; // data size. + DeviceType device{DeviceType::UNK}; // tells which device this buffer is on. +}; + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc new file mode 100644 index 00000000..8b379457 --- /dev/null +++ b/paddle/fluid/inference/io.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/io.h" + +#include +#include +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/version.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/pybind/pybind.h" + +DEFINE_string(devices, "", "The devices to be used which is joined by comma."); +DEFINE_bool(init_p2p, false, "Whether to init p2p."); +DEFINE_int32(math_num_threads, 1, + "Number of threads used to run math functions."); + +namespace paddle { +namespace inference { + +void Init(const std::vector argv) { + framework::InitGflags(argv); + platform::SetNumThreads(FLAGS_math_num_threads); + // init devices + std::vector devices; + std::string token; + std::istringstream tokenStream(FLAGS_devices); + while (std::getline(tokenStream, token, ',')) { + devices.push_back(std::stoi(token)); + } + framework::InitDevices(FLAGS_init_p2p, devices); +} + +void ReadBinaryFile(const std::string& filename, std::string* contents) { + std::ifstream fin(filename, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); + fin.seekg(0, std::ios::end); + contents->clear(); + contents->resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&(contents->at(0)), contents->size()); + fin.close(); +} + +bool IsPersistable(const framework::VarDesc* var) { + if (var->Persistable() && + var->GetType() != framework::proto::VarType::FEED_MINIBATCH && + var->GetType() != framework::proto::VarType::FETCH_LIST && + var->GetType() != framework::proto::VarType::RAW) { + return true; + } + return false; +} + +void LoadPersistables(framework::Executor* executor, framework::Scope* scope, + const framework::ProgramDesc& main_program, + const std::string& dirname, + const std::string& param_filename, + bool model_from_memory = false) { + const framework::BlockDesc& global_block = main_program.Block(0); + + framework::ProgramDesc* load_program = new framework::ProgramDesc(); + framework::BlockDesc* load_block = load_program->MutableBlock(0); + std::vector paramlist; + + for (auto* var : global_block.AllVars()) { + if (IsPersistable(var)) { + VLOG(4) << "persistable variable's name: " << var->Name(); + + framework::VarDesc* new_var = load_block->Var(var->Name()); + new_var->SetShape(var->GetShape()); + new_var->SetDataType(var->GetDataType()); + new_var->SetType(var->GetType()); + + if (var->GetType() != + framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) { + new_var->SetLoDLevel(var->GetLoDLevel()); + } + + new_var->SetPersistable(true); + + if (!param_filename.empty()) { + paramlist.push_back(new_var->Name()); + } else { + // append_op + framework::OpDesc* op = load_block->AppendOp(); + op->SetType("load"); + op->SetOutput("Out", {new_var->Name()}); + op->SetAttr("file_path", {dirname + "/" + new_var->Name()}); + op->CheckAttrs(); + } + } + } + + if (!param_filename.empty()) { + // sort paramlist to have consistent ordering + std::sort(paramlist.begin(), paramlist.end()); + // append just the load_combine op + framework::OpDesc* op = load_block->AppendOp(); + op->SetType("load_combine"); + op->SetOutput("Out", paramlist); + op->SetAttr("file_path", {param_filename}); + op->SetAttr("model_from_memory", {model_from_memory}); + op->CheckAttrs(); + } + + executor->Run(*load_program, scope, 0, true, true); + + delete load_program; +} + +std::unique_ptr Load(framework::Executor* executor, + framework::Scope* scope, + const std::string& dirname) { + std::string model_filename = dirname + "/__model__"; + std::string program_desc_str; + VLOG(3) << "loading model from " << model_filename; + ReadBinaryFile(model_filename, &program_desc_str); + + std::unique_ptr main_program( + new framework::ProgramDesc(program_desc_str)); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), + "model version %ld is not supported.", + main_program->Version()); + + // model_from_memory is false in seperate parameters. + LoadPersistables(executor, scope, *main_program, dirname, "", + false /* model_from_memory */); + return main_program; +} + +std::unique_ptr Load( + framework::Executor* executor, framework::Scope* scope, + const std::string& prog_filename, const std::string& param_filename) { + std::string program_desc_str; + ReadBinaryFile(prog_filename, &program_desc_str); + + std::unique_ptr main_program( + new framework::ProgramDesc(program_desc_str)); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), + "model version %ld is not supported.", + main_program->Version()); + + LoadPersistables(executor, scope, *main_program, "", param_filename, + false /* model_from_memory */); + return main_program; +} + +std::unique_ptr LoadFromMemory( + framework::Executor* executor, framework::Scope* scope, + const std::string& prog_buffer, const std::string& param_buffer) { + std::unique_ptr main_program( + new framework::ProgramDesc(prog_buffer)); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), + "model version %ld is not supported.", + main_program->Version()); + + LoadPersistables(executor, scope, *main_program, "", param_buffer, + true /* model_filename */); + return main_program; +} + +void SaveVars(const framework::Scope& scope, + const std::vector& vars, const std::string& dirname, + bool predicate) { + framework::ProgramDesc prog; + auto* block = prog.MutableBlock(0); + auto* op = block->AppendOp(); + op->SetType("save_combine"); + op->SetInput("X", vars); + op->SetAttr("file_path", dirname + "/param"); + op->CheckAttrs(); + + platform::CPUPlace place; + framework::Executor exe(place); + exe.Run(prog, const_cast(&scope), 0, true, true); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h new file mode 100644 index 00000000..317ef9d9 --- /dev/null +++ b/paddle/fluid/inference/io.h @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/init.h" + +namespace paddle { +namespace inference { + +void Init(const std::vector argv); + +void LoadPersistables(framework::Executor* executor, framework::Scope* scope, + const framework::ProgramDesc& main_program, + const std::string& dirname, + const std::string& param_filename, + bool model_from_memory); + +std::unique_ptr Load(framework::Executor* executor, + framework::Scope* scope, + const std::string& dirname); + +std::unique_ptr Load(framework::Executor* executor, + framework::Scope* scope, + const std::string& prog_filename, + const std::string& param_filename); + +std::unique_ptr LoadFromMemory( + framework::Executor* executor, framework::Scope* scope, + const std::string& prog_buffer, const std::string& param_buffer); + +// Save the variables from a scope to disk. +void SaveVars(const framework::Scope& scope, + const std::vector& vars, const std::string& dirname, + bool predicate = true); + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map new file mode 100644 index 00000000..7e5cae04 --- /dev/null +++ b/paddle/fluid/inference/paddle_fluid.map @@ -0,0 +1,7 @@ +{ + global: + *paddle*; + *Pass*; + local: + *; +}; diff --git a/paddle/fluid/inference/paddle_fluid.sym b/paddle/fluid/inference/paddle_fluid.sym new file mode 100644 index 00000000..ef2a04d7 --- /dev/null +++ b/paddle/fluid/inference/paddle_fluid.sym @@ -0,0 +1 @@ +*paddle* diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt new file mode 100644 index 00000000..d82b88a7 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -0,0 +1,6 @@ +nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost) +nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto boost) +nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) +nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) +add_subdirectory(plugin) +add_subdirectory(convert) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt new file mode 100644 index 00000000..854007ce --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -0,0 +1,44 @@ +# Add TRT tests +nv_library(tensorrt_converter + SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc + batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc + pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc + DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) + +nv_test(test_op_converter SRCS test_op_converter.cc DEPS + ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_converter) + +# TODO(xingzhaolong): fix the the following ci ut error. + +#nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor) +#nv_test(test_trt_mul_op SRCS test_mul_op.cc mul_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op) +#nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine mul_op) +#nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op) +#nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op) +#nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op tensorrt_plugin) +#nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin +# elementwise_add_op elementwise_mul_op) +#nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine softmax_op) +#nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine batch_norm_op) +#nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine concat_op) +#nv_test(test_trt_dropout_op SRCS test_dropout_op.cc dropout_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine dropout_op) +#nv_test(test_trt_pad_op SRCS test_pad_op.cc pad_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pad_op) +#nv_test(test_trt_split_op SRCS test_split_op.cc split_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin +# split_op concat_op) +#nv_test(test_trt_prelu_op SRCS test_prelu_op.cc prelu_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin +# prelu_op) +#nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op) diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc new file mode 100644 index 00000000..5c2454fa --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class ActivationOpConverter : public OpConverter { + public: + ActivationOpConverter() {} + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + // Here the two nullptr looks strange, that's because the + // framework::OpDesc's constructor is strange. + framework::OpDesc op_desc(op, nullptr); + VLOG(3) + << "convert a fluid Activation op to tensorrt activation layer whose " + "type is " + << op_type_; + const nvinfer1::ITensor* input_tensor = + engine_->GetITensor(op_desc.Input("X")[0]); + + auto op_pair = ops.find(op_type_); + if (op_pair == ops.end()) { + PADDLE_THROW("Wrong activation op type!"); + } + + nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *const_cast(input_tensor), + op_pair->second); + auto output_name = op_desc.Output("Out")[0]; + + RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode); + if (op_desc.HasAttr("out_scale")) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif + } + } + + protected: + std::string op_type_; + static const std::unordered_map ops; +}; + +const std::unordered_map + ActivationOpConverter::ops = { + {"relu", nvinfer1::ActivationType::kRELU}, + {"sigmoid", nvinfer1::ActivationType::kSIGMOID}, + {"tanh", nvinfer1::ActivationType::kTANH}, +}; + +class ReluOpConverter : public ActivationOpConverter { + public: + ReluOpConverter() { op_type_ = "relu"; } +}; + +class SigmoidOpConverter : public ActivationOpConverter { + public: + SigmoidOpConverter() { op_type_ = "sigmoid"; } +}; + +class TanhOpConverter : public ActivationOpConverter { + public: + TanhOpConverter() { op_type_ = "tanh"; } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter); +REGISTER_TRT_OP_CONVERTER(sigmoid, SigmoidOpConverter); +REGISTER_TRT_OP_CONVERTER(tanh, TanhOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc new file mode 100644 index 00000000..d9488684 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -0,0 +1,132 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class BatchNormOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm"; + + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1); // Bias is a weight + PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1); // Mean is a weight + PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1); // Scale is a weight + PADDLE_ENFORCE_EQ(op_desc.Input("Variance").size(), + 1); // Variance is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1); + + auto* X = engine_->GetITensor(op_desc.Input("X").front()); + // Declare weights + auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front()); + auto* Mean_v = scope.FindVar(op_desc.Input("Mean").front()); + auto* Scale_v = scope.FindVar(op_desc.Input("Scale").front()); + auto* Variance_v = scope.FindVar(op_desc.Input("Variance").front()); + const float eps = boost::get(op_desc.GetAttr("epsilon")); + + PADDLE_ENFORCE_NOT_NULL(Bias_v); + PADDLE_ENFORCE_NOT_NULL(Mean_v); + PADDLE_ENFORCE_NOT_NULL(Scale_v); + PADDLE_ENFORCE_NOT_NULL(Variance_v); + + // get tensor + auto* Bias_t = Bias_v->GetMutable(); + auto* Mean_t = Mean_v->GetMutable(); + auto* Scale_t = Scale_v->GetMutable(); + auto* Variance_t = Variance_v->GetMutable(); + + // create temp tensor for weights + framework::LoDTensor bias_tensor; + framework::LoDTensor mean_tensor; + framework::LoDTensor scale_tensor; + framework::LoDTensor variance_tensor; + + bias_tensor.Resize(Bias_t->dims()); + mean_tensor.Resize(Mean_t->dims()); + scale_tensor.Resize(Scale_t->dims()); + variance_tensor.Resize(Variance_t->dims()); + + platform::CPUPlace cpu_place; + // copy data from gpu to cpu + TensorCopySync((*Bias_t), cpu_place, &bias_tensor); + TensorCopySync((*Mean_t), cpu_place, &mean_tensor); + TensorCopySync((*Scale_t), cpu_place, &scale_tensor); + TensorCopySync((*Variance_t), cpu_place, &variance_tensor); + + auto* bias_data = bias_tensor.mutable_data(platform::CPUPlace()); + auto* mean_data = mean_tensor.mutable_data(platform::CPUPlace()); + auto* scale_data = scale_tensor.mutable_data(platform::CPUPlace()); + auto* variance_data = + variance_tensor.mutable_data(platform::CPUPlace()); + + std::unique_ptr combile_scale_tensor( + new framework::LoDTensor()); + std::unique_ptr combile_bias_tensor( + new framework::LoDTensor()); + + combile_scale_tensor->Resize(scale_tensor.dims()); + combile_bias_tensor->Resize(bias_tensor.dims()); + + auto* combile_scale_data = + combile_scale_tensor->mutable_data(platform::CPUPlace()); + auto* combile_bias_data = + combile_bias_tensor->mutable_data(platform::CPUPlace()); + + size_t ele_num = combile_scale_tensor->memory_size() / sizeof(float); + + for (size_t i = 0; i < ele_num; i++) { + float scale = scale_data[i]; + float bias = bias_data[i]; + float mean = mean_data[i]; + float variance = variance_data[i]; + combile_scale_data[i] = scale / sqrtf(variance + eps); + combile_bias_data[i] = bias - mean * combile_scale_data[i]; + } + + TensorRTEngine::Weight scale_weights{ + nvinfer1::DataType::kFLOAT, static_cast(combile_scale_data), + combile_scale_tensor->memory_size() / sizeof(float)}; + TensorRTEngine::Weight shift_weights{ + nvinfer1::DataType::kFLOAT, static_cast(combile_bias_data), + combile_bias_tensor->memory_size() / sizeof(float)}; + TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + + nvinfer1::IScaleLayer* layer = + TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast(X), + nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(), + scale_weights.get(), power_weights.get()); + + auto output_name = op_desc.Output("Y").front(); + engine_->weight_map[op_desc.Input("Bias").front()] = + std::move(combile_bias_tensor); + engine_->weight_map[op_desc.Input("Scale").front()] = + std::move(combile_scale_tensor); + + RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(batch_norm, BatchNormOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/concat_op.cc b/paddle/fluid/inference/tensorrt/convert/concat_op.cc new file mode 100644 index 00000000..ec771850 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * ConcatOp + */ +class ConcatOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + std::vector itensors; + for (auto& input_name : op_desc.Input("X")) { + itensors.push_back(engine_->GetITensor(input_name)); + } + int axis = boost::get(op_desc.GetAttr("axis")); + PADDLE_ENFORCE(axis > 0, + "The axis attr of Concat op should be large than 0 for trt"); + + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(), + itensors.size()); + axis = axis - 1; // Remove batch dim + layer->setAxis(axis); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "concat", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(concat, ConcatOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc new file mode 100644 index 00000000..73bfa800 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -0,0 +1,156 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +template +void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode, + RegistFunc fadd_layer, SetDilationFunc fset_dilation, + const std::string& name) { + VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias"; + + framework::OpDesc op_desc(op, nullptr); + PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1); + + PADDLE_ENFORCE(engine != nullptr); + auto* X = engine->GetITensor(op_desc.Input("Input").front()); + auto* Y_v = scope.FindVar(op_desc.Input("Filter").front()); + PADDLE_ENFORCE_NOT_NULL(Y_v); + auto* Y_t = Y_v->GetMutable(); + float* weight_data = nullptr; + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + + if (enable_int8) { +#if IS_TRT_VERSION_GE(5000) + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + auto weight_scale = + boost::get>(op_desc.GetAttr("weight_scale")); + weight_data = engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t, + true, weight_scale); + engine->SetTensorDynamicRange(X, in_scale); +#endif + } else { + weight_data = + engine->GetWeightCPUData(op_desc.Input("Filter").front(), Y_t, false); + } + + PADDLE_ENFORCE_EQ(Y_t->dims().size(), 4UL); + const int n_output = Y_t->dims()[0]; + const int n_input = Y_t->dims()[1]; + const int filter_h = Y_t->dims()[2]; + const int filter_w = Y_t->dims()[3]; + const int groups = boost::get(op_desc.GetAttr("groups")); + const std::vector dilations = + boost::get>(op_desc.GetAttr("dilations")); + const std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + const std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + + nvinfer1::DimsHW nv_ksize(filter_h, filter_w); + nvinfer1::DimsHW nv_dilations(dilations[0], dilations[1]); + nvinfer1::DimsHW nv_strides(strides[0], strides[1]); + nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); + + TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + static_cast(Y_t->numel())}; + + TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + auto* layer = fadd_layer(const_cast(X), n_output, n_input, + nv_ksize, weight, bias); + PADDLE_ENFORCE(layer != nullptr); + layer->setStride(nv_strides); + layer->setPadding(nv_paddings); + layer->setNbGroups(groups); + // set dilations + fset_dilation(layer, nv_dilations); + + auto output_name = op_desc.Output("Output").front(); + layer->setName((name + " (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); + engine->SetITensor(output_name, layer->getOutput(0)); + +#if IS_TRT_VERSION_GE(5000) + if (enable_int8) { + float output_scale = boost::get(op_desc.GetAttr("out_scale")); + engine->SetTensorDynamicRange(layer->getOutput(0), output_scale); + } +#endif + + if (test_mode) { + engine->DeclareOutput(output_name); + } +} + +class Conv2dOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + ConvertConv2d( + engine_, op, scope, test_mode, + [&](nvinfer1::ITensor* inputs, int n_output, /* Conv output maps */ + int n_input, /* Conv input maps */ + nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight, + TensorRTEngine::Weight& bias) -> nvinfer1::IConvolutionLayer* { + auto* layer = + TRT_ENGINE_ADD_LAYER(engine_, Convolution, *inputs, n_output, + ksize, weight.get(), bias.get()); + return layer; + }, + [](nvinfer1::IConvolutionLayer* layer, nvinfer1::DimsHW& dilations) { + layer->setDilation(dilations); + }, + "conv2d"); + } +}; + +class Deconv2dOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + ConvertConv2d( + engine_, op, scope, test_mode, + [&](nvinfer1::ITensor* inputs, int n_output, /* Deconv input maps */ + int n_input, /* Deconv output maps */ + nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight, + TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* { + auto* layer = + TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input, + ksize, weight.get(), bias.get()); + return layer; + }, + [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) { + PADDLE_ENFORCE( + dilations.d[0] == 1 && dilations.d[1] == 1, + "Dilations must be (1, 1) for tensorRT, but given (%d, %d)", + dilations.d[0], dilations.d[1]); + }, + "conv2d_transpose"); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(conv2d, Conv2dOpConverter); +REGISTER_TRT_OP_CONVERTER(conv2d_transpose, Deconv2dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc new file mode 100644 index 00000000..71177e5e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * DropoutOp. This Layer doesn't has weights. + */ +class DropoutOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid dropout op to tensorrt dropout layer"; + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + float dropout_prob = boost::get(op_desc.GetAttr("dropout_prob")); + + platform::CPUPlace cpu_place; + std::unique_ptr weight_tensor( + new framework::LoDTensor()); + weight_tensor->Resize(framework::make_ddim({1})); + auto* weight_data = + weight_tensor->mutable_data(platform::CPUPlace()); + weight_data[0] = 1 - dropout_prob; + + TensorRTEngine::Weight scale_weights{ + nvinfer1::DataType::kFLOAT, static_cast(weight_data), + weight_tensor->memory_size() / sizeof(float)}; + TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + + auto* layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *const_cast(input1), + nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), scale_weights.get(), + power_weights.get()); + + engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] = + std::move(weight_tensor); + auto output_name = op_desc.Output("Out")[0]; + + RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(dropout); +REGISTER_TRT_OP_CONVERTER(dropout, DropoutOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc new file mode 100644 index 00000000..a888b080 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -0,0 +1,264 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +static bool CheckDims(const nvinfer1::Dims& dims_x, + const nvinfer1::Dims& dims_y) { + if (dims_x.nbDims != dims_y.nbDims) { + return false; + } + for (int i = 0; i < dims_x.nbDims; i++) { + if (dims_x.d[i] != dims_y.d[i]) { + return false; + } + } + return true; +} + +class ElementwiseWeightOpConverter : public OpConverter { + public: + ElementwiseWeightOpConverter() {} + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + // Here the two nullptr looks strange, that's because the + // framework::OpDesc's constructor is strange. + nvinfer1::ILayer* layer = nullptr; + framework::OpDesc op_desc(op, nullptr); + VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer"; + + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto* X = engine_->GetITensor(op_desc.Input("X").front()); + nvinfer1::Dims dims_x = X->getDimensions(); + PADDLE_ENFORCE(dims_x.nbDims >= 3, "x dims experts 3, but %d is given.", + dims_x.nbDims); + + auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); + PADDLE_ENFORCE_NOT_NULL(Y_v); + auto* Y_t = Y_v->GetMutable(); + float* weight_data = nullptr; + weight_data = + engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t, false); + + auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; + + std::vector dims_y = framework::vectorize2int(Y_t->dims()); + if (static_cast(dims_y.size()) == dims_x.nbDims + 1) { + if (dims_y[0] == 1) dims_y.erase(dims_y.begin()); + } + + if (static_cast(dims_y.size()) == 1 && dims_y[0] == dims_x.d[0]) { + scale_mode = nvinfer1::ScaleMode::kCHANNEL; + } else if (static_cast(dims_y.size()) == dims_x.nbDims && + dims_y[0] == dims_x.d[0]) { + scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; + for (int i = 1; i < dims_x.nbDims; i++) { + if (dims_y[i] != dims_x.d[i]) { + scale_mode = nvinfer1::ScaleMode::kCHANNEL; + break; + } + } + if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) { + for (int i = 1; i < dims_x.nbDims; i++) { + if (dims_y[i] != 1) + PADDLE_THROW( + "TensorRT unsupported weight shape for Elementwise op!"); + } + } + } else { + PADDLE_THROW("TensorRT unsupported weight Shape for Elementwise op!"); + } + + TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + static_cast(Y_t->numel())}; + TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + if (op_type_ == "add") { + nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *X, scale_mode, shift_weights.get(), + scale_weights.get(), power_weights.get()); + layer = scale_layer; + } else if (op_type_ == "mul") { + nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *X, scale_mode, scale_weights.get(), + shift_weights.get(), power_weights.get()); + layer = scale_layer; + } + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, {output_name}, + test_mode); + if (op_desc.HasAttr("out_scale")) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif + } + } + + protected: + std::string op_type_; +}; + +class ElementwiseTensorOpConverter : public OpConverter { + public: + ElementwiseTensorOpConverter() {} + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + auto op_pair = ops.find(op_type_); + PADDLE_ENFORCE(op_pair != ops.end(), "Wrong elementwise op type!"); + + // Here the two nullptr looks strange, that's because the + // framework::OpDesc's constructor is strange. + framework::OpDesc op_desc(op, nullptr); + nvinfer1::ILayer* layer = nullptr; + + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); // Y is a weight + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + + auto* X = engine_->GetITensor(op_desc.Input("X").front()); + auto* Y = engine_->GetITensor(op_desc.Input("Y").front()); + nvinfer1::Dims dims_x = X->getDimensions(); + nvinfer1::Dims dims_y = Y->getDimensions(); + + int axis = boost::get(op_desc.GetAttr("axis")); + auto output_name = op_desc.Output("Out")[0]; + if (CheckDims(dims_x, dims_y)) { + // The two input tensor should have the same dims + VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; + nvinfer1::IElementWiseLayer* elet_layer = TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *const_cast(X), + *const_cast(Y), op_pair->second); + + layer = elet_layer; + } else { + VLOG(3) << "Convert a fluid elementwise op to TensorRT " + "ElementWisePluginLayer"; + + plugin::ElementWisePlugin* plugin = + new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis); + plugin->AddInput(X); + plugin->AddInput(Y); + nvinfer1::IPluginLayer* plugin_layer = engine_->AddPlugin( + const_cast(plugin->GetInputs().data()), 2, + reinterpret_cast(plugin)); + + layer = plugin_layer; + } + RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode); + if (op_desc.HasAttr("out_scale")) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif + } + } + + protected: + static const std::unordered_map + ops; + std::string op_type_; +}; + +const std::unordered_map + ElementwiseTensorOpConverter::ops = { + {"add", nvinfer1::ElementWiseOperation::kSUM}, + {"mul", nvinfer1::ElementWiseOperation::kPROD}, + {"sub", nvinfer1::ElementWiseOperation::kSUB}, + {"div", nvinfer1::ElementWiseOperation::kDIV}, + {"min", nvinfer1::ElementWiseOperation::kMIN}, + {"pow", nvinfer1::ElementWiseOperation::kPOW}, + {"max", nvinfer1::ElementWiseOperation::kMAX}, +}; + +class ElementwiseWeightAddOpConverter : public ElementwiseWeightOpConverter { + public: + ElementwiseWeightAddOpConverter() { op_type_ = "add"; } +}; + +class ElementwiseWeightMulOpConverter : public ElementwiseWeightOpConverter { + public: + ElementwiseWeightMulOpConverter() { op_type_ = "mul"; } +}; + +class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorAddOpConverter() { op_type_ = "add"; } +}; + +class ElementwiseTensorMulOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorMulOpConverter() { op_type_ = "mul"; } +}; + +class ElementwiseTensorSubOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorSubOpConverter() { op_type_ = "sub"; } +}; + +class ElementwiseTensorDivOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorDivOpConverter() { op_type_ = "div"; } +}; + +class ElementwiseTensorMinOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorMinOpConverter() { op_type_ = "min"; } +}; + +class ElementwiseTensorMaxOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorMaxOpConverter() { op_type_ = "max"; } +}; + +class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter { + public: + ElementwiseTensorPowOpConverter() { op_type_ = "pow"; } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, + ElementwiseWeightAddOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_mul_weight, + ElementwiseWeightMulOpConverter); + +REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor, + ElementwiseTensorAddOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_sub_tensor, + ElementwiseTensorSubOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_div_tensor, + ElementwiseTensorDivOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_mul_tensor, + ElementwiseTensorMulOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_max_tensor, + ElementwiseTensorMaxOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_min_tensor, + ElementwiseTensorMinOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_pow_tensor, + ElementwiseTensorPowOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc new file mode 100644 index 00000000..fb7b89b1 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -0,0 +1,151 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// Reorder the elements from istrides to ostrides, borrowed from TRT convert in +// tensorflow. +// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/tensorrt/convert/convert_nodes.cc#L318 +template +void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, + T* odata, nvinfer1::DimsHW ostrides) { + for (int h = 0; h < shape.h(); ++h) { + for (int w = 0; w < shape.w(); ++w) { + odata[h * ostrides.h() + w * ostrides.w()] = + idata[h * istrides.h() + w * istrides.w()]; + } + } +} +// indata c * k +// Reorder the data layout from CK to KC. +void ReorderCKtoKC(TensorRTEngine::Weight& iweights, // NOLINT + TensorRTEngine::Weight* oweights) { + int c = iweights.dims[0]; + int k = iweights.dims[1]; + oweights->dims.assign({k, c}); + nvinfer1::DimsHW istrides = {1, k}; + nvinfer1::DimsHW ostrides = {c, 1}; + Reorder2({k, c}, static_cast(iweights.get().values), istrides, + static_cast(const_cast(oweights->get().values)), + ostrides); +} + +/* + * FC converter convert a MUL op in Fluid to a FC layer in TRT. + */ +class FcOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid fc op to tensorrt fc layer without bias"; + framework::OpDesc op_desc(op, nullptr); + + auto input_names = op_desc.InputNames(); + bool with_bias = input_names.size() >= 3; + std::string w_name = "Y"; + std::string i_name = "X"; + if (with_bias) { + w_name = "W"; + i_name = "Input"; + } + + // Declare inputs + auto* X = engine_->GetITensor(op_desc.Input(i_name).front()); + + // Declare weights + auto* Y_v = scope.FindVar(op_desc.Input(w_name).front()); + PADDLE_ENFORCE_NOT_NULL(Y_v); + auto* Y_t = Y_v->GetMutable(); + // This may trigger a GPU->CPU copy, because TRT's weight can only be + // assigned from CPU memory, that can't be avoided. + float* weight_data = nullptr; + bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); + if (enable_int8) { +#if IS_TRT_VERSION_GE(5000) + float in_scale = boost::get(op_desc.GetAttr("input_scale")); + auto weight_scale = + boost::get>(op_desc.GetAttr("weight_scale")); + weight_data = engine_->GetWeightCPUData(op_desc.Input(w_name).front(), + Y_t, true, weight_scale); + engine_->SetTensorDynamicRange(X, in_scale); +#endif + } else { + weight_data = + engine_->GetWeightCPUData(op_desc.Input(w_name).front(), Y_t, false); + } + + PADDLE_ENFORCE_EQ(Y_t->dims().size(), 2UL); // a matrix + size_t n_output = Y_t->dims()[1]; + + std::unique_ptr tmp(new framework::LoDTensor()); + tmp->Resize(Y_t->dims()); + + memcpy(tmp->mutable_data(platform::CPUPlace()), weight_data, + Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); + TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + static_cast(Y_t->numel())}; + TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT, + static_cast(tmp->data()), + static_cast(Y_t->numel())); + weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]}); + tmp_weight.dims = weight.dims; + + // The data layout of TRT FC layer's weight is different from fluid's FC, + // need to reorder the elements. + ReorderCKtoKC(weight, &tmp_weight); + + // Currently, the framework can only handle one fluid op -> one TRT layer, + // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just + // handle `mul`, leave `add` as another layer. + // DEBUG + float* bias_data = nullptr; + int bias_num = 0; + if (with_bias) { + auto* b_v = scope.FindVar(op_desc.Input("Bias").front()); + auto* b_t = b_v->GetMutable(); + bias_data = + engine_->GetWeightCPUData(op_desc.Input("Bias").front(), b_t, false); + bias_num = b_t->numel(); + } + TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, + static_cast(bias_data), + static_cast(bias_num)}; + + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, + *const_cast(X), + n_output, tmp_weight.get(), bias.get()); + + engine_->weight_map[op_desc.Input(w_name).front()] = std::move(tmp); + auto output_name = op_desc.Output("Out").front(); + + RreplenishLayerAndOutput(layer, "fc", {output_name}, test_mode); + if (enable_int8) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.cc b/paddle/fluid/inference/tensorrt/convert/io_converter.cc new file mode 100644 index 00000000..854f434d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/io_converter.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/io_converter.h" +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +using platform::is_gpu_place; +using platform::is_cpu_place; + +class DefaultIOConverter : public EngineIOConverter { + public: + DefaultIOConverter() {} + // NOTE out is GPU memory. + virtual void operator()(const LoDTensor& in, void* out, + size_t max_size) override { + PADDLE_ENFORCE(out != nullptr); + PADDLE_ENFORCE(stream_ != nullptr); + const auto& place = in.place(); + size_t size = in.memory_size(); + PADDLE_ENFORCE_LE(size, max_size); + if (is_cpu_place(place)) { + PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data(), size, + cudaMemcpyHostToDevice, *stream_)); + } else if (is_gpu_place(place)) { + PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out, in.data(), size, + cudaMemcpyDeviceToDevice, *stream_)); + } else { + PADDLE_THROW("Unknown device for converter"); + } + cudaStreamSynchronize(*stream_); + } + // NOTE in is GPU memory. + virtual void operator()(const void* in, LoDTensor* out, + size_t max_size) override { + PADDLE_ENFORCE(in != nullptr); + PADDLE_ENFORCE(stream_ != nullptr); + const auto& place = out->place(); + size_t size = out->memory_size(); + PADDLE_ENFORCE_LE(size, max_size); + if (is_cpu_place(place)) { + PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data(), in, size, + cudaMemcpyDeviceToHost, *stream_)); + } else if (is_gpu_place(place)) { + PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(out->data(), in, size, + cudaMemcpyDeviceToDevice, *stream_)); + } else { + PADDLE_THROW("Unknown device for converter"); + } + cudaStreamSynchronize(*stream_); + } +}; + +// fluid LodTensor <-> tensorrt ITensor +REGISTER_TENSORRT_IO_CONVERTER(default, DefaultIOConverter); + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/io_converter.h b/paddle/fluid/inference/tensorrt/convert/io_converter.h new file mode 100644 index 00000000..5daa242f --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/io_converter.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/utils/singleton.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +using framework::LoDTensor; + +/* + * Convert Input from Fluid to TensorRT Engine. + * Convert Output from TensorRT Engine to Fluid. + * + * Note that TensorRT's ITensor follows row major, NCHW. Fluid is also row + * major, + * so in the default case just need to copy the data. + */ +class EngineIOConverter { + public: + EngineIOConverter() {} + + virtual void operator()(const LoDTensor& in, void* out, size_t max_size) {} + virtual void operator()(const void* in, LoDTensor* out, size_t max_size) {} + + void SetStream(cudaStream_t* stream) { stream_ = stream; } + + static void ConvertInput(const std::string& op_type, const LoDTensor& in, + void* out, size_t max_size, cudaStream_t* stream) { + PADDLE_ENFORCE(stream != nullptr); + auto* converter = Registry::Global().Lookup( + op_type, "default" /* default_type */); + PADDLE_ENFORCE_NOT_NULL(converter); + converter->SetStream(stream); + (*converter)(in, out, max_size); + } + + static void ConvertOutput(const std::string& op_type, const void* in, + LoDTensor* out, size_t max_size, + cudaStream_t* stream) { + PADDLE_ENFORCE(stream != nullptr); + auto* converter = Registry::Global().Lookup( + op_type, "default" /* default_type */); + PADDLE_ENFORCE_NOT_NULL(converter); + converter->SetStream(stream); + (*converter)(in, out, max_size); + } + + virtual ~EngineIOConverter() {} + + protected: + cudaStream_t* stream_{nullptr}; +}; + +#define REGISTER_TENSORRT_IO_CONVERTER(op_type__, Converter__) \ + struct trt_io_##op_type__##_converter { \ + trt_io_##op_type__##_converter() { \ + Registry::Global().Register(#op_type__); \ + } \ + }; \ + trt_io_##op_type__##_converter trt_io_##op_type__##_converter__; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc new file mode 100644 index 00000000..7753fda0 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// LeakyRelu converter from fluid to tensorRT +class LeakyReluOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert fluid leaky_relu op to tensorrt layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + int input_num = op_desc.Input("X").size(); + PADDLE_ENFORCE(input_num == 1); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + // Get output + size_t output_num = op_desc.Output("Out").size(); + PADDLE_ENFORCE(output_num == 1); + // Get attrs + float alpha = boost::get(op_desc.GetAttr("alpha")); + + platform::CPUPlace place; + std::unique_ptr alpha_tensor( + new framework::LoDTensor()); + alpha_tensor->Resize(framework::make_ddim({2})); + float* alpha_data = alpha_tensor->mutable_data(place); + alpha_data[0] = alpha; + alpha_data[1] = 1.f - alpha; + // the leaky relu formula y = (x > 0) ? x : alpha * x is equal to + // y = alpha * x + (x > 0) ? (1 - alpha) * x : 0 + TensorRTEngine::Weight scale{nvinfer1::DataType::kFLOAT, &alpha_data[0], 1}; + TensorRTEngine::Weight shift{nvinfer1::DataType::kFLOAT, nullptr, 0}; + TensorRTEngine::Weight power{nvinfer1::DataType::kFLOAT, nullptr, 0}; + // y_scale = alpha * x + auto* scale_layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM, shift.get(), + scale.get(), power.get()); + PADDLE_ENFORCE(nullptr != scale_layer); + // y_relu = (x > 0) : x : 0 + auto* relu_layer = TRT_ENGINE_ADD_LAYER(engine_, Activation, *input, + nvinfer1::ActivationType::kRELU); + PADDLE_ENFORCE(nullptr != relu_layer); + // + TensorRTEngine::Weight sub_scale{nvinfer1::DataType::kFLOAT, &alpha_data[1], + 1}; + auto* scale_relu_layer = + TRT_ENGINE_ADD_LAYER(engine_, Scale, *(relu_layer->getOutput(0)), + nvinfer1::ScaleMode::kUNIFORM, shift.get(), + sub_scale.get(), power.get()); + PADDLE_ENFORCE(nullptr != scale_relu_layer); + auto* output_layer = + TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *(scale_layer->getOutput(0)), + *(scale_relu_layer->getOutput(0)), + nvinfer1::ElementWiseOperation::kSUM); + PADDLE_ENFORCE(nullptr != output_layer); + // keep alpha tensor to avoid release it's memory + std::string alpha_name = op_desc.Output("Out")[0] + "_alpha"; + PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) == + engine_->weight_map.end()); + engine_->weight_map[alpha_name] = std::move(alpha_tensor); + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name}, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(leaky_relu, LeakyReluOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc new file mode 100644 index 00000000..5b6aaad4 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights. + */ +class MulOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]); + // Both the input1 and input2 do not need transpose. + auto* layer = TRT_ENGINE_ADD_LAYER( + engine_, MatrixMultiply, *const_cast(input1), false, + *const_cast(input2), false); + + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { // the test framework can not determine which is the + // output, so place the declaration inside. + engine_->DeclareOutput(output_name); + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h new file mode 100644 index 00000000..f89b0d7e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -0,0 +1,231 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/utils/singleton.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +using FluidDT = framework::proto::VarType_Type; +using TRT_DT = nvinfer1::DataType; + +namespace { // NOLINT + +TRT_DT FluidDataType2TRT(FluidDT type) { + switch (type) { + case FluidDT::VarType_Type_FP32: + return TRT_DT::kFLOAT; + case FluidDT::VarType_Type_INT32: + return TRT_DT::kINT32; + default: + return TRT_DT::kINT32; + } + PADDLE_THROW("unkown type"); + return TRT_DT::kINT32; +} + +nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { + PADDLE_ENFORCE_GT(shape.size(), 1UL, + "TensorRT' tensor input requires at least 2 dimensions"); + PADDLE_ENFORCE_LE(shape.size(), 4UL, + "TensorRT' tensor input requires at most 4 dimensions"); + PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL); + if (shape.size() == 4UL) + return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); + return nvinfer1::DimsCHW(shape[1], 1, 1); +} + +} // namespace // NOLINT + +/* + * Convert Op from Fluid to TensorRT Engine. + */ +class OpConverter { + public: + OpConverter() {} + + // Converter logic for an op. + virtual void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode = false) {} + + // Convert a single fluid operator and add the corresponding layer to TRT. + // test_mode: whether the instance executes in an unit test. + void ConvertOp(const framework::proto::OpDesc& op, + const std::unordered_set& parameters, + const framework::Scope& scope, TensorRTEngine* engine, + bool test_mode = false) { + framework::OpDesc op_desc(op, nullptr); + + OpConverter* it{nullptr}; + + if (op_desc.Type() == "mul") { + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); + std::string Y = op_desc.Input("Y")[0]; + if (parameters.count(Y)) { + it = Registry::Global().Lookup("fc"); + } + } + if (op_desc.Type().find("elementwise") != std::string::npos) { + static std::unordered_set add_tensor_op_set{ + "add", "mul", "sub", "div", "max", "min", "pow"}; + // TODO(xingzhaolong): all mul, sub, div + // static std::unordered_set add_weight_op_set {"add", "mul", + // "sub", "div"}; + static std::unordered_set add_weight_op_set{"add", "mul"}; + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL); + int op_type_len = op_desc.Type().size(); + std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len); + std::string Y = op_desc.Input("Y")[0]; + if (parameters.count(Y)) { + PADDLE_ENFORCE(add_weight_op_set.count(op_type) > 0, + "Unsupported elementwise type" + op_type); + it = Registry::Global().Lookup("elementwise_" + op_type + + "_weight"); + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", + op_desc.Type()); + } else { + PADDLE_ENFORCE(add_tensor_op_set.count(op_type) > 0, + "Unsupported elementwise type" + op_type); + it = Registry::Global().Lookup("elementwise_" + op_type + + "_tensor"); + } + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", + op_desc.Type()); + } + + if (op_desc.Type() == "depthwise_conv2d") { + it = Registry::Global().Lookup("conv2d"); + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", + op_desc.Type()); + } + + if (!it) { + it = Registry::Global().Lookup(op_desc.Type()); + } + PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", + op_desc.Type()); + it->SetEngine(engine); + (*it)(op, scope, test_mode); + } + + // Convert a fluid block to tensorrt network, NOTE it just convert operators, + // the INetwork's inputs and outputs should specified in some other modules. + void ConvertBlock(const framework::proto::BlockDesc& block, + const std::unordered_set& parameters, + const framework::Scope& scope, TensorRTEngine* engine) { + std::unique_lock lk(mut_); + for (int i = 0; i < block.ops_size(); i++) { + const auto& op = block.ops(i); + ConvertOp(op, parameters, scope, engine); + } + } + + // The scope here should be inited with the parameter vars. + void ConvertBlockToTRTEngine( + framework::BlockDesc* block_desc, const framework::Scope& scope, + const std::vector& inputs, + const std::unordered_set& parameters, + const std::vector& outputs, TensorRTEngine* engine) { + engine->InitNetwork(); + for (auto& input : inputs) { + if (parameters.count(input)) continue; + auto* var = block_desc->FindVar(input); + PADDLE_ENFORCE(var, "no variable called %s", input); + PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, + "TensorRT engine only takes LoDTensor as input"); + auto var_shape = var->GetShape(); + + engine->DeclareInput( + input, FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), + Vec2TRT_Dims(var_shape)); + } + framework::proto::BlockDesc* block_proto = block_desc->Proto(); + ConvertBlock(*block_proto, parameters, scope, engine); + for (auto& output : outputs) { + engine->DeclareOutput(output); + } + engine->FreezeNetwork(); + engine->ClearWeights(); + } + + void RreplenishLayerAndOutput( + nvinfer1::ILayer* layer, const std::string& layer_type, + const std::vector& output_tensor_names, + bool test_mode = false) { + size_t num_out = output_tensor_names.size(); + for (size_t i = 0; i < num_out; i++) { + layer->getOutput(i)->setName(output_tensor_names[i].c_str()); + engine_->SetITensor(output_tensor_names[i], layer->getOutput(i)); + if (test_mode) { + engine_->DeclareOutput(output_tensor_names[i]); + } + } + layer->setName( + (layer_type + " (Output: " + output_tensor_names[0] + ")").c_str()); + } + void SetEngine(TensorRTEngine* engine) { engine_ = engine; } + + virtual ~OpConverter() {} + + // TensorRT engine + TensorRTEngine* engine_{nullptr}; + + protected: + bool test_mode_; + + private: + // registered op converter map, whose key is the fluid op type, and value is + // the pointer position of corresponding OpConverter class. + std::unordered_map converters_; + // fluid inference scope + framework::Scope* scope_{nullptr}; + std::mutex mut_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \ + struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \ + trt_##op_type__##_converter() { \ + ::paddle::inference::Registry< \ + paddle::inference::tensorrt::OpConverter>::Global() \ + .Register<::paddle::inference::tensorrt::Converter__>(#op_type__); \ + } \ + }; \ + trt_##op_type__##_converter trt_##op_type__##_converter__; \ + int TouchConverterRegister_##op_type__() { \ + trt_##op_type__##_converter__.Touch(); \ + return 0; \ + } + +#define USE_TRT_CONVERTER(op_type__) \ + extern int TouchConverterRegister_##op_type__(); \ + static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \ + TouchConverterRegister_##op_type__(); diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc new file mode 100644 index 00000000..bcd21667 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * PadOp. + */ +class PadOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid transpose op to tensorrt tranpose layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + + const std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + const float pad_value = boost::get(op_desc.GetAttr("pad_value")); + + nvinfer1::Dims input_shape = input->getDimensions(); + int nbDims = input_shape.nbDims; + int pad_size = static_cast(paddings.size()); + PADDLE_ENFORCE_GE(nbDims, 2); + PADDLE_ENFORCE_EQ((nbDims + 1) * 2, pad_size); + PADDLE_ENFORCE(pad_value == 0.0, "The pad layer of TRT only support zero."); + + nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]); + nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]); + + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, + *const_cast(input), + pre_pad, post_pad); + + PADDLE_ENFORCE(layer != nullptr); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "pad", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(pad, PadOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc new file mode 100644 index 00000000..1752c52c --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -0,0 +1,167 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +void DealCeilMode(const nvinfer1::Dims &input_shape, std::vector ksize, + std::vector strides, std::vector paddings, + nvinfer1::DimsHW *pre_pad, nvinfer1::DimsHW *post_pad, + int input_dims) { + int input_height = input_shape.d[input_dims - 2]; + int input_width = input_shape.d[input_dims - 1]; + int floor_h_output_size = + (input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1; + int ceil_h_output_size = + (input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) / + strides[0] + + 1; + + int floor_w_output_size = + (input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1; + int ceil_w_output_size = + (input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) / strides[1] + + 1; + if (floor_h_output_size != ceil_h_output_size) { + post_pad->h() = strides[0] - 1; + } + + if (floor_w_output_size != ceil_w_output_size) { + post_pad->w() = strides[1] - 1; + } +} + +/* + * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights. + */ +class Pool2dOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, bool test_mode) override { + VLOG(4) + << "convert a fluid pool2d op to tensorrt pool2d layer without bias"; + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]); + nvinfer1::Dims input_shape = input1->getDimensions(); + int input_dims = input_shape.nbDims; + + PADDLE_ENFORCE_EQ(input_dims, 3UL); + + bool global_pooling = boost::get(op_desc.GetAttr("global_pooling")); + std::string pool_type = + boost::get(op_desc.GetAttr("pooling_type")); + std::vector ksize = + boost::get>(op_desc.GetAttr("ksize")); + std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + bool ceil_mode = boost::get(op_desc.GetAttr("ceil_mode")); + + nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX; + if (pool_type == "max") { + nv_pool_type = nvinfer1::PoolingType::kMAX; + } else if (pool_type == "avg") { + nv_pool_type = nvinfer1::PoolingType::kAVERAGE; + } else { + PADDLE_THROW("TensorRT unsupported pooling type!"); + } + + nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); + nvinfer1::DimsHW nv_strides(strides[0], strides[1]); + nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); + + nvinfer1::ILayer *layer = nullptr; + + if (global_pooling == true) { + nv_ksize.d[0] = input_shape.d[input_dims - 2]; + nv_ksize.d[1] = input_shape.d[input_dims - 1]; + auto *layer = TRT_ENGINE_ADD_LAYER( + engine_, Pooling, *const_cast(input1), + nv_pool_type, nv_ksize); + PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created."); + auto output_name = op_desc.Output("Out")[0]; + layer->setName(("pool2d (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } + return; + } + + if (pool_type == "max") { + // Under ceil mode, the pre_pad and post_pad are used to + // record the the padding size. In some ceil mode cases, + // we do not need padding, so we initialize the two vars to 0. + + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); + if (ceil_mode) { + // If ceil mode is true, we will pad the appropriate size to the input. + DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, + input_dims); + auto *pad_layer = TRT_ENGINE_ADD_LAYER( + engine_, Padding, *const_cast(input1), pre_pad, + post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, "pad layer in poolOp converter could not be created."); + input1 = pad_layer->getOutput(0); + } + auto *pool_layer = TRT_ENGINE_ADD_LAYER( + engine_, Pooling, *const_cast(input1), + nv_pool_type, nv_ksize); + PADDLE_ENFORCE_NOT_NULL(pool_layer, "pool layer could not be created."); + pool_layer->setStride(nv_strides); + pool_layer->setPadding(nv_paddings); + layer = pool_layer; + } else { + // Average pooling needs to exclude the padding pixels from the average + // mean. + // It is not supported well by TRT, we use a plugin here. + std::vector input_shape_v; + for (int i = 0; i < input_dims; i++) { + input_shape_v.push_back(input_shape.d[i]); + } + plugin::AvgPoolPlugin *plugin = new plugin::AvgPoolPlugin( + ceil_mode, ksize, strides, paddings, input_shape_v); + auto *avg_pool_layer = engine_->AddPlugin(&input1, 1, plugin); + layer = avg_pool_layer; + } + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode); + + if (op_desc.HasAttr("out_scale")) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(pool2d); +REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc new file mode 100644 index 00000000..01bcd03e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * PRelu converter from fluid to tensorRT. + */ +class PReluOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert fluid prelu op to tensorrt prelu layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + int input_num = op_desc.Input("X").size(); + PADDLE_ENFORCE(input_num == 1); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + // Get output + size_t output_num = op_desc.Output("Out").size(); + PADDLE_ENFORCE(output_num == 1); + // Get attrs + std::string mode = boost::get(op_desc.GetAttr("mode")); + // + auto* alpha_var = scope.FindVar(op_desc.Input("Alpha")[0]); + PADDLE_ENFORCE_NOT_NULL(alpha_var); + auto* alpha_tensor = alpha_var->GetMutable(); + + platform::CPUPlace cpu_place; + std::unique_ptr alpha_tensor_temp( + new framework::LoDTensor()); + alpha_tensor_temp->Resize(alpha_tensor->dims()); + TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get()); + float* alpha_data = alpha_tensor_temp->mutable_data(cpu_place); + + plugin::PReluPlugin* plugin = + new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode); + nvinfer1::IPluginLayer* layer = + engine_->AddPlugin(&input, input_num, plugin); + // keep alpha tensor to avoid release it's memory + engine_->weight_map[op_desc.Input("Alpha")[0]] = + std::move(alpha_tensor_temp); + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(prelu, PReluOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc new file mode 100644 index 00000000..b0ae1694 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * SoftMaxOp, ISoftMaxLayer in TRT. This Layer doesn't has weights. + */ +class SoftMaxOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) + << "convert a fluid softmax op to tensorrt softmax layer without bias"; + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, SoftMax, + *const_cast(input1)); + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "softmax", {output_name}, test_mode); + + if (op_desc.HasAttr("out_scale")) { +#if IS_TRT_VERSION_GE(5000) + float out_scale = boost::get(op_desc.GetAttr("out_scale")); + engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); +#endif + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(softmax); +REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc new file mode 100644 index 00000000..ae5b1b98 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class SplitOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert a fluid split op to tensorrt split layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + auto input_dims = input->getDimensions(); + int input_num = op_desc.Input("X").size(); + size_t output_num = op_desc.Output("Out").size(); + + // Get Attrs + PADDLE_ENFORCE(input_num == 1); + int axis = boost::get(op_desc.GetAttr("axis")); + std::vector output_lengths = + boost::get>(op_desc.GetAttr("sections")); + // split on batch is not supported in TensorRT + PADDLE_ENFORCE(axis != 0); + axis += (axis < 0) ? input_dims.nbDims : -1; + + PADDLE_ENFORCE(output_lengths.size() == output_num); + plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths); + nvinfer1::IPluginLayer* layer = + engine_->AddPlugin(&input, input_num, plugin); + + std::string layer_name = "split (Output: "; + for (size_t i = 0; i < output_num; i++) { + auto output_name = op_desc.Output("Out")[i]; + layer->getOutput(i)->setName(output_name.c_str()); + engine_->SetITensor(output_name, layer->getOutput(i)); + layer_name += output_name; + if (test_mode) { + engine_->DeclareOutput(output_name); + } + } + layer->setName((layer_name + ")").c_str()); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(split, SplitOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc new file mode 100644 index 00000000..dd3dfb0b --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +void test_activation(std::string act_type) { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("act-X", nvinfer1::Dims2(10, 6)); + validator.DeclOutputVar("act-Out", nvinfer1::Dims2(10, 6)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType(act_type); + desc.SetInput("X", {"act-X"}); + desc.SetOutput("Out", {"act-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(5); +} + +TEST(ReluOpConverter, main) { test_activation("relu"); } + +TEST(SigmoidOpConverter, main) { test_activation("sigmoid"); } + +TEST(TanhOpConverter, main) { test_activation("tanh"); } + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(relu); +USE_OP(sigmoid); +USE_OP(tanh); diff --git a/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc new file mode 100644 index 00000000..41412cb0 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_batch_norm_op.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(batch_norm_op, test) { + std::unordered_set parameters( + {"batch_norm_scale", "batch_norm_bias", "batch_norm_mean", + "batch_norm_variance"}); + framework::Scope scope; + TRTConvertValidation validator(5, parameters, scope, 1 << 15); + std::vector param_shape{2}; + + validator.DeclInputVar("batch_norm_X", nvinfer1::DimsCHW(2, 5, 5)); + validator.DeclParamVar("batch_norm_scale", param_shape); + validator.DeclParamVar("batch_norm_bias", param_shape); + validator.DeclParamVar("batch_norm_mean", param_shape); + validator.DeclParamVar("batch_norm_variance", param_shape); + validator.DeclOutputVar("batch_norm_Y", nvinfer1::DimsCHW(2, 5, 5)); + validator.DeclOutputVar("batch_norm_save_mean", param_shape); + validator.DeclOutputVar("batch_norm_save_variance", param_shape); + + // Prepare Op description + framework::OpDesc desc; + + desc.SetType("batch_norm"); + desc.SetInput("X", {"batch_norm_X"}); + desc.SetInput("Scale", {"batch_norm_scale"}); + desc.SetInput("Bias", {"batch_norm_bias"}); + desc.SetInput("Mean", {"batch_norm_mean"}); + desc.SetInput("Variance", {"batch_norm_variance"}); + desc.SetOutput("Y", {"batch_norm_Y"}); + desc.SetOutput("MeanOut", {"batch_norm_mean"}); + desc.SetOutput("VarianceOut", {"batch_norm_variance"}); + desc.SetOutput("SavedMean", {"batch_norm_save_mean"}); + desc.SetOutput("SavedVariance", {"batch_norm_save_variance"}); + + float eps = 1e-5f; + bool is_test = true; + desc.SetAttr("epsilon", eps); + desc.SetAttr("is_test", is_test); + + validator.SetOp(*desc.Proto()); + + std::unordered_set neglected_output = { + "batch_norm_save_mean", "batch_norm_save_variance", "batch_norm_mean", + "batch_norm_variance"}; + validator.Execute(3, neglected_output); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle +USE_OP(batch_norm); diff --git a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc new file mode 100644 index 00000000..4f284a4d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(concat_op, test) { + std::unordered_set parameters({""}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("concat_x1", nvinfer1::DimsCHW(10, 3, 1)); + validator.DeclInputVar("concat_x2", nvinfer1::DimsCHW(3, 3, 1)); + validator.DeclInputVar("concat_x3", nvinfer1::DimsCHW(7, 3, 1)); + validator.DeclOutputVar("concat_out", nvinfer1::DimsCHW(20, 3, 1)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("concat"); + desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"}); + desc.SetOutput("Out", {"concat_out"}); + + int axis = 1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + + validator.Execute(5); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle +USE_OP(concat); diff --git a/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc new file mode 100644 index 00000000..95916746 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_conv2d_op.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +USE_OP(conv2d); +USE_OP(conv2d_transpose); + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(conv2d_op, test) { + std::unordered_set parameters({"conv2d-Y"}); + framework::Scope scope; + TRTConvertValidation validator(5, parameters, scope, 1 << 15); + + validator.DeclInputVar("conv2d-X", nvinfer1::Dims3(2, 5, 5)); + validator.DeclParamVar("conv2d-Y", nvinfer1::Dims4(3, 2, 3, 3)); + validator.DeclOutputVar("conv2d-Out", nvinfer1::Dims3(3, 5, 5)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("conv2d"); + desc.SetInput("Input", {"conv2d-X"}); + desc.SetInput("Filter", {"conv2d-Y"}); + desc.SetOutput("Output", {"conv2d-Out"}); + + const std::vector strides({1, 1}); + const std::vector paddings({1, 1}); + const std::vector dilations({1, 1}); + const int groups = 1; + + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + desc.SetAttr("dilations", dilations); + desc.SetAttr("groups", groups); + + validator.SetOp(*desc.Proto()); + + validator.Execute(3); +} + +TEST(conv2d_transpose_op, test) { + std::unordered_set parameters({"deconv2d-Y"}); + framework::Scope scope; + TRTConvertValidation validator(5, parameters, scope, 1 << 15); + + validator.DeclInputVar("deconv2d-X", nvinfer1::Dims3(3, 5, 5)); + validator.DeclParamVar("deconv2d-Y", nvinfer1::Dims4(3, 2, 3, 3)); + validator.DeclOutputVar("deconv2d-Out", nvinfer1::Dims3(2, 5, 5)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("conv2d_transpose"); + desc.SetInput("Input", {"deconv2d-X"}); + desc.SetInput("Filter", {"deconv2d-Y"}); + desc.SetOutput("Output", {"deconv2d-Out"}); + + const std::vector strides({1, 1}); + const std::vector paddings({1, 1}); + const std::vector dilations({1, 1}); + const int groups = 1; + + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + desc.SetAttr("dilations", dilations); + desc.SetAttr("groups", groups); + + validator.SetOp(*desc.Proto()); + + validator.Execute(3); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc new file mode 100644 index 00000000..6b8e621b --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(DropoutOpConverter, main) { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(8, parameters, scope, 1000); + + std::vector tensor_shape{8, 10}; + validator.DeclInputVar("dropout-X", tensor_shape, + nvinfer1::DimsCHW(10, 1, 1)); + validator.DeclOutputVar("dropout-Out", nvinfer1::DimsCHW(10, 1, 1)); + validator.DeclOutputVar("mask-Out", nvinfer1::DimsCHW(10, 1, 1)); + + // Prepare Op description + framework::OpDesc desc; + int is_test = 1; + float dropout_prob = 0.4; + + desc.SetType("dropout"); + desc.SetInput("X", {"dropout-X"}); + desc.SetOutput("Mask", {"mask-Out"}); + desc.SetOutput("Out", {"dropout-Out"}); + desc.SetAttr("is_test", is_test); + desc.SetAttr("dropout_prob", dropout_prob); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + std::unordered_set neglected_output = {"mask-Out"}; + + validator.Execute(8, neglected_output); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(dropout); diff --git a/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc new file mode 100644 index 00000000..cc967464 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_elementwise_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(elementwise_op, add_weight) { + std::unordered_set parameters({"elementwise_add-Y"}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1 << 15); + validator.DeclInputVar("elementwise_add-X", nvinfer1::DimsCHW(10, 3, 3)); + validator.DeclParamVar("elementwise_add-Y", nvinfer1::Dims3(10, 1, 1)); + validator.DeclOutputVar("elementwise_add-Out", nvinfer1::DimsCHW(10, 3, 3)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("elementwise_add"); + desc.SetInput("X", {"elementwise_add-X"}); + desc.SetInput("Y", {"elementwise_add-Y"}); + desc.SetOutput("Out", {"elementwise_add-Out"}); + + int axis = 1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + + validator.Execute(8); +} + +TEST(elementwise_op, native) { + for (std::string type : {"add", "mul"}) { + int batch_size = 8; + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15); + validator.DeclInputVar("elementwise_" + type + "-X", + nvinfer1::DimsCHW(10, 3, 3)); + validator.DeclInputVar("elementwise_" + type + "-Y", + nvinfer1::Dims3(10, 3, 3)); + validator.DeclOutputVar("elementwise_" + type + "-Out", + nvinfer1::DimsCHW(10, 3, 3)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("elementwise_" + type); + desc.SetInput("X", {"elementwise_" + type + "-X"}); + desc.SetInput("Y", {"elementwise_" + type + "-Y"}); + desc.SetOutput("Out", {"elementwise_" + type + "-Out"}); + + int axis = -1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + validator.Execute(batch_size); + } +} + +TEST(elementwise_op, plugin) { + for (std::string type : {"add", "mul"}) { + int batch_size = 8; + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(batch_size, parameters, scope, 1 << 15); + validator.DeclInputVar("elementwise_" + type + "-X", + nvinfer1::DimsCHW(10, 3, 3)); + validator.DeclInputVar("elementwise_" + type + "-Y", + nvinfer1::Dims3(10, 1, 1)); + validator.DeclOutputVar("elementwise_" + type + "-Out", + nvinfer1::DimsCHW(10, 3, 3)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("elementwise_" + type); + desc.SetInput("X", {"elementwise_" + type + "-X"}); + desc.SetInput("Y", {"elementwise_" + type + "-Y"}); + desc.SetOutput("Out", {"elementwise_" + type + "-Out"}); + + int axis = -1; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + validator.Execute(batch_size); + } +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(elementwise_add); +USE_OP(elementwise_mul); diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc new file mode 100644 index 00000000..1ae2668e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(fc_op, test) { + std::unordered_set parameters({"mul-Y"}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1)); + validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2)); + validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("mul"); + desc.SetInput("X", {"mul-X"}); + desc.SetInput("Y", {"mul-Y"}); + desc.SetOutput("Out", {"mul-Out"}); + + validator.SetOp(*desc.Proto()); + + validator.Execute(10); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle +USE_OP(mul); diff --git a/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc new file mode 100644 index 00000000..8f91309a --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_io_converter.cc @@ -0,0 +1,76 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/tensorrt/convert/io_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +void IOConverterTester(const platform::DeviceContext& ctx) { + cudaStream_t stream; + ASSERT_EQ(0, cudaStreamCreate(&stream)); + + // init fluid in_tensor + framework::LoDTensor in_tensor; + in_tensor.Resize({10, 10}); + auto place = ctx.GetPlace(); + in_tensor.mutable_data(place); + std::vector init; + for (int64_t i = 0; i < 10 * 10; ++i) { + init.push_back(i); + } + framework::TensorFromVector(init, ctx, &in_tensor); + + // init tensorrt buffer + void* buffer; + size_t size = in_tensor.memory_size(); + ASSERT_EQ(cudaMalloc(&buffer, size), 0); + + // convert fluid in_tensor to tensorrt buffer + EngineIOConverter::ConvertInput("test", in_tensor, buffer, size, &stream); + + // convert tensorrt buffer to fluid out_tensor + framework::LoDTensor out_tensor; + out_tensor.Resize({10, 10}); + out_tensor.mutable_data(place); + EngineIOConverter::ConvertOutput("test", buffer, &out_tensor, size, &stream); + + // compare in_tensor and out_tensor + std::vector result; + framework::TensorToVector(out_tensor, ctx, &result); + EXPECT_EQ(init.size(), result.size()); + for (size_t i = 0; i < init.size(); i++) { + EXPECT_EQ(init[i], result[i]); + } + cudaStreamDestroy(stream); +} + +TEST(EngineIOConverterTester, DefaultCPU) { + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + IOConverterTester(ctx); +} + +TEST(EngineIOConverterTester, DefaultGPU) { + platform::CUDAPlace place; + platform::CUDADeviceContext ctx(place); + IOConverterTester(ctx); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc new file mode 100644 index 00000000..d00826af --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_leaky_relu_op.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(leaky_relu_op, test_leaky_relu) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("leaky_relu_input", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclOutputVar("leaky_relu_out", nvinfer1::DimsCHW(3, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("leaky_relu"); + desc.SetInput("X", {"leaky_relu_input"}); + desc.SetOutput("Out", {"leaky_relu_out"}); + + desc.SetAttr("alpha", 0.1f); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +// USE_OP(leaky_relu); +USE_OP(leaky_relu); diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc new file mode 100644 index 00000000..282f5355 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(MulOpConverter, main) { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(10, parameters, scope, 1000, false); + validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6)); + validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10)); + validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("mul"); + desc.SetInput("X", {"mul-X"}); + desc.SetInput("Y", {"mul-Y"}); + desc.SetOutput("Out", {"mul-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(2); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(mul); diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc new file mode 100644 index 00000000..c5a41322 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +#include +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(OpConverter, ConvertBlock) { + framework::ProgramDesc prog; + auto* block = prog.MutableBlock(0); + auto* conv2d_op = block->AppendOp(); + + // init trt engine + cudaStream_t stream_; + std::unique_ptr engine_; + PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + engine_.reset(new TensorRTEngine(5, 1 << 15, stream_)); + engine_->InitNetwork(); + + engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3(2, 5, 5)); + + conv2d_op->SetType("conv2d"); + conv2d_op->SetInput("Input", {"conv2d-X"}); + conv2d_op->SetInput("Filter", {"conv2d-Y"}); + conv2d_op->SetOutput("Output", {"conv2d-Out"}); + + const std::vector strides({1, 1}); + const std::vector paddings({1, 1}); + const std::vector dilations({1, 1}); + const int groups = 1; + + conv2d_op->SetAttr("strides", strides); + conv2d_op->SetAttr("paddings", paddings); + conv2d_op->SetAttr("dilations", dilations); + conv2d_op->SetAttr("groups", groups); + + // init scope + framework::Scope scope; + std::vector dim_vec = {3, 2, 3, 3}; + auto* x = scope.Var("conv2d-Y"); + auto* x_tensor = x->GetMutable(); + x_tensor->Resize(framework::make_ddim(dim_vec)); + x_tensor->mutable_data(platform::CUDAPlace(0)); + + OpConverter converter; + converter.ConvertBlock(*block->Proto(), {"conv2d-Y"}, scope, + engine_.get() /*TensorRTEngine*/); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_TRT_CONVERTER(conv2d) diff --git a/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc new file mode 100644 index 00000000..ba35d7dd --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_pad_op.cc @@ -0,0 +1,52 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(PadConverter, main) { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("pad-X", nvinfer1::Dims3(3, 2, 2)); + validator.DeclOutputVar("pad-Out", nvinfer1::Dims3(3, 3, 5)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("pad"); + desc.SetInput("X", {"pad-X"}); + desc.SetOutput("Out", {"pad-Out"}); + + std::vector paddings = {0, 0, 0, 0, 0, 1, 1, 2}; + float pad_value = 0.0; + desc.SetAttr("paddings", paddings); + desc.SetAttr("pad_value", pad_value); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(2); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(pad); diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc new file mode 100644 index 00000000..bded8335 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +void test_pool2d(bool global_pooling, bool ceil_mode, + std::string pool_type = "max") { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(5, parameters, scope, 1 << 15); + + // The ITensor's Dims should not contain the batch size. + // So, the ITensor's Dims of input and output should be C * H * W. + validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 6, 7)); + if (global_pooling) + validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1)); + else if (ceil_mode) + validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 3, 4)); + else + validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 3, 3)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("pool2d"); + desc.SetInput("X", {"pool2d-X"}); + desc.SetOutput("Out", {"pool2d-Out"}); + + std::vector ksize({2, 2}); + std::vector strides({2, 2}); + std::vector paddings({0, 0}); + std::string pooling_t = pool_type; + + desc.SetAttr("pooling_type", pooling_t); + desc.SetAttr("ksize", ksize); + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + desc.SetAttr("global_pooling", global_pooling); + desc.SetAttr("ceil_mode", ceil_mode); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(3); +} + +TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); } +TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); } + +TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); } +TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); } + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(pool2d); diff --git a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc new file mode 100644 index 00000000..b086c910 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(prelu_op, test_channel_wise) { + std::unordered_set parameters({"prelu_alpha"}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(3, 1, 1)); + validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("prelu"); + desc.SetInput("X", {"prelu_input"}); + desc.SetInput("Alpha", {"prelu_alpha"}); + desc.SetOutput("Out", {"prelu_out"}); + + desc.SetAttr("mode", std::string("channel")); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +TEST(prelu_op, test_element_wise) { + std::unordered_set parameters({"prelu_alpha"}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclParamVar("prelu_alpha", nvinfer1::Dims4(10, 3, 2, 2)); + validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("prelu"); + desc.SetInput("X", {"prelu_input"}); + desc.SetInput("Alpha", {"prelu_alpha"}); + desc.SetOutput("Out", {"prelu_out"}); + + desc.SetAttr("mode", std::string("element")); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +TEST(prelu_op, test_scalar) { + std::unordered_set parameters({"prelu_alpha"}); + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("prelu_input", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclParamVar("prelu_alpha", nvinfer1::Dims3(1, 1, 1)); + validator.DeclOutputVar("prelu_out", nvinfer1::DimsCHW(3, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("prelu"); + desc.SetInput("X", {"prelu_input"}); + desc.SetInput("Alpha", {"prelu_alpha"}); + desc.SetOutput("Out", {"prelu_out"}); + + desc.SetAttr("mode", std::string("all")); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(prelu); diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc new file mode 100644 index 00000000..503ce71f --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(SoftMaxOpConverter, main) { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(8, parameters, scope, 1000); + + std::vector tensor_shape{8, 10}; + validator.DeclInputVar("softmax-X", tensor_shape, + nvinfer1::DimsCHW(10, 1, 1)); + validator.DeclOutputVar("softmax-Out", nvinfer1::DimsCHW(10, 1, 1)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("softmax"); + desc.SetInput("X", {"softmax-X"}); + desc.SetOutput("Out", {"softmax-Out"}); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(3); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(softmax); diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc new file mode 100644 index 00000000..5aacc5c6 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +template +void TensorRTSplitTest(const std::vector &in_shape, + const std::vector §ions) { + std::unordered_set parameters({""}); + framework::Scope scope; + TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000); + + auto make_dim = [](const std::vector &shape) { + nvinfer1::DimsCHW dim; + dim.c() = shape[0]; + dim.h() = shape[1]; + dim.w() = shape[2]; + return dim; + }; + validator.DeclInputVar("split_input", make_dim(in_shape)); + std::vector output_vars; + for (size_t i = 0; i < sections.size(); ++i) { + auto out_shape = in_shape; + out_shape[Axis - 1] = sections[i]; + std::string output_name = "split_out" + std::to_string(i); + validator.DeclOutputVar(output_name, make_dim(out_shape)); + output_vars.push_back(output_name); + } + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("split"); + desc.SetInput("X", {"split_input"}); + desc.SetOutput("Out", output_vars); + + desc.SetAttr("axis", Axis); + desc.SetAttr("num", 0); + desc.SetAttr("sections", sections); + + validator.SetOp(*desc.Proto()); + + validator.Execute(BatchSize); +} + +// batch = 0, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch1) { + TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2}); +} +// batch = 0, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch1) { + TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1}); +} +// batch = 10, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch10) { + TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2}); +} +// batch = 10, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch10) { + TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1}); +} +// batch = 0, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch1) { + TensorRTSplitTest<1, 2>({3, 4, 2}, {2, 2}); +} +// batch = 0, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch1) { + TensorRTSplitTest<1, 2>({3, 3, 2}, {2, 1}); +} +// batch = 10, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch10) { + TensorRTSplitTest<10, 2>({3, 4, 2}, {2, 2}); +} +// batch = 10, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch10) { + TensorRTSplitTest<10, 2>({3, 3, 2}, {2, 1}); +} +// batch = 0, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch1) { + TensorRTSplitTest<1, 3>({3, 2, 4}, {2, 2}); +} +// batch = 0, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch1) { + TensorRTSplitTest<1, 3>({3, 2, 3}, {2, 1}); +} +// batch = 10, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch10) { + TensorRTSplitTest<10, 3>({3, 2, 4}, {2, 2}); +} +// batch = 10, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch10) { + TensorRTSplitTest<10, 3>({3, 2, 3}, {2, 1}); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(split); diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h new file mode 100644 index 00000000..388d83d8 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -0,0 +1,241 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + * This file implements a UT framework to make the validation of transforming + * Fluid Op to TRT Layer. + */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/utils/singleton.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Get a random float value between [low, high] + */ +float random(float low, float high) { + static std::mt19937 mt(100); + std::uniform_real_distribution dist(low, high); + return dist(mt); +} + +void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, + const platform::DeviceContext& ctx) { + auto dims = tensor->dims(); + size_t num_elements = analysis::AccuDims(dims, dims.size()); + PADDLE_ENFORCE_GT(num_elements, 0); + + platform::CPUPlace cpu_place; + framework::LoDTensor temp_tensor; + temp_tensor.Resize(dims); + auto* temp_data = temp_tensor.mutable_data(cpu_place); + + for (size_t i = 0; i < num_elements; i++) { + *(temp_data + i) = random(0., 1.); + } + + TensorCopySync(temp_tensor, place, tensor); +} + +/* + * Help to validate the correctness between Fluid Op and the corresponding TRT + * layer. + */ +class TRTConvertValidation { + public: + TRTConvertValidation() = delete; + + TRTConvertValidation(int max_batch_size, + const std::unordered_set& parameters, + framework::Scope& scope, // NOLINT + int workspace_size = 1 << 10, bool if_add_batch = true) + : parameters_(parameters), + scope_(scope), + if_add_batch_(if_add_batch), + max_batch_size_(max_batch_size) { + PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); + engine_.reset( + new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0)); + engine_->InitNetwork(); + } + + // Declare a Variable as input with random initialization. + void DeclInputVar(const std::string& name, const std::vector tensor_dims, + const nvinfer1::Dims& trt_dims) { + DeclVar(name, tensor_dims); + engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, trt_dims); + } + + void DeclInputVar(const std::string& name, const nvinfer1::Dims& dims) { + DeclVar(name, dims); + // Declare TRT inputs. + engine_->DeclareInput(name, nvinfer1::DataType::kFLOAT, dims); + } + + void DeclParamVar(const std::string& name, const std::vector dim_vec) { + DeclVar(name, dim_vec); + } + + // Declare a parameter varaible in the scope. + void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) { + DeclVar(name, dims, true); + } + + void DeclOutputVar(const std::string& name, const std::vector dim_vec) { + DeclVar(name, dim_vec); + } + + void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) { + DeclVar(name, dims); + } + + void DeclVar(const std::string& name, const std::vector dim_vec) { + platform::CUDADeviceContext ctx(place_); + + auto* x = scope_.Var(name); + auto* x_tensor = x->GetMutable(); + x_tensor->Resize(framework::make_ddim(dim_vec)); + RandomizeTensor(x_tensor, place_, ctx); + } + // Declare a variable in a fluid Scope. + void DeclVar(const std::string& name, const nvinfer1::Dims& dims, + bool is_param = false) { + // Init Fluid tensor. + std::vector dim_vec(dims.d, dims.d + dims.nbDims); + // There is no batchsize in ITensor's shape, but We should add it to + // tensor's shape of fluid. If the variable is not parameter and the + // if_add_batch_ flag is true, add the max batchsize to dim_vec. + if (is_param != true && if_add_batch_ == true) + dim_vec.insert(dim_vec.begin(), max_batch_size_); + + DeclVar(name, dim_vec); + } + + void SetOp(const framework::proto::OpDesc& desc) { + op_ = framework::OpRegistry::CreateOp(desc); + + Singleton::Global().ConvertOp( + desc, parameters_, scope_, engine_.get(), true /*test_mode*/); + + engine_->FreezeNetwork(); + + // Declare outputs. + op_desc_.reset(new framework::OpDesc(desc, nullptr)); + } + + // We use the set 'neglected_output' here, because some Ops like batch norm, + // the outputs specified in the op des are only used during training, + // so we should neglect those output during inference. + void Execute(int batch_size, + std::unordered_set neglected_output = {}) { + // Execute Fluid Op + PADDLE_ENFORCE_LE(batch_size, max_batch_size_); + platform::CUDADeviceContext ctx(place_); + op_->Run(scope_, place_); + cudaStreamSynchronize(stream_); + std::vector input_output_names; + + // Note: we need filter the parameter + for (const auto& input : op_desc_->InputArgumentNames()) { + if (parameters_.count(input)) continue; + input_output_names.push_back(input); + } + + // Collect the fluid outputs. + std::vector> fluid_outs; + for (const auto& output : op_desc_->OutputArgumentNames()) { + if (neglected_output.count(output)) continue; + input_output_names.push_back(output); + std::vector fluid_out; + auto* var = scope_.FindVar(output); + auto* tensor = var->GetMutable(); + framework::TensorToVector(*tensor, ctx, &fluid_out); + fluid_outs.push_back(fluid_out); + } + + // Bind input and output for TRT. + const int num_bindings = input_output_names.size(); + std::vector buffers(num_bindings); + + for (const std::string& name : input_output_names) { + auto* var = scope_.FindVar(name); + auto* tensor = var->GetMutable(); + const int bind_index = engine_->engine()->getBindingIndex(name.c_str()); + buffers[bind_index] = + static_cast(tensor->mutable_data(place_)); + } + + // Execute TRT. + engine_->Execute(batch_size, &buffers, stream_); + cudaStreamSynchronize(stream_); + + ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); + int index = 0; + for (const auto& output : op_desc_->OutputArgumentNames()) { + if (neglected_output.count(output)) continue; + std::vector trt_out; + auto* var = scope_.FindVar(output); + auto* tensor = var->GetMutable(); + framework::TensorToVector(*tensor, ctx, &trt_out); + + size_t fluid_out_size = fluid_outs[index].size(); + if (if_add_batch_ == true) { + fluid_out_size = + batch_size * (framework::product(tensor->dims()) / max_batch_size_); + } + + for (size_t i = 0; i < fluid_out_size; i++) { + // Loose the threshold for CI in different machine model. + EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5); + } + index += 1; + } + } + + framework::Scope& scope() { return scope_; } + + private: + platform::CUDAPlace place_; + std::unique_ptr engine_; + cudaStream_t stream_; + std::unique_ptr op_; + std::unique_ptr op_desc_; + const std::unordered_set& parameters_; + framework::Scope& scope_; + // The ITensor of trt does not cotain the batch size, + // bug, in most cases, we need to set batch size for + // fluid's tensor shape. This variable indicates + // whether to add batch size to tensor shape of fluid. + bool if_add_batch_; + int max_batch_size_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc new file mode 100644 index 00000000..c5ac6f38 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -0,0 +1,225 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/engine.h" + +#include +#include +#include +#include +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/tensorrt/helper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +int TensorRTEngine::runtime_batch_ = 1; + +void TensorRTEngine::Build(const DescType &paddle_model) { + PADDLE_ENFORCE(false, "not implemented"); +} + +void TensorRTEngine::Execute(int batch_size, std::vector *buffers, + cudaStream_t stream) { + freshDeviceId(); + batch_size_ = batch_size; + infer_context_->enqueue(batch_size, buffers->data(), stream, nullptr); + cudaStreamSynchronize(stream); + SetRuntimeBatch(batch_size); +} + +void TensorRTEngine::FreezeNetwork() { + freshDeviceId(); + VLOG(3) << "TRT to freeze network"; + PADDLE_ENFORCE(infer_builder_ != nullptr, + "Call InitNetwork first to initialize network."); + PADDLE_ENFORCE(infer_network_ != nullptr, + "Call InitNetwork first to initialize network."); + // build engine. + infer_builder_->setMaxBatchSize(max_batch_); + infer_builder_->setMaxWorkspaceSize(max_workspace_); + if (enable_int8_) { + infer_builder_->setInt8Mode(true); + if (calibrator_) { + infer_builder_->setInt8Calibrator(calibrator_); + } else { + infer_builder_->setInt8Calibrator(nullptr); + +#if IS_TRT_VERSION_GE(5000) + infer_builder_->setStrictTypeConstraints(true); + for (auto &quant_range : quant_dynamic_range_) { + auto tensor = quant_range.first; + float range = quant_range.second; + tensor->setDynamicRange(-range, range); + } + + std::unordered_set all_t; + for (int i = 0; i < infer_network_->getNbLayers(); i++) { + auto layer = infer_network_->getLayer(i); + for (int j = 0; j < layer->getNbOutputs(); j++) { + all_t.insert(layer->getOutput(j)); + } + } + for (int i = 0; i < infer_network_->getNbInputs(); i++) { + all_t.insert(infer_network_->getInput(i)); + } + + for (auto &t : all_t) { + if (!quant_dynamic_range_.count(t)) { + LOG(WARNING) + << "We are in trt int8 mode(not calibration), scale not setted" + << " for tensor " << t->getName() + << ", this might be ok when trt does not need this range"; + } + } +#endif + } + } + + infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_)); + PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); + + infer_context_.reset(infer_engine_->createExecutionContext()); +} + +nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, + nvinfer1::DataType dtype, + const nvinfer1::Dims &dims) { + PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s", + name); + + PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first"); + auto *input = infer_network_->addInput(name.c_str(), dtype, dims); + PADDLE_ENFORCE(input, "infer network add input %s failed", name); + buffer_sizes_[name] = kDataTypeSize[static_cast(dtype)] * + analysis::AccuDims(dims.d, dims.nbDims) * max_batch_; + PADDLE_ENFORCE(input->isNetworkInput()); + TensorRTEngine::SetITensor(name, input); + return input; +} + +void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset, + const std::string &name) { + PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s", + name); + + auto *output = layer->getOutput(offset); + SetITensor(name, output); + PADDLE_ENFORCE(output != nullptr); + output->setName(name.c_str()); + PADDLE_ENFORCE(!output->isNetworkInput()); + infer_network_->markOutput(*output); + PADDLE_ENFORCE(output->isNetworkOutput()); + // output buffers' size can only be decided latter, set zero here to mark this + // and will reset latter. + buffer_sizes_[name] = 0; +} + +bool TensorRTEngine::HasDeclared(const std::string &name) { + return buffer_sizes_.count(name) > 0; +} + +void TensorRTEngine::DeclareOutput(const std::string &name) { + PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s", + name); + + auto *output = TensorRTEngine::GetITensor(name); + PADDLE_ENFORCE(output != nullptr); + output->setName(name.c_str()); + PADDLE_ENFORCE(!output->isNetworkInput()); + infer_network_->markOutput(*output); + // output buffers' size can only be decided latter, set zero here to mark this + // and will reset latter. + buffer_sizes_[name] = 0; +} + +void TensorRTEngine::SetITensor(const std::string &name, + nvinfer1::ITensor *tensor) { + PADDLE_ENFORCE(tensor != nullptr); + PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s", + name); + itensor_map_[name] = tensor; +} + +nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { + PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name); + return itensor_map_[name]; +} + +void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { + runtime_batch_ = batch_size; +} + +float *TensorRTEngine::GetWeightCPUData(const std::string &name, + framework::Tensor *weight_tensor, + bool enable_int8, + const std::vector &scale) { + auto w_dims = weight_tensor->dims(); + platform::CPUPlace cpu_place; + PADDLE_ENFORCE(!weight_map.count(name), + "During TRT Op converter: We set weight %s with the same name " + "twice into the weight_map", + name); + weight_map[name].reset(new framework::Tensor()); + weight_map[name]->Resize(weight_tensor->dims()); + TensorCopySync(*weight_tensor, cpu_place, weight_map[name].get()); + float *weight_data = weight_map[name]->mutable_data(cpu_place); + + if (enable_int8) { + // when the op is fc, scale's size should be 1 + // when the op is conv, the scale's size should be w_dims[0] + bool valid_scale_size = + (scale.size() == 1 || scale.size() == static_cast(w_dims[0])); + PADDLE_ENFORCE(valid_scale_size, "TRT int8 quant: invalid scale size"); + for (int i = 0; i < weight_tensor->numel(); i++) { + bool is_valid_int8 = + ((weight_data[i] >= -128) && (weight_data[i] <= 127)); + PADDLE_ENFORCE(is_valid_int8, + "We are in anakin subgraph int8 mode, the weight of conv " + "should be in range [-128, 127]"); + if (scale.size() == 1) { + weight_data[i] *= (scale[0] / 127); + } else { + PADDLE_ENFORCE(w_dims.size() == 4, + "TRT int8 quant : We only use the channel quant for " + "conv op, so the weight dims should be 4."); + int inner_size = w_dims[1] * w_dims[2] * w_dims[3]; + weight_data[i] *= (scale[i / inner_size] / 127); + } + } + } + return weight_data; +} + +int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } + +nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( + nvinfer1::ITensor *const *inputs, int num_inputs, + plugin::PluginTensorRT *plugin) { + owned_plugin_.emplace_back(plugin); + return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin); +} + +void TensorRTEngine::freshDeviceId() { + int count; + cudaGetDeviceCount(&count); + PADDLE_ENFORCE_LT(device_id_, count); + cudaSetDevice(device_id_); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h new file mode 100644 index 00000000..80af463d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -0,0 +1,257 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/engine.h" +#include "paddle/fluid/inference/tensorrt/helper.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" +#include "paddle/fluid/inference/utils/singleton.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class TRTInt8Calibrator; +/* + * TensorRT Engine. + * + * There are two alternative ways to use it, one is to build from a paddle + * protobuf model, another way is to manully construct the network. + */ +class TensorRTEngine { + using DescType = ::paddle::framework::proto::BlockDesc; + + public: + // Weight is model parameter. + class Weight { + public: + Weight() = default; + Weight(nvinfer1::DataType dtype, void* value, size_t num_elem) { + w_.type = dtype; + w_.values = value; + w_.count = num_elem; + } + const nvinfer1::Weights& get() { return w_; } + + std::vector dims; + + private: + nvinfer1::Weights w_; + }; + + TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false, + TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, + nvinfer1::ILogger& logger = NaiveLogger::Global()) + : max_batch_(max_batch), + max_workspace_(max_workspace), + enable_int8_(enable_int8), + calibrator_(calibrator), + device_id_(device_id), + logger_(logger) {} + + ~TensorRTEngine() {} + + // TODO(Superjomn) implement it later when graph segmentation is supported. + void Build(const DescType& paddle_model); + + void Execute(int batch_size, std::vector* buffers, + cudaStream_t stream); + + // Initialize the inference network, so that TensorRT layers can add to this + // network. + void InitNetwork() { + freshDeviceId(); + infer_builder_.reset(createInferBuilder(&logger_)); + infer_network_.reset(infer_builder_->createNetwork()); + } + // After finishing adding ops, freeze this network and creates the executation + // environment. + void FreezeNetwork(); + + // Add an input and set its name, data type and dimention. + nvinfer1::ITensor* DeclareInput(const std::string& name, + nvinfer1::DataType dtype, + const nvinfer1::Dims& dim); + // Set the offset-th output from a layer as the network's output, and set its + // name. + void DeclareOutput(const nvinfer1::ILayer* layer, int offset, + const std::string& name); + // Set the itensor_map_[name] as the network's output, and set its name. + void DeclareOutput(const std::string& name); + // Check if the ITensor has been declared + bool HasDeclared(const std::string& name); + + void SetITensor(const std::string& name, nvinfer1::ITensor* tensor); + // Get an ITensor called name. + nvinfer1::ITensor* GetITensor(const std::string& name); + + nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } + nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } + + nvinfer1::IHostMemory* Serialize() { + PADDLE_ENFORCE(infer_engine_ != nullptr, + "You should build engine first and then serialize"); + ihost_memory_.reset(infer_engine_->serialize()); + return ihost_memory_.get(); + } + + void Deserialize(const std::string& engine_serialized_data) { + freshDeviceId(); + infer_ptr runtime(createInferRuntime(&logger_)); + infer_engine_.reset(runtime->deserializeCudaEngine( + engine_serialized_data.c_str(), engine_serialized_data.size(), + &inference::Singleton::Global())); + PADDLE_ENFORCE(infer_engine_ != nullptr, + "build cuda engine failed when deserialize engine info.!"); + infer_context_.reset(infer_engine_->createExecutionContext()); + } + + void SetRuntimeBatch(size_t batch_size); + int GetRuntimeBatch(); + int GetDeviceId() { return device_id_; } + nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, + int num_inputs, plugin::PluginTensorRT*); + void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) { + quant_dynamic_range_[tensor] = range; + } + + float* GetWeightCPUData(const std::string& name, + framework::Tensor* weight_tensor, bool enable_int8, + const std::vector& scale = {}); + + // A pointer to CPU memory is needed of the TRT weight. + // Before TRT runs, fluid loads weight into GPU storage. + // so we need to copy the weights from GPU to CPU in our op converter. + // We use a map to store these weights for the weight memory is not released + // in advance, which affecting the construction of TRT Op. + std::unordered_map> + weight_map; + + void ClearWeights() { + for (auto& weight_pair : weight_map) { + weight_pair.second.reset(nullptr); + } + } + + private: + // Each ICudaEngine object is bound to a specific GPU when it is instantiated, + // ensure that the thread is associated with the correct device by calling + // freshDeviceId(). + void freshDeviceId(); + + // the max batch size + int max_batch_; + // the runtime batch size + static int runtime_batch_; + // the max memory size the engine uses + int max_workspace_; + + bool enable_int8_; + TRTInt8Calibrator* calibrator_; + // batch size of the current data, will be updated each Executation. + int batch_size_{-1}; + + int device_id_; + nvinfer1::ILogger& logger_; + + // max data size for the buffers. + std::unordered_map buffer_sizes_; + std::unordered_map + itensor_map_; + + std::vector> owned_plugin_; + + // TensorRT related internal members + template + struct Destroyer { + void operator()(T* x) { + if (x) { + x->destroy(); + } + } + }; + template + using infer_ptr = std::unique_ptr>; + infer_ptr infer_builder_; + infer_ptr infer_network_; + infer_ptr infer_engine_; + infer_ptr infer_context_; + infer_ptr ihost_memory_; + std::unordered_map quant_dynamic_range_; +}; // class TensorRTEngine + +#define IS_TRT_VERSION_GE(version) \ + ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \ + NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version) + +// Add an layer__ into engine__ with args ARGS. +// For example: +// +// Reference +// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network +// +// will add a fully connected layer into the engine. +// TensorRT has too many layers, so that is not wise to add member functions for +// them, and an macro like this is more extensible when underlying TensorRT +// library add new layer supports. +#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \ + engine__->network()->add##layer__(ARGS); + +class TRTEngineManager { + public: + bool Empty() const { return engines_.size() == 0; } + bool Has(const std::string& name) const { + if (engines_.count(name) == 0) return false; + return engines_.at(name).get() != nullptr; + } + + TensorRTEngine* Get(const std::string& name) const { + return engines_.at(name).get(); + } + + TensorRTEngine* Create(std::string name, int max_batch, int max_workspace, + bool enable_int8 = false, + TRTInt8Calibrator* calibrator = nullptr, + int device_id = 0, + nvinfer1::ILogger& logger = NaiveLogger::Global()) { + auto* p = new TensorRTEngine(max_batch, max_workspace, enable_int8, + calibrator, device_id, logger); + engines_[name].reset(p); + return p; + } + + void DeleteAll() { + for (auto& item : engines_) { + item.second.reset(nullptr); + } + } + + private: + std::unordered_map> engines_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h new file mode 100644 index 00000000..010942a0 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -0,0 +1,108 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/platform/dynload/tensorrt.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +namespace dy = paddle::platform::dynload; + +// TensorRT data type to size +const int kDataTypeSize[] = { + 4, // kFLOAT + 2, // kHALF + 1, // kINT8 + 4 // kINT32 +}; + +// The following two API are implemented in TensorRT's header file, cannot load +// from the dynamic library. So create our own implementation and directly +// trigger the method from the dynamic library. +static nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) { + return static_cast( + dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION)); +} +static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { + return static_cast( + dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION)); +} + +// A logger for create TensorRT infer builder. +class NaiveLogger : public nvinfer1::ILogger { + public: + void log(nvinfer1::ILogger::Severity severity, const char* msg) override { + switch (severity) { + case Severity::kINFO: + VLOG(3) << msg; + break; + case Severity::kWARNING: + LOG(WARNING) << msg; + break; + case Severity::kINTERNAL_ERROR: + case Severity::kERROR: + LOG(ERROR) << msg; + break; + default: + break; + } + } + + static nvinfer1::ILogger& Global() { + static nvinfer1::ILogger* x = new NaiveLogger; + return *x; + } + + ~NaiveLogger() override {} +}; + +class NaiveProfiler : public nvinfer1::IProfiler { + public: + typedef std::pair Record; + std::vector mProfile; + + virtual void reportLayerTime(const char* layerName, float ms) { + auto record = + std::find_if(mProfile.begin(), mProfile.end(), + [&](const Record& r) { return r.first == layerName; }); + if (record == mProfile.end()) + mProfile.push_back(std::make_pair(layerName, ms)); + else + record->second += ms; + } + + void printLayerTimes() { + float totalTime = 0; + for (size_t i = 0; i < mProfile.size(); i++) { + printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), + mProfile[i].second); + totalTime += mProfile[i].second; + } + printf("Time over all layers: %4.3f\n", totalTime); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc new file mode 100644 index 00000000..170ca40d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/op_teller.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// Just tell by the op_types. +struct SimpleOpTypeSetTeller : public Teller { + SimpleOpTypeSetTeller() {} + + bool operator()(const std::string& op_type, + const framework::OpDesc& desc) override { + return teller_set.count(op_type); + } + + private: + std::unordered_set teller_set{ + {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", + "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", + "elementwise_add", "elementwise_mul", "dropout", "prelu", + "conv2d_transpose", "leaky_relu", "fc"}}; +}; + +bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) { + for (auto& teller : tellers_) { + if ((*teller)(op_type, desc)) return true; + } + return false; +} + +OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); } + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h new file mode 100644 index 00000000..3363d77a --- /dev/null +++ b/paddle/fluid/inference/tensorrt/op_teller.h @@ -0,0 +1,70 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/op_desc.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Single Op teller definition. + * One can override this and define a more complex tell logic, considerring more + * issues such as op_desc. + */ +struct Teller { + virtual bool operator()(const std::string& op_type, + const framework::OpDesc& desc) = 0; + + virtual ~Teller() = default; +}; +/* + * A real example: + * + * struct SomeTeller : public Teller { + * bool operator()(const std::string& op_type, + * const framework::OpDesc& desc) override { + * return op_type == "fc" && desc.Inputs().size() == 2; + * } + *}; + */ + +/* + * class OpTeller helps to tell whether a fluid + * operator can be transformed to a TensorRT layer. + */ +class OpTeller { + public: + static OpTeller& Global() { + static std::unique_ptr x(new OpTeller); + return *x; + } + + bool Tell(const std::string& op_type, const framework::OpDesc& desc); + + private: + OpTeller(); + + private: + std::vector> tellers_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt new file mode 100644 index 00000000..709aa103 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -0,0 +1,5 @@ +nv_library(tensorrt_plugin + SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu + prelu_op_plugin.cu trt_plugin_factory.cc + avg_pool_op_plugin.cu + DEPS enforce tensorrt_engine prelu) diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu new file mode 100644 index 00000000..f27a8381 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu @@ -0,0 +1,71 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" +#include "paddle/fluid/operators/math/pooling.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +AvgPoolPlugin* CreateAvgPoolPluginDeserialize(const void* buffer, + size_t length) { + return new AvgPoolPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("avg_pool_plugin", CreateAvgPoolPluginDeserialize); + +nvinfer1::Dims AvgPoolPlugin::getOutputDimensions( + int index, const nvinfer1::Dims* inputDims, int nbInputs) { + assert(nbInputs == 1); + assert(index == 0); + assert(inputDims[0].nbDims == 3); + nvinfer1::Dims const& input_dims = inputDims[0]; + + nvinfer1::Dims output_dims = input_dims; + + output_dims.d[1] = output_shape_[1]; + output_dims.d[2] = output_shape_[2]; + return output_dims; +} + +int AvgPoolPlugin::enqueue(int batchSize, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + auto const& input_dims = this->getInputDims(0); + int input_size = 0; + float const* idata = reinterpret_cast(inputs[0]); + float** odatas = reinterpret_cast(outputs); + + paddle::operators::math::AvgPool pool_process; + paddle::operators::math::Pool2dDirectCUDAFunctor< + paddle::operators::math::AvgPool, float> + pool2d_forward; + + std::vector input_shape = input_shape_; + std::vector output_shape = output_shape_; + input_shape.insert(input_shape.begin(), batchSize); + output_shape.insert(output_shape.begin(), batchSize); + + pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_, + pool_process, true, odatas[0], stream); + + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h new file mode 100644 index 00000000..a7c0aa57 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h @@ -0,0 +1,115 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class AvgPoolPlugin : public PluginTensorRT { + private: + bool ceil_mode_; + std::vector ksize_; + std::vector strides_; + std::vector paddings_; + std::vector input_shape_; + std::vector output_shape_; + + protected: + size_t getSerializationSize() override { + return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) + + SerializedSize(ksize_) + SerializedSize(strides_) + + SerializedSize(paddings_) + SerializedSize(input_shape_) + + SerializedSize(output_shape_) + getBaseSerializationSize(); + } + + // TRT will call this func when we need to serialize the configuration of + // tensorrt. + void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); + serializeBase(buffer); + SerializeValue(&buffer, ceil_mode_); + SerializeValue(&buffer, ksize_); + SerializeValue(&buffer, strides_); + SerializeValue(&buffer, paddings_); + SerializeValue(&buffer, input_shape_); + SerializeValue(&buffer, output_shape_); + } + + public: + AvgPoolPlugin() {} + AvgPoolPlugin(bool ceil_mode, std::vector ksize, + std::vector strides, std::vector paddings, + std::vector input_shape) + : ceil_mode_(ceil_mode), + ksize_(ksize), + strides_(strides), + paddings_(paddings), + input_shape_(input_shape) { + int output_h, output_w; + output_shape_ = input_shape_; + if (!ceil_mode_) { + output_h = + (input_shape[1] - ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1; + output_w = + (input_shape[2] - ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1; + } else { + output_h = + (input_shape[1] - ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) / + strides_[0] + + 1; + output_w = + (input_shape[2] - ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) / + strides_[1] + + 1; + } + output_shape_[1] = output_h; + output_shape_[2] = output_w; + } + + // It was used for tensorrt deserialization. + // It should not be called by users. + AvgPoolPlugin(void const *serialData, size_t serialLength) { + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &ceil_mode_); + DeserializeValue(&serialData, &serialLength, &ksize_); + DeserializeValue(&serialData, &serialLength, &strides_); + DeserializeValue(&serialData, &serialLength, &paddings_); + DeserializeValue(&serialData, &serialLength, &input_shape_); + DeserializeValue(&serialData, &serialLength, &output_shape_); + } + + AvgPoolPlugin *clone() const override { + return new AvgPoolPlugin(ceil_mode_, ksize_, strides_, paddings_, + input_shape_); + } + + const char *getPluginType() const override { return "avg_pool_plugin"; } + int getNbOutputs() const override { return 1; } + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, + int nbInputDims) override; + int initialize() override { return 0; } + int enqueue(int batchSize, const void *const *inputs, void **outputs, + void *workspace, cudaStream_t stream) override; +}; + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu new file mode 100644 index 00000000..9aed3dda --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -0,0 +1,145 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer, + size_t length) { + return new ElementWisePlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize); + +namespace details { + +template +struct Add { + __device__ T operator()(const T& a, const T& b) const { return a + b; } +}; + +template +struct Mul { + __device__ T operator()(const T& a, const T& b) const { return a * b; } +}; + +template +__global__ void ColumnWiseKernel(Operator op, const T* x, const T* y, T* out, + int batch_size, int num_rows, int num_cols) { + for (int batch_id = 0; batch_id < batch_size; ++batch_id) { + int row = blockIdx.x; + for (; row < num_rows; row += gridDim.x) { + T value_y = y[batch_id * num_rows + row]; + int col = threadIdx.x; + int offset = (batch_id * num_rows + row) * num_cols; + for (; col < num_cols; col += blockDim.x) { + T value_x = x[offset + col]; + out[offset + col] = op(value_x, value_y); + } + } + } +} + +template +static void ElementWise(Operator op, const T* x, const T* y, T* out, + int batch_size, int prev, int midd, int post, + cudaStream_t stream) { + const int kThreadsPerBlock = 1024; + const int kMaximumBlocks = 65535; + if (prev == 1) { + int num_threads = (post > kThreadsPerBlock) ? kThreadsPerBlock + : (((post + 31) >> 5) << 5); + int num_blocks = (midd < kMaximumBlocks) ? midd : kMaximumBlocks; + ColumnWiseKernel<<>>( + op, x, y, out, batch_size, midd, post); + } else if (post == 1) { + PADDLE_THROW("Not implemented."); + } else { + PADDLE_THROW("Not implemented."); + } +} + +} // namespace details + +nvinfer1::Dims ElementWisePlugin::getOutputDimensions( + int index, const nvinfer1::Dims* input_dims, int num_inputs) { + PADDLE_ENFORCE_EQ(index, 0); + PADDLE_ENFORCE_EQ(num_inputs, 2); + PADDLE_ENFORCE_NOT_NULL(input_dims); + return input_dims[0]; +} + +int ElementWisePlugin::initialize() { + PADDLE_ENFORCE_GT(dims_y_.nbDims, 0); + + axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_; + int trimed_nb_dims = dims_y_.nbDims; + for (; trimed_nb_dims > 0; --trimed_nb_dims) { + if (dims_y_.d[trimed_nb_dims - 1] != 1) { + break; + } + } + dims_y_.nbDims = trimed_nb_dims; + + PADDLE_ENFORCE_GE(dims_x_.nbDims, dims_y_.nbDims + axis_); + PADDLE_ENFORCE_LT(axis_, dims_x_.nbDims); + + prev_size_ = 1; + midd_size_ = 1; + post_size_ = 1; + for (int i = 0; i < axis_; ++i) { + prev_size_ *= dims_x_.d[i]; + } + + for (int i = 0; i < dims_y_.nbDims; ++i) { + PADDLE_ENFORCE_EQ(dims_x_.d[i + axis_], dims_y_.d[i], + "Broadcast dimension mismatch."); + midd_size_ *= dims_y_.d[i]; + } + + for (int i = axis_ + dims_y_.nbDims; i < dims_x_.nbDims; ++i) { + post_size_ *= dims_x_.d[i]; + } + return 0; +} + +int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + const float* x = reinterpret_cast(inputs[0]); + const float* y = reinterpret_cast(inputs[1]); + float* out = reinterpret_cast(outputs[0]); + + if (type_ == "add") { + details::ElementWise(details::Add(), x, y, out, batch_size, + prev_size_, midd_size_, post_size_, stream); + } else if (type_ == "mul") { + details::ElementWise(details::Mul(), x, y, out, batch_size, + prev_size_, midd_size_, post_size_, stream); + } else { + PADDLE_THROW("Not implemented."); + } + + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h new file mode 100644 index 00000000..3b040f14 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class ElementWisePlugin : public PluginTensorRT { + public: + ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x, + nvinfer1::Dims const &dims_y, int axis) + : type_(type), + dims_x_(dims_x), + dims_y_(dims_y), + axis_(axis), + prev_size_(1), + midd_size_(1), + post_size_(1) {} + + ElementWisePlugin(void const *serial_data, size_t serial_length) { + deserializeBase(serial_data, serial_length); + const char *elementwise_type; + DeserializeValue(&serial_data, &serial_length, &elementwise_type); + type_ = std::string(elementwise_type); + DeserializeValue(&serial_data, &serial_length, &axis_); + DeserializeValue(&serial_data, &serial_length, &dims_x_); + DeserializeValue(&serial_data, &serial_length, &dims_y_); + } + + ElementWisePlugin *clone() const override { + // return new ElementWisePlugin(dims_x_, dims_y_, axis_); + return nullptr; + } + + const char *getPluginType() const override { return "elementwise_plugin"; } + + nvinfer1::Dims getOutputDimensions(int index, + const nvinfer1::Dims *input_dims, + int num_inputs) override; + + int initialize() override; + + // execute the layer + int enqueue(int batch_size, const void *const *inputs, void **outputs, + void *workspace, cudaStream_t stream); + + protected: + size_t getSerializationSize() override { + return SerializedSize(getPluginType()) + SerializedSize(axis_) + + SerializedSize(dims_x_) + SerializedSize(dims_y_) + + getBaseSerializationSize(); + } + + void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); + serializeBase(buffer); + SerializeValue(&buffer, type_.c_str()); + SerializeValue(&buffer, axis_); + SerializeValue(&buffer, dims_x_); + SerializeValue(&buffer, dims_y_); + } + + std::string type_; + nvinfer1::Dims dims_x_; + nvinfer1::Dims dims_y_; + int axis_; + int prev_size_; + int midd_size_; + int post_size_; +}; + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu new file mode 100644 index 00000000..b8a044fe --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -0,0 +1,82 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" +#include "paddle/fluid/operators/math/prelu.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) { + return new PReluPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize); + +int PReluPlugin::initialize() { + cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size()); + cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float), + cudaMemcpyHostToDevice); +} + +nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, + const nvinfer1::Dims *inputDims, + int nbInputs) { + assert(nbInputs == 1); + assert(index < this->getNbOutputs()); + nvinfer1::Dims const &input_dims = inputDims[0]; + nvinfer1::Dims output_dims = input_dims; + return output_dims; +} + +int PReluPlugin::enqueue(int batch_size, const void *const *inputs, + void **outputs, void *workspace, cudaStream_t stream) { + // input dims is CHW. + const auto &input_dims = this->getInputDims(0); + const float *input = reinterpret_cast(inputs[0]); + // const float *alpha = reinterpret_cast(alpha_.get().values); + const float *alpha = p_gpu_weight_; + float *output = reinterpret_cast(outputs)[0]; + + std::vector input_shape; + input_shape.push_back(batch_size); + for (int i = 0; i < input_dims.nbDims; i++) { + input_shape.push_back(input_dims.d[i]); + } + + if (mode_ == "channel") { + operators::math::PreluChannelWiseDirectCUDAFunctor + prelu_channel_wise; + prelu_channel_wise(stream, input, alpha, output, input_shape); + } else if (mode_ == "element") { + operators::math::PreluElementWiseDirectCUDAFunctor + prelu_element_wise; + prelu_element_wise(stream, input, alpha, output, input_shape); + } else { + operators::math::PreluScalarDirectCUDAFunctor prelu_scalar; + prelu_scalar(stream, input, alpha, output, input_shape); + } + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h new file mode 100644 index 00000000..a9664950 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class PReluPlugin : public PluginTensorRT { + std::vector weight_; + float *p_gpu_weight_; + std::string mode_; + + protected: + size_t getSerializationSize() override { + return getBaseSerializationSize() + SerializedSize(mode_.c_str()) + + SerializedSize(weight_) + SerializedSize(getPluginType()); + } + + // TRT will call this func when we need to serialize the configuration of + // tensorrt. + // It should not be called by users. + void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); + serializeBase(buffer); + SerializeValue(&buffer, weight_); + SerializeValue(&buffer, mode_.c_str()); + } + + public: + PReluPlugin(const float *weight, const int weight_num, + std::string const &mode) + : mode_(mode) { + weight_.resize(weight_num); + std::copy(weight, weight + weight_num, weight_.data()); + } + + // It was used for tensorrt deserialization. + // It should not be called by users. + PReluPlugin(void const *serialData, size_t serialLength) { + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &weight_); + const char *prelu_mode; + DeserializeValue(&serialData, &serialLength, &prelu_mode); + mode_ = std::string(prelu_mode); + } + ~PReluPlugin() { cudaFree(p_gpu_weight_); } + int initialize() override; + + PReluPlugin *clone() const override { + return new PReluPlugin(weight_.data(), weight_.size(), mode_); + } + + const char *getPluginType() const override { return "prelu_plugin"; } + int getNbOutputs() const override { return 1; } + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, + int nbInputDims) override; + int enqueue(int batchSize, const void *const *inputs, void **outputs, + void *workspace, cudaStream_t stream) override; +}; + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu new file mode 100644 index 00000000..b5503c3b --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -0,0 +1,184 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) { + return new SplitPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize); + +// copied from operators::math::SplitFunctor +template +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int* out_cols, + int out_cols_size, T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int curr_segment = 0; + int curr_offset = out_cols[0]; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int curr_col_offset = out_cols[curr_segment + 1]; + while (curr_col_offset <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + curr_col_offset = out_cols[curr_segment + 1]; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + T* output_ptr = outputs_data[curr_segment]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * segment_width + local_col] = + input_data[tid_y * in_col + tid_x]; + } + } +} + +template +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int fixed_out_col, + T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int split = tid_x / fixed_out_col; + int in_offset = tid_x - split * fixed_out_col; + T* output_ptr = outputs_data[split]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * fixed_out_col + in_offset] = + input_data[tid_y * in_col + tid_x]; + } + } +} + +nvinfer1::Dims SplitPlugin::getOutputDimensions( + int index, const nvinfer1::Dims* input_dims, int num_inputs) { + PADDLE_ENFORCE_EQ(num_inputs, 1); + PADDLE_ENFORCE_LT(index, this->getNbOutputs()); + + nvinfer1::Dims output_dims = input_dims[0]; + output_dims.d[axis_] = output_length_.at(index); + return output_dims; +} + +int SplitPlugin::initialize() { + PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS); + // notice input dims is [C, H, W] + nvinfer1::Dims dims = this->getInputDims(0); + outer_rows_ = 1; + inner_cols_ = 1; + for (int i = 0; i < axis_; ++i) { + outer_rows_ *= dims.d[i]; + } + for (int i = axis_ + 1; i < dims.nbDims; ++i) { + inner_cols_ *= dims.d[i]; + } + same_shape_ = true; + std::vector segment_offsets(1, 0); + for (int i = 0; i < this->getNbOutputs(); ++i) { + if (output_length_[i] != output_length_[0]) { + same_shape_ = false; + } + segment_offsets.push_back(segment_offsets.back() + + output_length_[i] * inner_cols_); + } + inner_cols_ *= dims.d[axis_]; + d_segment_offsets_ = segment_offsets; + segment_offsets_ = std::move(segment_offsets); + d_output_ptrs_.resize(this->getNbOutputs(), nullptr); + return 0; +} + +template +inline void Split(cudaStream_t stream, const bool same_shape, + const int outer_rows, const int inner_cols, + const std::vector& segment_offsets, + const int* d_segment_offsets, const T* input, T** outputs) { + const int kThreadsPerBlock = 1024; + const int kMaxBlocks = 65535; + int block_cols = kThreadsPerBlock; + if (inner_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((inner_cols + 31) >> 5) << 5; + } + int block_rows = kThreadsPerBlock / block_cols; + dim3 block_size = dim3(block_cols, block_rows, 1); + + int grid_cols = + std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks); + int grid_rows = + std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1)); + dim3 grid_size = dim3(grid_cols, grid_rows, 1); + + if (same_shape) { + SplitKernel<<>>( + input, outer_rows, inner_cols, segment_offsets[1], outputs); + } else { + SplitKernel<<>>( + input, outer_rows, inner_cols, d_segment_offsets, + static_cast(segment_offsets.size()), outputs); + } +} + +int SplitPlugin::enqueue(int batchSize, const void* const* inputs, + void** outputs, void* workspace, cudaStream_t stream) { + float const* input_ptr = reinterpret_cast(inputs[0]); + if (((batchSize == 1 && axis_ == 0) || axis_ == -1) && + this->getNbOutputs() < 10) { + float** output_ptrs = reinterpret_cast(outputs); + int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT) + ? sizeof(float) + : sizeof(__half); + for (int i = 0; i < this->getNbOutputs(); ++i) { + PADDLE_ENFORCE( + cudaMemcpyAsync( + output_ptrs[i], input_ptr + segment_offsets_[i], + (segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size, + cudaMemcpyDeviceToDevice, stream) == cudaSuccess); + } + } else { + outer_rows_ *= batchSize; + const int* d_segment_offsets_ptr = + thrust::raw_pointer_cast(&d_segment_offsets_[0]); + float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]); + PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs, + this->getNbOutputs() * sizeof(float*), + cudaMemcpyHostToDevice, + stream) == cudaSuccess); + if (this->getDataType() == nvinfer1::DataType::kFLOAT) { + Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, + d_segment_offsets_ptr, input_ptr, output_ptrs); + } else { + Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, + d_segment_offsets_ptr, (__half*)input_ptr, // NOLINT + (__half**)output_ptrs); // NOLINT + } + } + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h new file mode 100644 index 00000000..cbb72590 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -0,0 +1,79 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class SplitPlugin : public PluginTensorRT { + public: + SplitPlugin() {} + SplitPlugin(int axis, std::vector const &output_lengths) + : axis_(axis), same_shape_(true), output_length_(output_lengths) {} + + SplitPlugin(void const *serial_data, size_t serial_length) { + deserializeBase(serial_data, serial_length); + DeserializeValue(&serial_data, &serial_length, &axis_); + DeserializeValue(&serial_data, &serial_length, &output_length_); + } + + SplitPlugin *clone() const override { + return new SplitPlugin(axis_, output_length_); + } + + const char *getPluginType() const override { return "split_plugin"; } + int getNbOutputs() const override { return output_length_.size(); } + nvinfer1::Dims getOutputDimensions(int index, + const nvinfer1::Dims *input_dims, + int num_inputs) override; + + int initialize() override; + int enqueue(int batchSize, const void *const *inputs, void **outputs, + void *workspace, cudaStream_t stream) override; + + protected: + size_t getSerializationSize() override { + return SerializedSize(getPluginType()) + SerializedSize(axis_) + + SerializedSize(output_length_) + getBaseSerializationSize(); + } + + void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); + serializeBase(buffer); + SerializeValue(&buffer, axis_); + SerializeValue(&buffer, output_length_); + } + + int axis_; + int outer_rows_; + int inner_cols_; + bool same_shape_; + std::vector output_length_; + std::vector segment_offsets_; + thrust::device_vector d_segment_offsets_; + thrust::device_vector d_output_ptrs_; +}; + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc new file mode 100644 index 00000000..b0f4cff3 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +void PluginTensorRT::serializeBase(void*& buffer) { + SerializeValue(&buffer, input_dims_); + SerializeValue(&buffer, max_batch_size_); + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, data_format_); +} + +void PluginTensorRT::deserializeBase(void const*& serial_data, + size_t& serial_length) { + DeserializeValue(&serial_data, &serial_length, &input_dims_); + DeserializeValue(&serial_data, &serial_length, &max_batch_size_); + DeserializeValue(&serial_data, &serial_length, &data_type_); + DeserializeValue(&serial_data, &serial_length, &data_format_); +} + +size_t PluginTensorRT::getBaseSerializationSize() { + return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) + + SerializedSize(data_type_) + SerializedSize(data_format_)); +} + +bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, + nvinfer1::PluginFormat format) const { + return ((type == nvinfer1::DataType::kFLOAT) && + (format == nvinfer1::PluginFormat::kNCHW)); +} + +void PluginTensorRT::configureWithFormat( + const nvinfer1::Dims* input_dims, int num_inputs, + const nvinfer1::Dims* output_dims, int num_outputs, nvinfer1::DataType type, + nvinfer1::PluginFormat format, int max_batch_size) { + data_type_ = type; + data_format_ = format; + input_dims_.assign(input_dims, input_dims + num_inputs); + max_batch_size_ = max_batch_size; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h new file mode 100644 index 00000000..3b737bd7 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -0,0 +1,118 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/profiler.h" + +DECLARE_bool(profile); + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class PluginTensorRT; + +typedef std::function + PluginDeserializeFunc; + +typedef std::function PluginConstructFunc; + +class PluginTensorRT : public nvinfer1::IPluginExt { + public: + PluginTensorRT() {} + // It was used for TensorRT deserialization. + // It should not be called by users. + PluginTensorRT(const void* serialized_data, size_t length) {} + virtual ~PluginTensorRT() {} + + nvinfer1::Dims const& getInputDims(int index) const { + return input_dims_.at(index); + } + size_t getMaxBatchSize() const { return max_batch_size_; } + nvinfer1::DataType getDataType() const { return data_type_; } + nvinfer1::PluginFormat getDataFormat() const { return data_format_; } + virtual const char* getPluginVersion() const { return "1"; } + + void AddInput(nvinfer1::ITensor* input) { inputs_.push_back(input); } + std::vector& GetInputs() { return inputs_; } + + virtual nvinfer1::IPluginExt* clone() const = 0; + virtual const char* getPluginType() const = 0; + + // Following functions are inherit from nvinfer1::IPluginExt + // Get the number of outputs from the layer + int getNbOutputs() const { return 1; } + // Get the dimension of an output tensor + virtual nvinfer1::Dims getOutputDimensions(int index, + const nvinfer1::Dims* input_dims, + int num_inputs) = 0; + // Find the workspace size required by the layer + size_t getWorkspaceSize(int) const override { return 0; } + + // Initialize the layer for execution. + // This is called when the engine is created. + int initialize() override { return 0; } + // Shutdown the layer. This is called when the engine is destroyed + void terminate() override {} + // Execute the layer + virtual int enqueue(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) = 0; + + // Find the size of the serialization buffer required + virtual size_t getSerializationSize() = 0; + // Serialize the layer config to buffer. + // TensorRT will call this func to serialize the configuration of TensorRT + // engine. It should not be called by users. + virtual void serialize(void* buffer) = 0; + + // Check format support. The default is FLOAT32 and NCHW. + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::PluginFormat format) const override; + // Configure the layer + void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs, + const nvinfer1::Dims* output_dims, int num_outputs, + nvinfer1::DataType type, + nvinfer1::PluginFormat format, + int max_batch_size) override; + + protected: + // Deserialize input_dims, max_batch_size, data_type, data_format + void deserializeBase(void const*& serial_data, // NOLINT + size_t& serial_length); // NOLINT + size_t getBaseSerializationSize(); + // Serialize input_dims, max_batch_size, data_type, data_format + void serializeBase(void*& buffer); // NOLINT + + std::vector input_dims_; + size_t max_batch_size_; + nvinfer1::DataType data_type_; + nvinfer1::PluginFormat data_format_; + + std::vector inputs_; +}; + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc new file mode 100644 index 00000000..3c20b6d1 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name, + const void* serial_data, + size_t serial_length) { + const char* plugin_type; + DeserializeValue(&serial_data, &serial_length, &plugin_type); + + PADDLE_ENFORCE(Has(plugin_type), + "trt plugin type %s does not exists, check it.", plugin_type); + auto plugin = plugin_registry_[plugin_type](serial_data, serial_length); + owned_plugins_.emplace_back(plugin); + + return plugin; +} + +bool PluginFactoryTensorRT::RegisterPlugin( + const std::string& op_name, PluginDeserializeFunc deserialize_func) { + if (Has(op_name)) return false; + auto ret = plugin_registry_.emplace(op_name, deserialize_func); + return ret.second; +} + +void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); } + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h new file mode 100644 index 00000000..139c7559 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h @@ -0,0 +1,78 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" +#include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class PluginFactoryTensorRT : public nvinfer1::IPluginFactory, + public DeleteHelper { + public: + // Deserialization method + PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data, + size_t serial_length) override; + + bool RegisterPlugin(const std::string& op_name, + PluginDeserializeFunc deserialize_func); + + bool Has(const std::string& op_name) { + return plugin_registry_.find(op_name) != plugin_registry_.end(); + } + + void DestroyPlugins(); + + protected: + std::unordered_map plugin_registry_; + + std::list> owned_plugins_; +}; + +class TrtPluginRegistrar { + public: + TrtPluginRegistrar(const std::string& name, + PluginDeserializeFunc deserialize_func) { + inference::Singleton::Global().RegisterPlugin( + name, deserialize_func); + } +}; + +#define REGISTER_TRT_PLUGIN(name, deserialize_func) \ + REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func) + +#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func) \ + static paddle::inference::tensorrt::plugin::TrtPluginRegistrar \ + trt_plugin_registrar##ctr __attribute__((unused)) = \ + paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \ + name, deserialize_func) + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h new file mode 100644 index 00000000..1cae4cca --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h @@ -0,0 +1,134 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +// Some trt base classes lack of the destructor. +// We use a assisted class to fix this. +struct DeleteHelper { + protected: + virtual ~DeleteHelper() {} +}; + +template +inline void SerializeValue(void** buffer, T const& value); + +template +inline void DeserializeValue(void const** buffer, size_t* buffer_size, + T* value); + +namespace details { + +template +struct Serializer {}; + +template +struct Serializer::value || + std::is_enum::value || + std::is_pod::value>::type> { + static size_t SerializedSize(T const& value) { return sizeof(T); } + + static void Serialize(void** buffer, T const& value) { + std::memcpy(*buffer, &value, sizeof(T)); + reinterpret_cast(*buffer) += sizeof(T); + } + + static void Deserialize(void const** buffer, size_t* buffer_size, T* value) { + assert(*buffer_size >= sizeof(T)); + std::memcpy(value, *buffer, sizeof(T)); + reinterpret_cast(*buffer) += sizeof(T); + *buffer_size -= sizeof(T); + } +}; + +template <> +struct Serializer { + static size_t SerializedSize(const char* value) { return strlen(value) + 1; } + + static void Serialize(void** buffer, const char* value) { + std::strcpy(static_cast(*buffer), value); // NOLINT + reinterpret_cast(*buffer) += strlen(value) + 1; + } + + static void Deserialize(void const** buffer, size_t* buffer_size, + const char** value) { + *value = static_cast(*buffer); + size_t data_size = strnlen(*value, *buffer_size) + 1; + assert(*buffer_size >= data_size); + reinterpret_cast(*buffer) += data_size; + *buffer_size -= data_size; + } +}; + +template +struct Serializer, + typename std::enable_if::value || + std::is_enum::value || + std::is_pod::value>::type> { + static size_t SerializedSize(std::vector const& value) { + return sizeof(value.size()) + value.size() * sizeof(T); + } + + static void Serialize(void** buffer, std::vector const& value) { + SerializeValue(buffer, value.size()); + size_t nbyte = value.size() * sizeof(T); + std::memcpy(*buffer, value.data(), nbyte); + reinterpret_cast(*buffer) += nbyte; + } + + static void Deserialize(void const** buffer, size_t* buffer_size, + std::vector* value) { + size_t size; + DeserializeValue(buffer, buffer_size, &size); + value->resize(size); + size_t nbyte = value->size() * sizeof(T); + PADDLE_ENFORCE_GE(*buffer_size, nbyte); + std::memcpy(value->data(), *buffer, nbyte); + reinterpret_cast(*buffer) += nbyte; + *buffer_size -= nbyte; + } +}; + +} // namespace details + +template +inline size_t SerializedSize(T const& value) { + return details::Serializer::SerializedSize(value); +} + +template +inline void SerializeValue(void** buffer, T const& value) { + return details::Serializer::Serialize(buffer, value); +} + +template +inline void DeserializeValue(void const** buffer, size_t* buffer_size, + T* value) { + return details::Serializer::Deserialize(buffer, buffer_size, value); +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc new file mode 100644 index 00000000..a03dd45d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -0,0 +1,232 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class TensorRTEngineTest : public ::testing::Test { + protected: + void SetUp() override { + ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + + engine_ = new TensorRTEngine(10, 1 << 10); + engine_->InitNetwork(); + } + + void TearDown() override { + if (engine_) { + delete engine_; + engine_ = nullptr; + } + } + + void PrepareInputOutput(const std::vector &input, + std::vector output_shape) { + TensorFromVector(input, *ctx_, &input_); + output_.Resize(framework::make_ddim(output_shape)); + } + + void GetOutput(std::vector *output) { + TensorToVector(output_, *ctx_, output); + } + + protected: + framework::Tensor input_; + framework::Tensor output_; + TensorRTEngine *engine_; + platform::CUDADeviceContext *ctx_; +}; + +TEST_F(TensorRTEngineTest, add_layer) { + const int size = 1; + + float raw_weight[size] = {2.}; // Weight in CPU memory. + float raw_bias[size] = {3.}; + + std::vector buffers(2); // TRT binded inputs + + LOG(INFO) << "create weights"; + TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size); + TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size); + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + nvinfer1::DimsCHW{1, 1, 1}); + auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, + weight.get(), bias.get()); + PADDLE_ENFORCE(fc_layer != nullptr); + + engine_->DeclareOutput(fc_layer, 0, "y"); + LOG(INFO) << "freeze network"; + engine_->FreezeNetwork(); + ASSERT_EQ(engine_->engine()->getNbBindings(), 2); + + // fill in real data + std::vector x_v = {1234}; + std::vector y_cpu; + PrepareInputOutput(x_v, {1}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + LOG(INFO) << "to execute"; + engine_->Execute(1, &buffers, ctx_->stream()); + + LOG(INFO) << "to get output"; + GetOutput(&y_cpu); + + LOG(INFO) << "to checkout output"; + ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3); +} + +TEST_F(TensorRTEngineTest, add_layer_multi_dim) { + // Weight in CPU memory. + // It seems tensorrt FC use col-major: [[1.0, 3.3], [1.1, 4.4]] + // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]] + float raw_weight[4] = {1.0, 1.1, 3.3, 4.4}; + float raw_bias[2] = {1.3, 2.4}; + std::vector buffers(2); // TRT binded inputs + + TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4); + TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2); + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + nvinfer1::DimsCHW{1, 2, 1}); + auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, + weight.get(), bias.get()); + PADDLE_ENFORCE(fc_layer != nullptr); + + engine_->DeclareOutput(fc_layer, 0, "y"); + engine_->FreezeNetwork(); + ASSERT_EQ(engine_->engine()->getNbBindings(), 2); + + // fill in real data + std::vector x_v = {1.0, 2.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {2}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(1, &buffers, ctx_->stream()); + + LOG(INFO) << "to get output"; + GetOutput(&y_cpu); + + auto dims = engine_->GetITensor("y")->getDimensions(); + ASSERT_EQ(dims.nbDims, 3); + ASSERT_EQ(dims.d[0], 2); + ASSERT_EQ(dims.d[1], 1); + + ASSERT_EQ(y_cpu[0], 4.5); + ASSERT_EQ(y_cpu[1], 14.5); +} + +TEST_F(TensorRTEngineTest, test_conv2d) { + // Weight in CPU memory. + float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + float raw_bias[1] = {0}; + std::vector buffers(2); // TRT binded inputs + + TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9); + TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1); + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3{1, 3, 3}); + auto *conv_layer = + TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3}, + weight.get(), bias.get()); + PADDLE_ENFORCE(conv_layer != nullptr); + conv_layer->setStride(nvinfer1::DimsHW{1, 1}); + conv_layer->setPadding(nvinfer1::DimsHW{1, 1}); + + engine_->DeclareOutput(conv_layer, 0, "y"); + engine_->FreezeNetwork(); + ASSERT_EQ(engine_->engine()->getNbBindings(), 2); + + // fill in real data + std::vector x_v = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {18}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(2, &buffers, ctx_->stream()); + + LOG(INFO) << "to get output"; + GetOutput(&y_cpu); + + ASSERT_EQ(y_cpu[0], 4.0); + ASSERT_EQ(y_cpu[1], 6.0); +} + +TEST_F(TensorRTEngineTest, test_pool2d) { + // Weight in CPU memory. + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3{1, 2, 2}); + + std::vector buffers(2); // TRT binded inputs + nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE; + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t, + nvinfer1::DimsHW{2, 2}); + + PADDLE_ENFORCE(pool_layer != nullptr); + pool_layer->setStride(nvinfer1::DimsHW{1, 1}); + pool_layer->setPadding(nvinfer1::DimsHW{0, 0}); + + engine_->DeclareOutput(pool_layer, 0, "y"); + engine_->FreezeNetwork(); + ASSERT_EQ(engine_->engine()->getNbBindings(), 2); + + // fill in real data + std::vector x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {2}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(2, &buffers, ctx_->stream()); + + LOG(INFO) << "to get output"; + GetOutput(&y_cpu); + + ASSERT_EQ(y_cpu[0], 2.0); + ASSERT_EQ(y_cpu[1], 5.0); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc new file mode 100644 index 00000000..a0753798 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc @@ -0,0 +1,155 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "NvInfer.h" +#include "paddle/fluid/platform/dynload/tensorrt.h" + +namespace dy = paddle::platform::dynload; + +class Logger : public nvinfer1::ILogger { + public: + void log(nvinfer1::ILogger::Severity severity, const char* msg) override { + switch (severity) { + case Severity::kINFO: + LOG(INFO) << msg; + break; + case Severity::kWARNING: + LOG(WARNING) << msg; + break; + case Severity::kINTERNAL_ERROR: + case Severity::kERROR: + LOG(ERROR) << msg; + break; + default: + break; + } + } +}; + +class ScopedWeights { + public: + explicit ScopedWeights(float value) : value_(value) { + w.type = nvinfer1::DataType::kFLOAT; + w.values = &value_; + w.count = 1; + } + const nvinfer1::Weights& get() { return w; } + + private: + float value_; + nvinfer1::Weights w; +}; + +// The following two API are implemented in TensorRT's header file, cannot load +// from the dynamic library. So create our own implementation and directly +// trigger the method from the dynamic library. +nvinfer1::IBuilder* createInferBuilder(nvinfer1::ILogger* logger) { + return static_cast( + dy::createInferBuilder_INTERNAL(logger, NV_TENSORRT_VERSION)); +} +nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { + return static_cast( + dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION)); +} + +const char* kInputTensor = "input"; +const char* kOutputTensor = "output"; + +// Creates a network to compute y = 2x + 3 +nvinfer1::IHostMemory* CreateNetwork() { + Logger logger; + // Create the engine. + nvinfer1::IBuilder* builder = createInferBuilder(&logger); + ScopedWeights weights(2.); + ScopedWeights bias(3.); + + nvinfer1::INetworkDefinition* network = builder->createNetwork(); + // Add the input + auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT, + nvinfer1::DimsCHW{1, 1, 1}); + EXPECT_NE(input, nullptr); + // Add the hidden layer. + auto layer = network->addFullyConnected(*input, 1, weights.get(), bias.get()); + EXPECT_NE(layer, nullptr); + // Mark the output. + auto output = layer->getOutput(0); + output->setName(kOutputTensor); + network->markOutput(*output); + // Build the engine. + builder->setMaxBatchSize(1); + builder->setMaxWorkspaceSize(1 << 10); + auto engine = builder->buildCudaEngine(*network); + EXPECT_NE(engine, nullptr); + // Serialize the engine to create a model, then close. + nvinfer1::IHostMemory* model = engine->serialize(); + network->destroy(); + engine->destroy(); + builder->destroy(); + return model; +} + +void Execute(nvinfer1::IExecutionContext* context, const float* input, + float* output) { + const nvinfer1::ICudaEngine& engine = context->getEngine(); + // Two binds, input and output + ASSERT_EQ(engine.getNbBindings(), 2); + const int input_index = engine.getBindingIndex(kInputTensor); + const int output_index = engine.getBindingIndex(kOutputTensor); + // Create GPU buffers and a stream + void* buffers[2]; + ASSERT_EQ(0, cudaMalloc(&buffers[input_index], sizeof(float))); + ASSERT_EQ(0, cudaMalloc(&buffers[output_index], sizeof(float))); + cudaStream_t stream; + ASSERT_EQ(0, cudaStreamCreate(&stream)); + // Copy the input to the GPU, execute the network, and copy the output back. + ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float), + cudaMemcpyHostToDevice, stream)); + context->enqueue(1, buffers, stream, nullptr); + ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float), + cudaMemcpyDeviceToHost, stream)); + cudaStreamSynchronize(stream); + + // Release the stream and the buffers + cudaStreamDestroy(stream); + ASSERT_EQ(0, cudaFree(buffers[input_index])); + ASSERT_EQ(0, cudaFree(buffers[output_index])); +} + +TEST(TensorrtTest, BasicFunction) { + // Create the network serialized model. + nvinfer1::IHostMemory* model = CreateNetwork(); + + // Use the model to create an engine and an execution context. + Logger logger; + nvinfer1::IRuntime* runtime = createInferRuntime(&logger); + nvinfer1::ICudaEngine* engine = + runtime->deserializeCudaEngine(model->data(), model->size(), nullptr); + model->destroy(); + nvinfer1::IExecutionContext* context = engine->createExecutionContext(); + + // Execute the network. + float input = 1234; + float output; + Execute(context, &input, &output); + EXPECT_EQ(output, input * 2 + 3); + + // Destroy the engine. + context->destroy(); + engine->destroy(); + runtime->destroy(); +} diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc new file mode 100644 index 00000000..4a85c8b8 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc @@ -0,0 +1,147 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" +#include "glog/logging.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// set the batch size before constructing the thread to execute engine +int TRTInt8Calibrator::getBatchSize() const { return batch_size_; } + +TRTInt8Calibrator::TRTInt8Calibrator( + const std::unordered_map& buffers, int batch_size, + std::string engine_name, const platform::Place place) + : batch_size_(batch_size), engine_name_(engine_name) { + int i = 0; + VLOG(4) << "Init a new calibrator: " << engine_name_; + for (const auto it : buffers) { + framework::Tensor temp_tensor; + std::string input_name = it.first; + int data_size = it.second; + int num_ele = data_size / sizeof(int16_t); + framework::DDim data_shape = framework::make_ddim({num_ele}); + temp_tensor.Resize(data_shape); + data_tensors_.push_back(temp_tensor); + data_buffers_[input_name] = std::pair( + static_cast(temp_tensor.mutable_data(place)), num_ele); + i += 1; + } +} + +TRTInt8Calibrator::TRTInt8Calibrator(const std::string& calib_data) + : batch_size_(0), + calib_running_(false), + data_is_set_(false), + done_(true), + calibration_table_(calib_data) {} + +void TRTInt8Calibrator::waitAndSetDone() { + std::unique_lock lk(mut_); + while ((calib_running_ || data_is_set_) && !done_) cond_.wait(lk); + if (!done_) { + done_ = true; + cond_.notify_all(); + } +} + +// There might be more than one input for trt subgraph, +// So, we use a map to store input information. +bool TRTInt8Calibrator::setBatch( + const std::unordered_map& data) { + VLOG(3) << "set batch: " << engine_name_; + std::unique_lock lk(mut_); + // There is a producer and a consumer. The producer set the batch data and + // the consumer get the batch data. The size of the data pool is one. + // So, the producer has to wait for the consumer to finish processing before + // they can set the data. + while ((calib_running_ || data_is_set_) && (!done_)) cond_.wait(lk); + // The done_ is set to true using waitAndSetDone, When all calibration data + // are processed. + if (done_) return false; + + // Sets the batch. + for (const auto& it : data) { + auto dataptr = data_buffers_.find(it.first); + if (dataptr == data_buffers_.end()) { + LOG(FATAL) << "FATAL " << engine_name_ << " input name '" << it.first + << "' does not match with the buffer names"; + } + const auto& d = dataptr->second; + PADDLE_ENFORCE( + cudaMemcpy(d.first, it.second, d.second, cudaMemcpyDeviceToDevice), + "Fail to cudaMemcpy %s for %s", engine_name_, it.first); + } + + data_is_set_ = true; + cond_.notify_all(); + return true; +} + +bool TRTInt8Calibrator::getBatch(void** bindings, const char** names, + int num_bindings) { + VLOG(4) << "get batch: " << engine_name_; + std::unique_lock lk(mut_); + // The consumer has just finished processing a data. + // The producer can set the data again. + calib_running_ = false; + cond_.notify_all(); + + // As long as there is data in the pool, the consumer can get it. + while (!data_is_set_ && !done_) cond_.wait(lk); + if (done_) return false; + + // Gets the batch + for (int i = 0; i < num_bindings; i++) { + auto it = data_buffers_.find(names[i]); + if (it == data_buffers_.end()) { + LOG(FATAL) << "Calibration engine asked for unknown tensor name '" + << names[i] << "' at position " << i; + } + bindings[i] = it->second.first; + } + + data_is_set_ = false; + calib_running_ = true; + VLOG(4) << "get batch done: " << engine_name_; + return true; +} + +void TRTInt8Calibrator::setDone() { + std::unique_lock lk(mut_); + done_ = true; + cond_.notify_all(); +} + +const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) { + if (calibration_table_.empty()) return nullptr; + length = calibration_table_.size(); + return calibration_table_.data(); +} + +void TRTInt8Calibrator::writeCalibrationCache(const void* ptr, + std::size_t length) { + calibration_table_ = std::string((const char*)ptr, length); + VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr + << " length=" << length; +} +TRTInt8Calibrator::~TRTInt8Calibrator() { + VLOG(4) << "Destroying calibrator for " << engine_name_; +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h new file mode 100644 index 00000000..5815bc9a --- /dev/null +++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h @@ -0,0 +1,128 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class TensorRTEngine; + +struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator { + public: + TRTInt8Calibrator(const std::unordered_map& buffers, + int batch_size, std::string engine_name, + const platform::Place place); + + explicit TRTInt8Calibrator(const std::string& calibration_data); + ~TRTInt8Calibrator(); + + int getBatchSize() const override; + + bool getBatch(void* bindings[], const char* names[], + int num_bindings) override; + + bool setBatch(const std::unordered_map& data); + void setDone(); + void waitAndSetDone(); + + const void* readCalibrationCache(std::size_t& length) override; + void writeCalibrationCache(const void* ptr, std::size_t length) override; + const std::string& getCalibrationTableAsString() { + return calibration_table_; + } + + private: + const int batch_size_; + + bool calib_running_{true}; + bool data_is_set_{false}; + bool done_{false}; + + std::mutex mut_; + std::condition_variable cond_; + + std::unordered_map> data_buffers_; + std::vector data_tensors_; + + std::string engine_name_; + std::string calibration_table_; +}; + +class TRTCalibratorEngine { + public: + TRTCalibratorEngine() {} + std::unique_ptr calib_; + std::unique_ptr thr_; + std::unique_ptr engine_; +}; +/* + * Manager to control the TensorRT Int8 calibration creation and deltetion. + */ +class TRTCalibratorEngineManager { + public: + bool Has() const { return res_.size() > 0; } + bool Has(const std::string& name) const { + if (res_.count(name) == 0) return false; + return res_.at(name).get() != nullptr; + } + + // Get Int8Calibrator via name + TRTCalibratorEngine* Get(const std::string& name) const { + return res_.at(name).get(); + } + + // Look up or create a calibrator. + TRTCalibratorEngine* LookupOrCreate(const std::string& engine_name) { + if (res_.count(engine_name) == 0) { + auto* p = new TRTCalibratorEngine; + res_[engine_name].reset(p); + } + return res_.at(engine_name).get(); + } + + // Create an Int8Calibrator + TRTCalibratorEngine* Create(const std::string& engine_name) { + auto* p = new TRTCalibratorEngine; + res_[engine_name].reset(p); + return p; + } + + void DeleteALL() { + for (auto& item : res_) { + item.second.reset(nullptr); + } + } + + private: + std::unordered_map> res_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt new file mode 100644 index 00000000..083e1bc5 --- /dev/null +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -0,0 +1,271 @@ +set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor benchmark) + +if(WITH_GPU AND TENSORRT_FOUND) + set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor) +endif() + +function(download_data install_dir data_file) + if (NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL} ${data_file}) + endif() +endfunction() + +function(download_int8_data install_dir data_file) + if (NOT EXISTS ${install_dir}/${data_file}) + inference_download_and_uncompress(${install_dir} ${INFERENCE_URL}/int8 ${data_file}) + endif() +endfunction() + +function(download_model_and_data install_dir model_name data_name) + download_data(${install_dir} ${model_name}) + download_data(${install_dir} ${data_name}) +endfunction() + +function(inference_analysis_api_test target install_dir filename) + inference_analysis_test(${target} SRCS ${filename} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark + ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt) +endfunction() + +function(inference_analysis_api_int8_test_build TARGET_NAME filename) + inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark) +endfunction() + +function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir data_path) + inference_analysis_test_run(${TARGET_NAME} + COMMAND ${test_binary} + ARGS --infer_model=${model_dir}/model + --infer_data=${data_path} + --warmup_batch_size=100 + --batch_size=50 + --paddle_num_threads=${CPU_NUM_THREADS_ON_CI} + --iterations=2) +endfunction() + +function(inference_analysis_api_test_with_fake_data_build TARGET_NAME filename) + inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}) +endfunction() + +function(inference_analysis_api_test_with_fake_data_run TARGET_NAME test_binary model_dir disable_fc) + inference_analysis_test_run(${TARGET_NAME} + COMMAND ${test_binary} + ARGS --infer_model=${model_dir}/model + --disable_mkldnn_fc=${disable_fc}) +endfunction() + +function(inference_analysis_api_test_with_refer_result target install_dir filename) + inference_analysis_test(${target} SRCS ${filename} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt + --refer_result=${install_dir}/result.txt) +endfunction() + +if(NOT APPLE AND WITH_MKLML) + # RNN1 + set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") + download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") + inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc) + + # seq_pool1 + set(SEQ_POOL1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_pool") + download_model_and_data(${SEQ_POOL1_INSTALL_DIR} "seq_pool1_model_.tar.gz" "seq_pool1_data.txt.tar.gz") + inference_analysis_api_test(test_analyzer_seq_pool1 ${SEQ_POOL1_INSTALL_DIR} analyzer_seq_pool1_tester.cc) +else() + # TODO: fix this test on MACOS and OPENBLAS, the reason is that + # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS + message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_rnn1") + message(WARNING "These tests has been disabled in OSX or WITH_MKL=OFF before being fixed: \n test_analyzer_seq_pool1") +endif() + + +# RNN2 +set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") +download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc) + +# TODO(luotao, Superjom) Disable DAM test, temporarily fix +# https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914. +# After inference framework refactor, will reopen it. +# normal DAM +set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") +download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") +#inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator) + +# small DAM +set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") +download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") +inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1) + +#save model +inference_analysis_api_test(test_analyzer_save_model ${DAM_SMALL_INSTALL_DIR} analyzer_save_model_tester.cc) + +# chinese_ner +set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") +download_model_and_data(${CHINESE_NER_INSTALL_DIR} "chinese_ner_model.tar.gz" "chinese_ner-data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_ner ${CHINESE_NER_INSTALL_DIR} analyzer_ner_tester.cc) + +# lac +set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac") +download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc) + +# MM DNN +set(MM_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mm_dnn") +download_model_and_data(${MM_DNN_INSTALL_DIR} "MM_DNN_model.tar.gz" "MM_DNN_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc) + +# Pyramid DNN +set(PYRAMID_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/pyramid_dnn") +download_model_and_data(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR} analyzer_pyramid_dnn_tester.cc) + +# text_classification +set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classification") +download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc) + +# seq_conv1 +set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1") +download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz") +inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc) + +# transformer, the dataset only works on batch_size=8 now +set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer") +download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz") +inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 + --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}) + +# ocr +set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") +if (NOT EXISTS ${OCR_INSTALL_DIR}) + inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") +endif() +inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) + +# mobilenet with transpose op +set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") +if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) + inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.bj.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz") +endif() +inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) + +### Image classification tests with fake data +set(IMG_CLASS_TEST_APP "test_analyzer_image_classification") +set(IMG_CLASS_TEST_APP_SRC "analyzer_image_classification_tester.cc") + +# build test binary to be used in subsequent tests +inference_analysis_api_test_with_fake_data_build(${IMG_CLASS_TEST_APP} ${IMG_CLASS_TEST_APP_SRC}) + +# googlenet +set(GOOGLENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/googlenet") +download_data(${GOOGLENET_MODEL_DIR} "googlenet.tar.gz") +inference_analysis_api_test_with_fake_data_run(test_analyzer_googlenet ${IMG_CLASS_TEST_APP} + ${GOOGLENET_MODEL_DIR} false) + +# resnet50 +set(RESNET50_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/resnet50") +download_data(${RESNET50_MODEL_DIR} "resnet50_model.tar.gz") +inference_analysis_api_test_with_fake_data_run(test_analyzer_resnet50 ${IMG_CLASS_TEST_APP} + ${RESNET50_MODEL_DIR} true) + +# mobilenet with depthwise_conv op +set(MOBILENET_MODEL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv") +download_data(${MOBILENET_MODEL_DIR} "mobilenet_model.tar.gz") +inference_analysis_api_test_with_fake_data_run(test_analyzer_mobilenet_depthwise_conv ${IMG_CLASS_TEST_APP} + ${MOBILENET_MODEL_DIR} false) + +### INT8 tests +if(WITH_MKLDNN) + + set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8v2") + + ### Image classification tests + set(IMAGENET_DATA_PATH "${INT8_DATA_DIR}/data.bin") + set(INT8_IMG_CLASS_TEST_APP "test_analyzer_int8_image_classification") + set(INT8_IMG_CLASS_TEST_APP_SRC "analyzer_int8_image_classification_tester.cc") + + # download dataset if necessary + download_int8_data(${INT8_DATA_DIR} "imagenet_val_100_tail.tar.gz") + + # build test binary to be used in subsequent tests + inference_analysis_api_int8_test_build(${INT8_IMG_CLASS_TEST_APP} ${INT8_IMG_CLASS_TEST_APP_SRC}) + + # resnet50 int8 + set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50") + download_int8_data(${INT8_RESNET50_MODEL_DIR} "resnet50_int8_model.tar.gz" ) + inference_analysis_api_int8_test_run(test_analyzer_int8_resnet50 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET50_MODEL_DIR} ${IMAGENET_DATA_PATH}) + + # mobilenetv1 int8 + set(INT8_MOBILENETV1_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv1") + download_int8_data(${INT8_MOBILENETV1_MODEL_DIR} "mobilenetv1_int8_model.tar.gz" ) + inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv1 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV1_MODEL_DIR} ${IMAGENET_DATA_PATH}) + + # mobilenetv2 int8 + set(INT8_MOBILENETV2_MODEL_DIR "${INT8_DATA_DIR}/mobilenetv2") + download_int8_data(${INT8_MOBILENETV2_MODEL_DIR} "mobilenet_v2_int8_model.tar.gz" ) + inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenetv2 ${INT8_IMG_CLASS_TEST_APP} ${INT8_MOBILENETV2_MODEL_DIR} ${IMAGENET_DATA_PATH}) + + # resnet101 int8 + set(INT8_RESNET101_MODEL_DIR "${INT8_DATA_DIR}/resnet101") + download_int8_data(${INT8_RESNET101_MODEL_DIR} "Res101_int8_model.tar.gz" ) + inference_analysis_api_int8_test_run(test_analyzer_int8_resnet101 ${INT8_IMG_CLASS_TEST_APP} ${INT8_RESNET101_MODEL_DIR} ${IMAGENET_DATA_PATH}) + + # vgg16 int8 + set(INT8_VGG16_MODEL_DIR "${INT8_DATA_DIR}/vgg16") + download_int8_data(${INT8_VGG16_MODEL_DIR} "VGG16_int8_model.tar.gz" ) + inference_analysis_api_int8_test_run(test_analyzer_int8_vgg16 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG16_MODEL_DIR} ${IMAGENET_DATA_PATH}) + + # vgg19 int8 + set(INT8_VGG19_MODEL_DIR "${INT8_DATA_DIR}/vgg19") + download_int8_data(${INT8_VGG19_MODEL_DIR} "VGG19_int8_model.tar.gz" ) + inference_analysis_api_int8_test_run(test_analyzer_int8_vgg19 ${INT8_IMG_CLASS_TEST_APP} ${INT8_VGG19_MODEL_DIR} ${IMAGENET_DATA_PATH}) + + # googlenet int8 + set(INT8_GOOGLENET_MODEL_DIR "${INT8_DATA_DIR}/googlenet") + download_int8_data(${INT8_GOOGLENET_MODEL_DIR} "GoogleNet_int8_model.tar.gz" ) + inference_analysis_api_int8_test_run(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH}) + + ### Object detection models + set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_100.bin") + set(INT8_OBJ_DETECT_TEST_APP "test_analyzer_int8_object_detection") + set(INT8_OBJ_DETECT_TEST_APP_SRC "analyzer_int8_object_detection_tester.cc") + + # download dataset if necessary + download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_100.tar.gz") + + # build test binary to be used in subsequent tests + inference_analysis_api_int8_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC}) + + # mobilenet-ssd int8 + set(INT8_MOBILENET_SSD_MODEL_DIR "${INT8_DATA_DIR}/mobilenet-ssd") + download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" ) + inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH}) + +endif() + +# bert, max_len=20, embedding_dim=128 +set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128") +download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz") +inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc) + +if(WITH_GPU AND TENSORRT_FOUND) + set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt") + if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) + inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") + endif() + inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models) + inference_analysis_test(trt_resnet50_test SRCS trt_resnet50_test.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models) + inference_analysis_test(trt_resnext_test SRCS trt_resnext_test.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models) +endif() diff --git a/paddle/fluid/inference/tests/api/anakin_mlu_tester.cc b/paddle/fluid/inference/tests/api/anakin_mlu_tester.cc new file mode 100644 index 00000000..8094c744 --- /dev/null +++ b/paddle/fluid/inference/tests/api/anakin_mlu_tester.cc @@ -0,0 +1,98 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +DEFINE_string(model, "", "Directory of the inference model."); + +namespace paddle { + +contrib::AnakinConfig Config() { + // Determine the use of memory here. + std::map> init_inputs_shape; + init_inputs_shape["input_0"] = std::vector({1, 3, 112, 112}); + + contrib::AnakinConfig config; + config.target_type = contrib::AnakinConfig::MLU; + config.model_file = FLAGS_model; + config.init_inputs_shape = init_inputs_shape; + + // Determine the device execution context. + config.device_id = 0; + config.data_stream_id = 0; + config.compute_stream_id = 0; + + // Set re_allocable and op_fuse TRUE. + config.re_allocable = true; + config.op_fuse = true; + + return config; +} + +void single_test() { + // 1. Defining basic data structures. + auto config = paddle::Config(); + auto predictor = + paddle::CreatePaddlePredictor(config); + + // 2. Define the data structure of the predictor inputs and outputs. + std::vector input_tensors; + std::vector output_tensors; + + // 3. Define and fill the inputs tensor. + int num = 1; + int channel = 3; + int height = 112; + int width = 112; + std::vector input(num * channel * height * width, 1); + std::vector> inputs({input}); + const std::vector input_names{"input_0"}; + for (auto& name : input_names) { + paddle::PaddleTensor tensor; + tensor.name = name; + tensor.dtype = PaddleDType::FLOAT32; + input_tensors.push_back(tensor); + } + for (size_t j = 0; j < input_tensors.size(); j++) { + input_tensors[j].data = + paddle::PaddleBuf(&inputs[j][0], inputs[j].size() * sizeof(float)); + // The shape of each execution can be changed. + input_tensors[j].shape = std::vector({num, channel, height, width}); + } + + // 4. Set the output placeholder of predictor. + PaddleTensor predict_out, score_out; + predict_out.name = "landmark_predict_out"; + score_out.name = "landmark_score_out"; + output_tensors.push_back(predict_out); + output_tensors.push_back(score_out); + + // 5. Execution predict. + predictor->Run(input_tensors, &output_tensors); + + // 6. Take out the output data. + for (auto out : output_tensors) { + float* data_o = static_cast(out.data.data()); + LOG(INFO) << out.name << " size = " << out.data.length() / sizeof(float); + } +} +} // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + paddle::single_test(); + return 0; +} diff --git a/paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc new file mode 100644 index 00000000..27abaa53 --- /dev/null +++ b/paddle/fluid/inference/tests/api/anakin_rnn2_tester.cc @@ -0,0 +1,261 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +#define BUFFER_SIZE (10000) +#define COMPARE_OUTPUTS (1) +#define PRINT_INPUTS (0) + +DEFINE_string(model, "", "Directory of the inference model."); +DEFINE_string(datapath, "", "Path of the dataset."); +DEFINE_string(truthpath, "", "Path of the dataset."); +DEFINE_int32(batch_size, 1, "Batch size per execution."); +DEFINE_int32(repeats, 1, "Number of iterations."); +DEFINE_int32( + start_line, 0, + "The starting line of the text file read (this line will be read)."); +DEFINE_int32(end_line, 1000000, + "The ending line of the text file read (this line will be read)."); +DEFINE_int32(init_batch_size, 40, + "Max batch size for Anakin memory allocation."); +DEFINE_int32(threads_num, 2, "Threads num for Anakin."); + +class Data { + public: + Data(std::string file_name, size_t batch_size, size_t start = 0, + size_t end = 1000000) + : _batch_size(batch_size), _total_length(0), _inputs_size(6) { + _file.open(file_name); + _file.seekg(_file.end); + _total_length = _file.tellg(); + _file.seekg(_file.beg); + read_file_to_vec(start, end); + reset_current_line(); + } + void reset_current_line(); + const std::vector& get_lines(); + void read_file_to_vec(const size_t start, const size_t end); + int get_next_batches(std::vector>* inputs, + std::vector>* seq_offsets); + + private: + std::fstream _file; + int _batch_size; + size_t _total_length; + size_t _inputs_size; + std::vector _lines; + size_t _current_line; +}; + +void Data::read_file_to_vec(const size_t start, const size_t end) { + std::string line; + size_t count = 0; + _lines.clear(); + while (std::getline(_file, line)) { + if (count >= start && count <= end) { + _lines.push_back(line); + } + count++; + } +} + +const std::vector& Data::get_lines() { return _lines; } + +void Data::reset_current_line() { _current_line = 0; } + +int Data::get_next_batches(std::vector>* data, + std::vector>* offsets) { + data->clear(); + offsets->clear(); + data->resize(_inputs_size); + offsets->resize(_inputs_size); + for (auto& offset : *offsets) { + offset.push_back(0); + } + + int seq_num = -1; + int pre_query_index = -1; + while (_current_line < _lines.size()) { + int cur_query_index = -1; + std::vector line; + paddle::inference::split(_lines[_current_line], ';', &line); + for (size_t i = 0; i < line.size(); i++) { + std::vector float_v; + paddle::inference::split_to_float(line[i], ' ', &float_v); + if (i == 0) { + cur_query_index = float_v[0]; + if (pre_query_index != -1 && cur_query_index != pre_query_index) { + return seq_num; + } + seq_num++; + _current_line++; + } else { + if (float_v.size() == 0) { + float_v.push_back(-1); + } + (*data)[i - 1].insert((*data)[i - 1].end(), float_v.begin(), + float_v.end()); + (*offsets)[i - 1].push_back((*offsets)[i - 1][seq_num] + + float_v.size()); + } + } + if (seq_num + 1 >= _batch_size) { + return seq_num; + } else { + pre_query_index = cur_query_index; + } + } + return seq_num; +} + +namespace paddle { + +contrib::AnakinConfig GetConfig() { + contrib::AnakinConfig config; + + std::map> init_inputs_shape; + init_inputs_shape["q_basic"] = std::vector({1000, 1, 1, 1}); + init_inputs_shape["q_bigram0"] = std::vector({1000, 1, 1, 1}); + init_inputs_shape["pt_basic"] = std::vector({2000, 1, 1, 1}); + init_inputs_shape["pa_basic"] = std::vector({4000, 1, 1, 1}); + init_inputs_shape["pa_bigram0"] = std::vector({4000, 1, 1, 1}); + init_inputs_shape["pt_bigram0"] = std::vector({2000, 1, 1, 1}); + + // using AnakinConfig::X86 if you need to use cpu to do inference + config.target_type = contrib::AnakinConfig::NVGPU; + config.model_file = FLAGS_model; + config.device_id = 0; + config.init_batch_size = FLAGS_init_batch_size; + config.init_inputs_shape = init_inputs_shape; + config.re_allocable = false; + return config; +} + +void single_test(PaddlePredictor* predictor_master) { + auto predictor = predictor_master->Clone(); + + Data data(FLAGS_datapath, FLAGS_batch_size, FLAGS_start_line, FLAGS_end_line); + + std::vector> inputs; + std::vector> seq_offsets; + std::vector compare_outputs; + + const std::vector input_names{"q_basic", "q_bigram0", + "pt_basic", "pt_bigram0", + "pa_basic", "pa_bigram0"}; + std::vector input_tensors; + std::vector output_tensors; + for (auto& name : input_names) { + PaddleTensor tensor; + tensor.name = name; + tensor.dtype = PaddleDType::FLOAT32; + input_tensors.push_back(tensor); + } + + PaddleTensor tensor_out; + tensor_out.name = "save_infer_model/scale_0"; + tensor_out.shape = std::vector({}); + tensor_out.data = PaddleBuf(); + tensor_out.dtype = PaddleDType::FLOAT32; + output_tensors.push_back(tensor_out); + + inference::Timer timer; + for (int i = 0; i < FLAGS_repeats; i++) { + data.reset_current_line(); + size_t count = 0; + float time_sum = 0; + while (data.get_next_batches(&inputs, &seq_offsets) >= 0) { +#if PRINT_INPUTS + for (size_t i = 0; i < inputs.size(); i++) { + LOG(INFO) << "data " << i; + for (size_t j = 0; j < inputs[i].size(); j++) { + LOG(INFO) << j << ": " << inputs[i][j]; + } + for (auto j : seq_offsets[i]) { + LOG(INFO) << "offsets: " << i << ": " << j; + } + } +#endif + for (size_t j = 0; j < input_tensors.size(); j++) { + input_tensors[j].data = + PaddleBuf(&inputs[j][0], inputs[j].size() * sizeof(float)); + input_tensors[j].lod = + std::vector>({seq_offsets[j]}); + input_tensors[j].shape = + std::vector({static_cast(inputs[j].size()), 1, 1, 1}); + } + timer.tic(); + predictor->Run(input_tensors, &output_tensors); + float time = timer.toc(); +#if COMPARE_OUTPUTS + float* data_o = static_cast(output_tensors[0].data.data()); + LOG(INFO) << "outputs[0].data.size() = " + << output_tensors[0].data.length() / sizeof(float); + size_t sum = 1; + for_each(output_tensors[0].shape.begin(), output_tensors[0].shape.end(), + [&](int n) { sum *= n; }); + for (size_t j = 0; j < sum; ++j) { + LOG(INFO) << "output[" << j << "]: " << data_o[j]; + compare_outputs.push_back(data_o[j]); + } +#endif + LOG(INFO) << "Single Time: " << time; + count++; + if (count > 10) { + time_sum += timer.toc(); + } + } + inference::PrintTime(FLAGS_batch_size, FLAGS_repeats, 1, 0, + time_sum / (count - 10)); +#if COMPARE_OUTPUTS + Data data(FLAGS_truthpath, 1); + const std::vector truth_vals = data.get_lines(); + for (size_t j = 0; j < truth_vals.size(); j++) { + float truth = std::atof(truth_vals[j].c_str()); + float compa = compare_outputs[j]; + float diff = std::abs(truth - compa); + LOG(INFO) << "[DIFF " << j << " ] " << diff; + if (diff > 0.0001) { + LOG(FATAL) << "The result is wrong!"; + } + } + LOG(INFO) << "The result is correct!"; +#endif + } +} +} // namespace paddle + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + std::vector threads; + + auto config = paddle::GetConfig(); + config.data_stream_id = 0; + config.compute_stream_id = 0; + std::unique_ptr predictor_master = + paddle::CreatePaddlePredictor(config); + + for (int i = 0; i < FLAGS_threads_num; i++) { + threads.push_back(std::thread(paddle::single_test, predictor_master.get())); + } + for (auto& t : threads) { + t.join(); + } + return 0; +} diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc new file mode 100644 index 00000000..406c028a --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc @@ -0,0 +1,296 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/transfer_scope_cache.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +using paddle::PaddleTensor; + +template +void GetValueFromStream(std::stringstream *ss, T *t) { + (*ss) >> (*t); +} + +template <> +void GetValueFromStream(std::stringstream *ss, std::string *t) { + *t = ss->str(); +} + +// Split string to vector +template +void Split(const std::string &line, char sep, std::vector *v) { + std::stringstream ss; + T t; + for (auto c : line) { + if (c != sep) { + ss << c; + } else { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } + } + + if (!ss.str().empty()) { + GetValueFromStream(&ss, &t); + v->push_back(std::move(t)); + ss.str({}); + ss.clear(); + } +} + +// Parse tensor from string +template +bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) { + std::vector data; + Split(field, ':', &data); + if (data.size() < 2) return false; + + std::string shape_str = data[0]; + + std::vector shape; + Split(shape_str, ' ', &shape); + + std::string mat_str = data[1]; + + std::vector mat; + Split(mat_str, ' ', &mat); + + tensor->shape = shape; + auto size = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()) * + sizeof(T); + tensor->data.Resize(size); + std::copy(mat.begin(), mat.end(), static_cast(tensor->data.data())); + tensor->dtype = GetPaddleDType(); + + return true; +} + +// Parse input tensors from string +bool ParseLine(const std::string &line, + std::vector *tensors) { + std::vector fields; + Split(line, ';', &fields); + + if (fields.size() < 5) return false; + + tensors->clear(); + tensors->reserve(5); + + int i = 0; + // src_id + paddle::PaddleTensor src_id; + ParseTensor(fields[i++], &src_id); + tensors->push_back(src_id); + + // pos_id + paddle::PaddleTensor pos_id; + ParseTensor(fields[i++], &pos_id); + tensors->push_back(pos_id); + + // segment_id + paddle::PaddleTensor segment_id; + ParseTensor(fields[i++], &segment_id); + tensors->push_back(segment_id); + + // self_attention_bias + paddle::PaddleTensor self_attention_bias; + ParseTensor(fields[i++], &self_attention_bias); + tensors->push_back(self_attention_bias); + + // next_segment_index + paddle::PaddleTensor next_segment_index; + ParseTensor(fields[i++], &next_segment_index); + tensors->push_back(next_segment_index); + + return true; +} + +bool LoadInputData(std::vector> *inputs) { + if (FLAGS_infer_data.empty()) { + LOG(ERROR) << "please set input data path"; + return false; + } + + std::ifstream fin(FLAGS_infer_data); + std::string line; + int sample = 0; + + // The unit-test dataset only have 10 samples, each sample have 5 feeds. + while (std::getline(fin, line)) { + std::vector feed_data; + ParseLine(line, &feed_data); + inputs->push_back(std::move(feed_data)); + sample++; + if (!FLAGS_test_all_data && sample == FLAGS_batch_size) break; + } + LOG(INFO) << "number of samples: " << sample; + + return true; +} + +void SetConfig(AnalysisConfig *config) { config->SetModel(FLAGS_infer_model); } + +void profile(bool use_mkldnn = false, bool use_ngraph = false) { + AnalysisConfig config; + SetConfig(&config); + + if (use_mkldnn) { + config.EnableMKLDNN(); + config.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + + if (use_ngraph) { + config.EnableNgraph(); + } + + std::vector> outputs; + std::vector> inputs; + LoadInputData(&inputs); + TestPrediction(reinterpret_cast(&config), + inputs, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_bert, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_bert, profile_mkldnn) { profile(true, false); } +#endif + +#ifdef PADDLE_WITH_NGRAPH +TEST(Analyzer_bert, profile_ngraph) { profile(false, true); } +#endif + +// Check the fuse status +TEST(Analyzer_bert, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + LOG(INFO) << "num_ops: " << num_ops; +} + +// Compare result of NativeConfig and AnalysisConfig +void compare(bool use_mkldnn = false, bool use_ngraph = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + + if (use_ngraph) { + cfg.EnableNgraph(); + } + + std::vector> inputs; + LoadInputData(&inputs); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), inputs); +} + +TEST(Analyzer_bert, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_bert, compare_mkldnn) { + compare(true, false /* use_mkldnn, no use_ngraph */); +} +#endif + +#ifdef PADDLE_WITH_NGRAPH +TEST(Analyzer_bert, compare_ngraph) { + compare(false, true /* no use_mkldnn, use_ngraph */); +} +#endif + +// Compare Deterministic result +TEST(Analyzer_bert, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> inputs; + LoadInputData(&inputs); + CompareDeterministic(reinterpret_cast(&cfg), + inputs); +} + +void verify_transfer_scope_cache(bool is_static = false) { + AnalysisConfig config; + SetConfig(&config); + + std::vector input, output; + auto predictor = CreatePaddlePredictor(config); + + int threads_num = 10; + std::vector threads; + std::unordered_set *> + global_transfer_scope_cache; + std::unordered_set *> + global_transfer_data_cache; + + std::ifstream fin(FLAGS_infer_data); + std::string line; + + for (int i = 0; i < threads_num; i++) { + threads.emplace_back([&, i]() { + std::getline(fin, line); + ParseLine(line, &input); +#ifdef PADDLE_WITH_MKLDNN + // Use static method to handle transfer_scope_cache() + // TODO(intel) explicit session id setting will be deprecated. + if (is_static) platform::set_cur_mkldnn_session_id(1); +#endif + predictor->Run(input, &output, FLAGS_batch_size); + global_transfer_scope_cache.insert( + &paddle::framework::global_transfer_scope_cache()); + global_transfer_data_cache.insert( + &paddle::framework::global_transfer_data_cache()); + }); + threads[0].join(); + threads.clear(); + std::vector().swap(input); + } +#ifdef PADDLE_WITH_MKLDNN + if (is_static) { + // Use static method to do transfer_scope_cache() instead of thread_local + // so paddle::framework::global_transfer_data_cache() should be 1 + PADDLE_ENFORCE(global_transfer_scope_cache.size(), 1); + PADDLE_ENFORCE(global_transfer_data_cache.size(), 1); + } else { +#endif + // Since paddle::framework::global_transfer_scope_cache() and + // paddle::framework::global_transfer_data_cache() are thread_local, + // their pointer should be different among different thread id. + PADDLE_ENFORCE(global_transfer_scope_cache.size(), threads_num); + PADDLE_ENFORCE(global_transfer_data_cache.size(), threads_num); +#ifdef PADDLE_WITH_MKLDNN + } +#endif +} + +TEST(Analyzer_bert, threadlocal_transfer_scope_cache) { + verify_transfer_scope_cache(); +} +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_bert, static_transfer_scope_cache) { + verify_transfer_scope_cache(true); +} +#endif +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc new file mode 100644 index 00000000..83bf99ec --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -0,0 +1,360 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +DEFINE_int32(max_turn_num, 9, + "The max turn number: 1 for the small and 9 for the normal."); + +namespace paddle { +namespace inference { + +constexpr int32_t kMaxTurnLen = 50; + +static std::vector result_data; + +struct DataRecord { + std::vector> *turns; + std::vector> *turns_mask; + std::vector> response; // response data : 1 + std::vector> response_mask; // response mask data : 1 + size_t batch_iter{0}; + size_t batch_size{1}; + size_t num_samples; // total number of samples + + DataRecord() { + turns = new std::vector>[FLAGS_max_turn_num]; // turns data : FLAGS_max_turn_num + turns_mask = new std::vector>[FLAGS_max_turn_num]; // turns mask data : FLAGS_max_turn_num + } + + explicit DataRecord(const std::string &path, int batch_size = 1) + : DataRecord() { + this->batch_size = batch_size; + Load(path); + } + + ~DataRecord() { + delete[] turns; + delete[] turns_mask; + } + + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= response.size()) { + for (int i = 0; i < FLAGS_max_turn_num; ++i) { + data.turns[i].assign(turns[i].begin() + batch_iter, + turns[i].begin() + batch_end); + } + for (int i = 0; i < FLAGS_max_turn_num; ++i) { + data.turns_mask[i].assign(turns_mask[i].begin() + batch_iter, + turns_mask[i].begin() + batch_end); + } + data.response.assign(response.begin() + batch_iter, + response.begin() + batch_end); + data.response_mask.assign(response_mask.begin() + batch_iter, + response_mask.begin() + batch_end); + CHECK(!data.response.empty()); + CHECK(!data.response_mask.empty()); + CHECK_EQ(data.response.size(), data.response_mask.size()); + } + batch_iter += batch_size; + return data; + } + + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + size_t num_lines = 0; + result_data.clear(); + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ',', &data); + CHECK_EQ(data.size(), (size_t)(2 * FLAGS_max_turn_num + 3)); + // load turn data + std::vector turns_tmp[FLAGS_max_turn_num]; + for (int i = 0; i < FLAGS_max_turn_num; ++i) { + split_to_int64(data[i], ' ', &turns_tmp[i]); + turns[i].push_back(std::move(turns_tmp[i])); + } + // load turn_mask data + std::vector turns_mask_tmp[FLAGS_max_turn_num]; + for (int i = 0; i < FLAGS_max_turn_num; ++i) { + split_to_float(data[FLAGS_max_turn_num + i], ' ', &turns_mask_tmp[i]); + turns_mask[i].push_back(std::move(turns_mask_tmp[i])); + } + // load response data + std::vector response_tmp; + split_to_int64(data[2 * FLAGS_max_turn_num], ' ', &response_tmp); + response.push_back(std::move(response_tmp)); + // load response_mask data + std::vector response_mask_tmp; + split_to_float(data[2 * FLAGS_max_turn_num + 1], ' ', &response_mask_tmp); + response_mask.push_back(std::move(response_mask_tmp)); + // load result data + float result_tmp; + result_tmp = std::stof(data[2 * FLAGS_max_turn_num + 2]); + result_data.push_back(result_tmp); + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor turns_tensor[FLAGS_max_turn_num]; + PaddleTensor turns_mask_tensor[FLAGS_max_turn_num]; + PaddleTensor response_tensor; + PaddleTensor response_mask_tensor; + std::string turn_pre = "turn_"; + std::string turn_mask_pre = "turn_mask_"; + + auto one_batch = data->NextBatch(); + PADDLE_ENFORCE(!one_batch.response.empty()); + int size = one_batch.response[0].size(); + CHECK_EQ(size, kMaxTurnLen); + // turn tensor assignment + for (int i = 0; i < FLAGS_max_turn_num; ++i) { + turns_tensor[i].name = turn_pre + std::to_string(i); + turns_tensor[i].shape.assign({batch_size, size, 1}); + turns_tensor[i].dtype = PaddleDType::INT64; + TensorAssignData(&turns_tensor[i], one_batch.turns[i]); + } + // turn mask tensor assignment + for (int i = 0; i < FLAGS_max_turn_num; ++i) { + turns_mask_tensor[i].name = turn_mask_pre + std::to_string(i); + turns_mask_tensor[i].shape.assign({batch_size, size, 1}); + turns_mask_tensor[i].dtype = PaddleDType::FLOAT32; + TensorAssignData(&turns_mask_tensor[i], one_batch.turns_mask[i]); + } + // response tensor assignment + response_tensor.name = "response"; + response_tensor.shape.assign({batch_size, size, 1}); + response_tensor.dtype = PaddleDType::INT64; + TensorAssignData(&response_tensor, one_batch.response); + // response mask tensor assignment + response_mask_tensor.name = "response_mask"; + response_mask_tensor.shape.assign({batch_size, size, 1}); + response_mask_tensor.dtype = PaddleDType::FLOAT32; + TensorAssignData(&response_mask_tensor, one_batch.response_mask); + + // Set inputs. + for (int i = 0; i < FLAGS_max_turn_num; ++i) { + input_slots->push_back(std::move(turns_tensor[i])); + } + for (int i = 0; i < FLAGS_max_turn_num; ++i) { + input_slots->push_back(std::move(turns_mask_tensor[i])); + } + input_slots->push_back(std::move(response_tensor)); + input_slots->push_back(std::move(response_mask_tensor)); +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(true); +} + +void SetOptimConfig(AnalysisConfig *cfg) { + std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model"; + cfg->SetModel(optimModelPath + "/model", optimModelPath + "/params"); + cfg->SwitchIrOptim(true); + cfg->SwitchSpecifyInputNames(); +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int test_batch_num = + FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "The number of samples to be test: " + << test_batch_num * FLAGS_batch_size; + for (int bid = 0; bid < test_batch_num; ++bid) { + input_slots.clear(); + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + + if (use_mkldnn) { + cfg.EnableMKLDNN(); + // Enable all the mkldnn supported ops except conv3d in dam + std::unordered_set op_list = {"softmax", "elementwise_add", + "relu", "fc"}; + cfg.SetMKLDNNOp(op_list); + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + + std::vector> outputs; + std::vector> input_slots_all; + SetInput(&input_slots_all); + + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_GT(output.size(), 0); + size_t size = GetSize(output[0]); + PADDLE_ENFORCE_GT(size, 0); + float *result = static_cast(output[0].data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(result[i], result_data[i], 1e-3); + } + } +} + +TEST(Analyzer_dam, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); } +#endif + +// Check the fuse status +TEST(Analyzer_dam, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); +} + +// Compare result of NativeConfig and AnalysisConfig +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + // Enable all the mkldnn supported ops except conv3d in dam + std::unordered_set op_list = {"softmax", "elementwise_add", + "relu"}; + cfg.SetMKLDNNOp(op_list); + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + + std::vector> input_slots_all; + SetInput(&input_slots_all); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +// Compare result of NativeConfig and AnalysisConfig with memory optimization. +TEST(Analyzer_dam, compare_with_static_memory_optim) { + // The small dam will core in CI, but works in local. + if (FLAGS_max_turn_num == 9) { + AnalysisConfig cfg, cfg1; + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + // Run the first time to force to update memory cache + SetConfig(&cfg); + cfg.EnableMemoryOptim(true, true /*force update*/); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), + input_slots_all); + + // Run second time to use the memory cache and perform memory optimization. + SetConfig(&cfg1); + cfg1.EnableMemoryOptim(true, false /*do not force update*/); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg1), + input_slots_all); + } +} + +TEST(Analyzer_dam, compare_with_dynamic_memory_optim) { + // The small dam will core in CI, but works in local. + if (FLAGS_max_turn_num == 9) { + AnalysisConfig cfg, cfg1; + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + // Run the first time to force to update memory cache + SetConfig(&cfg); + cfg.EnableMemoryOptim(); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), + input_slots_all); + } +} + +TEST(Analyzer_dam, compare) { compare(); } + +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + +// Compare Deterministic result +TEST(Analyzer_dam, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} +// Save optim model +TEST(Analyzer_dam, save_optim_model) { + AnalysisConfig cfg; + std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model"; + mkdir(optimModelPath.c_str(), 0777); + SetConfig(&cfg); + SaveOptimModel(&cfg, optimModelPath); +} + +void CompareOptimAndOrig(const PaddlePredictor::Config *orig_config, + const PaddlePredictor::Config *optim_config, + const std::vector> &inputs) { + PrintConfig(orig_config, true); + PrintConfig(optim_config, true); + std::vector> orig_outputs, optim_outputs; + TestOneThreadPrediction(orig_config, inputs, &orig_outputs, false); + TestOneThreadPrediction(optim_config, inputs, &optim_outputs, false); + CompareResult(orig_outputs.back(), optim_outputs.back()); +} + +TEST(Analyzer_dam, compare_optim_orig) { + AnalysisConfig orig_cfg; + AnalysisConfig optim_cfg; + SetConfig(&orig_cfg); + SetOptimConfig(&optim_cfg); + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareOptimAndOrig( + reinterpret_cast(&orig_cfg), + reinterpret_cast(&optim_cfg), + input_slots_all); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc new file mode 100644 index 00000000..07934f96 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_image_classification_tester.cc @@ -0,0 +1,146 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +DEFINE_bool(disable_mkldnn_fc, false, "Disable usage of MKL-DNN's FC op"); + +namespace paddle { +namespace inference { +namespace analysis { + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchIrOptim(); + cfg->SwitchSpecifyInputNames(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); +} + +void SetInput(std::vector> *inputs) { + SetFakeImageInput(inputs, FLAGS_infer_model); +} + +void SetOptimConfig(AnalysisConfig *cfg) { + std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model"; + cfg->SetModel(optimModelPath + "/model", optimModelPath + "/params"); + cfg->DisableGpu(); + cfg->SwitchIrOptim(); + cfg->SwitchSpecifyInputNames(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); +} + +// Easy for profiling independently. +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + + if (use_mkldnn) { + cfg.EnableMKLDNN(); + if (!FLAGS_disable_mkldnn_fc) + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + std::vector> outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_resnet50, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_resnet50, profile_mkldnn) { profile(true /* use_mkldnn */); } +#endif + +// Check the fuse status +TEST(Analyzer_resnet50, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + LOG(INFO) << "num_ops: " << num_ops; +} + +// Compare result of NativeConfig and AnalysisConfig +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + if (!FLAGS_disable_mkldnn_fc) + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +TEST(Analyzer_resnet50, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + +// Compare Deterministic result +TEST(Analyzer_resnet50, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +// Save optim model +TEST(Analyzer_resnet50, save_optim_model) { + AnalysisConfig cfg; + std::string optimModelPath = FLAGS_infer_model + "/saved_optim_model"; + mkdir(optimModelPath.c_str(), 0777); + SetConfig(&cfg); + SaveOptimModel(&cfg, optimModelPath); +} + +void CompareOptimAndOrig(const PaddlePredictor::Config *orig_config, + const PaddlePredictor::Config *optim_config, + const std::vector> &inputs) { + PrintConfig(orig_config, true); + PrintConfig(optim_config, true); + std::vector> orig_outputs, optim_outputs; + TestOneThreadPrediction(orig_config, inputs, &orig_outputs, false); + TestOneThreadPrediction(optim_config, inputs, &optim_outputs, false); + CompareResult(orig_outputs.back(), optim_outputs.back()); +} + +TEST(Analyzer_resnet50, compare_optim_orig) { + AnalysisConfig orig_cfg; + AnalysisConfig optim_cfg; + SetConfig(&orig_cfg); + SetOptimConfig(&optim_cfg); + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareOptimAndOrig( + reinterpret_cast(&orig_cfg), + reinterpret_cast(&optim_cfg), + input_slots_all); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc new file mode 100644 index 00000000..3e4a8f3f --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc @@ -0,0 +1,171 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/api/paddle_analysis_config.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchIrOptim(); + cfg->SwitchSpecifyInputNames(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); + cfg->EnableMKLDNN(); +} + +template +class TensorReader { + public: + TensorReader(std::ifstream &file, size_t beginning_offset, + std::vector shape, std::string name) + : file_(file), position(beginning_offset), shape_(shape), name_(name) { + numel = std::accumulate(shape_.begin(), shape_.end(), size_t{1}, + std::multiplies()); + } + + PaddleTensor NextBatch() { + PaddleTensor tensor; + tensor.name = name_; + tensor.shape = shape_; + tensor.dtype = GetPaddleDType(); + tensor.data.Resize(numel * sizeof(T)); + + file_.seekg(position); + file_.read(static_cast(tensor.data.data()), numel * sizeof(T)); + position = file_.tellg(); + + if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream"; + if (file_.fail()) + throw std::runtime_error(name_ + ": failed reading file."); + + return tensor; + } + + protected: + std::ifstream &file_; + size_t position; + std::vector shape_; + std::string name_; + size_t numel; +}; + +std::shared_ptr> GetWarmupData( + const std::vector> &test_data, + int num_images = FLAGS_warmup_batch_size) { + int test_data_batch_size = test_data[0][0].shape[0]; + auto iterations = test_data.size(); + PADDLE_ENFORCE( + static_cast(num_images) <= iterations * test_data_batch_size, + "The requested quantization warmup data size " + + std::to_string(num_images) + " is bigger than all test data size."); + + PaddleTensor images; + images.name = "image"; + images.shape = {num_images, 3, 224, 224}; + images.dtype = PaddleDType::FLOAT32; + images.data.Resize(sizeof(float) * num_images * 3 * 224 * 224); + + PaddleTensor labels; + labels.name = "label"; + labels.shape = {num_images, 1}; + labels.dtype = PaddleDType::INT64; + labels.data.Resize(sizeof(int64_t) * num_images); + + for (int i = 0; i < num_images; i++) { + auto batch = i / test_data_batch_size; + auto element_in_batch = i % test_data_batch_size; + std::copy_n(static_cast(test_data[batch][0].data.data()) + + element_in_batch * 3 * 224 * 224, + 3 * 224 * 224, + static_cast(images.data.data()) + i * 3 * 224 * 224); + + std::copy_n(static_cast(test_data[batch][1].data.data()) + + element_in_batch, + 1, static_cast(labels.data.data()) + i); + } + + auto warmup_data = std::make_shared>(2); + (*warmup_data)[0] = std::move(images); + (*warmup_data)[1] = std::move(labels); + return warmup_data; +} + +void SetInput(std::vector> *inputs, + int32_t batch_size = FLAGS_batch_size) { + std::ifstream file(FLAGS_infer_data, std::ios::binary); + if (!file) { + FAIL() << "Couldn't open file: " << FLAGS_infer_data; + } + + int64_t total_images{0}; + file.read(reinterpret_cast(&total_images), sizeof(total_images)); + LOG(INFO) << "Total images in file: " << total_images; + + std::vector image_batch_shape{batch_size, 3, 224, 224}; + std::vector label_batch_shape{batch_size, 1}; + auto images_offset_in_file = static_cast(file.tellg()); + auto labels_offset_in_file = + images_offset_in_file + sizeof(float) * total_images * 3 * 224 * 224; + + TensorReader image_reader(file, images_offset_in_file, + image_batch_shape, "image"); + TensorReader label_reader(file, labels_offset_in_file, + label_batch_shape, "label"); + + auto iterations_max = total_images / batch_size; + auto iterations = iterations_max; + if (FLAGS_iterations > 0 && FLAGS_iterations < iterations_max) { + iterations = FLAGS_iterations; + } + for (auto i = 0; i < iterations; i++) { + auto images = image_reader.NextBatch(); + auto labels = label_reader.NextBatch(); + inputs->emplace_back( + std::vector{std::move(images), std::move(labels)}); + } +} + +TEST(Analyzer_int8_image_classification, quantization) { + AnalysisConfig cfg; + SetConfig(&cfg); + + AnalysisConfig q_cfg; + SetConfig(&q_cfg); + + // read data from file and prepare batches with test data + std::vector> input_slots_all; + SetInput(&input_slots_all); + + // prepare warmup batch from input data read earlier + // warmup batch size can be different than batch size + std::shared_ptr> warmup_data = + GetWarmupData(input_slots_all); + + // configure quantizer + q_cfg.EnableMkldnnQuantizer(); + q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); + q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size); + + CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc new file mode 100644 index 00000000..ccb50d40 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc @@ -0,0 +1,281 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/inference/api/paddle_analysis_config.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchIrOptim(true); + cfg->SwitchSpecifyInputNames(false); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); + cfg->EnableMKLDNN(); +} + +std::vector ReadObjectsNum(std::ifstream &file, size_t offset, + int64_t total_images) { + std::vector num_objects; + num_objects.resize(total_images); + + file.clear(); + file.seekg(offset); + file.read(reinterpret_cast(num_objects.data()), + total_images * sizeof(size_t)); + + if (file.eof()) LOG(ERROR) << "Reached end of stream"; + if (file.fail()) throw std::runtime_error("Failed reading file."); + return num_objects; +} + +template +class TensorReader { + public: + TensorReader(std::ifstream &file, size_t beginning_offset, std::string name) + : file_(file), position(beginning_offset), name_(name) {} + + PaddleTensor NextBatch(std::vector shape, std::vector lod) { + int numel = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); + PaddleTensor tensor; + tensor.name = name_; + tensor.shape = shape; + tensor.dtype = GetPaddleDType(); + tensor.data.Resize(numel * sizeof(T)); + if (lod.empty() == false) { + tensor.lod.clear(); + tensor.lod.push_back(lod); + } + file_.seekg(position); + file_.read(reinterpret_cast(tensor.data.data()), numel * sizeof(T)); + position = file_.tellg(); + if (file_.eof()) LOG(ERROR) << name_ << ": reached end of stream"; + if (file_.fail()) + throw std::runtime_error(name_ + ": failed reading file."); + return tensor; + } + + protected: + std::ifstream &file_; + size_t position; + std::string name_; +}; + +void SetInput(std::vector> *inputs, + int32_t batch_size = FLAGS_batch_size) { + std::ifstream file(FLAGS_infer_data, std::ios::binary); + if (!file) { + FAIL() << "Couldn't open file: " << FLAGS_infer_data; + } + + int64_t total_images{0}; + file.read(reinterpret_cast(&total_images), sizeof(int64_t)); + LOG(INFO) << "Total images in file: " << total_images; + + size_t image_beginning_offset = static_cast(file.tellg()); + auto lod_offset_in_file = + image_beginning_offset + sizeof(float) * total_images * 3 * 300 * 300; + auto labels_beginning_offset = + lod_offset_in_file + sizeof(size_t) * total_images; + + std::vector lod_full = + ReadObjectsNum(file, lod_offset_in_file, total_images); + size_t sum_objects_num = + std::accumulate(lod_full.begin(), lod_full.end(), 0UL); + + auto bbox_beginning_offset = + labels_beginning_offset + sizeof(int64_t) * sum_objects_num; + auto difficult_beginning_offset = + bbox_beginning_offset + sizeof(float) * sum_objects_num * 4; + + TensorReader image_reader(file, image_beginning_offset, "image"); + TensorReader label_reader(file, labels_beginning_offset, "gt_label"); + TensorReader bbox_reader(file, bbox_beginning_offset, "gt_bbox"); + TensorReader difficult_reader(file, difficult_beginning_offset, + "gt_difficult"); + auto iterations_max = total_images / batch_size; + auto iterations = iterations_max; + if (FLAGS_iterations > 0 && FLAGS_iterations < iterations_max) { + iterations = FLAGS_iterations; + } + for (auto i = 0; i < iterations; i++) { + auto images_tensor = image_reader.NextBatch({batch_size, 3, 300, 300}, {}); + std::vector batch_lod(lod_full.begin() + i * batch_size, + lod_full.begin() + batch_size * (i + 1)); + size_t batch_num_objects = + std::accumulate(batch_lod.begin(), batch_lod.end(), 0UL); + batch_lod.insert(batch_lod.begin(), 0UL); + for (auto it = batch_lod.begin() + 1; it != batch_lod.end(); it++) { + *it = *it + *(it - 1); + } + auto labels_tensor = label_reader.NextBatch( + {static_cast(batch_num_objects), 1}, batch_lod); + auto bbox_tensor = bbox_reader.NextBatch( + {static_cast(batch_num_objects), 4}, batch_lod); + auto difficult_tensor = difficult_reader.NextBatch( + {static_cast(batch_num_objects), 1}, batch_lod); + + inputs->emplace_back(std::vector{ + std::move(images_tensor), std::move(bbox_tensor), + std::move(labels_tensor), std::move(difficult_tensor)}); + } +} + +std::shared_ptr> GetWarmupData( + const std::vector> &test_data, + int32_t num_images = FLAGS_warmup_batch_size) { + int test_data_batch_size = test_data[0][0].shape[0]; + auto iterations = test_data.size(); + PADDLE_ENFORCE( + static_cast(num_images) <= iterations * test_data_batch_size, + "The requested quantization warmup data size " + + std::to_string(num_images) + " is bigger than all test data size."); + + PaddleTensor images; + images.name = "image"; + images.shape = {num_images, 3, 300, 300}; + images.dtype = PaddleDType::FLOAT32; + images.data.Resize(sizeof(float) * num_images * 3 * 300 * 300); + + int batches = num_images / test_data_batch_size; + int batch_remain = num_images % test_data_batch_size; + size_t num_objects = 0UL; + std::vector accum_lod; + accum_lod.push_back(0UL); + for (int i = 0; i < batches; i++) { + std::transform(test_data[i][1].lod[0].begin() + 1, + test_data[i][1].lod[0].end(), std::back_inserter(accum_lod), + [&num_objects](size_t lodtemp) -> size_t { + return lodtemp + num_objects; + }); + num_objects += test_data[i][1].lod[0][test_data_batch_size]; + } + if (batch_remain > 0) { + std::transform(test_data[batches][1].lod[0].begin() + 1, + test_data[batches][1].lod[0].begin() + batch_remain + 1, + std::back_inserter(accum_lod), + [&num_objects](size_t lodtemp) -> size_t { + return lodtemp + num_objects; + }); + num_objects = num_objects + test_data[batches][1].lod[0][batch_remain]; + } + + PaddleTensor labels; + labels.name = "gt_label"; + labels.shape = {static_cast(num_objects), 1}; + labels.dtype = PaddleDType::INT64; + labels.data.Resize(sizeof(int64_t) * num_objects); + labels.lod.push_back(accum_lod); + + PaddleTensor bbox; + bbox.name = "gt_bbox"; + bbox.shape = {static_cast(num_objects), 4}; + bbox.dtype = PaddleDType::FLOAT32; + bbox.data.Resize(sizeof(float) * num_objects * 4); + bbox.lod.push_back(accum_lod); + + PaddleTensor difficult; + difficult.name = "gt_difficult"; + difficult.shape = {static_cast(num_objects), 1}; + difficult.dtype = PaddleDType::INT64; + difficult.data.Resize(sizeof(int64_t) * num_objects); + difficult.lod.push_back(accum_lod); + + size_t objects_accum = 0; + size_t objects_in_batch = 0; + for (int i = 0; i < batches; i++) { + objects_in_batch = test_data[i][1].lod[0][test_data_batch_size]; + std::copy_n(static_cast(test_data[i][0].data.data()), + test_data_batch_size * 3 * 300 * 300, + static_cast(images.data.data()) + + i * test_data_batch_size * 3 * 300 * 300); + std::copy_n(static_cast(test_data[i][1].data.data()), + objects_in_batch, + static_cast(labels.data.data()) + objects_accum); + std::copy_n(static_cast(test_data[i][2].data.data()), + objects_in_batch * 4, + static_cast(bbox.data.data()) + objects_accum * 4); + std::copy_n(static_cast(test_data[i][3].data.data()), + objects_in_batch, + static_cast(difficult.data.data()) + objects_accum); + objects_accum = objects_accum + objects_in_batch; + } + if (batch_remain > 0) { + size_t objects_remain = test_data[batches][1].lod[0][batch_remain]; + std::copy_n(static_cast(test_data[batches][0].data.data()), + batch_remain * 3 * 300 * 300, + static_cast(images.data.data()) + + objects_accum * 3 * 300 * 300); + std::copy_n(static_cast(test_data[batches][1].data.data()), + objects_remain, + static_cast(labels.data.data()) + objects_accum); + std::copy_n(static_cast(test_data[batches][2].data.data()), + objects_remain * 4, + static_cast(bbox.data.data()) + objects_accum * 4); + std::copy_n(static_cast(test_data[batches][3].data.data()), + objects_remain, + static_cast(difficult.data.data()) + objects_accum); + objects_accum = objects_accum + objects_remain; + } + PADDLE_ENFORCE( + static_cast(num_objects) == static_cast(objects_accum), + "The requested num of objects " + std::to_string(num_objects) + + " is the same as objects_accum."); + + auto warmup_data = std::make_shared>(4); + (*warmup_data)[0] = std::move(images); + (*warmup_data)[1] = std::move(bbox); + (*warmup_data)[2] = std::move(labels); + (*warmup_data)[3] = std::move(difficult); + + return warmup_data; +} + +TEST(Analyzer_int8_mobilenet_ssd, quantization) { + AnalysisConfig cfg; + SetConfig(&cfg); + + AnalysisConfig q_cfg; + SetConfig(&q_cfg); + + // read data from file and prepare batches with test data + std::vector> input_slots_all; + SetInput(&input_slots_all); + + // prepare warmup batch from input data read earlier + // warmup batch size can be different than batch size + std::shared_ptr> warmup_data = + GetWarmupData(input_slots_all); + + // configure quantizer + q_cfg.EnableMkldnnQuantizer(); + q_cfg.mkldnn_quantizer_config(); + std::unordered_set quantize_operators( + {"conv2d", "depthwise_conv2d", "prior_box"}); + q_cfg.mkldnn_quantizer_config()->SetEnabledOpTypes(quantize_operators); + q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); + q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size); + + CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc new file mode 100644 index 00000000..142905dc --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +struct DataRecord { + std::vector data; + std::vector lod; + // for dataset and nextbatch + size_t batch_iter{0}; + std::vector> batched_lods; + std::vector> batched_datas; + std::vector> datasets; + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) { + Load(path); + Prepare(batch_size); + batch_iter = 0; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + datasets.resize(0); + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ';', &data); + std::vector words_ids; + split_to_int64(data[1], ' ', &words_ids); + datasets.emplace_back(words_ids); + } + } + void Prepare(int bs) { + if (bs == 1) { + batched_datas = datasets; + for (auto one_sentence : datasets) { + batched_lods.push_back({0, one_sentence.size()}); + } + } else { + std::vector one_batch; + std::vector lod{0}; + int bs_id = 0; + for (auto one_sentence : datasets) { + bs_id++; + one_batch.insert(one_batch.end(), one_sentence.begin(), + one_sentence.end()); + lod.push_back(lod.back() + one_sentence.size()); + if (bs_id == bs) { + bs_id = 0; + batched_datas.push_back(one_batch); + batched_lods.push_back(lod); + one_batch.clear(); + one_batch.resize(0); + lod.clear(); + lod.resize(0); + lod.push_back(0); + } + } + if (one_batch.size() != 0) { + batched_datas.push_back(one_batch); + batched_lods.push_back(lod); + } + } + } + + DataRecord NextBatch() { + DataRecord data; + data.data = batched_datas[batch_iter]; + data.lod = batched_lods[batch_iter]; + batch_iter++; + if (batch_iter >= batched_datas.size()) { + batch_iter = 0; + } + return data; + } +}; + +void GetOneBatch(std::vector *input_slots, DataRecord *data, + int batch_size) { + auto one_batch = data->NextBatch(); + PaddleTensor input_tensor; + input_tensor.name = "word"; + input_tensor.dtype = PaddleDType::INT64; + TensorAssignData(&input_tensor, {one_batch.data}, one_batch.lod); + PADDLE_ENFORCE_EQ(batch_size, static_cast(one_batch.lod.size() - 1)); + input_slots->assign({input_tensor}); +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.batched_datas.size() : 1; + LOG(INFO) << "number of samples: " << epoch; + for (int bid = 0; bid < epoch; ++bid) { + GetOneBatch(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +TEST(Analyzer_LAC, profile) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector> outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + // the first inference result + const int64_t lac_ref_data[] = { + 24, 25, 25, 25, 38, 30, 31, 14, 15, 44, 24, 25, 25, 25, 25, 25, + 44, 24, 25, 25, 25, 36, 42, 43, 44, 14, 15, 44, 14, 15, 44, 14, + 15, 44, 38, 39, 14, 15, 44, 22, 23, 23, 23, 23, 23, 23, 23}; + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); + size_t batch1_size = sizeof(lac_ref_data) / sizeof(int64_t); + PADDLE_ENFORCE_GE(size, batch1_size); + int64_t *pdata = static_cast(output[0].data.data()); + for (size_t i = 0; i < batch1_size; ++i) { + EXPECT_EQ(pdata[i], lac_ref_data[i]); + } + } +} + +// Check the fuse status +TEST(Analyzer_LAC, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 4); + EXPECT_EQ(num_ops, 11); +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_LAC, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +// Compare Deterministic result +TEST(Analyzer_LAC, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc new file mode 100644 index 00000000..70478d69 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -0,0 +1,268 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +struct DataRecord { + std::vector> query, title; + std::vector lod1, lod2; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= query.size()) { + GetInputPerBatch(query, &data.query, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title, &data.title, &data.lod2, batch_iter, batch_end); + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, '\t', &data); + // load query data + std::vector query_data; + split_to_int64(data[0], ' ', &query_data); + // load title data + std::vector title_data; + split_to_int64(data[1], ' ', &title_data); + query.push_back(std::move(query_data)); + title.push_back(std::move(title_data)); + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor lod_query_tensor, lod_title_tensor; + lod_query_tensor.name = "left"; + lod_title_tensor.name = "right"; + auto one_batch = data->NextBatch(); + // assign data + TensorAssignData(&lod_query_tensor, one_batch.query, one_batch.lod1); + TensorAssignData(&lod_title_tensor, one_batch.title, one_batch.lod2); + // Set inputs. + input_slots->assign({lod_query_tensor, lod_title_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::INT64; + } +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector> outputs; + + if (use_mkldnn) { + cfg.EnableMKLDNN(); + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + PADDLE_ENFORCE_EQ(outputs.back().size(), 2UL); + for (auto &output : outputs.back()) { + size_t size = GetSize(output); + PADDLE_ENFORCE_GT(size, 0); + float *result = static_cast(output.data.data()); + // output is probability, which is in (-1, 1). + for (size_t i = 0; i < size; i++) { + EXPECT_GT(result[i], -1); + EXPECT_LT(result[i], 1); + } + } + } +} + +TEST(Analyzer_MM_DNN, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_MM_DNN, profile_mkldnn) { profile(true /* use_mkldnn */); } +#endif + +// Check the fuse status +TEST(Analyzer_MM_DNN, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); +} + +// Compare result of NativeConfig and AnalysisConfig +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + + if (use_mkldnn) { + cfg.EnableMKLDNN(); + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +TEST(Analyzer_MM_DNN, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_MM_DNN, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + +// Compare Deterministic result +TEST(Analyzer_MM_DNN, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +#ifdef PADDLE_WITH_MKLDNN +void TestMkldnnCacheClear(int mkldnn_input_shape_cache_capacity, + std::vector> *outputs) { + AnalysisConfig config; + SetConfig(&config); + config.EnableMKLDNN(); + config.SetMkldnnCacheCapacity(mkldnn_input_shape_cache_capacity); + + std::vector input; + auto predictor = CreatePaddlePredictor(config); + + int sample_num = 10; + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + outputs->resize(sample_num); + + for (int i = 0; i < sample_num; i++) { + PrepareInputs(&input, &data, FLAGS_batch_size); + predictor->Run(input, &(*outputs)[i], 1); + } +} + +TEST(Analyzer_MM_DNN, mkldnn_cache_clear) { + std::vector> outputs, cache_outputs; + // 0 means do not use cache clear strategy. + TestMkldnnCacheClear(0, &outputs); + // 4 means use cache clear strategy, and the + // mkldnn_input_shape_cache_capacity is 4. + TestMkldnnCacheClear(4, &cache_outputs); + // compare the result. + for (size_t i = 0; i < outputs.size(); i++) { + CompareResult(outputs[i], cache_outputs[i]); + } +} + +void TestMkldnnShapeBlobSize(int mkldnn_input_shape_cache_capacity) { + AnalysisConfig config; + SetConfig(&config); + config.EnableMKLDNN(); + config.SwitchUseFeedFetchOps(false); + // Since AnalysisPredictor::Run() will reset cur_mkldnn_session_id to default + // before its finished, we use AnalysisPredictor::ZeroCopyRun() here to check + // the mkldnn_shape_blob_size. + if (mkldnn_input_shape_cache_capacity > 0) { + platform::set_cur_mkldnn_session_id( + platform::kMKLDNNSessionID_CacheClearing); + platform::set_cur_input_shape_cache_capacity( + mkldnn_input_shape_cache_capacity); + } + + std::vector input; + auto predictor = CreatePaddlePredictor(config); + + int sample_num = 10; + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + + auto &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = dynamic_cast( + pool.Get(platform::CPUPlace())); + // clear before test + dev_ctx->ResetBlobMap(); + + for (int i = 0; i < sample_num; i++) { + PrepareInputs(&input, &data, FLAGS_batch_size); + ConvertPaddleTensorToZeroCopyTensor(predictor.get(), input); + if (mkldnn_input_shape_cache_capacity > 0) { + std::stringstream ss; + for (size_t i = 0; i < input.size(); i++) { + for (size_t j = 0; j < input[i].shape.size(); ++j) { + ss << input[i].shape[j] << "-"; + } + } + platform::set_cur_input_shape_str(ss.str()); + } + predictor->ZeroCopyRun(); + } + if (mkldnn_input_shape_cache_capacity > 0) { + PADDLE_ENFORCE_EQ(dev_ctx->GetShapeBlobSize(), + mkldnn_input_shape_cache_capacity); + } else { + PADDLE_ENFORCE_EQ(dev_ctx->GetShapeBlobSize(), 1UL); + } +} + +TEST(Analyzer_MM_DNN, mkldnn_shape_blob_size) { + // 0 means do not use cache clear strategy. + TestMkldnnShapeBlobSize(0); + // 4 means use cache clear strategy, and the + // mkldnn_input_shape_cache_capacity is 4. + TestMkldnnShapeBlobSize(4); +} +#endif + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc new file mode 100644 index 00000000..36e07d5f --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -0,0 +1,177 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +struct DataRecord { + std::vector> word, mention; + std::vector lod; // two inputs have the same lod info. + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= word.size()) { + GetInputPerBatch(word, &data.word, &data.lod, batch_iter, batch_end); + GetInputPerBatch(mention, &data.mention, &data.lod, batch_iter, + batch_end); + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ';', &data); + // load word data + std::vector word_data; + split_to_int64(data[1], ' ', &word_data); + // load mention data + std::vector mention_data; + split_to_int64(data[3], ' ', &mention_data); + word.push_back(std::move(word_data)); + mention.push_back(std::move(mention_data)); + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data) { + PaddleTensor lod_word_tensor, lod_mention_tensor; + lod_word_tensor.name = "word"; + lod_mention_tensor.name = "mention"; + auto one_batch = data->NextBatch(); + // assign data + TensorAssignData(&lod_word_tensor, one_batch.word, one_batch.lod); + TensorAssignData(&lod_mention_tensor, one_batch.mention, + one_batch.lod); + // Set inputs. + input_slots->assign({lod_word_tensor, lod_mention_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::INT64; + } +} + +void SetConfig(AnalysisConfig *cfg, bool memory_load = false) { + if (memory_load) { + std::string buffer_prog, buffer_param; + ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog); + ReadBinaryFile(FLAGS_infer_model + "/param", &buffer_param); + cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0], + buffer_param.size()); + } else { + cfg->SetModel(FLAGS_infer_model + "/__model__", + FLAGS_infer_model + "/param"); + } + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +void profile(bool memory_load = false) { + AnalysisConfig cfg; + SetConfig(&cfg, memory_load); + std::vector> outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + // the first inference result + const int chinese_ner_result_data[] = {30, 45, 41, 48, 17, 26, + 48, 39, 38, 16, 25}; + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); + PADDLE_ENFORCE_GT(size, 0); + int64_t *result = static_cast(output[0].data.data()); + for (size_t i = 0; i < std::min(11UL, size); i++) { + EXPECT_EQ(result[i], chinese_ner_result_data[i]); + } + } +} + +TEST(Analyzer_Chinese_ner, profile) { profile(); } + +TEST(Analyzer_Chinese_ner, profile_memory_load) { + profile(true /* memory_load */); +} + +// Check the fuse status +TEST(Analyzer_Chinese_ner, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_TRUE(fuse_statis.count("fc_gru_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_gru_fuse"), 2); + EXPECT_EQ(num_ops, 14); +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_Chinese_ner, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +// Compare Deterministic result +TEST(Analyzer_Chinese_ner, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc new file mode 100644 index 00000000..11a49ed2 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc @@ -0,0 +1,204 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +struct DataRecord { + std::vector> query_basic, query_phrase, title_basic, + title_phrase; + std::vector lod1, lod2, lod3, lod4; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= query_basic.size()) { + GetInputPerBatch(query_basic, &data.query_basic, &data.lod1, batch_iter, + batch_end); + GetInputPerBatch(query_phrase, &data.query_phrase, &data.lod2, batch_iter, + batch_end); + GetInputPerBatch(title_basic, &data.title_basic, &data.lod3, batch_iter, + batch_end); + GetInputPerBatch(title_phrase, &data.title_phrase, &data.lod4, batch_iter, + batch_end); + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + std::vector data; + split(line, ';', &data); + // load query data + std::vector query_basic_data; + split_to_int64(data[1], ' ', &query_basic_data); + std::vector query_phrase_data; + split_to_int64(data[2], ' ', &query_phrase_data); + // load title data + std::vector title_basic_data; + split_to_int64(data[3], ' ', &title_basic_data); + std::vector title_phrase_data; + split_to_int64(data[4], ' ', &title_phrase_data); + // filter the empty data + bool flag = + data[1].size() && data[2].size() && data[3].size() && data[4].size(); + if (flag) { + query_basic.push_back(std::move(query_basic_data)); + query_phrase.push_back(std::move(query_phrase_data)); + title_basic.push_back(std::move(title_basic_data)); + title_phrase.push_back(std::move(title_phrase_data)); + num_lines++; + } + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor query_basic_tensor, query_phrase_tensor, title_basic_tensor, + title_phrase_tensor; + query_basic_tensor.name = "query_basic"; + query_phrase_tensor.name = "query_phrase"; + title_basic_tensor.name = "pos_title_basic"; + title_phrase_tensor.name = "pos_title_phrase"; + auto one_batch = data->NextBatch(); + // assign data + TensorAssignData(&query_basic_tensor, one_batch.query_basic, + one_batch.lod1); + TensorAssignData(&query_phrase_tensor, one_batch.query_phrase, + one_batch.lod2); + TensorAssignData(&title_basic_tensor, one_batch.title_basic, + one_batch.lod3); + TensorAssignData(&title_phrase_tensor, one_batch.title_phrase, + one_batch.lod4); + // Set inputs. + input_slots->assign({query_basic_tensor, query_phrase_tensor, + title_basic_tensor, title_phrase_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::INT64; + } +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +TEST(Analyzer_Pyramid_DNN, profile) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector> outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data && !FLAGS_zero_copy) { + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); + PADDLE_ENFORCE_GT(size, 0); + float *result = static_cast(output[0].data.data()); + // output is probability, which is in (0, 1). + for (size_t i = 0; i < size; i++) { + EXPECT_GT(result[i], 0); + EXPECT_LT(result[i], 1); + } + } +} + +// Check the fuse status +TEST(Analyzer_Pyramid_DNN, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_Pyramid_DNN, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_Pyramid_DNN, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); + + AnalysisConfig cfg1; + SetConfig(&cfg1); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + std::vector outputs_name; + outputs_name.emplace_back("cos_sim_2.tmp_0"); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + reinterpret_cast(&cfg1), + input_slots_all, outputs_name); +} + +// Compare Deterministic result +TEST(Analyzer_Pyramid_DNN, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc new file mode 100644 index 00000000..620a1d1f --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -0,0 +1,309 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +DEFINE_bool(with_precision_check, true, "turn on test"); + +namespace paddle { +namespace inference { + +using namespace framework; // NOLINT + +struct DataRecord { + std::vector>> link_step_data_all; + std::vector> week_data_all, minute_data_all; + std::vector lod1, lod2, lod3; + std::vector> rnn_link_data, rnn_week_datas, + rnn_minute_datas; + size_t num_samples; // total number of samples + size_t batch_iter{0}; + size_t batch_size{1}; + DataRecord() = default; + + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= link_step_data_all.size()) { + data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, + link_step_data_all.begin() + batch_end); + data.week_data_all.assign(week_data_all.begin() + batch_iter, + week_data_all.begin() + batch_end); + data.minute_data_all.assign(minute_data_all.begin() + batch_iter, + minute_data_all.begin() + batch_end); + // Prepare LoDs + data.lod1.push_back(0); + data.lod2.push_back(0); + data.lod3.push_back(0); + CHECK(!data.link_step_data_all.empty()) << "empty"; + CHECK(!data.week_data_all.empty()); + CHECK(!data.minute_data_all.empty()); + CHECK_EQ(data.link_step_data_all.size(), data.week_data_all.size()); + CHECK_EQ(data.minute_data_all.size(), data.link_step_data_all.size()); + for (size_t j = 0; j < data.link_step_data_all.size(); j++) { + for (const auto &d : data.link_step_data_all[j]) { + data.rnn_link_data.push_back(d); + } + data.rnn_week_datas.push_back(data.week_data_all[j]); + data.rnn_minute_datas.push_back(data.minute_data_all[j]); + // calculate lod + data.lod1.push_back(data.lod1.back() + + data.link_step_data_all[j].size()); + data.lod3.push_back(data.lod3.back() + 1); + for (size_t i = 1; i < data.link_step_data_all[j].size() + 1; i++) { + data.lod2.push_back(data.lod2.back() + + data.link_step_data_all[j].size()); + } + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ':', &data); + std::vector> link_step_data; + std::vector link_datas; + split(data[0], '|', &link_datas); + for (auto &step_data : link_datas) { + std::vector tmp; + split_to_float(step_data, ',', &tmp); + link_step_data.push_back(tmp); + } + // load week data + std::vector week_data; + split_to_float(data[2], ',', &week_data); + // load minute data + std::vector minute_data; + split_to_float(data[1], ',', &minute_data); + link_step_data_all.push_back(std::move(link_step_data)); + week_data_all.push_back(std::move(week_data)); + minute_data_all.push_back(std::move(minute_data)); + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor lod_attention_tensor, init_zero_tensor, lod_tensor_tensor, + week_tensor, minute_tensor; + lod_attention_tensor.name = "data_lod_attention"; + init_zero_tensor.name = "cell_init"; + lod_tensor_tensor.name = "data"; + week_tensor.name = "week"; + minute_tensor.name = "minute"; + auto one_batch = data->NextBatch(); + std::vector rnn_link_data_shape( + {static_cast(one_batch.rnn_link_data.size()), + static_cast(one_batch.rnn_link_data.front().size())}); + lod_attention_tensor.shape.assign({1, 2}); + lod_attention_tensor.lod.assign({one_batch.lod1, one_batch.lod2}); + init_zero_tensor.shape.assign({batch_size, 15}); + init_zero_tensor.lod.assign({one_batch.lod3}); + lod_tensor_tensor.shape = rnn_link_data_shape; + lod_tensor_tensor.lod.assign({one_batch.lod1}); + // clang-format off + week_tensor.shape.assign( + {static_cast(one_batch.rnn_week_datas.size()), + static_cast(one_batch.rnn_week_datas.front().size())}); + week_tensor.lod.assign({one_batch.lod3}); + minute_tensor.shape.assign( + {static_cast(one_batch.rnn_minute_datas.size()), + static_cast(one_batch.rnn_minute_datas.front().size())}); + minute_tensor.lod.assign({one_batch.lod3}); + // clang-format on + // assign data + TensorAssignData(&lod_attention_tensor, + std::vector>({{0, 0}})); + std::vector tmp_zeros(batch_size * 15, 0.); + TensorAssignData(&init_zero_tensor, {tmp_zeros}); + TensorAssignData(&lod_tensor_tensor, one_batch.rnn_link_data); + TensorAssignData(&week_tensor, one_batch.rnn_week_datas); + TensorAssignData(&minute_tensor, one_batch.rnn_minute_datas); + // Set inputs. + auto init_zero_tensor1 = init_zero_tensor; + init_zero_tensor1.name = "hidden_init"; + input_slots->assign({week_tensor, init_zero_tensor, minute_tensor, + init_zero_tensor1, lod_attention_tensor, + lod_tensor_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::FLOAT32; + } +} + +void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor, + ZeroCopyTensor *cell_init_tensor, + ZeroCopyTensor *data_tensor, + ZeroCopyTensor *hidden_init_tensor, + ZeroCopyTensor *week_tensor, + ZeroCopyTensor *minute_tensor, + DataRecord *data_record, int batch_size) { + auto one_batch = data_record->NextBatch(); + std::vector rnn_link_data_shape( + {static_cast(one_batch.rnn_link_data.size()), + static_cast(one_batch.rnn_link_data.front().size())}); + lod_attention_tensor->Reshape({1, 2}); + lod_attention_tensor->SetLoD({one_batch.lod1, one_batch.lod2}); + + cell_init_tensor->Reshape({batch_size, 15}); + cell_init_tensor->SetLoD({one_batch.lod3}); + + hidden_init_tensor->Reshape({batch_size, 15}); + hidden_init_tensor->SetLoD({one_batch.lod3}); + + data_tensor->Reshape(rnn_link_data_shape); + data_tensor->SetLoD({one_batch.lod1}); + + week_tensor->Reshape( + {static_cast(one_batch.rnn_week_datas.size()), + static_cast(one_batch.rnn_week_datas.front().size())}); + week_tensor->SetLoD({one_batch.lod3}); + + minute_tensor->Reshape( + {static_cast(one_batch.rnn_minute_datas.size()), + static_cast(one_batch.rnn_minute_datas.front().size())}); + minute_tensor->SetLoD({one_batch.lod3}); + + // assign data + float arr0[] = {0, 0}; + std::vector zeros(batch_size * 15, 0); + std::copy_n(arr0, 2, + lod_attention_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(arr0, 2, data_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + cell_init_tensor->mutable_data(PaddlePlace::kCPU)); + std::copy_n(zeros.begin(), zeros.size(), + hidden_init_tensor->mutable_data(PaddlePlace::kCPU)); + ZeroCopyTensorAssignData(data_tensor, one_batch.rnn_link_data); + ZeroCopyTensorAssignData(week_tensor, one_batch.rnn_week_datas); + ZeroCopyTensorAssignData(minute_tensor, one_batch.rnn_minute_datas); +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +TEST(Analyzer_rnn1, profile) { + AnalysisConfig cfg; + SetConfig(&cfg); + cfg.DisableGpu(); + cfg.SwitchIrDebug(); + std::vector> outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); +} + +// Check the fuse status +TEST(Analyzer_rnn1, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 1); + EXPECT_EQ(fuse_statis.at("fc_nobias_lstm_fuse"), 2); // bi-directional LSTM + EXPECT_EQ(fuse_statis.at("seq_concat_fc_fuse"), 1); + EXPECT_EQ(num_ops, + 13); // After graph optimization, only 13 operators exists. +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_rnn1, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +// Compare Deterministic result +TEST(Analyzer_rnn1, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +// Test Multi-Thread. +TEST(Analyzer_rnn1, multi_thread) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector> outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, 2 /* multi_thread */); +} + +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_rnn1, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); + + AnalysisConfig cfg1; + SetConfig(&cfg1); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + std::vector outputs_name; + outputs_name.emplace_back("final_output.tmp_1"); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + reinterpret_cast(&cfg1), + input_slots_all, outputs_name); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc new file mode 100644 index 00000000..9ccbf58c --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -0,0 +1,173 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +using namespace framework; // NOLINT +static std::vector result_data; + +struct DataRecord { + std::vector>> link_step_data_all; + std::vector lod; + std::vector> rnn_link_data; + size_t num_samples; // total number of samples + size_t batch_iter{0}; + size_t batch_size{1}; + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= link_step_data_all.size()) { + data.link_step_data_all.assign(link_step_data_all.begin() + batch_iter, + link_step_data_all.begin() + batch_end); + // Prepare LoDs + data.lod.push_back(0); + CHECK(!data.link_step_data_all.empty()) << "empty"; + for (size_t j = 0; j < data.link_step_data_all.size(); j++) { + for (const auto &d : data.link_step_data_all[j]) { + data.rnn_link_data.push_back(d); + // calculate lod + data.lod.push_back(data.lod.back() + 11); + } + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + result_data.clear(); + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ':', &data); + if (num_lines % 2) { // feature + std::vector feature_data; + split(data[1], ' ', &feature_data); + std::vector> link_step_data; + int feature_count = 1; + std::vector feature; + for (auto &step_data : feature_data) { + std::vector tmp; + split_to_float(step_data, ',', &tmp); + feature.insert(feature.end(), tmp.begin(), tmp.end()); + if (feature_count % 11 == 0) { // each sample has 11 features + link_step_data.push_back(feature); + feature.clear(); + } + feature_count++; + } + link_step_data_all.push_back(std::move(link_step_data)); + } else { // result + std::vector tmp; + split_to_float(data[1], ',', &tmp); + result_data.insert(result_data.end(), tmp.begin(), tmp.end()); + } + } + num_samples = num_lines / 2; + } +}; +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor feed_tensor; + feed_tensor.name = "feed"; + auto one_batch = data->NextBatch(); + int token_size = one_batch.rnn_link_data.size(); + // each token has 11 features, each feature's dim is 54. + std::vector rnn_link_data_shape({token_size * 11, 54}); + feed_tensor.shape = rnn_link_data_shape; + feed_tensor.lod.assign({one_batch.lod}); + feed_tensor.dtype = PaddleDType::FLOAT32; + TensorAssignData(&feed_tensor, one_batch.rnn_link_data); + // Set inputs. + input_slots->assign({feed_tensor}); +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +TEST(Analyzer_rnn2, profile) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector> outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + // the first inference result + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_GT(output.size(), 0); + size_t size = GetSize(output[0]); + PADDLE_ENFORCE_GT(size, 0); + float *result = static_cast(output[0].data.data()); + for (size_t i = 0; i < size; i++) { + EXPECT_NEAR(result[i], result_data[i], 1e-3); + } + } +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_rnn2, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +// Compare Deterministic result +TEST(Analyzer_rnn2, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc b/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc new file mode 100644 index 00000000..977b2ec8 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_save_model_tester.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +void SetConfig(AnalysisConfig *cfg) { + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(true); + cfg->SwitchIrDebug(); +} + +int GetNumOps(const AnalysisConfig &cfg) { + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(static_cast(predictor.get()), &num_ops); + return num_ops; +} + +TEST(Analyzer, save_model) { + AnalysisConfig cfg; + SetConfig(&cfg); + cfg.SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + // ensure the path being unique + std::string optimModelPath = FLAGS_infer_model + "/only_for_save_model_test"; + mkdir(optimModelPath.c_str(), 0777); + SaveOptimModel(&cfg, optimModelPath); + + // Each config can only be applied to one predictor. + AnalysisConfig cfg2; + SetConfig(&cfg2); + cfg2.pass_builder()->ClearPasses(); + cfg2.SetModel(optimModelPath + "/model", optimModelPath + "/params"); + int origin_num_ops = GetNumOps(cfg2); + + AnalysisConfig cfg3; + SetConfig(&cfg3); + cfg3.SetModel(optimModelPath + "/model", optimModelPath + "/params"); + int fused_num_ops = GetNumOps(cfg3); + CHECK_LE(fused_num_ops, origin_num_ops); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc new file mode 100644 index 00000000..5ee848c3 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +struct DataRecord { + std::vector> title1, title2, title3, l1; + std::vector lod1, lod2, lod3, l1_lod; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= title1.size()) { + GetInputPerBatch(title1, &data.title1, &data.lod1, batch_iter, batch_end); + GetInputPerBatch(title2, &data.title2, &data.lod2, batch_iter, batch_end); + GetInputPerBatch(title3, &data.title3, &data.lod3, batch_iter, batch_end); + GetInputPerBatch(l1, &data.l1, &data.l1_lod, batch_iter, batch_end); + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, '\t', &data); + PADDLE_ENFORCE(data.size() >= 4); + // load title1 data + std::vector title1_data; + split_to_int64(data[0], ' ', &title1_data); + // load title2 data + std::vector title2_data; + split_to_int64(data[1], ' ', &title2_data); + // load title3 data + std::vector title3_data; + split_to_int64(data[2], ' ', &title3_data); + // load l1 data + std::vector l1_data; + split_to_int64(data[3], ' ', &l1_data); + title1.push_back(std::move(title1_data)); + title2.push_back(std::move(title2_data)); + title3.push_back(std::move(title3_data)); + l1.push_back(std::move(l1_data)); + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + PaddleTensor title1_tensor, title2_tensor, title3_tensor, l1_tensor; + title1_tensor.name = "title1"; + title2_tensor.name = "title2"; + title3_tensor.name = "title3"; + l1_tensor.name = "l1"; + auto one_batch = data->NextBatch(); + // assign data + TensorAssignData(&title1_tensor, one_batch.title1, one_batch.lod1); + TensorAssignData(&title2_tensor, one_batch.title2, one_batch.lod2); + TensorAssignData(&title3_tensor, one_batch.title3, one_batch.lod3); + TensorAssignData(&l1_tensor, one_batch.l1, one_batch.l1_lod); + // Set inputs. + input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor}); + for (auto &tensor : *input_slots) { + tensor.dtype = PaddleDType::INT64; + } +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +TEST(Analyzer_seq_conv1, profile) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector> outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + // the first inference result + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto output = outputs.back(); + PADDLE_ENFORCE_EQ(output.size(), 1UL); + size_t size = GetSize(output[0]); + PADDLE_ENFORCE_GT(size, 0); + float *result = static_cast(output[0].data.data()); + // output is probability, which is in (0, 1). + for (size_t i = 0; i < size; i++) { + EXPECT_GT(result[i], 0); + EXPECT_LT(result[i], 1); + } + } +} + +// Check the fuse status +TEST(Analyzer_seq_conv1, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + + auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_TRUE(fuse_statis.count("seqconv_eltadd_relu_fuse")); + EXPECT_EQ(fuse_statis.at("fc_fuse"), 2); + EXPECT_EQ(fuse_statis.at("seqconv_eltadd_relu_fuse"), 6); + EXPECT_EQ(num_ops, 32); +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_seq_conv1, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +// Compare Deterministic result +TEST(Analyzer_seq_conv1, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc new file mode 100644 index 00000000..e6f2bfad --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -0,0 +1,232 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +// diff: similarity_norm.tmp_0, for speed: fc_4.tmp_1 +static const char out_var_name[] = "reduce_sum_0.tmp_0"; + +// for diff: 154, for speed 111 +constexpr int num_slots = 154; + +struct OneSlotInBatch { + std::string name; + std::vector> data; + std::vector shape; + std::vector lod; +}; + +struct DataRecord { + std::vector> batched_data; + std::map>> datasets; + size_t batch_iter{0}, num_samples; // total number of samples + + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) { + Load(path); + Prepare(batch_size); + } + + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + int num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, '\t', &data); + std::vector slot_data; + split_to_float(data[1], ' ', &slot_data); + std::string name = data[0]; + PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL, + "line %d, %s should be divisible", num_lines, name); + datasets[name].emplace_back(std::move(slot_data)); + } + num_samples = num_lines / num_slots; + PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast(num_lines), + "num samples should be divisible"); + PADDLE_ENFORCE_GT(num_samples, 0UL); + } + + void Prepare(int bs) { + for (auto it = datasets.begin(); it != datasets.end(); ++it) { + PADDLE_ENFORCE_EQ(it->second.size(), num_samples, + "size of each slot should be equal"); + } + size_t num_batches = num_samples / bs; + EXPECT_GT(num_batches, 0); + batched_data.resize(num_batches); + for (auto &one_batch : batched_data) { + one_batch.resize(datasets.size()); + size_t i = 0; + for (auto it = datasets.begin(); it != datasets.end(); ++it) { + auto &slot = one_batch[i]; + slot.name = it->first; + slot.data.resize(bs); + slot.lod.resize(bs + 1); + slot.lod[0] = 0; + auto &lod = slot.lod; + auto &datas = it->second; + for (int k = 0; k < bs; ++k) { + size_t id = k + batch_iter * bs; + std::copy(datas[id].begin(), datas[id].end(), + std::back_inserter(slot.data[k])); + size_t len = datas[id].size() / 11; + PADDLE_ENFORCE_EQ(len * 11, datas[id].size(), + "%s %d size should be divisible", slot.name, id); + lod[k + 1] = lod[k] + len; + } + slot.shape.assign({static_cast(lod[bs]), 11}); + i++; + } + } + } + + const std::vector &NextBatch() { + if (batch_iter >= batched_data.size() - 1) { + batch_iter = -1; + } + return batched_data[++batch_iter]; + } +}; + +static void TensorAssignSlot(PaddleTensor *tensor, const OneSlotInBatch &slot) { + tensor->name = slot.name + "_embed"; + tensor->shape = slot.shape; + tensor->dtype = PaddleDType::FLOAT32; + tensor->lod.clear(); + tensor->lod.emplace_back(slot.lod); + TensorAssignData(tensor, slot.data); +} + +void PrepareInputs(std::vector *input_slots, DataRecord *data) { + const auto &one_batch = data->NextBatch(); + input_slots->resize(one_batch.size()); + for (size_t i = 0; i < one_batch.size(); ++i) { + auto &slot = one_batch[i]; + TensorAssignSlot(&((*input_slots)[i]), slot); + } +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int epoch = FLAGS_test_all_data ? data.batched_data.size() : 1; + LOG(INFO) << "number of samples: " + << data.batched_data.size() * FLAGS_batch_size; + for (int bid = 0; bid < epoch; ++bid) { + PrepareInputs(&input_slots, &data); + (*inputs).emplace_back(input_slots); + } +} + +void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) { + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrDebug(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); + if (FLAGS_zero_copy) { + cfg->SwitchUseFeedFetchOps(false); + } + if (use_mkldnn) { + cfg->EnableMKLDNN(); + cfg->pass_builder()->AppendPass("fc_mkldnn_pass"); + } + // Enable seqpool_concat_fuse_pass, disabled by default since it takes much + // time + cfg->pass_builder()->InsertPass(2, "seqpool_concat_fuse_pass"); +} + +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg, use_mkldnn); + + std::vector> outputs; + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_seq_pool1, profile) { profile(); } + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_seq_pool1, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +// Compare Deterministic result +TEST(Analyzer_seq_pool1, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +// Check the fuse status +TEST(Analyzer_seq_pool1, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis(predictor.get(), &num_ops); + ASSERT_TRUE(fuse_statis.count("fc_fuse")); + ASSERT_TRUE(fuse_statis.count("seqpool_concat_fuse")); + ASSERT_TRUE(fuse_statis.count("squared_mat_sub_fuse")); + ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse")); + ASSERT_EQ(fuse_statis.at("fc_fuse"), 10); + EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2); + EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2); + EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2); + LOG(INFO) << "num_ops: " << num_ops; + EXPECT_EQ(num_ops, 171); +} + +// Compare result of AnalysisConfig and AnalysisConfig + ZeroCopy +TEST(Analyzer_seq_pool1, compare_zero_copy) { + AnalysisConfig cfg; + SetConfig(&cfg); + + AnalysisConfig cfg1; + SetConfig(&cfg1); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + std::vector outputs_name; + outputs_name.emplace_back(out_var_name); + CompareAnalysisAndZeroCopy(reinterpret_cast(&cfg), + reinterpret_cast(&cfg1), + input_slots_all, outputs_name); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc new file mode 100644 index 00000000..78e500b2 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -0,0 +1,142 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +struct DataReader { + explicit DataReader(const std::string &path) + : file(new std::ifstream(path)) {} + + bool NextBatch(std::vector *input, int batch_size) { + PADDLE_ENFORCE_EQ(batch_size, 1); + std::string line; + PaddleTensor tensor; + tensor.dtype = PaddleDType::INT64; + tensor.lod.emplace_back(std::vector({0})); + std::vector data; + + for (int i = 0; i < batch_size; i++) { + if (!std::getline(*file, line)) return false; + inference::split_to_int64(line, ' ', &data); + } + tensor.lod.front().push_back(data.size()); + + tensor.data.Resize(data.size() * sizeof(int64_t)); + CHECK(tensor.data.data() != nullptr); + CHECK(data.data() != nullptr); + memcpy(tensor.data.data(), data.data(), data.size() * sizeof(int64_t)); + tensor.shape.push_back(data.size()); + tensor.shape.push_back(1); + input->assign({tensor}); + return true; + } + + std::unique_ptr file; +}; + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); +} + +void SetInput(std::vector> *inputs) { + std::vector input_slots; + DataReader reader(FLAGS_infer_data); + int num_batches = 0; + while (reader.NextBatch(&input_slots, FLAGS_batch_size)) { + (*inputs).emplace_back(input_slots); + ++num_batches; + if (!FLAGS_test_all_data) return; + } + LOG(INFO) << "total number of samples: " << num_batches * FLAGS_batch_size; +} + +// Easy for profiling independently. +TEST(Analyzer_Text_Classification, profile) { + AnalysisConfig cfg; + SetConfig(&cfg); + cfg.SwitchIrDebug(); + std::vector> outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); + + if (FLAGS_num_threads == 1) { + // Get output + PADDLE_ENFORCE_GT(outputs.size(), 0); + LOG(INFO) << "get outputs " << outputs.back().size(); + for (auto &output : outputs.back()) { + LOG(INFO) << "output.shape: " << to_string(output.shape); + // no lod ? + CHECK_EQ(output.lod.size(), 0UL); + LOG(INFO) << "output.dtype: " << output.dtype; + std::stringstream ss; + int num_data = 1; + for (auto i : output.shape) { + num_data *= i; + } + + for (int i = 0; i < num_data; i++) { + ss << static_cast(output.data.data())[i] << " "; + } + LOG(INFO) << "output.data summary: " << ss.str(); + // one batch ends + } + } +} + +// Compare result of NativeConfig and AnalysisConfig +TEST(Analyzer_Text_Classification, compare) { + AnalysisConfig cfg; + SetConfig(&cfg); + cfg.EnableMemoryOptim(); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +// Compare Deterministic result +TEST(Analyzer_Text_Classification, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) { + AnalysisConfig cfg; + SetConfig(&cfg); + // Enable embedding_fc_lstm_fuse_pass (disabled by default) + cfg.pass_builder()->InsertPass(2, "embedding_fc_lstm_fuse_pass"); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc new file mode 100644 index 00000000..f2195966 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc @@ -0,0 +1,238 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +struct DataRecord { + std::vector> src_word, src_pos, trg_word, init_idx; + std::vector> src_slf_attn_bias, init_score, + trg_src_attn_bias; + std::vector> batch_data_shape; + std::vector> lod; + size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples + DataRecord() = default; + explicit DataRecord(const std::string &path, int batch_size = 1) + : batch_size(batch_size) { + Load(path); + } + DataRecord NextBatch() { + DataRecord data; + size_t batch_end = batch_iter + batch_size; + // NOTE skip the final batch, if no enough data is provided. + if (batch_end <= src_word.size()) { + data.src_word.assign(src_word.begin() + batch_iter, + src_word.begin() + batch_end); + data.src_pos.assign(src_pos.begin() + batch_iter, + src_pos.begin() + batch_end); + data.src_slf_attn_bias.assign(src_slf_attn_bias.begin() + batch_iter, + src_slf_attn_bias.begin() + batch_end); + data.trg_word.assign(trg_word.begin() + batch_iter, + trg_word.begin() + batch_end); + data.init_score.assign(init_score.begin() + batch_iter, + init_score.begin() + batch_end); + data.init_idx.assign(init_idx.begin() + batch_iter, + init_idx.begin() + batch_end); + data.trg_src_attn_bias.assign(trg_src_attn_bias.begin() + batch_iter, + trg_src_attn_bias.begin() + batch_end); + std::vector batch_shape = + *(batch_data_shape.begin() + batch_iter); + data.batch_data_shape.push_back(batch_shape); + data.lod.resize(2); + for (int i = 0; i < batch_shape[0] + 1; i++) { + data.lod[0].push_back(i); + data.lod[1].push_back(i); + } + } + batch_iter += batch_size; + return data; + } + void Load(const std::string &path) { + std::ifstream file(path); + std::string line; + size_t num_lines = 0; + while (std::getline(file, line)) { + num_lines++; + std::vector data; + split(line, ',', &data); + CHECK_EQ(data.size(), static_cast(8)); + // load src_word + std::vector src_word_data; + split_to_int64(data[0], ' ', &src_word_data); + src_word.push_back(std::move(src_word_data)); + // load src_pos + std::vector src_pos_data; + split_to_int64(data[1], ' ', &src_pos_data); + src_pos.push_back(std::move(src_pos_data)); + // load src_slf_attn_bias + std::vector src_slf_attn_bias_data; + split_to_float(data[2], ' ', &src_slf_attn_bias_data); + src_slf_attn_bias.push_back(std::move(src_slf_attn_bias_data)); + // load trg_word + std::vector trg_word_data; + split_to_int64(data[3], ' ', &trg_word_data); + trg_word.push_back(std::move(trg_word_data)); + // load init_score + std::vector init_score_data; + split_to_float(data[4], ' ', &init_score_data); + init_score.push_back(std::move(init_score_data)); + // load init_idx + std::vector init_idx_data; + split_to_int64(data[5], ' ', &init_idx_data); + init_idx.push_back(std::move(init_idx_data)); + // load trg_src_attn_bias + std::vector trg_src_attn_bias_data; + split_to_float(data[6], ' ', &trg_src_attn_bias_data); + trg_src_attn_bias.push_back(std::move(trg_src_attn_bias_data)); + // load shape for variant data shape + std::vector batch_data_shape_data; + split_to_int(data[7], ' ', &batch_data_shape_data); + batch_data_shape.push_back(std::move(batch_data_shape_data)); + } + num_samples = num_lines; + } +}; + +void PrepareInputs(std::vector *input_slots, DataRecord *data, + int batch_size) { + auto one_batch = data->NextBatch(); + batch_size = one_batch.batch_data_shape[0][0]; + auto n_head = one_batch.batch_data_shape[0][1]; + auto trg_seq_len = one_batch.batch_data_shape[0][2]; // 1 for inference + auto src_seq_len = one_batch.batch_data_shape[0][3]; + + PaddleTensor src_word, src_pos, src_slf_attn_bias, trg_word, init_score, + init_idx, trg_src_attn_bias; + + src_word.name = "src_word"; + src_word.shape.assign({batch_size, src_seq_len, 1}); + src_word.dtype = PaddleDType::INT64; + TensorAssignData(&src_word, one_batch.src_word); + + src_pos.name = "src_pos"; + src_pos.shape.assign({batch_size, src_seq_len, 1}); + src_pos.dtype = PaddleDType::INT64; + TensorAssignData(&src_pos, one_batch.src_pos); + + src_slf_attn_bias.name = "src_slf_attn_bias"; + src_slf_attn_bias.shape.assign( + {batch_size, n_head, src_seq_len, src_seq_len}); + src_slf_attn_bias.dtype = PaddleDType::FLOAT32; + TensorAssignData(&src_slf_attn_bias, one_batch.src_slf_attn_bias); + + trg_word.name = "trg_word"; + trg_word.shape.assign({batch_size, 1}); + trg_word.dtype = PaddleDType::INT64; + trg_word.lod.assign(one_batch.lod.begin(), one_batch.lod.end()); + TensorAssignData(&trg_word, one_batch.trg_word); + + init_score.name = "init_score"; + init_score.shape.assign({batch_size, 1}); + init_score.dtype = PaddleDType::FLOAT32; + init_score.lod.assign(one_batch.lod.begin(), one_batch.lod.end()); + TensorAssignData(&init_score, one_batch.init_score); + + init_idx.name = "init_idx"; + init_idx.shape.assign({batch_size}); + init_idx.dtype = PaddleDType::INT32; + TensorAssignData(&init_idx, one_batch.init_idx); + + trg_src_attn_bias.name = "trg_src_attn_bias"; + trg_src_attn_bias.shape.assign( + {batch_size, n_head, trg_seq_len, src_seq_len}); + trg_src_attn_bias.dtype = PaddleDType::FLOAT32; + TensorAssignData(&trg_src_attn_bias, one_batch.trg_src_attn_bias); + + input_slots->assign({src_word, src_pos, src_slf_attn_bias, trg_word, + init_score, init_idx, trg_src_attn_bias}); +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); +} + +void SetInput(std::vector> *inputs) { + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + std::vector input_slots; + int test_batch_num = + FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; + LOG(INFO) << "The number of samples to be test: " + << test_batch_num * FLAGS_batch_size; + for (int bid = 0; bid < test_batch_num; ++bid) { + input_slots.clear(); + PrepareInputs(&input_slots, &data, FLAGS_batch_size); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + std::vector> outputs; + if (use_mkldnn) { + cfg.EnableMKLDNN(); + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_Transformer, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); } +#endif + +// Check the fuse status +TEST(Analyzer_Transformer, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + auto fuse_statis = GetFuseStatis( + static_cast(predictor.get()), &num_ops); +} + +// Compare result of NativeConfig and AnalysisConfig +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +TEST(Analyzer_Transformer, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc new file mode 100644 index 00000000..5f65229e --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -0,0 +1,163 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +struct Record { + std::vector data; + std::vector shape; +}; + +Record ProcessALine(const std::string &line) { + VLOG(3) << "process a line"; + std::vector columns; + split(line, '\t', &columns); + CHECK_EQ(columns.size(), 2UL) + << "data format error, should be \t"; + + Record record; + std::vector data_strs; + split(columns[0], ' ', &data_strs); + for (auto &d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(columns[1], ' ', &shape_strs); + for (auto &s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + VLOG(3) << "data size " << record.data.size(); + VLOG(3) << "data shape size " << record.shape.size(); + return record; +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/__model__", + FLAGS_infer_model + "/__params__"); + cfg->DisableGpu(); + cfg->SwitchIrDebug(); + cfg->SwitchSpecifyInputNames(false); + // TODO(TJ): fix fusion gru + cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); +} + +void SetInput(std::vector> *inputs) { + PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); + std::string line; + std::ifstream file(FLAGS_infer_data); + std::getline(file, line); + auto record = ProcessALine(line); + + PaddleTensor input; + input.shape = record.shape; + input.dtype = PaddleDType::FLOAT32; + size_t input_size = record.data.size() * sizeof(float); + input.data.Resize(input_size); + memcpy(input.data.data(), record.data.data(), input_size); + std::vector input_slots; + input_slots.assign({input}); + (*inputs).emplace_back(input_slots); +} + +// Easy for profiling independently. +// ocr, mobilenet and se_resnext50 +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + // cfg.pass_builder()->TurnOnDebug(); + std::vector> outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); + if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { + std::string line; + std::ifstream file(FLAGS_refer_result); + std::getline(file, line); + auto refer = ProcessALine(line); + file.close(); + + PADDLE_ENFORCE_GT(outputs.size(), 0); + auto &output = outputs.back().front(); + size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); + CHECK_EQ(numel, refer.data.size()); + for (size_t i = 0; i < numel; ++i) { + EXPECT_NEAR(static_cast(output.data.data())[i], refer.data[i], + 1e-5); + } + } +} + +TEST(Analyzer_vis, profile) { profile(); } + +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); } +#endif + +// Check the fuse status +TEST(Analyzer_vis, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); +} + +// Compare result of NativeConfig and AnalysisConfig +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +TEST(Analyzer_vis, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + +// Compare Deterministic result +TEST(Analyzer_vis, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h new file mode 100644 index 00000000..de938669 --- /dev/null +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/inference/api/paddle_inference_api.h" + +namespace paddle { +namespace inference { + +thread_local int num_spaces = 0; + +static std::string GenSpaces(int num_spaces) { + std::ostringstream os; + for (int i = 0; i < num_spaces; ++i) { + os << " "; + } + return os.str(); +} + +std::ostream &operator<<(std::ostream &os, + const PaddlePredictor::Config &config) { + os << GenSpaces(num_spaces) << "PaddlePredictor::Config {\n"; + num_spaces++; + os << GenSpaces(num_spaces) << "model_dir: " << config.model_dir << "\n"; + num_spaces--; + os << GenSpaces(num_spaces) << "}\n"; + return os; +} + +std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { + os << GenSpaces(num_spaces) << "NativeConfig {\n"; + num_spaces++; + os << *reinterpret_cast(&config); + os << GenSpaces(num_spaces) << "use_gpu: " << config.use_gpu << "\n"; + os << GenSpaces(num_spaces) << "device: " << config.device << "\n"; + os << GenSpaces(num_spaces) + << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n"; + os << GenSpaces(num_spaces) + << "specify_input_name: " << config.specify_input_name << "\n"; + num_spaces--; + os << GenSpaces(num_spaces) << "}\n"; + return os; +} + +std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) { + os << GenSpaces(num_spaces) << "AnalysisConfig {\n"; + num_spaces++; + os << config.ToNativeConfig(); + if (!config.model_from_memory()) { + os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n"; + os << GenSpaces(num_spaces) << "param_file: " << config.params_file() + << "\n"; + } else { + os << GenSpaces(num_spaces) + << "prog_file and param_file: load from memory \n"; + } + os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() + << "\n"; + os << GenSpaces(num_spaces) + << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n"; + os << GenSpaces(num_spaces) + << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n"; + os << GenSpaces(num_spaces) + << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n"; + os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled() + << "\n"; + os << GenSpaces(num_spaces) << "use_ngraph: " << config.ngraph_enabled() + << "\n"; + num_spaces--; + os << GenSpaces(num_spaces) << "}\n"; + return os; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py new file mode 100644 index 00000000..826c4531 --- /dev/null +++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py @@ -0,0 +1,213 @@ +# copyright (c) 2019 paddlepaddle authors. all rights reserved. +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. +import hashlib +import unittest +import os +import numpy as np +import time +import sys +import random +import functools +import contextlib +from PIL import Image +import math +from paddle.dataset.common import download +import tarfile +import StringIO + +random.seed(0) +np.random.seed(0) + +DATA_DIM = 224 +SIZE_FLOAT32 = 4 +SIZE_INT64 = 8 +FULL_SIZE_BYTES = 30106000008 +FULL_IMAGES = 50000 +TARGET_HASH = '22d2e0008dca693916d9595a5ea3ded8' +FOLDER_NAME = "ILSVRC2012/" +VALLIST_TAR_NAME = "ILSVRC2012/val_list.txt" +CHUNK_SIZE = 8192 + +img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) +img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) + + +def resize_short(img, target_size): + percent = float(target_size) / min(img.size[0], img.size[1]) + resized_width = int(round(img.size[0] * percent)) + resized_height = int(round(img.size[1] * percent)) + img = img.resize((resized_width, resized_height), Image.LANCZOS) + return img + + +def crop_image(img, target_size, center): + width, height = img.size + size = target_size + if center == True: + w_start = (width - size) / 2 + h_start = (height - size) / 2 + else: + w_start = np.random.randint(0, width - size + 1) + h_start = np.random.randint(0, height - size + 1) + w_end = w_start + size + h_end = h_start + size + img = img.crop((w_start, h_start, w_end, h_end)) + return img + + +def process_image(img): + img = resize_short(img, target_size=256) + img = crop_image(img, target_size=DATA_DIM, center=True) + if img.mode != 'RGB': + img = img.convert('RGB') + img = np.array(img).astype('float32').transpose((2, 0, 1)) / 255 + img -= img_mean + img /= img_std + return img + + +def download_concat(cache_folder, zip_path): + data_urls = [] + data_md5s = [] + data_urls.append( + 'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa' + ) + data_md5s.append('60f6525b0e1d127f345641d75d41f0a8') + data_urls.append( + 'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab' + ) + data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5') + file_names = [] + print("Downloading full ImageNet Validation dataset ...") + for i in range(0, len(data_urls)): + download(data_urls[i], cache_folder, data_md5s[i]) + file_name = os.path.join(cache_folder, data_urls[i].split('/')[-1]) + file_names.append(file_name) + print("Downloaded part {0}\n".format(file_name)) + if not os.path.exists(zip_path): + with open(zip_path, "w+") as outfile: + for fname in file_names: + with open(fname) as infile: + outfile.write(infile.read()) + + +def print_processbar(done_percentage): + done_filled = done_percentage * '=' + empty_filled = (100 - done_percentage) * ' ' + sys.stdout.write("\r[%s%s]%d%%" % + (done_filled, empty_filled, done_percentage)) + sys.stdout.flush() + + +def check_integrity(filename, target_hash): + print('\nThe binary file exists. Checking file integrity...\n') + md = hashlib.md5() + count = 0 + onepart = FULL_SIZE_BYTES / CHUNK_SIZE / 100 + with open(filename) as ifs: + while True: + buf = ifs.read(CHUNK_SIZE) + if count % onepart == 0: + done = count / onepart + print_processbar(done) + count = count + 1 + if not buf: + break + md.update(buf) + hash1 = md.hexdigest() + if hash1 == target_hash: + return True + else: + return False + + +def convert(tar_file, output_file): + print('Converting 50000 images to binary file ...\n') + tar = tarfile.open(name=tar_file, mode='r:gz') + + print_processbar(0) + + dataset = {} + for tarInfo in tar: + if tarInfo.isfile() and tarInfo.name != VALLIST_TAR_NAME: + dataset[tarInfo.name] = tar.extractfile(tarInfo).read() + + with open(output_file, "w+b") as ofs: + ofs.seek(0) + num = np.array(int(FULL_IMAGES)).astype('int64') + ofs.write(num.tobytes()) + + per_percentage = FULL_IMAGES / 100 + + idx = 0 + for imagedata in dataset.values(): + img = Image.open(StringIO.StringIO(imagedata)) + img = process_image(img) + np_img = np.array(img) + ofs.write(np_img.astype('float32').tobytes()) + if idx % per_percentage == 0: + print_processbar(idx / per_percentage) + idx = idx + 1 + + val_info = tar.getmember(VALLIST_TAR_NAME) + val_list = tar.extractfile(val_info).read() + + lines = val_list.split('\n') + val_dict = {} + for line_idx, line in enumerate(lines): + if line_idx == FULL_IMAGES: + break + name, label = line.split() + val_dict[name] = label + + for img_name in dataset.keys(): + remove_len = (len(FOLDER_NAME)) + img_name_prim = img_name[remove_len:] + label = val_dict[img_name_prim] + label_int = (int)(label) + np_label = np.array(label_int) + ofs.write(np_label.astype('int64').tobytes()) + print_processbar(100) + tar.close() + print("Conversion finished.") + + +def run_convert(): + print('Start to download and convert 50000 images to binary file...') + cache_folder = os.path.expanduser('~/.cache/paddle/dataset/int8/download') + zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz.partaa') + output_file = os.path.join(cache_folder, 'int8_full_val.bin') + retry = 0 + try_limit = 3 + + while not (os.path.exists(output_file) and + os.path.getsize(output_file) == FULL_SIZE_BYTES and + check_integrity(output_file, TARGET_HASH)): + if os.path.exists(output_file): + sys.stderr.write( + "\n\nThe existing binary file is broken. Start to generate new one...\n\n". + format(output_file)) + os.remove(output_file) + if retry < try_limit: + retry = retry + 1 + else: + raise RuntimeError( + "Can not convert the dataset to binary file with try limit {0}". + format(try_limit)) + download_concat(cache_folder, zip_path) + convert(zip_path, output_file) + print("\nSuccess! The binary file can be found at {0}".format(output_file)) + + +if __name__ == '__main__': + run_convert() diff --git a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py new file mode 100644 index 00000000..2ca8e582 --- /dev/null +++ b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py @@ -0,0 +1,187 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import xml.etree.ElementTree as ET +from PIL import Image +import numpy as np +import os +import sys +from paddle.dataset.common import download +import tarfile +import StringIO +import hashlib +import tarfile + +DATA_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar" +DATA_DIR = os.path.expanduser("~/.cache/paddle/dataset/pascalvoc/") +TAR_FILE = "VOCtest_06-Nov-2007.tar" +TAR_PATH = os.path.join(DATA_DIR, TAR_FILE) +RESIZE_H = 300 +RESIZE_W = 300 +mean_value = [127.5, 127.5, 127.5] +ap_version = '11point' +DATA_OUT = 'pascalvoc_full.bin' +DATA_OUT_PATH = os.path.join(DATA_DIR, DATA_OUT) +BIN_TARGETHASH = "f6546cadc42f5ff13178b84ed29b740b" +TAR_TARGETHASH = "b6e924de25625d8de591ea690078ad9f" +TEST_LIST_KEY = "VOCdevkit/VOC2007/ImageSets/Main/test.txt" +BIN_FULLSIZE = 5348678856 + + +def preprocess(img): + img_width, img_height = img.size + + img = img.resize((RESIZE_W, RESIZE_H), Image.ANTIALIAS) + img = np.array(img) + + # HWC to CHW + if len(img.shape) == 3: + img = np.swapaxes(img, 1, 2) + img = np.swapaxes(img, 1, 0) + # RBG to BGR + img = img[[2, 1, 0], :, :] + img = img.astype('float32') + img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype('float32') + img -= img_mean + img = img * 0.007843 + return img + + +def print_processbar(done_percentage): + done_filled = done_percentage * '=' + empty_filled = (100 - done_percentage) * ' ' + sys.stdout.write("\r[%s%s]%d%%" % + (done_filled, empty_filled, done_percentage)) + sys.stdout.flush() + + +def convert_pascalvoc(tar_path, data_out_path): + print("Start converting ...\n") + images = {} + gt_labels = {} + boxes = [] + lbls = [] + difficults = [] + object_nums = [] + + # map label to number (index) + label_list = [ + "background", "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", + "car", "cat", "chair", "cow", "diningtable", "dog", "horse", + "motorbike", "person", "pottedplant", "sheep", "sofa", "train", + "tvmonitor" + ] + print_processbar(0) + #read from tar file and write to bin + tar = tarfile.open(tar_path, "r") + f_test = tar.extractfile(TEST_LIST_KEY).read() + lines = f_test.split('\n') + del lines[-1] + line_len = len(lines) + per_percentage = line_len / 100 + + f1 = open(data_out_path, "w+b") + f1.seek(0) + f1.write(np.array(line_len).astype('int64').tobytes()) + for tarInfo in tar: + if tarInfo.isfile(): + tmp_filename = tarInfo.name + name_arr = tmp_filename.split('/') + name_prefix = name_arr[-1].split('.')[0] + if name_arr[-2] == 'JPEGImages' and name_prefix in lines: + images[name_prefix] = tar.extractfile(tarInfo).read() + if name_arr[-2] == 'Annotations' and name_prefix in lines: + gt_labels[name_prefix] = tar.extractfile(tarInfo).read() + + for line_idx, name_prefix in enumerate(lines): + im = Image.open(StringIO.StringIO(images[name_prefix])) + if im.mode == 'L': + im = im.convert('RGB') + im_width, im_height = im.size + + im = preprocess(im) + np_im = np.array(im) + f1.write(np_im.astype('float32').tobytes()) + + # layout: label | xmin | ymin | xmax | ymax | difficult + bbox_labels = [] + root = ET.fromstring(gt_labels[name_prefix]) + + objects = root.findall('object') + objects_size = len(objects) + object_nums.append(objects_size) + + for object in objects: + bbox_sample = [] + bbox_sample.append( + float(label_list.index(object.find('name').text))) + bbox = object.find('bndbox') + difficult = float(object.find('difficult').text) + bbox_sample.append(float(bbox.find('xmin').text) / im_width) + bbox_sample.append(float(bbox.find('ymin').text) / im_height) + bbox_sample.append(float(bbox.find('xmax').text) / im_width) + bbox_sample.append(float(bbox.find('ymax').text) / im_height) + bbox_sample.append(difficult) + bbox_labels.append(bbox_sample) + + bbox_labels = np.array(bbox_labels) + if len(bbox_labels) == 0: continue + lbls.extend(bbox_labels[:, 0]) + boxes.extend(bbox_labels[:, 1:5]) + difficults.extend(bbox_labels[:, -1]) + + if line_idx % per_percentage: + print_processbar(line_idx / per_percentage) + + f1.write(np.array(object_nums).astype('uint64').tobytes()) + f1.write(np.array(lbls).astype('int64').tobytes()) + f1.write(np.array(boxes).astype('float32').tobytes()) + f1.write(np.array(difficults).astype('int64').tobytes()) + f1.close() + print_processbar(100) + print("Conversion finished!\n") + + +def download_pascalvoc(data_url, data_dir, tar_targethash, tar_path): + print("Downloading pascalvcoc test set...") + download(data_url, data_dir, tar_targethash) + if not os.path.exists(tar_path): + print("Failed in downloading pascalvoc test set. URL %s\n" % data_url) + else: + tmp_hash = hashlib.md5(open(tar_path, 'rb').read()).hexdigest() + if tmp_hash != tar_targethash: + print("Downloaded test set is broken, removing ...\n") + else: + print("Downloaded successfully. Path: %s\n" % tar_path) + + +def run_convert(): + try_limit = 2 + retry = 0 + while not (os.path.exists(DATA_OUT_PATH) and + os.path.getsize(DATA_OUT_PATH) == BIN_FULLSIZE and BIN_TARGETHASH + == hashlib.md5(open(DATA_OUT_PATH, 'rb').read()).hexdigest()): + if os.path.exists(DATA_OUT_PATH): + sys.stderr.write( + "The existing binary file is broken. It is being removed...\n") + os.remove(DATA_OUT_PATH) + if retry < try_limit: + retry = retry + 1 + else: + download_pascalvoc(DATA_URL, DATA_DIR, TAR_TARGETHASH, TAR_PATH) + convert_pascalvoc(TAR_PATH, DATA_OUT_PATH) + print("Success! \nThe binary file can be found at %s\n" % DATA_OUT_PATH) + + +if __name__ == "__main__": + run_convert() diff --git a/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md new file mode 100644 index 00000000..4add8bb2 --- /dev/null +++ b/paddle/fluid/inference/tests/api/int8_mkldnn_quantization.md @@ -0,0 +1,105 @@ +# INT8 MKL-DNN quantization + +This document describes how to use Paddle inference Engine to convert the FP32 model to INT8 model on ResNet-50 and MobileNet-V1. We provide the instructions on enabling INT8 MKL-DNN quantization in Paddle inference and show the ResNet-50 and MobileNet-V1 results in accuracy and performance. + +## 0. Install PaddlePaddle + +Follow PaddlePaddle [installation instruction](https://github.com/PaddlePaddle/models/tree/develop/fluid/PaddleCV/image_classification#installation) to install PaddlePaddle. If you build PaddlePaddle yourself, please use the following cmake arguments. + +```bash +cmake .. -DWITH_TESTING=ON -WITH_FLUID_ONLY=ON -DWITH_GPU=OFF -DWITH_MKL=ON -DWITH_MKLDNN=ON -DWITH_INFERENCE_API_TEST=ON -DON_INFER=ON + +``` + +Note: MKL-DNN and MKL are required. + +## 1. Enable INT8 MKL-DNN quantization + +For reference, please examine the code of unit test enclosed in [analyzer_int8_image_classification_tester.cc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/tests/api/analyzer_int8_image_classification_tester.cc). + +* ### Create Analysis config + +INT8 quantization is one of the optimizations in analysis config. More information about analysis config can be found [here](https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/advanced_usage/deploy/inference/native_infer_en.md#upgrade-performance-based-on-contribanalysisconfig-prerelease) + +* ### Create quantize config by analysis config + +We enable the MKL-DNN quantization procedure by calling an appropriate method from analysis config. Afterwards, all the required quantization parameters (quantization op names, quantization strategies etc.) can be set through quantizer config which is present in the analysis config. It is also necessary to specify a pre-processed warmup dataset and desired batch size. + +```cpp +//Enable MKL-DNN quantization +cfg.EnableMkldnnQuantizer(); + +//use analysis config to call the MKL-DNN quantization config +cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); +cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(100); +``` + +## 2. Accuracy and Performance benchmark + +We provide the results of accuracy and performance measured on Intel(R) Xeon(R) Gold 6271 on single core. + +>**Dataset: ILSVRC2012 Validation dataset** + +>**I. Top-1 Accuracy on Intel(R) Xeon(R) Gold 6271** + +| Model | FP32 Accuracy | INT8 Accuracy | Accuracy Diff(FP32-INT8) | +| :----------: | :-------------: | :------------: | :--------------: | +| GoogleNet | 70.50% | 69.81% | 0.69% | +| MobileNet-V1 | 70.78% | 70.42% | 0.36% | +| MobileNet-V2 | 71.90% | 71.35% | 0.55% | +| ResNet-101 | 77.50% | 77.42% | 0.08% | +| ResNet-50 | 76.63% | 76.52% | 0.11% | +| VGG16 | 72.08% | 72.03% | 0.05% | +| VGG19 | 72.57% | 72.55% | 0.02% | + +>**II. Throughput on Intel(R) Xeon(R) Gold 6271 (batch size 1 on single core)** + +| Model | FP32 Throughput(images/s) | INT8 Throughput(images/s) | Ratio(INT8/FP32)| +| :-----------:| :------------: | :------------: | :------------: | +| GoogleNet | 34.06 | 72.79 | 2.14 | +| MobileNet-V1 | 80.02 | 230.65 | 2.88 | +| MobileNet-V2 | 99.38 | 206.92 | 2.08 | +| ResNet-101 | 7.38 | 27.31 | 3.70 | +| ResNet-50 | 13.71 | 50.55 | 3.69 | +| VGG16 | 3.64 | 10.56 | 2.90 | +| VGG19 | 2.95 | 9.02 | 3.05 | + +Notes: + +* Measurement of accuracy requires a model which accepts two inputs: data and labels. + +* Different sampling batch size data may cause slight difference on INT8 top accuracy. +* CAPI performance data is better than python API performance data because of the python overhead. Especially for the small computational model, python overhead will be more obvious. + +## 3. Commands to reproduce the above accuracy and performance benchmark + +Two steps to reproduce the above-mentioned accuracy results, and we take GoogleNet benchmark as an example: + +* ### Prepare dataset + +Running the following commands to download and preprocess the ILSVRC2012 Validation dataset. + +```bash +cd /PATH/TO/PADDLE/build +python ../paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py +``` + +Then the ILSVRC2012 Validation dataset will be preprocessed and saved by default in `~/.cache/paddle/dataset/int8/download/int8_full_val.bin` + +* ### Commands to reproduce benchmark + +You can run `test_analyzer_int8_imagenet_classification` with the following arguments to reproduce the accuracy result on GoogleNet. + +```bash +./paddle/fluid/inference/tests/api/test_analyzer_int8_image_classification --infer_model=third_party/inference_demo/int8v2/resnet50/model --infer_data=/~/.cache/paddle/dataset/int8/download/int8_full_val.bin --batch_size=1 --paddle_num_threads=1 +``` + +To verify all the 7 models, you need to set the parameter of `--infer_model` to one of the following values in command line: + +```bash +--infer_model /PATH/TO/PADDLE/build/third_party/inference_demo/int8v2/MODEL_NAME/model +``` + +```text +MODEL_NAME=googlenet, mobilenetv1, mobilenetv2, resnet101, resnet50, vgg16, vgg19 +``` diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h new file mode 100644 index 00000000..61cf10c3 --- /dev/null +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -0,0 +1,730 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include // NOLINT +#include +#include +#ifdef WITH_GPERFTOOLS +#include +#endif +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/analysis_predictor.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/inference/tests/api/config_printer.h" +#include "paddle/fluid/inference/tests/test_helper.h" +#include "paddle/fluid/inference/utils/benchmark.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_string(model_name, "", "model name"); +DEFINE_string(infer_model, "", "model path"); +DEFINE_string(infer_data, "", "data file"); +DEFINE_string(refer_result, "", "reference result for comparison"); +DEFINE_int32(batch_size, 1, "batch size"); +DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup"); +// setting iterations to 0 means processing the whole dataset +DEFINE_int32(iterations, 0, "number of batches to process"); +DEFINE_int32(repeat, 1, "Running the inference program repeat times."); +DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); +DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads."); +DEFINE_bool(use_analysis, true, + "Running the inference program in analysis mode."); +DEFINE_bool(record_benchmark, false, + "Record benchmark after profiling the model"); +DEFINE_double(accuracy, 1e-3, "Result Accuracy."); +DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy."); +DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch."); +DEFINE_bool(warmup, false, + "Use warmup to calculate elapsed_time more accurately. " + "To reduce CI time, it sets false in default."); + +DECLARE_bool(profile); +DECLARE_int32(paddle_num_threads); + +namespace paddle { +namespace inference { + +using paddle::framework::proto::VarType; + +template +constexpr paddle::PaddleDType GetPaddleDType(); + +template <> +constexpr paddle::PaddleDType GetPaddleDType() { + return paddle::PaddleDType::INT64; +} + +template <> +constexpr paddle::PaddleDType GetPaddleDType() { + return paddle::PaddleDType::FLOAT32; +} + +void PrintConfig(const PaddlePredictor::Config *config, bool use_analysis) { + const auto *analysis_config = + reinterpret_cast(config); + if (use_analysis) { + LOG(INFO) << *analysis_config; + return; + } + LOG(INFO) << analysis_config->ToNativeConfig(); +} + +// Compare result between two PaddleTensor +void CompareResult(const std::vector &outputs, + const std::vector &ref_outputs) { + EXPECT_GT(outputs.size(), 0UL); + EXPECT_EQ(outputs.size(), ref_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &ref_out = ref_outputs[i]; + size_t size = VecReduceToInt(out.shape); + size_t ref_size = VecReduceToInt(ref_out.shape); + EXPECT_GT(size, 0UL); + EXPECT_EQ(size, ref_size); + EXPECT_EQ(out.dtype, ref_out.dtype); + switch (out.dtype) { + case PaddleDType::INT64: { + int64_t *pdata = static_cast(out.data.data()); + int64_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + case PaddleDType::FLOAT32: { + float *pdata = static_cast(out.data.data()); + float *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy); + } + break; + } + case PaddleDType::INT32: { + int32_t *pdata = static_cast(out.data.data()); + int32_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + } + } +} + +// Compare result between a PaddleTensor and a ZeroCopyTensor +void CompareResult(const std::vector &outputs, + const std::vector &ref_outputs) { + EXPECT_GT(outputs.size(), 0UL); + EXPECT_EQ(outputs.size(), ref_outputs.size()); + for (size_t i = 0; i < outputs.size(); i++) { + auto &out = outputs[i]; + auto &ref_out = ref_outputs[i]; + size_t size = VecReduceToInt(out.shape); + EXPECT_GT(size, 0UL); + int ref_size = 0; // this is the number of elements not memory size + PaddlePlace place; + switch (out.dtype) { + case PaddleDType::INT64: { + int64_t *pdata = static_cast(out.data.data()); + int64_t *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, static_cast(ref_size)); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + case PaddleDType::FLOAT32: { + float *pdata = static_cast(out.data.data()); + float *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy); + } + break; + } + case PaddleDType::INT32: { + int32_t *pdata = static_cast(out.data.data()); + int32_t *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } + } + } +} + +std::unique_ptr CreateTestPredictor( + const PaddlePredictor::Config *config, bool use_analysis = true) { + const auto *analysis_config = + reinterpret_cast(config); + if (use_analysis) { + return CreatePaddlePredictor(*analysis_config); + } + auto native_config = analysis_config->ToNativeConfig(); + return CreatePaddlePredictor(native_config); +} + +size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); } + +std::unordered_map GetFuseStatis(PaddlePredictor *predictor, + int *num_ops) { + std::unordered_map res; + auto *analysis_predictor = static_cast(predictor); + auto *fusion_status = + analysis_predictor->analysis_argument().fusion_statis_ptr(); + if (!fusion_status) { + return res; + } + for (auto &item : *fusion_status) { + LOG(INFO) << "fused " << item.first << " " << item.second; + } + int num = 0; + for (auto &node : + analysis_predictor->analysis_argument().main_graph().Nodes()) { + if (node->IsOp()) { + ++num; + } + } + *num_ops = num; + return *fusion_status; +} + +void SetFakeImageInput(std::vector> *inputs, + const std::string &dirname, bool is_combined = true, + std::string model_filename = "model", + std::string params_filename = "params", + const std::vector *feed_names = nullptr, + const int continuous_inuput_index = 0) { + // Set fake_image_data + PADDLE_ENFORCE_EQ(FLAGS_test_all_data, 0, "Only have single batch of data."); + std::vector> feed_target_shapes = GetFeedTargetShapes( + dirname, is_combined, model_filename, params_filename); + std::ostringstream os; + for (size_t i = 0; i < feed_target_shapes.size(); ++i) { + os << "feed target " << i << ": {" << feed_target_shapes[i][0]; + for (size_t j = 1; j < feed_target_shapes[i].size(); ++j) { + os << ", " << feed_target_shapes[i][j]; + } + os << "}\n"; + } + LOG(INFO) << os.str(); + if (feed_names) { + PADDLE_ENFORCE_EQ(feed_names->size(), feed_target_shapes.size()); + } + std::vector input_slots(feed_target_shapes.size()); + for (size_t i = 0; i < feed_target_shapes.size(); ++i) { + const auto &feed_shape = feed_target_shapes[i]; + auto &input = input_slots[i]; + std::vector shape({FLAGS_batch_size}); + for (size_t s = 1; s < feed_shape.size(); ++s) { + shape.push_back(static_cast(feed_shape[s])); + } + if (feed_names) { + input.name = (*feed_names)[i]; + } + input.shape = shape; + input.dtype = PaddleDType::FLOAT32; + size_t len = std::accumulate(shape.begin(), shape.end(), size_t{1}, + [](int a, int b) { return a * b; }); + input.data.Resize(len * sizeof(float)); + input.lod.assign({{0, static_cast(FLAGS_batch_size)}}); + float *input_data = static_cast(input.data.data()); + // fill input data, for profile easily, do not use random data here. + for (size_t j = 0; j < len; ++j) { + *(input_data + j) = + static_cast((j + continuous_inuput_index) % len) / len; + } + } + (*inputs).emplace_back(input_slots); +} + +void GetInputPerBatch(const std::vector> &in, + std::vector> *out, + std::vector *lod, size_t batch_iter, + size_t batch_end) { + lod->clear(); + lod->push_back(0); + for (auto it = in.begin() + batch_iter; it < in.begin() + batch_end; it++) { + out->push_back(*it); + lod->push_back(lod->back() + (*it).size()); // calculate lod + } +} + +void ConvertPaddleTensorToZeroCopyTensor( + PaddlePredictor *predictor, const std::vector &inputs) { + for (size_t i = 0; i < inputs.size(); i++) { + auto input = inputs[i]; + auto tensor = predictor->GetInputTensor(input.name); + tensor->Reshape(input.shape); + tensor->SetLoD({input.lod}); + if (input.dtype == PaddleDType::INT64) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else if (input.dtype == PaddleDType::FLOAT32) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else if (input.dtype == PaddleDType::INT32) { + ZeroCopyTensorAssignData(tensor.get(), input.data); + } else { + LOG(ERROR) << "unsupported feed type " << input.dtype; + } + } +} + +void PredictionWarmUp(PaddlePredictor *predictor, + const std::vector> &inputs, + std::vector> *outputs, + int num_threads, int tid, + const VarType::Type data_type = VarType::FP32) { + int batch_size = FLAGS_batch_size; + LOG(INFO) << "Running thread " << tid << ", warm up run..."; + if (FLAGS_zero_copy) { + ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[0]); + } + outputs->resize(1); + Timer warmup_timer; + warmup_timer.tic(); + if (!FLAGS_zero_copy) { + predictor->Run(inputs[0], &(*outputs)[0], batch_size); + } else { + predictor->ZeroCopyRun(); + } + PrintTime(batch_size, 1, num_threads, tid, warmup_timer.toc(), 1, data_type); + if (FLAGS_profile) { + paddle::platform::ResetProfiler(); + } +} + +void PredictionRun(PaddlePredictor *predictor, + const std::vector> &inputs, + std::vector> *outputs, + int num_threads, int tid, + const VarType::Type data_type = VarType::FP32, + float *sample_latency = nullptr) { + int num_times = FLAGS_repeat; + int iterations = inputs.size(); // process the whole dataset ... + if (FLAGS_iterations > 0 && + FLAGS_iterations < static_cast(inputs.size())) + iterations = + FLAGS_iterations; // ... unless the number of iterations is set + outputs->resize(iterations); + LOG(INFO) << "Thread " << tid << ", number of threads " << num_threads + << ", run " << num_times << " times..."; + Timer run_timer; + double elapsed_time = 0; +#ifdef WITH_GPERFTOOLS + ProfilerStart("paddle_inference.prof"); +#endif + int predicted_num = 0; + if (!FLAGS_zero_copy) { + for (int i = 0; i < iterations; i++) { + run_timer.tic(); + for (int j = 0; j < num_times; j++) { + predictor->Run(inputs[i], &(*outputs)[i], FLAGS_batch_size); + } + elapsed_time += run_timer.toc(); + + predicted_num += FLAGS_batch_size; + if (predicted_num % 100 == 0) { + LOG(INFO) << predicted_num << " samples"; + } + } + } else { + for (int i = 0; i < iterations; i++) { + ConvertPaddleTensorToZeroCopyTensor(predictor, inputs[i]); + run_timer.tic(); + for (int j = 0; j < num_times; j++) { + predictor->ZeroCopyRun(); + } + elapsed_time += run_timer.toc(); + + predicted_num += FLAGS_batch_size; + if (predicted_num % 100 == 0) { + LOG(INFO) << predicted_num << " samples"; + } + } + } + +#ifdef WITH_GPERFTOOLS + ProfilerStop(); +#endif + + auto batch_latency = elapsed_time / (iterations * num_times); + PrintTime(FLAGS_batch_size, num_times, num_threads, tid, batch_latency, + iterations, data_type); + + if (sample_latency != nullptr) + *sample_latency = batch_latency / FLAGS_batch_size; + + if (FLAGS_record_benchmark) { + Benchmark benchmark; + benchmark.SetName(FLAGS_model_name); + benchmark.SetBatchSize(FLAGS_batch_size); + benchmark.SetLatency(batch_latency); + benchmark.PersistToFile("benchmark_record.txt"); + } +} + +void TestOneThreadPrediction( + const PaddlePredictor::Config *config, + const std::vector> &inputs, + std::vector> *outputs, bool use_analysis = true, + const VarType::Type data_type = VarType::FP32, + float *sample_latency = nullptr) { + auto predictor = CreateTestPredictor(config, use_analysis); + if (FLAGS_warmup) { + PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0, data_type); + } + PredictionRun(predictor.get(), inputs, outputs, 1, 0, data_type, + sample_latency); +} + +void TestMultiThreadPrediction( + const PaddlePredictor::Config *config, + const std::vector> &inputs, + std::vector> *outputs, int num_threads, + bool use_analysis = true) { + std::vector threads; + std::vector> predictors; + predictors.emplace_back(CreateTestPredictor(config, use_analysis)); + for (int tid = 1; tid < num_threads; tid++) { + predictors.emplace_back(predictors.front()->Clone()); + } + + for (int tid = 0; tid < num_threads; ++tid) { + threads.emplace_back([&, tid]() { + // Each thread should have local inputs and outputs. + // The inputs of each thread are all the same. + std::vector> outputs_tid; + auto &predictor = predictors[tid]; + if (FLAGS_warmup) { + PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads, + tid); + } + PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid); + }); + } + for (int i = 0; i < num_threads; ++i) { + threads[i].join(); + } +} + +void TestPrediction(const PaddlePredictor::Config *config, + const std::vector> &inputs, + std::vector> *outputs, + int num_threads, bool use_analysis = FLAGS_use_analysis) { + PrintConfig(config, use_analysis); + if (num_threads == 1) { + TestOneThreadPrediction(config, inputs, outputs, use_analysis); + } else { + TestMultiThreadPrediction(config, inputs, outputs, num_threads, + use_analysis); + } +} + +void SummarizeAccuracy(float avg_acc1_fp32, float avg_acc1_int8) { + LOG(INFO) << "--- Accuracy summary --- "; + LOG(INFO) << "Accepted top1 accuracy drop threshold: " + << FLAGS_quantized_accuracy + << ". (condition: (FP32_top1_acc - INT8_top1_acc) <= threshold)"; + LOG(INFO) << "FP32: avg top1 accuracy: " << std::fixed << std::setw(6) + << std::setprecision(4) << avg_acc1_fp32; + LOG(INFO) << "INT8: avg top1 accuracy: " << std::fixed << std::setw(6) + << std::setprecision(4) << avg_acc1_int8; +} + +void SummarizePerformance(float sample_latency_fp32, + float sample_latency_int8) { + // sample latency in ms + auto throughput_fp32 = 1000.0 / sample_latency_fp32; + auto throughput_int8 = 1000.0 / sample_latency_int8; + LOG(INFO) << "--- Performance summary --- "; + LOG(INFO) << "FP32: avg fps: " << std::fixed << std::setw(6) + << std::setprecision(4) << throughput_fp32 + << ", avg latency: " << sample_latency_fp32 << " ms"; + LOG(INFO) << "INT8: avg fps: " << std::fixed << std::setw(6) + << std::setprecision(4) << throughput_int8 + << ", avg latency: " << sample_latency_int8 << " ms"; +} + +void CompareTopAccuracy( + const std::vector> &output_slots_quant, + const std::vector> &output_slots_ref) { + if (output_slots_quant.size() == 0 || output_slots_ref.size() == 0) + throw std::invalid_argument( + "CompareTopAccuracy: output_slots vector is empty."); + + float total_accs1_quant{0}; + float total_accs1_ref{0}; + for (size_t i = 0; i < output_slots_quant.size(); ++i) { + PADDLE_ENFORCE(output_slots_quant[i].size() >= 2UL); + PADDLE_ENFORCE(output_slots_ref[i].size() >= 2UL); + // second output: acc_top1 + if (output_slots_quant[i][1].lod.size() > 0 || + output_slots_ref[i][1].lod.size() > 0) + throw std::invalid_argument( + "CompareTopAccuracy: top1 accuracy output has nonempty LoD."); + if (output_slots_quant[i][1].dtype != paddle::PaddleDType::FLOAT32 || + output_slots_ref[i][1].dtype != paddle::PaddleDType::FLOAT32) + throw std::invalid_argument( + "CompareTopAccuracy: top1 accuracy output is of a wrong type."); + total_accs1_quant += + *static_cast(output_slots_quant[i][1].data.data()); + total_accs1_ref += + *static_cast(output_slots_ref[i][1].data.data()); + } + float avg_acc1_quant = total_accs1_quant / output_slots_quant.size(); + float avg_acc1_ref = total_accs1_ref / output_slots_ref.size(); + + SummarizeAccuracy(avg_acc1_ref, avg_acc1_quant); + CHECK_GT(avg_acc1_ref, 0.0); + CHECK_GT(avg_acc1_quant, 0.0); + CHECK_LE(avg_acc1_ref - avg_acc1_quant, FLAGS_quantized_accuracy); +} + +void CompareDeterministic( + const PaddlePredictor::Config *config, + const std::vector> &inputs) { + int batch_size = FLAGS_batch_size; + int num_times = FLAGS_repeat; + auto predictor = CreateTestPredictor(config, FLAGS_use_analysis); + + std::vector warmup_outputs, outputs; + // run num_times to Compare Deterministic Result. + for (size_t j = 0; j < inputs.size(); j++) { + // warmup run + predictor->Run(inputs[j], &warmup_outputs, batch_size); + for (int i = 0; i < num_times; i++) { + predictor->Run(inputs[j], &outputs, batch_size); + CompareResult(outputs, warmup_outputs); + } + } +} + +void CompareNativeAndAnalysis( + const PaddlePredictor::Config *config, + const std::vector> &inputs) { + PrintConfig(config, true); + std::vector> native_outputs, analysis_outputs; + TestOneThreadPrediction(config, inputs, &native_outputs, false); + TestOneThreadPrediction(config, inputs, &analysis_outputs, true); + PADDLE_ENFORCE(native_outputs.size() > 0, "Native output is empty."); + PADDLE_ENFORCE(analysis_outputs.size() > 0, "Analysis output is empty."); + CompareResult(analysis_outputs.back(), native_outputs.back()); +} + +void CompareQuantizedAndAnalysis( + const AnalysisConfig *config, const AnalysisConfig *qconfig, + const std::vector> &inputs) { + PADDLE_ENFORCE_EQ(inputs[0][0].shape[0], FLAGS_batch_size, + "Input data has to be packed batch by batch."); + LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size + << ", warmup batch size " << FLAGS_warmup_batch_size << "."; + + LOG(INFO) << "--- FP32 prediction start ---"; + auto *cfg = reinterpret_cast(config); + PrintConfig(cfg, true); + std::vector> analysis_outputs; + float sample_latency_fp32{-1}; + TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32, + &sample_latency_fp32); + + LOG(INFO) << "--- INT8 prediction start ---"; + auto *qcfg = reinterpret_cast(qconfig); + PrintConfig(qcfg, true); + std::vector> quantized_outputs; + float sample_latency_int8{-1}; + TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true, VarType::INT8, + &sample_latency_int8); + + SummarizePerformance(sample_latency_fp32, sample_latency_int8); + CompareTopAccuracy(quantized_outputs, analysis_outputs); +} + +void CompareNativeAndAnalysis( + PaddlePredictor *native_pred, PaddlePredictor *analysis_pred, + const std::vector> &inputs) { + int batch_size = FLAGS_batch_size; + std::vector native_outputs, analysis_outputs; + native_pred->Run(inputs[0], &native_outputs, batch_size); + analysis_pred->Run(inputs[0], &analysis_outputs, batch_size); + CompareResult(analysis_outputs, native_outputs); +} + +void CompareAnalysisAndZeroCopy( + PaddlePredictor::Config *config, PaddlePredictor::Config *config1, + const std::vector> &inputs, + const std::vector &outputs_name) { + int batch_size = FLAGS_batch_size; + // analysis + std::vector analysis_outputs; + auto predictor = CreateTestPredictor(config, true); + predictor->Run(inputs[0], &analysis_outputs, batch_size); + // analysis + zero_copy + std::vector zerocopy_outputs; + reinterpret_cast(config1)->SwitchUseFeedFetchOps(false); + predictor = CreateTestPredictor(config1, true); + ConvertPaddleTensorToZeroCopyTensor(predictor.get(), inputs[0]); + predictor->ZeroCopyRun(); + for (size_t i = 0; i < outputs_name.size(); i++) { + ZeroCopyTensor zerocopy_output = + *predictor->GetOutputTensor(outputs_name[i]).get(); + zerocopy_outputs.emplace_back(zerocopy_output); + LOG(INFO) << "ZeroCopy output: " << DescribeZeroCopyTensor(zerocopy_output); + } + // compare + CompareResult(analysis_outputs, zerocopy_outputs); +} + +void SaveOptimModel(AnalysisConfig *cfg, const std::string &dstPath) { + auto predictor = CreateTestPredictor( + reinterpret_cast(cfg), + FLAGS_use_analysis); + (static_cast(predictor.get()))->SaveOptimModel(dstPath); +} + +template +std::string LoDTensorSummary(const framework::LoDTensor &tensor) { + std::stringstream ss; + ss << "\n---- tensor ---" << '\n'; + ss << "lod: ["; + for (const auto &level : tensor.lod()) { + ss << "[ "; + for (auto i : level) { + ss << i << ", "; + } + ss << "]"; + } + ss << "]\n"; + + ss << "shape: ["; + int size = 1; + for (int i = 0; i < tensor.dims().size(); i++) { + int dim = tensor.dims()[i]; + ss << dim << ", "; + size *= dim; + } + ss << "]\n"; + + ss << "data: "; + for (int i = 0; i < std::min(20, size); i++) { + ss << tensor.data()[i] << " "; + } + ss << "\n"; + + return ss.str(); +} + +static bool CompareLoD(const framework::LoD &a, const framework::LoD &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("lod size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + auto &al = a[i]; + auto &bl = b[i]; + if (al.size() != bl.size()) { + LOG(ERROR) << string::Sprintf("level size %d != %d", al.size(), + bl.size()); + return false; + } + } + return true; +} + +static bool CompareShape(const std::vector &a, + const std::vector &b) { + if (a.size() != b.size()) { + LOG(ERROR) << string::Sprintf("shape size not match %d != %d", a.size(), + b.size()); + return false; + } + for (size_t i = 0; i < a.size(); i++) { + if (a[i] != b[i]) { + LOG(ERROR) << string::Sprintf("shape %d-th element not match %d != %d", i, + a[i], b[i]); + return false; + } + } + return true; +} + +static bool CompareTensorData(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + auto a_shape = framework::vectorize(a.dims()); + auto b_shape = framework::vectorize(b.dims()); + size_t a_size = std::accumulate(a_shape.begin(), a_shape.end(), size_t{1}, + [](int a, int b) { return a * b; }); + size_t b_size = std::accumulate(b_shape.begin(), b_shape.end(), size_t{1}, + [](int a, int b) { return a * b; }); + if (a_size != b_size) { + LOG(ERROR) << string::Sprintf("tensor data size not match, %d != %d", + a_size, b_size); + } + + for (size_t i = 0; i < a_size; i++) { + if (a.type() == VarType::FP32) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } else if (a.type() == VarType::INT64) { + const auto *a_data = a.data(); + const auto *b_data = b.data(); + if (std::abs(a_data[i] - b_data[i]) > 1e-3) { + LOG(ERROR) << string::Sprintf( + "tensor data %d-th element not match, %f != %f", i, a_data[i], + b_data[i]); + return false; + } + } + } + + return true; +} + +static bool CompareTensor(const framework::LoDTensor &a, + const framework::LoDTensor &b) { + if (!CompareLoD(a.lod(), b.lod())) { + return false; + } + if (!CompareShape(framework::vectorize(a.dims()), + framework::vectorize(b.dims()))) { + return false; + } + + if (!CompareTensorData(a, b)) { + return false; + } + + return true; +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc new file mode 100644 index 00000000..14539a9d --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/inference/tests/api/trt_test_helper.h" + +namespace paddle { +namespace inference { + +TEST(TensorRT_mobilenet, compare) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + compare(model_dir, /* use_tensorrt */ true); + // Open it when need. + // profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); +} + +TEST(AnalysisPredictor, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir); + config.pass_builder()->TurnOnDebug(); + + std::vector> inputs_all; + auto predictor = CreatePaddlePredictor(config); + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + + std::vector outputs; + for (auto& input : inputs_all) { + ASSERT_TRUE(predictor->Run(input, &outputs)); + } +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc new file mode 100644 index 00000000..7dfcbb0d --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/inference/tests/api/trt_test_helper.h" + +namespace paddle { +namespace inference { + +TEST(resnet50, compare_continuous_input) { + std::string model_dir = FLAGS_infer_model + "/resnet50"; + compare_continuous_input(model_dir, true); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_resnext_test.cc b/paddle/fluid/inference/tests/api/trt_resnext_test.cc new file mode 100644 index 00000000..588b5bff --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_resnext_test.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/inference/tests/api/trt_test_helper.h" + +namespace paddle { +namespace inference { + +TEST(TensorRT_resnext50, compare) { + std::string model_dir = FLAGS_infer_model + "/resnext50"; + compare(model_dir, /* use_tensorrt */ true); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_test_helper.h b/paddle/fluid/inference/tests/api/trt_test_helper.h new file mode 100644 index 00000000..0233cad0 --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_test_helper.h @@ -0,0 +1,137 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include +#include + +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "gtest/gtest.h" + +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { + +DEFINE_bool(use_tensorrt, true, "Test the performance of TensorRT engine."); +DEFINE_string(prog_filename, "", "Name of model file."); +DEFINE_string(param_filename, "", "Name of parameters file."); + +template +void SetConfig(ConfigType* config, std::string model_dir, bool use_gpu, + bool use_tensorrt = false, int batch_size = -1) { + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + config->prog_file = model_dir + "/" + FLAGS_prog_filename; + config->param_file = model_dir + "/" + FLAGS_param_filename; + } else { + config->model_dir = model_dir; + } + if (use_gpu) { + config->use_gpu = true; + config->device = 0; + config->fraction_of_gpu_memory = 0.15; + } +} + +template <> +void SetConfig(AnalysisConfig* config, std::string model_dir, + bool use_gpu, bool use_tensorrt, + int batch_size) { + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + config->SetModel(model_dir + "/" + FLAGS_prog_filename, + model_dir + "/" + FLAGS_param_filename); + } else { + config->SetModel(model_dir); + } + if (use_gpu) { + config->EnableUseGpu(100, 0); + if (use_tensorrt) { + config->EnableTensorRtEngine(1 << 10, batch_size, 3, + AnalysisConfig::Precision::kFloat32, false); + config->pass_builder()->DeletePass("conv_bn_fuse_pass"); + config->pass_builder()->DeletePass("fc_fuse_pass"); + config->pass_builder()->TurnOnDebug(); + } else { + config->SwitchIrOptim(); + } + } +} + +void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { + std::vector> inputs_all; + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, + FLAGS_param_filename); + } else { + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + } + + std::vector> outputs; + if (use_analysis || use_tensorrt) { + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.pass_builder()->TurnOnDebug(); + SetConfig(&config, model_dir, true, use_tensorrt, + FLAGS_batch_size); + TestPrediction(reinterpret_cast(&config), + inputs_all, &outputs, FLAGS_num_threads, true); + } else { + NativeConfig config; + SetConfig(&config, model_dir, true, false); + TestPrediction(reinterpret_cast(&config), + inputs_all, &outputs, FLAGS_num_threads, false); + } +} + +void compare(std::string model_dir, bool use_tensorrt) { + std::vector> inputs_all; + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, + FLAGS_param_filename); + } else { + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + } + + AnalysisConfig analysis_config; + SetConfig(&analysis_config, model_dir, true, use_tensorrt, + FLAGS_batch_size); + CompareNativeAndAnalysis( + reinterpret_cast(&analysis_config), + inputs_all); +} + +void compare_continuous_input(std::string model_dir, bool use_tensorrt) { + AnalysisConfig analysis_config; + SetConfig(&analysis_config, model_dir, true, use_tensorrt, + FLAGS_batch_size); + auto config = + reinterpret_cast(&analysis_config); + auto native_pred = CreateTestPredictor(config, false); + auto analysis_pred = CreateTestPredictor(config, true); + for (int i = 0; i < 20; i++) { + std::vector> inputs_all; + if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { + SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, + FLAGS_param_filename, nullptr, i); + } else { + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", "", nullptr, + i); + } + CompareNativeAndAnalysis(native_pred.get(), analysis_pred.get(), + inputs_all); + } +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt new file mode 100644 index 00000000..87472794 --- /dev/null +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -0,0 +1,50 @@ +function(inference_test TARGET_NAME) + set(options "") + set(oneValueArgs "") + set(multiValueArgs ARGS) + cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + set(arg_list "") + if(inference_test_ARGS) + foreach(arg ${inference_test_ARGS}) + list(APPEND arg_list "_${arg}") + endforeach() + else() + list(APPEND arg_list "_") + endif() + foreach(arg ${arg_list}) + string(REGEX REPLACE "^_$" "" arg "${arg}") + cc_test(test_inference_${TARGET_NAME}${arg} + SRCS test_inference_${TARGET_NAME}.cc + DEPS paddle_fluid_origin + ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model) + set_tests_properties(test_inference_${TARGET_NAME}${arg} + PROPERTIES DEPENDS test_${TARGET_NAME}) + set_tests_properties(test_inference_${TARGET_NAME}${arg} + PROPERTIES LABELS "RUN_TYPE=DIST") + endforeach() +endfunction(inference_test) + +#################### +# Inference tests here depend on fluid/tests/book. If users want to run +# individual test with ctest, they need to run tests in fluid/tests/book +# first to generate saved model. +#################### +# This unittest is buggy! +#inference_test(fit_a_line) +inference_test(image_classification ARGS vgg resnet) +inference_test(label_semantic_roles) +inference_test(recognize_digits ARGS mlp conv) +inference_test(recommender_system) +#inference_test(rnn_encoder_decoder) +#inference_test(understand_sentiment ARGS conv) +inference_test(word2vec) + +# This is an unly work around to make this test run +# TODO(TJ): clean me up +cc_test(test_inference_nlp + SRCS test_inference_nlp.cc + DEPS paddle_fluid_origin + ARGS + --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model) +set_tests_properties(test_inference_nlp PROPERTIES LABELS "RUN_TYPE=DIST") diff --git a/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc new file mode 100644 index 00000000..2c5b66a3 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_fit_a_line.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/inference/tests/test_helper.h" +#include "paddle/fluid/inference/tests/test_multi_thread_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, fit_a_line) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + for (int num_threads : {1, 2}) { + std::vector> cpu_feeds; + cpu_feeds.resize(num_threads); + for (int i = 0; i < num_threads; ++i) { + auto* input = new paddle::framework::LoDTensor(); + // The second dim of the input tensor should be 13 + // The input data should be >= 0 + int64_t batch_size = 10; + SetupTensor(input, {batch_size, 13}, static_cast(0), + static_cast(10)); + cpu_feeds[i].push_back(input); + } + + std::vector> cpu_fetchs1; + cpu_fetchs1.resize(num_threads); + for (int i = 0; i < num_threads; ++i) { + auto* output = new paddle::framework::LoDTensor(); + cpu_fetchs1[i].push_back(output); + } + + // Run inference on CPU + LOG(INFO) << "--- CPU Runs (num_threads: " << num_threads << "): ---"; + if (num_threads == 1) { + TestInference(dirname, cpu_feeds[0], + cpu_fetchs1[0]); + } else { + TestMultiThreadInference( + dirname, cpu_feeds, cpu_fetchs1, num_threads); + } + +#ifdef PADDLE_WITH_CUDA + std::vector> cpu_fetchs2; + cpu_fetchs2.resize(num_threads); + for (int i = 0; i < num_threads; ++i) { + auto* output = new paddle::framework::LoDTensor(); + cpu_fetchs2[i].push_back(output); + } + + // Run inference on CUDA GPU + LOG(INFO) << "--- GPU Runs (num_threads: " << num_threads << "): ---"; + if (num_threads == 1) { + TestInference(dirname, cpu_feeds[0], + cpu_fetchs2[0]); + } else { + TestMultiThreadInference( + dirname, cpu_feeds, cpu_fetchs2, num_threads); + } + + for (int i = 0; i < num_threads; ++i) { + CheckError(*cpu_fetchs1[i][0], *cpu_fetchs2[i][0]); + delete cpu_fetchs2[i][0]; + } +#endif + + for (int i = 0; i < num_threads; ++i) { + delete cpu_feeds[i][0]; + delete cpu_fetchs1[i][0]; + } + } // num_threads-loop +} diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc new file mode 100644 index 00000000..60c761c5 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -0,0 +1,98 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model."); +DEFINE_int32(batch_size, 1, "Batch size of input data"); +DEFINE_int32(repeat, 1, "Running the inference program repeat times"); +DEFINE_bool(skip_cpu, false, "Skip the cpu test"); + +TEST(inference, image_classification) { + if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model " + "--batch_size=1 --repeat=1"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + const bool is_combined = false; + std::vector> feed_target_shapes = + GetFeedTargetShapes(dirname, is_combined); + + paddle::framework::LoDTensor input; + // Use normilized image pixels as input data, + // which should be in the range [0.0, 1.0]. + feed_target_shapes[0][0] = FLAGS_batch_size; + paddle::framework::DDim input_dims = + paddle::framework::make_ddim(feed_target_shapes[0]); + LOG(INFO) << input_dims; + SetupTensor(&input, input_dims, static_cast(0), + static_cast(1)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + paddle::framework::LoDTensor output1; + if (!FLAGS_skip_cpu) { + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + LOG(INFO) << "--- CPU Runs: ---"; + LOG(INFO) << "Batch size is " << FLAGS_batch_size; + TestInference( + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); + LOG(INFO) << output1.dims(); + } + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + LOG(INFO) << "--- GPU Runs: ---"; + LOG(INFO) << "Batch size is " << FLAGS_batch_size; + TestInference( + dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined); + LOG(INFO) << output2.dims(); + + if (!FLAGS_skip_cpu) { + CheckError(output1, output2); + } + + // float16 inference requires cuda GPUs with >= 5.3 compute capability + if (!FLAGS_fp16_dirname.empty() && + paddle::platform::GetCUDAComputeCapability(0) >= 53) { + paddle::framework::LoDTensor output3; + std::vector cpu_fetchs3; + cpu_fetchs3.push_back(&output3); + + LOG(INFO) << "--- GPU Runs in float16 mode: ---"; + LOG(INFO) << "Batch size is " << FLAGS_batch_size; + + TestInference( + FLAGS_fp16_dirname, cpu_feeds, cpu_fetchs3, FLAGS_repeat); + + CheckError(output2, output3); + } +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc new file mode 100644 index 00000000..84bb855f --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_label_semantic_roles.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, label_semantic_roles) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, + ctx_p2, mark; + paddle::framework::LoD lod{{0, 4, 10}}; + int64_t word_dict_len = 44068; + int64_t predicate_dict_len = 3162; + int64_t mark_dict_len = 2; + + SetupLoDTensor(&word, lod, static_cast(0), + static_cast(word_dict_len - 1)); + SetupLoDTensor(&predicate, lod, static_cast(0), + static_cast(predicate_dict_len - 1)); + SetupLoDTensor(&ctx_n2, lod, static_cast(0), + static_cast(word_dict_len - 1)); + SetupLoDTensor(&ctx_n1, lod, static_cast(0), + static_cast(word_dict_len - 1)); + SetupLoDTensor(&ctx_0, lod, static_cast(0), + static_cast(word_dict_len - 1)); + SetupLoDTensor(&ctx_p1, lod, static_cast(0), + static_cast(word_dict_len - 1)); + SetupLoDTensor(&ctx_p2, lod, static_cast(0), + static_cast(word_dict_len - 1)); + SetupLoDTensor(&mark, lod, static_cast(0), + static_cast(mark_dict_len - 1)); + + std::vector cpu_feeds; + cpu_feeds.push_back(&word); + cpu_feeds.push_back(&predicate); + cpu_feeds.push_back(&ctx_n2); + cpu_feeds.push_back(&ctx_n1); + cpu_feeds.push_back(&ctx_0); + cpu_feeds.push_back(&ctx_p1); + cpu_feeds.push_back(&ctx_p2); + cpu_feeds.push_back(&mark); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc new file mode 100644 index 00000000..5c1204b9 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -0,0 +1,243 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/inference/tests/test_helper.h" +#include "paddle/fluid/platform/cpu_helper.h" + +#include "paddle/fluid/framework/feed_fetch_method.h" + +DEFINE_string(model_path, "", "Directory of the inference model."); +DEFINE_string(data_file, "", "File of input index data."); +DEFINE_int32(repeat, 100, "Running the inference program repeat times"); +DEFINE_bool(prepare_vars, true, "Prepare variables before executor"); +DEFINE_int32(num_threads, 1, "Number of threads should be used"); +DECLARE_bool(use_mkldnn); +DECLARE_int32(paddle_num_threads); + +inline double GetCurrentMs() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec; +} + +// This function just give dummy data for recognize_digits model. +size_t DummyData(std::vector* out) { + paddle::framework::LoDTensor input; + SetupTensor(&input, {1, 1, 28, 28}, -1.f, 1.f); + out->emplace_back(input); + return 1; +} + +// Load the input word index data from file and save into LodTensor. +// Return the size of words. +size_t LoadData(std::vector* out, + const std::string& filename) { + if (filename.empty()) { + return DummyData(out); + } + + size_t sz = 0; + std::fstream fin(filename); + std::string line; + out->clear(); + while (getline(fin, line)) { + std::istringstream iss(line); + std::vector ids; + std::string field; + while (getline(iss, field, ' ')) { + ids.push_back(stoi(field)); + } + if (ids.size() >= 1024) { + // Synced with NLP guys, they will ignore input larger then 1024 + continue; + } + + paddle::framework::LoDTensor words; + paddle::framework::LoD lod{{0, ids.size()}}; + words.set_lod(lod); + int64_t* pdata = words.mutable_data( + {static_cast(ids.size()), 1}, paddle::platform::CPUPlace()); + memcpy(pdata, ids.data(), words.numel() * sizeof(int64_t)); + out->emplace_back(words); + sz += ids.size(); + } + return sz; +} + +// Split input data samples into small pieces jobs as balanced as possible, +// according to the number of threads. +void SplitData( + const std::vector& datasets, + std::vector>* jobs, + const int num_threads) { + size_t s = 0; + jobs->resize(num_threads); + while (s < datasets.size()) { + for (auto it = jobs->begin(); it != jobs->end(); it++) { + it->emplace_back(&datasets[s]); + s++; + if (s >= datasets.size()) { + break; + } + } + } +} + +void ThreadRunInfer( + const int tid, paddle::framework::Scope* scope, + const std::vector>& jobs) { + // maybe framework:ProgramDesc is not thread-safe + paddle::platform::CPUPlace place; + paddle::framework::Executor executor(place); + auto& sub_scope = scope->NewScope(); + auto inference_program = + paddle::inference::Load(&executor, scope, FLAGS_model_path); + + auto ctx = executor.Prepare(*inference_program, /*block_id*/ 0); + executor.CreateVariables(*inference_program, &sub_scope, /*block_id*/ 0); + + const std::vector& feed_target_names = + inference_program->GetFeedTargetNames(); + const std::vector& fetch_target_names = + inference_program->GetFetchTargetNames(); + + PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL); + std::map fetch_targets; + paddle::framework::LoDTensor outtensor; + fetch_targets[fetch_target_names[0]] = &outtensor; + + std::map feed_targets; + PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL); + + // map the data of feed_targets to feed_holder + for (auto* op : inference_program->Block(0).AllOps()) { + if (op->Type() == "feed") { + std::string feed_target_name = op->Output("Out")[0]; + int idx = boost::get(op->GetAttr("col")); + paddle::framework::SetFeedVariable(scope, *feed_targets[feed_target_name], + "feed", idx); + } + } + + auto& inputs = jobs[tid]; + auto start_ms = GetCurrentMs(); + for (size_t i = 0; i < inputs.size(); ++i) { + feed_targets[feed_target_names[0]] = inputs[i]; + executor.RunPreparedContext(ctx.get(), &sub_scope, + false /*create_local_scope*/); + } + auto stop_ms = GetCurrentMs(); + + // obtain the data of fetch_targets from fetch_holder + for (auto* op : inference_program->Block(0).AllOps()) { + if (op->Type() == "fetch") { + std::string fetch_target_name = op->Input("X")[0]; + int idx = boost::get(op->GetAttr("col")); + *fetch_targets[fetch_target_name] = + paddle::framework::GetFetchVariable(*scope, "fetch", idx); + } + } + + scope->DeleteScope(&sub_scope); + LOG(INFO) << "Tid: " << tid << ", process " << inputs.size() + << " samples, avg time per sample: " + << (stop_ms - start_ms) / inputs.size() << " ms"; +} + +TEST(inference, nlp) { + if (FLAGS_model_path.empty()) { + LOG(FATAL) << "Usage: ./example --model_path=path/to/your/model"; + } + if (FLAGS_data_file.empty()) { + LOG(WARNING) << "No data file provided, will use dummy data!" + << "Note: if you use nlp model, please provide data file."; + } + LOG(INFO) << "Model Path: " << FLAGS_model_path; + LOG(INFO) << "Data File: " << FLAGS_data_file; + + std::vector datasets; + size_t num_total_words = LoadData(&datasets, FLAGS_data_file); + LOG(INFO) << "Number of samples (seq_len<1024): " << datasets.size(); + LOG(INFO) << "Total number of words: " << num_total_words; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + std::unique_ptr scope( + new paddle::framework::Scope()); + + paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + + double start_ms = 0, stop_ms = 0; + if (FLAGS_num_threads > 1) { + std::vector> jobs; + SplitData(datasets, &jobs, FLAGS_num_threads); + std::vector> threads; + start_ms = GetCurrentMs(); + for (int i = 0; i < FLAGS_num_threads; ++i) { + threads.emplace_back( + new std::thread(ThreadRunInfer, i, scope.get(), std::ref(jobs))); + } + for (int i = 0; i < FLAGS_num_threads; ++i) { + threads[i]->join(); + } + stop_ms = GetCurrentMs(); + } else { + // 1. Define place, executor, scope + paddle::platform::CPUPlace place; + paddle::framework::Executor executor(place); + + // 2. Initialize the inference_program and load parameters + std::unique_ptr inference_program; + inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path, + /*model combined*/ false); + // always prepare context + std::unique_ptr ctx; + ctx = executor.Prepare(*inference_program, 0); + if (FLAGS_prepare_vars) { + executor.CreateVariables(*inference_program, scope.get(), 0); + } + // preapre fetch + const std::vector& fetch_target_names = + inference_program->GetFetchTargetNames(); + PADDLE_ENFORCE_EQ(fetch_target_names.size(), 1UL); + std::map fetch_targets; + paddle::framework::LoDTensor outtensor; + fetch_targets[fetch_target_names[0]] = &outtensor; + + // prepare feed + const std::vector& feed_target_names = + inference_program->GetFeedTargetNames(); + PADDLE_ENFORCE_EQ(feed_target_names.size(), 1UL); + std::map feed_targets; + + // feed data and run + start_ms = GetCurrentMs(); + for (size_t i = 0; i < datasets.size(); ++i) { + feed_targets[feed_target_names[0]] = &(datasets[i]); + executor.RunPreparedContext(ctx.get(), scope.get(), &feed_targets, + &fetch_targets, !FLAGS_prepare_vars); + } + stop_ms = GetCurrentMs(); + LOG(INFO) << "Tid: 0, process " << datasets.size() + << " samples, avg time per sample: " + << (stop_ms - start_ms) / datasets.size() << " ms"; + } + LOG(INFO) << "Total inference time with " << FLAGS_num_threads + << " threads : " << (stop_ms - start_ms) / 1000.0 + << " sec, QPS: " << datasets.size() / ((stop_ms - start_ms) / 1000); +} diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc new file mode 100644 index 00000000..f12828a2 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_int32(batch_size, 1, "Batch size of input data"); +DEFINE_int32(repeat, 1, "Running the inference program repeat times"); + +TEST(inference, recognize_digits) { + if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model " + "--batch_size=1 --repeat=1"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor input; + // Use normilized image pixels as input data, + // which should be in the range [-1.0, 1.0]. + SetupTensor(&input, {FLAGS_batch_size, 1, 28, 28}, + static_cast(-1), static_cast(1)); + std::vector cpu_feeds; + cpu_feeds.push_back(&input); + + for (auto is_combined : {false, true}) { + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---"; + TestInference(dirname, cpu_feeds, cpu_fetchs1, + FLAGS_repeat, is_combined); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---"; + TestInference(dirname, cpu_feeds, cpu_fetchs2, + FLAGS_repeat, is_combined); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif + } +} diff --git a/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc new file mode 100644 index 00000000..70aa6b19 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_recommender_system.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, recommender_system) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + int64_t batch_size = 1; + + paddle::framework::LoDTensor user_id, gender_id, age_id, job_id, movie_id, + category_id, movie_title; + + // Use the first data from paddle.dataset.movielens.test() as input + std::vector user_id_data = {1}; + SetupTensor(&user_id, {batch_size, 1}, user_id_data); + + std::vector gender_id_data = {1}; + SetupTensor(&gender_id, {batch_size, 1}, gender_id_data); + + std::vector age_id_data = {0}; + SetupTensor(&age_id, {batch_size, 1}, age_id_data); + + std::vector job_id_data = {10}; + SetupTensor(&job_id, {batch_size, 1}, job_id_data); + + std::vector movie_id_data = {783}; + SetupTensor(&movie_id, {batch_size, 1}, movie_id_data); + + std::vector category_id_data = {10, 8, 9}; + SetupLoDTensor(&category_id, {3, 1}, {{0, 3}}, category_id_data); + + std::vector movie_title_data = {1069, 4140, 2923, 710, 988}; + SetupLoDTensor(&movie_title, {5, 1}, {{0, 5}}, movie_title_data); + + std::vector cpu_feeds; + cpu_feeds.push_back(&user_id); + cpu_feeds.push_back(&gender_id); + cpu_feeds.push_back(&age_id); + cpu_feeds.push_back(&job_id); + cpu_feeds.push_back(&movie_id); + cpu_feeds.push_back(&category_id); + cpu_feeds.push_back(&movie_title); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc new file mode 100644 index 00000000..e15c3f59 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_rnn_encoder_decoder.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, rnn_encoder_decoder) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor word_data, trg_word; + paddle::framework::LoD lod{{0, 4, 10}}; + + SetupLoDTensor(&word_data, lod, static_cast(0), + static_cast(1)); + SetupLoDTensor(&trg_word, lod, static_cast(0), + static_cast(1)); + + std::vector cpu_feeds; + cpu_feeds.push_back(&word_data); + cpu_feeds.push_back(&trg_word); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc new file mode 100644 index 00000000..0dbb6a30 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_understand_sentiment.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, understand_sentiment) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor words; + paddle::framework::LoD lod{{0, 4, 10}}; + int64_t word_dict_len = 5147; + + SetupLoDTensor(&words, lod, static_cast(0), + static_cast(word_dict_len - 1)); + + std::vector cpu_feeds; + cpu_feeds.push_back(&words); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/book/test_inference_word2vec.cc b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc new file mode 100644 index 00000000..c9328eb2 --- /dev/null +++ b/paddle/fluid/inference/tests/book/test_inference_word2vec.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/inference/tests/test_helper.h" + +DEFINE_string(dirname, "", "Directory of the inference model."); + +TEST(inference, word2vec) { + if (FLAGS_dirname.empty()) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + } + + LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; + std::string dirname = FLAGS_dirname; + + // 0. Call `paddle::framework::InitDevices()` initialize all the devices + // In unittests, this is done in paddle/testing/paddle_gtest_main.cc + + paddle::framework::LoDTensor first_word, second_word, third_word, fourth_word; + paddle::framework::LoD lod{{0, 1}}; + int64_t dict_size = 2073; // The size of dictionary + + SetupLoDTensor(&first_word, lod, static_cast(0), dict_size - 1); + SetupLoDTensor(&second_word, lod, static_cast(0), dict_size - 1); + SetupLoDTensor(&third_word, lod, static_cast(0), dict_size - 1); + SetupLoDTensor(&fourth_word, lod, static_cast(0), dict_size - 1); + + std::vector cpu_feeds; + cpu_feeds.push_back(&first_word); + cpu_feeds.push_back(&second_word); + cpu_feeds.push_back(&third_word); + cpu_feeds.push_back(&fourth_word); + + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); + + // Run inference on CPU + TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << output1.lod(); + LOG(INFO) << output1.dims(); + +#ifdef PADDLE_WITH_CUDA + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); + + // Run inference on CUDA GPU + TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << output2.lod(); + LOG(INFO) << output2.dims(); + + CheckError(output1, output2); +#endif +} diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake new file mode 100644 index 00000000..444bab1b --- /dev/null +++ b/paddle/fluid/inference/tests/test.cmake @@ -0,0 +1,82 @@ +include(ExternalProject) +set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url") +set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING + "A path setting inference demo download directories.") +set(CPU_NUM_THREADS_ON_CI 4 CACHE STRING "Run multi-threads on CI to reduce CI time.") + +function(inference_download INSTALL_DIR URL FILENAME) + message(STATUS "Download inference test stuff from ${URL}/${FILENAME}") + string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME}) + ExternalProject_Add( + extern_inference_download_${FILENAME_EX} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${INSTALL_DIR} + URL ${URL}/${FILENAME} + DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} + DOWNLOAD_DIR ${INSTALL_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND "" + ) +endfunction() + +function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) + message(STATUS "Download inference test stuff from ${URL}/${FILENAME}") + string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME}) + set(EXTERNAL_PROJECT_NAME "extern_inference_download_${FILENAME_EX}") + set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}") + ExternalProject_Add( + ${EXTERNAL_PROJECT_NAME} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${INSTALL_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && + ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME} + DOWNLOAD_DIR ${INSTALL_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + UPDATE_COMMAND "" + INSTALL_COMMAND "" + ) +endfunction() + +set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") +if(NOT EXISTS ${WORD2VEC_INSTALL_DIR} AND NOT WIN32) + inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") +endif() +set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") + +function (inference_base_test_build TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + cc_test_build(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS}) +endfunction() + +function (inference_base_test_run TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs COMMAND ARGS) + cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(WITH_GPU) + set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") + endif() + cc_test_run(${TARGET} COMMAND ${base_test_COMMAND} ARGS ${mem_opt} ${base_test_ARGS}) +endfunction() + +function (inference_base_test TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS ARGS DEPS) + cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + inference_base_test_build(${TARGET} + SRCS ${base_test_SRCS} + DEPS ${base_test_DEPS}) + inference_base_test_run(${TARGET} + COMMAND ${TARGET} + ARGS ${base_test_ARGS}) +endfunction() + diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h new file mode 100644 index 00000000..861f69f4 --- /dev/null +++ b/paddle/fluid/inference/tests/test_helper.h @@ -0,0 +1,252 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/profiler.h" + +DECLARE_bool(use_mkldnn); + +template +void SetupTensor(paddle::framework::LoDTensor* input, + paddle::framework::DDim dims, T lower, T upper) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + + T* input_ptr = input->mutable_data(dims, paddle::platform::CPUPlace()); + for (int i = 0; i < input->numel(); ++i) { + input_ptr[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } +} + +template +void SetupTensor(paddle::framework::LoDTensor* input, + paddle::framework::DDim dims, const std::vector& data) { + CHECK_EQ(paddle::framework::product(dims), static_cast(data.size())); + T* input_ptr = input->mutable_data(dims, paddle::platform::CPUPlace()); + memcpy(input_ptr, data.data(), input->numel() * sizeof(T)); +} + +template +void SetupLoDTensor(paddle::framework::LoDTensor* input, + const paddle::framework::LoD& lod, T lower, T upper) { + input->set_lod(lod); + int dim = lod[0][lod[0].size() - 1]; + SetupTensor(input, {dim, 1}, lower, upper); +} + +template +void SetupLoDTensor(paddle::framework::LoDTensor* input, + paddle::framework::DDim dims, + const paddle::framework::LoD lod, + const std::vector& data) { + const size_t level = lod.size() - 1; + CHECK_EQ(dims[0], static_cast((lod[level]).back())); + input->set_lod(lod); + SetupTensor(input, dims, data); +} + +template +void CheckError(const paddle::framework::LoDTensor& output1, + const paddle::framework::LoDTensor& output2) { + // Check lod information + EXPECT_EQ(output1.lod(), output2.lod()); + + EXPECT_EQ(output1.dims(), output2.dims()); + EXPECT_EQ(output1.numel(), output2.numel()); + + T err = static_cast(0); + if (typeid(T) == typeid(float)) { + err = 1E-3; + } else if (typeid(T) == typeid(double)) { + err = 1E-6; + } else { + err = 0; + } + + size_t count = 0; + for (int64_t i = 0; i < output1.numel(); ++i) { + if (fabs(output1.data()[i] - output2.data()[i]) > err) { + count++; + } + } + EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; +} + +std::unique_ptr InitProgram( + paddle::framework::Executor* executor, paddle::framework::Scope* scope, + const std::string& dirname, const bool is_combined = false, + const std::string& prog_filename = "__model_combined__", + const std::string& param_filename = "__params_combined__") { + std::unique_ptr inference_program; + if (is_combined) { + // All parameters are saved in a single file. + // Hard-coding the file names of program and parameters in unittest. + // The file names should be consistent with that used in Python API + // `fluid.io.save_inference_model`. + inference_program = + paddle::inference::Load(executor, scope, dirname + "/" + prog_filename, + dirname + "/" + param_filename); + } else { + // Parameters are saved in separate files sited in the specified + // `dirname`. + inference_program = paddle::inference::Load(executor, scope, dirname); + } + return inference_program; +} + +std::vector> GetFeedTargetShapes( + const std::string& dirname, const bool is_combined = false, + const std::string& prog_filename = "__model_combined__", + const std::string& param_filename = "__params_combined__") { + auto place = paddle::platform::CPUPlace(); + auto executor = paddle::framework::Executor(place); + auto* scope = new paddle::framework::Scope(); + + auto inference_program = InitProgram(&executor, scope, dirname, is_combined, + prog_filename, param_filename); + auto& global_block = inference_program->Block(0); + + const std::vector& feed_target_names = + inference_program->GetFeedTargetNames(); + std::vector> feed_target_shapes; + for (size_t i = 0; i < feed_target_names.size(); ++i) { + auto* var = global_block.FindVar(feed_target_names[i]); + std::vector var_shape = var->GetShape(); + feed_target_shapes.push_back(var_shape); + } + + delete scope; + return feed_target_shapes; +} + +template +void TestInference(const std::string& dirname, + const std::vector& cpu_feeds, + const std::vector& cpu_fetchs, + const int repeat = 1, const bool is_combined = false) { + // 1. Define place, executor, scope + auto place = Place(); + auto executor = paddle::framework::Executor(place); + auto* scope = new paddle::framework::Scope(); + + // Profile the performance + paddle::platform::ProfilerState state; + if (paddle::platform::is_cpu_place(place)) { + state = paddle::platform::ProfilerState::kCPU; + } else { +#ifdef PADDLE_WITH_CUDA + state = paddle::platform::ProfilerState::kAll; + // The default device_id of paddle::platform::CUDAPlace is 0. + // Users can get the device_id using: + // int device_id = place.GetDeviceId(); + paddle::platform::SetDeviceId(0); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif + } + + // 2. Initialize the inference_program and load parameters + std::unique_ptr inference_program; + + // Enable the profiler + paddle::platform::EnableProfiler(state); + { + paddle::platform::RecordEvent record_event("init_program"); + inference_program = InitProgram(&executor, scope, dirname, is_combined); + } + + // Disable the profiler and print the timing information + paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, + "load_program_profiler"); + paddle::platform::ResetProfiler(); + + // 3. Get the feed_target_names and fetch_target_names + const std::vector& feed_target_names = + inference_program->GetFeedTargetNames(); + const std::vector& fetch_target_names = + inference_program->GetFetchTargetNames(); + + // 4. Prepare inputs: set up maps for feed targets + std::map feed_targets; + for (size_t i = 0; i < feed_target_names.size(); ++i) { + // Please make sure that cpu_feeds[i] is right for feed_target_names[i] + feed_targets[feed_target_names[i]] = cpu_feeds[i]; + } + + // 5. Define Tensor to get the outputs: set up maps for fetch targets + std::map fetch_targets; + for (size_t i = 0; i < fetch_target_names.size(); ++i) { + fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; + } + + // 6. If export Flags_use_mkldnn=True, use mkldnn related ops. + if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program); + + // 7. Run the inference program + { + if (!CreateVars) { + // If users don't want to create and destroy variables every time they + // run, they need to set `create_vars` to false and manually call + // `CreateVariables` before running. + executor.CreateVariables(*inference_program, scope, 0); + } + + // Ignore the profiling results of the first run + std::unique_ptr ctx; + bool CreateLocalScope = CreateVars; + if (PrepareContext) { + ctx = executor.Prepare(*inference_program, 0); + executor.RunPreparedContext(ctx.get(), scope, &feed_targets, + &fetch_targets, CreateLocalScope, CreateVars); + } else { + executor.Run(*inference_program, scope, &feed_targets, &fetch_targets, + CreateLocalScope, CreateVars); + } + + // Enable the profiler + paddle::platform::EnableProfiler(state); + + // Run repeat times to profile the performance + for (int i = 0; i < repeat; ++i) { + paddle::platform::RecordEvent record_event("run_inference"); + + if (PrepareContext) { + // Note: if you change the inference_program, you need to call + // executor.Prepare() again to get a new ExecutorPrepareContext. + executor.RunPreparedContext(ctx.get(), scope, &feed_targets, + &fetch_targets, CreateLocalScope, + CreateVars); + } else { + executor.Run(*inference_program, scope, &feed_targets, &fetch_targets, + CreateLocalScope, CreateVars); + } + } + + // Disable the profiler and print the timing information + paddle::platform::DisableProfiler( + paddle::platform::EventSortingKey::kDefault, "run_inference_profiler"); + paddle::platform::ResetProfiler(); + } + + delete scope; +} diff --git a/paddle/fluid/inference/tests/test_multi_thread_helper.h b/paddle/fluid/inference/tests/test_multi_thread_helper.h new file mode 100644 index 00000000..56745f11 --- /dev/null +++ b/paddle/fluid/inference/tests/test_multi_thread_helper.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/inference/io.h" + +void ThreadedRunInference( + const std::unique_ptr& inference_program, + paddle::framework::Executor* executor, paddle::framework::Scope* scope, + const int thread_id, + const std::vector& cpu_feeds, + const std::vector& cpu_fetchs) { + auto copy_program = std::unique_ptr( + new paddle::framework::ProgramDesc(*inference_program)); + + std::string feed_holder_name = "feed_" + paddle::string::to_string(thread_id); + std::string fetch_holder_name = + "fetch_" + paddle::string::to_string(thread_id); + copy_program->SetFeedHolderName(feed_holder_name); + copy_program->SetFetchHolderName(fetch_holder_name); + + // 3. Get the feed_target_names and fetch_target_names + const std::vector& feed_target_names = + copy_program->GetFeedTargetNames(); + const std::vector& fetch_target_names = + copy_program->GetFetchTargetNames(); + + // 4. Prepare inputs: set up maps for feed targets + std::map feed_targets; + for (size_t i = 0; i < feed_target_names.size(); ++i) { + // Please make sure that cpu_feeds[i] is right for feed_target_names[i] + feed_targets[feed_target_names[i]] = cpu_feeds[i]; + } + + // 5. Define Tensor to get the outputs: set up maps for fetch targets + std::map fetch_targets; + for (size_t i = 0; i < fetch_target_names.size(); ++i) { + fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; + } + + // 6. Run the inference program + executor->Run(*copy_program, scope, feed_targets, fetch_targets, true, + feed_holder_name, fetch_holder_name); +} + +template +void TestMultiThreadInference( + const std::string& dirname, + const std::vector>& cpu_feeds, + const std::vector>& cpu_fetchs, + const int num_threads) { + // 1. Define place, executor, scope + auto place = Place(); + auto executor = paddle::framework::Executor(place); + auto* scope = new paddle::framework::Scope(); + + // 2. Initialize the inference_program and load parameters + std::unique_ptr inference_program = + paddle::inference::Load(executor, *scope, dirname); + + std::vector threads; + for (int i = 0; i < num_threads; ++i) { + threads.push_back(new std::thread( + ThreadedRunInference, std::ref(inference_program), &executor, scope, i, + std::ref(cpu_feeds[i]), std::ref(cpu_fetchs[i]))); + } + for (int i = 0; i < num_threads; ++i) { + threads[i]->join(); + delete threads[i]; + } + + delete scope; +} diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt new file mode 100644 index 00000000..2104e4ac --- /dev/null +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -0,0 +1,2 @@ +cc_library(benchmark SRCS benchmark.cc DEPS enforce) +cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) diff --git a/paddle/fluid/inference/utils/benchmark.cc b/paddle/fluid/inference/utils/benchmark.cc new file mode 100644 index 00000000..0bd526bc --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/utils/benchmark.h" +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { + +std::string Benchmark::SerializeToString() const { + std::stringstream ss; + ss << "-----------------------------------------------------\n"; + ss << "name\t"; + ss << "batch_size\t"; + ss << "num_threads\t"; + ss << "latency\t"; + ss << "qps"; + ss << '\n'; + + ss << name_ << "\t"; + ss << batch_size_ << "\t\t"; + ss << num_threads_ << "\t"; + ss << latency_ << "\t"; + ss << 1000.0 / latency_; + ss << '\n'; + return ss.str(); +} +void Benchmark::PersistToFile(const std::string &path) const { + std::ofstream file(path, std::ios::app); + PADDLE_ENFORCE(file.is_open(), "Can not open %s to add benchmark", path); + file << SerializeToString(); + file.flush(); + file.close(); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h new file mode 100644 index 00000000..a1304cf4 --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark.h @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +namespace paddle { +namespace inference { + +/* + * Helper class to calculate the performance. + */ +struct Benchmark { + int batch_size() const { return batch_size_; } + void SetBatchSize(int x) { batch_size_ = x; } + + int num_threads() const { return num_threads_; } + void SetNumThreads(int x) { num_threads_ = x; } + + bool use_gpu() const { return use_gpu_; } + void SetUseGpu() { use_gpu_ = true; } + + float latency() const { return latency_; } + void SetLatency(float x) { latency_ = x; } + + const std::string& name() const { return name_; } + void SetName(const std::string& name) { name_ = name; } + + std::string SerializeToString() const; + void PersistToFile(const std::string& path) const; + + private: + bool use_gpu_{false}; + int batch_size_{0}; + float latency_; + int num_threads_{1}; + std::string name_; +}; + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc new file mode 100644 index 00000000..0c48c2db --- /dev/null +++ b/paddle/fluid/inference/utils/benchmark_tester.cc @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/utils/benchmark.h" +#include +#include + +using namespace paddle::inference; // NOLINT +TEST(Benchmark, basic) { + Benchmark benchmark; + benchmark.SetName("key0"); + benchmark.SetBatchSize(10); + benchmark.SetUseGpu(); + benchmark.SetLatency(220); + LOG(INFO) << "benchmark:\n" << benchmark.SerializeToString(); +} + +TEST(Benchmark, PersistToFile) { + Benchmark benchmark; + benchmark.SetName("key0"); + benchmark.SetBatchSize(10); + benchmark.SetUseGpu(); + benchmark.SetLatency(220); + + benchmark.PersistToFile("1.log"); + benchmark.PersistToFile("2.log"); + benchmark.PersistToFile("3.log"); +} diff --git a/paddle/fluid/inference/utils/singleton.h b/paddle/fluid/inference/utils/singleton.h new file mode 100644 index 00000000..990bef35 --- /dev/null +++ b/paddle/fluid/inference/utils/singleton.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { + +// NOTE not thread-safe. +template +struct Singleton { + static T& Global() { + static T* x = new T; + return *x; + } + + Singleton() = delete; + Singleton& operator=(const Singleton&) = delete; +}; + +/* + * An registor for any type. + * NOTE not thread-safe. + */ +template +struct Registry { + static Registry& Global() { + static auto* x = new Registry; + return *x; + } + + template + void Register(const std::string& name) { + PADDLE_ENFORCE_EQ(items_.count(name), 0); + items_[name] = new ItemChild; + } + + ItemParent* Lookup(const std::string& name, + const std::string& default_name = "") { + auto it = items_.find(name); + if (it == items_.end()) { + if (default_name == "") + return nullptr; + else + return items_.find(default_name)->second; + } + return it->second; + } + + ~Registry() { + for (auto& item : items_) { + delete item.second; + } + } + + private: + Registry() = default; + std::unordered_map items_; +}; + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt new file mode 100644 index 00000000..7eb663ea --- /dev/null +++ b/paddle/fluid/memory/CMakeLists.txt @@ -0,0 +1,12 @@ +add_subdirectory(detail) +add_subdirectory(allocation) +cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler) +cc_library(memcpy SRCS memcpy.cc DEPS place) + +cc_library(memory + DEPS + malloc + memcpy) +#if (WITH_GPU) +# nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory) +#endif() diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt new file mode 100644 index 00000000..888c214e --- /dev/null +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -0,0 +1,57 @@ +cc_library(allocator SRCS allocator.cc DEPS place) +cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) +cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) +cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) +cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) +cc_library(naive_best_fit_allocator SRCS naive_best_fit_allocator.cc DEPS allocator buddy_allocator profiler) +cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator) + +if (WITH_GPU) + nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) +endif() + +cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) + +nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) +if (WITH_GPU) + set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard) +else () + set(AllocatorFacadeDeps) +endif() + +if (WITH_GPU) + nv_test(best_fit_allocator_test + SRCS best_fit_allocator_test.cc + best_fit_allocator_test.cu + DEPS best_fit_allocator + locked_allocator + cpu_allocator + cuda_allocator + device_context + memcpy) +else() + cc_test(best_fit_allocator_test + SRCS best_fit_allocator_test.cc + DEPS best_fit_allocator + locked_allocator + cpu_allocator) +endif() + +list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator aligned_allocator retry_allocator buffered_allocator naive_best_fit_allocator auto_growth_best_fit_allocator best_fit_allocator) + +cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) +cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) +cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) + +cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator) +if (WITH_TESTING) + set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") +endif() + +cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade) + +cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade) + +cc_library(auto_growth_best_fit_allocator SRCS auto_growth_best_fit_allocator.cc DEPS allocator aligned_allocator) +cc_test(auto_growth_best_fit_allocator_facade_test SRCS auto_growth_best_fit_allocator_facade_test.cc DEPS cpu_allocator auto_growth_best_fit_allocator) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc new file mode 100644 index 00000000..c9a031df --- /dev/null +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/aligned_allocator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class AlignedAllocation : public Allocation { + public: + AlignedAllocation(AllocationPtr underlying_allocation, size_t offset) + : Allocation( + reinterpret_cast(underlying_allocation->ptr()) + offset, + underlying_allocation->size() - offset, + underlying_allocation->place()), + underlying_allocation_(std::move(underlying_allocation)) {} + + private: + AllocationPtr underlying_allocation_; +}; + +AlignedAllocator::AlignedAllocator( + const std::shared_ptr& underlyning_allocator, size_t alignment) + : underlying_allocator_(underlyning_allocator), alignment_(alignment) { + PADDLE_ENFORCE(alignment_ > 0, "alignment must be positive integer"); + if (alignment_ & (alignment_ - 1)) { + PADDLE_THROW("alignment must be 2^N, but got %d", alignment_); + } +} + +bool AlignedAllocator::IsAllocThreadSafe() const { + return underlying_allocator_->IsAllocThreadSafe(); +} + +Allocation* AlignedAllocator::AllocateImpl(size_t size) { + auto raw_allocation = underlying_allocator_->Allocate(size + alignment_); + size_t offset = AlignedPtrOffset(raw_allocation->ptr(), alignment_); + return new AlignedAllocation(std::move(raw_allocation), offset); +} + +void AlignedAllocator::FreeImpl(Allocation* allocation) { delete allocation; } + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h new file mode 100644 index 00000000..71250766 --- /dev/null +++ b/paddle/fluid/memory/allocation/aligned_allocator.h @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class AlignedAllocator : public Allocator { + public: + AlignedAllocator(const std::shared_ptr& underlying_allocator, + size_t alignment); + + bool IsAllocThreadSafe() const override; + + protected: + Allocation* AllocateImpl(size_t size) override; + + void FreeImpl(Allocation* allocation) override; + + private: + std::shared_ptr underlying_allocator_; + size_t alignment_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc new file mode 100644 index 00000000..4998f3db --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +bool Allocator::IsAllocThreadSafe() const { return false; } + +void Allocator::FreeImpl(Allocation* allocation) { + Allocator* allocator = allocation->TopDecoratedAllocator(); + allocator->Free(allocation); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h new file mode 100644 index 00000000..5d7c9bde --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator.h @@ -0,0 +1,203 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/inlined_vector.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +// Exception when `Alloc`/`AllocShared` failed +class BadAlloc : public std::exception { + public: + inline explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {} + + inline const char* what() const noexcept override { return msg_.c_str(); } + + private: + std::string msg_; +}; + +class Allocator; + +// Allocation is the object holding the actually pointer. Use +// `Allocation::ptr()` will returns the pointer that allocated. +// +// NOTE: this is the base class of Allocation. Each allocator can use its own +// allocation object. +// NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0 + +/** + * Allocation is returned by Allocator::Allocate() method. + * + * An allocator may be decorated by another allocator. For example, we can + * decorate a RetryAllocator to any allocator to perform allocation retry when + * first allocation request fails. + * + * Explanations of Allocator design are as follows: + * + * Suppose we have an allocator which is decorated by several allocators: + * + * A(1) <- A(2) <- A(3) <- ... <- A(n) + * + * , and the public allocator is A(1). + * + * The allocation process would be: + * + * A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate() + * + * , and the free process would be: + * + * A(1).Free() -> A(2).Free() -> ... -> A(n).Free() + * + * Therefore, we should record the allocator chain when allocating, so + * that we can free the allocation in the reverse order of allocator chain. + * The field `decorated_allocators_` is used to record this chain. + * + * Another example is that we want to add additional fields in Allocation, + * e.g., something what is done in AlignedAllocator, etc. + * In this case, we should declare a derived class of Allocation, which + * contains an underlying Allocation allocated by the underlying allocator. + * Therefore, `decorated_allocators_` of the new Allocation object would + * be a new chain, differing from the underlying Allocation object. + */ +class Allocation { + public: + inline Allocation(void* ptr, size_t size, platform::Place place) + : ptr_(ptr), size_(size), place_(place) {} + + Allocation(const Allocation& o) = delete; + Allocation& operator=(const Allocation& o) = delete; + Allocation(Allocation&& o) = delete; + Allocation& operator=(Allocation&& o) = delete; + + // Returns the holding pointer. + // NOTE: For performance consideration, it is better not to make this method + // as a virtual method. If we want to implement a `defragmentation` later, + // we might need to make `ptr_` field as a protected field, and add a virtual + // method like `defragmentation` to change `ptr_`. + inline void* ptr() const { return ptr_; } + + // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the + // last valid element. + // + // NOTE: Some allocator might alloc more memory than request. The size + // could larger than its request. For example, + // the AlignedAllocator will always allocate memory as size + kAlignment. + // The raw pointer might not aligned, so an offset might be added to raw + // the pointer. The size of this allocation will be + // `size + kAlignemnt - offset`. + inline size_t size() const { return size_; } + + inline const platform::Place& place() const { return place_; } + + virtual ~Allocation() {} + + private: + inline void RegisterDecoratedAllocator(Allocator* allocator) { + decorated_allocators_.emplace_back(allocator); + } + + inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); } + + inline Allocator* TopDecoratedAllocator() { + return decorated_allocators_.back(); + } + + private: + void* ptr_; + size_t size_; + platform::Place place_; + + /** + * NOTE(zjl): Since decorated_allocators_ is usually a small vector. + * We reserve a small buffer to it to prevent frequent heap allocation + * + * Instead, we can use a std::vector here, and reserve + * kReserveAllocatorNum in constructor of Allocation. + * But using std::vector would make ocr recognition model + * fail in CE. The train duration is 8% slower than KPI. + */ + static constexpr size_t kReserveAllocatorNum = 8; + using DecoratedAllocatorStack = + framework::InlinedVector; + + DecoratedAllocatorStack decorated_allocators_; + + friend class Allocator; +}; + +// Base interface class of memory Allocator. +class Allocator { + public: + virtual ~Allocator() {} + + class AllocationDeleter { + public: + inline void operator()(Allocation* allocation) const { + Allocator* allocator = allocation->TopDecoratedAllocator(); + allocator->Free(allocation); + } + }; + + using AllocationPtr = std::unique_ptr; + + // Allocate an allocation. + // size may be 0, but it would be too complex if we handle size == 0 + // in each Allocator. So we handle size == 0 inside AllocatorFacade + // in our design. + inline AllocationPtr Allocate(size_t size) { + auto ptr = AllocateImpl(size); + ptr->RegisterDecoratedAllocator(this); + return AllocationPtr(ptr); + } + + // This function should not be called outside Allocator class + inline void Free(Allocation* allocation) { + allocation->PopDecoratedAllocator(); + FreeImpl(allocation); + } + + // True if the `Allocate` is thread safe. + virtual bool IsAllocThreadSafe() const; + + protected: + virtual Allocation* AllocateImpl(size_t size) = 0; + virtual void FreeImpl(Allocation* allocation); +}; + +using AllocationDeleter = Allocator::AllocationDeleter; +using AllocationPtr = Allocator::AllocationPtr; + +inline size_t AlignedSize(size_t size, size_t alignment) { + auto remaining = size % alignment; + return remaining == 0 ? size : size + alignment - remaining; +} + +inline size_t AlignedPtrOffset(const void* ptr, size_t alignment) { + auto ptr_addr = reinterpret_cast(ptr); + auto diff = ptr_addr % alignment; + return diff == 0 ? 0 : alignment - diff; +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc new file mode 100644 index 00000000..77b95f71 --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -0,0 +1,176 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator.h" +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/memory/allocation/allocator_strategy.h" +#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" +#include "paddle/fluid/memory/allocation/cpu_allocator.h" +#include "paddle/fluid/memory/allocation/locked_allocator.h" +#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" +#include "paddle/fluid/memory/allocation/retry_allocator.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/memory/allocation/cuda_allocator.h" +#include "paddle/fluid/memory/allocation/pinned_allocator.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/gpu_info.h" +#endif + +DEFINE_int64( + gpu_allocator_retry_time, 0, + "The retry time (milliseconds) when allocator fails " + "to allocate memory. No retry if this value is not greater than 0"); + +namespace paddle { +namespace memory { +namespace allocation { + +class AllocatorFacadePrivate { + public: + AllocatorFacadePrivate() { + auto strategy = GetAllocatorStrategy(); + switch (strategy) { + case AllocatorStrategy::kNaiveBestFit: { + InitNaiveBestFitCPUAllocator(); +#ifdef PADDLE_WITH_CUDA + for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); + ++dev_id) { + InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); + } + InitNaiveBestFitCUDAPinnedAllocator(); +#endif + break; + } + + case AllocatorStrategy::kAutoGrowth: { + InitNaiveBestFitCPUAllocator(); +#ifdef PADDLE_WITH_CUDA + for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); + ++dev_id) { + InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id)); + } + InitNaiveBestFitCUDAPinnedAllocator(); +#endif + break; + } + + default: { + PADDLE_THROW("Unsupported allocator strategy: %d", + static_cast(strategy)); + } + } + InitZeroSizeAllocators(); + } + + inline const std::shared_ptr& GetAllocator( + const platform::Place& place, size_t size) { + const auto& allocators = (size > 0 ? allocators_ : zero_size_allocators_); + auto iter = allocators.find(place); + PADDLE_ENFORCE(iter != allocators.end(), + "No such allocator for the place, %s", place); + return iter->second; + } + + private: + void InitNaiveBestFitCPUAllocator() { + allocators_[platform::CPUPlace()] = + std::make_shared(platform::CPUPlace()); + } + +#ifdef PADDLE_WITH_CUDA + void InitNaiveBestFitCUDAPinnedAllocator() { + allocators_[platform::CUDAPinnedPlace()] = + std::make_shared(platform::CUDAPinnedPlace()); + } + + void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) { + allocators_[p] = std::make_shared(p); + } + + void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) { + auto cuda_allocator = std::make_shared(p); + allocators_[p] = std::make_shared( + cuda_allocator, platform::GpuMinChunkSize()); + } +#endif + + class ZeroSizeAllocator : public Allocator { + public: + explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} + + protected: + Allocation* AllocateImpl(size_t size) override { + return new Allocation(nullptr, 0, place_); + } + + void FreeImpl(Allocation* allocation) override { delete allocation; } + + private: + platform::Place place_; + }; + + void InitZeroSizeAllocators() { + std::vector places; + places.emplace_back(platform::CPUPlace()); +#ifdef PADDLE_WITH_CUDA + int device_count = platform::GetCUDADeviceCount(); + for (int dev_id = 0; dev_id < device_count; ++dev_id) { + places.emplace_back(platform::CUDAPlace(dev_id)); + } + places.emplace_back(platform::CUDAPinnedPlace()); +#endif + + for (auto& p : places) { + zero_size_allocators_[p] = std::make_shared(p); + } + } + + private: + std::map> allocators_; + std::map> zero_size_allocators_; +}; + +// Pimpl. Make interface clean. +AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} +// delete m_ may cause core dump when the destructor of python in conflict with +// cpp. +AllocatorFacade::~AllocatorFacade() {} + +AllocatorFacade& AllocatorFacade::Instance() { + static AllocatorFacade instance; + return instance; +} + +std::shared_ptr AllocatorFacade::AllocShared( + const platform::Place& place, size_t size) { + return std::shared_ptr(Alloc(place, size)); +} + +AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, + size_t size) { + return m_->GetAllocator(place, size)->Allocate(size); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h new file mode 100644 index 00000000..64b6fe25 --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -0,0 +1,55 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +// Allocator Facade is the interface exposed to other modules. +// All the configuration or dirty code under development should +// be hidden behind this facade. +// +// NOTE(yy): This class is a singleton class. +// NOTE(yy): To create a stable ABI and make compilation faster. Here we use +// a Pimpl trick; +class AllocatorFacadePrivate; +class AllocatorFacade { + public: + ~AllocatorFacade(); + AllocatorFacade(const AllocatorFacade& o) = delete; + const AllocatorFacade& operator=(const AllocatorFacade& o) = delete; + + static AllocatorFacade& Instance(); + + // Allocate a shared allocation. + std::shared_ptr AllocShared(const platform::Place& place, + size_t size); + + // Allocate a unique allocation. + AllocationPtr Alloc(const platform::Place& place, size_t size); + + // TODO(yy): Allocate a Copy-On-Write allocation? + private: + AllocatorFacade(); + AllocatorFacadePrivate* m_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc new file mode 100644 index 00000000..3e10be03 --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_facade_abs_flags_test.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include +#include + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +DECLARE_int64(gpu_allocator_retry_time); +#endif +DECLARE_string(allocator_strategy); + +namespace paddle { +namespace memory { +namespace allocation { + +//! Run allocate test cases for different places +void AllocateTestCases() { + auto &instance = AllocatorFacade::Instance(); + platform::Place place; + size_t size = 1024; + + { + place = platform::CPUPlace(); + size = 1024; + auto cpu_allocation = instance.Alloc(place, size); + ASSERT_NE(cpu_allocation, nullptr); + ASSERT_NE(cpu_allocation->ptr(), nullptr); + ASSERT_EQ(cpu_allocation->place(), place); + ASSERT_EQ(cpu_allocation->size(), size); + } + +#ifdef PADDLE_WITH_CUDA + { + place = platform::CUDAPlace(0); + size = 1024; + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + // Allocate 2GB gpu memory + place = platform::CUDAPlace(0); + size = 2 * static_cast(1 << 30); + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + place = platform::CUDAPinnedPlace(); + size = (1 << 20); + auto cuda_pinned_allocation = + instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); + ASSERT_NE(cuda_pinned_allocation, nullptr); + ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); + ASSERT_EQ(cuda_pinned_allocation->place(), place); + ASSERT_GE(cuda_pinned_allocation->size(), size); + } +#endif +} + +TEST(Allocator, SpecifyGpuMemory) { +#ifdef PADDLE_WITH_CUDA + // Set to 0.0 to test FLAGS_initial_gpu_memory_in_mb and + // FLAGS_reallocate_gpu_memory_in_mb + FLAGS_fraction_of_gpu_memory_to_use = 0.0; + // 512 MB + FLAGS_initial_gpu_memory_in_mb = 512; + // 4 MB + FLAGS_reallocate_gpu_memory_in_mb = 4; + FLAGS_gpu_allocator_retry_time = 500; + FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; +#endif + + FLAGS_allocator_strategy = "naive_best_fit"; + + AllocateTestCases(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc new file mode 100644 index 00000000..3748805b --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_facade_frac_flags_test.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include +#include + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +DECLARE_int64(gpu_allocator_retry_time); +#endif +DECLARE_string(allocator_strategy); + +namespace paddle { +namespace memory { +namespace allocation { + +//! Run allocate test cases for different places +void AllocateTestCases() { + auto &instance = AllocatorFacade::Instance(); + platform::Place place; + size_t size = 1024; + + { + place = platform::CPUPlace(); + size = 1024; + auto cpu_allocation = instance.Alloc(place, size); + ASSERT_NE(cpu_allocation, nullptr); + ASSERT_NE(cpu_allocation->ptr(), nullptr); + ASSERT_EQ(cpu_allocation->place(), place); + ASSERT_EQ(cpu_allocation->size(), size); + } + +#ifdef PADDLE_WITH_CUDA + { + place = platform::CUDAPlace(0); + size = 1024; + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + // Allocate 2GB gpu memory + place = platform::CUDAPlace(0); + size = 2 * static_cast(1 << 30); + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), size); + } + + { + place = platform::CUDAPinnedPlace(); + size = (1 << 20); + auto cuda_pinned_allocation = + instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); + ASSERT_NE(cuda_pinned_allocation, nullptr); + ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); + ASSERT_EQ(cuda_pinned_allocation->place(), place); + ASSERT_GE(cuda_pinned_allocation->size(), size); + } +#endif +} + +TEST(Allocator, Allocator) { +#ifdef PADDLE_WITH_CUDA + FLAGS_fraction_of_gpu_memory_to_use = 0.01; + FLAGS_gpu_allocator_retry_time = 500; + FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; +#endif + FLAGS_allocator_strategy = "naive_best_fit"; + + AllocateTestCases(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc new file mode 100644 index 00000000..4e45cc4d --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_strategy.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/allocator_strategy.h" +#include "gflags/gflags.h" +#include "glog/logging.h" +#include "paddle/fluid/platform/enforce.h" + +DEFINE_string(allocator_strategy, "naive_best_fit", + "The allocation strategy. naive_best_fit means the original best " + "fit allocator of Fluid. " + "auto_growth means the experimental auto-growth allocator. " + "Enum in [naive_best_fit, auto_growth]."); + +namespace paddle { +namespace memory { +namespace allocation { + +static AllocatorStrategy GetStrategyFromFlag() { + if (FLAGS_allocator_strategy == "naive_best_fit") { + return AllocatorStrategy::kNaiveBestFit; + } + + if (FLAGS_allocator_strategy == "auto_growth") { + return AllocatorStrategy::kAutoGrowth; + } + + PADDLE_THROW("Unsupported allocator strategy: %s", FLAGS_allocator_strategy); +} + +AllocatorStrategy GetAllocatorStrategy() { + static AllocatorStrategy strategy = GetStrategyFromFlag(); + return strategy; +} + +void UseAllocatorStrategyGFlag() {} +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_strategy.h b/paddle/fluid/memory/allocation/allocator_strategy.h new file mode 100644 index 00000000..ff6e7839 --- /dev/null +++ b/paddle/fluid/memory/allocation/allocator_strategy.h @@ -0,0 +1,30 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace memory { +namespace allocation { + +enum class AllocatorStrategy { kNaiveBestFit, kAutoGrowth }; + +extern AllocatorStrategy GetAllocatorStrategy(); + +// Do nothing, just make sure linker do not prune this file. +extern void UseAllocatorStrategyGFlag(); + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc new file mode 100644 index 00000000..0d3b11f7 --- /dev/null +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -0,0 +1,124 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" +#include +#include +#include +#include +#include // NOLINT +#include +#include "paddle/fluid/memory/allocation/aligned_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( + const std::shared_ptr &underlying_allocator, size_t alignment, + size_t chunk_size) + : underlying_allocator_( + std::make_shared(underlying_allocator, alignment)), + alignment_(alignment), + chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)) {} + +Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { + size = AlignedSize(size, alignment_); + + std::lock_guard guard(mtx_); + auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); + BlockIt block_it; + if (iter != free_blocks_.end()) { + block_it = iter->second; + free_blocks_.erase(iter); + auto *chunk = block_it->chunk_; + size_t remaining_size = block_it->size_ - size; + if (remaining_size == 0) { + block_it->is_free_ = false; + } else { + auto remaining_free_block = chunk->blocks_.insert( + block_it, Block(block_it->ptr_, remaining_size, true, chunk)); + free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_), + remaining_free_block); + block_it->ptr_ = + reinterpret_cast(block_it->ptr_) + remaining_size; + block_it->size_ = size; + block_it->is_free_ = false; + } + } else { + size_t realloc_size = std::max(size, chunk_size_); + + try { + chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size)); + } catch (BadAlloc &ex) { + if (size == realloc_size) throw ex; + realloc_size = size; + chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size)); + } + + auto *chunk = &(*chunks_.rbegin()); + realloc_size = chunk->allocation_->size(); + uint8_t *p = reinterpret_cast(chunk->allocation_->ptr()); + auto &blocks = chunk->blocks_; + + size_t remaining_size = realloc_size - size; + if (remaining_size > 0) { + blocks.emplace_back(p, remaining_size, true, chunk); + free_blocks_.emplace(std::make_pair(remaining_size, p), --(blocks.end())); + } + blocks.emplace_back(p + remaining_size, size, false, chunk); + block_it = --(blocks.end()); + VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining " + << remaining_size; + } + return new BlockAllocation(block_it); +} + +void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { + std::lock_guard guard(mtx_); + auto block_it = static_cast(allocation)->block_it_; + auto &blocks = block_it->chunk_->blocks_; + + block_it->is_free_ = true; + + if (block_it != blocks.begin()) { + auto prev_it = block_it; + --prev_it; + + if (prev_it->is_free_) { + free_blocks_.erase(std::make_pair(prev_it->size_, prev_it->ptr_)); + prev_it->size_ += block_it->size_; + blocks.erase(block_it); + block_it = prev_it; + } + } + + auto next_it = block_it; + ++next_it; + + if (next_it != blocks.end() && next_it->is_free_) { + free_blocks_.erase(std::make_pair(next_it->size_, next_it->ptr_)); + block_it->size_ += next_it->size_; + blocks.erase(next_it); + } + + free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_), + block_it); + + delete allocation; +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h new file mode 100644 index 00000000..a31dd7cf --- /dev/null +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -0,0 +1,86 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class AutoGrowthBestFitAllocator : public Allocator { + public: + explicit AutoGrowthBestFitAllocator( + const std::shared_ptr &underlying_allocator, size_t alignment, + size_t chunk_size = 0); + + bool IsAllocThreadSafe() const override { return true; } + + protected: + Allocation *AllocateImpl(size_t size) override; + + void FreeImpl(Allocation *allocation) override; + + private: + template + using List = std::list; + + struct Chunk; + + struct Block { + Block(void *ptr, size_t size, bool is_free, Chunk *chunk) + : ptr_(ptr), size_(size), is_free_(is_free), chunk_(chunk) {} + + void *ptr_; + size_t size_; + bool is_free_; + Chunk *chunk_; // which chunk it is from + }; + + struct Chunk { + explicit Chunk(AllocationPtr allocation) + : allocation_(std::move(allocation)) {} + + AllocationPtr allocation_; + List blocks_; + }; + + struct BlockAllocation : public Allocation { + explicit BlockAllocation(const List::iterator &it) + : Allocation(it->ptr_, it->size_, it->chunk_->allocation_->place()), + block_it_(it) {} + + List::iterator block_it_; + }; + + using BlockIt = List::iterator; + + std::shared_ptr underlying_allocator_; + std::map, BlockIt> free_blocks_; + std::list chunks_; + size_t alignment_; + size_t chunk_size_; + + mutable std::mutex mtx_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc new file mode 100644 index 00000000..69de0273 --- /dev/null +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_facade_test.cc @@ -0,0 +1,148 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/platform/gpu_info.h" + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_double(fraction_of_cuda_pinned_memory_to_use); +DECLARE_int64(gpu_allocator_retry_time); +#endif + +DECLARE_string(allocator_strategy); + +namespace paddle { +namespace memory { +namespace allocation { + +static inline size_t AlignTo(size_t size, size_t alignment) { + auto remaining = size % alignment; + return remaining == 0 ? size : size + alignment - remaining; +} + +TEST(allocator, allocator) { +#ifdef PADDLE_WITH_CUDA + FLAGS_fraction_of_gpu_memory_to_use = 0.01; + FLAGS_gpu_allocator_retry_time = 500; + FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; +#endif + + FLAGS_allocator_strategy = "auto_growth"; + + auto &instance = AllocatorFacade::Instance(); + size_t size = 1024; + platform::Place place; + + { + place = platform::CPUPlace(); + size = 1024; + auto cpu_allocation = instance.Alloc(place, size); + ASSERT_NE(cpu_allocation, nullptr); + ASSERT_NE(cpu_allocation->ptr(), nullptr); + ASSERT_EQ(cpu_allocation->place(), place); + ASSERT_EQ(cpu_allocation->size(), AlignedSize(size, 1024)); + } + +#ifdef PADDLE_WITH_CUDA + { + place = platform::CUDAPlace(0); + size = 1024; + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), + AlignedSize(size, platform::GpuMinChunkSize())); + } + + { + // Allocate 2GB gpu memory + place = platform::CUDAPlace(0); + size = 2 * static_cast(1 << 30); + auto gpu_allocation = instance.Alloc(place, size); + ASSERT_NE(gpu_allocation, nullptr); + ASSERT_NE(gpu_allocation->ptr(), nullptr); + ASSERT_EQ(gpu_allocation->place(), place); + ASSERT_GE(gpu_allocation->size(), + AlignedSize(size, platform::GpuMinChunkSize())); + } + + { + place = platform::CUDAPinnedPlace(); + size = (1 << 20); + auto cuda_pinned_allocation = + instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); + ASSERT_NE(cuda_pinned_allocation, nullptr); + ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); + ASSERT_EQ(cuda_pinned_allocation->place(), place); + ASSERT_GE(cuda_pinned_allocation->size(), AlignedSize(size, 1 << 20)); + } +#endif +} + +TEST(multithread_allocate, test_segfault) { + FLAGS_allocator_strategy = "auto_growth"; +#ifdef PADDLE_WITH_CUDA + std::mutex mtx; + std::condition_variable cv; + bool flag = false; + + auto alloc_func = [&](int dev_id, unsigned int seed) { + auto &instance = AllocatorFacade::Instance(); + + std::mt19937 gen(seed); + std::uniform_int_distribution dist(1 << 20, 1 << 25); + + { + std::unique_lock lock(mtx); + cv.wait(lock, [&] { return flag; }); + } + + for (int i = 0; i < 50; i++) { + size_t size = dist(gen); + for (int j = 0; j < 10; j++) { + instance.Alloc(platform::CUDAPlace(dev_id), size); + } + } + }; + + std::vector ths; + for (size_t i = 0; i < 50; ++i) { + std::random_device rd; + ths.emplace_back(alloc_func, 0, rd()); + } + + { + std::lock_guard guard(mtx); + flag = true; + } + cv.notify_all(); + + for (auto &th : ths) { + th.join(); + } +#endif +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc new file mode 100644 index 00000000..72ee4e54 --- /dev/null +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -0,0 +1,170 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/best_fit_allocator.h" +#include +#include +#include +#include + +namespace paddle { +namespace memory { +namespace allocation { + +static int HighestBitPos(size_t N) { + if (UNLIKELY(N == 0)) { + return 0; + } else { +#ifdef __GNUCC__ + return sizeof(unsigned int) * 8 - __builtin_clz(N); +#else + return static_cast(std::log2(N) + 1); +#endif + } +} + +BestFitAllocator::BestFitAllocator(Allocation* allocation) + : allocation_(allocation) { + details::Chunk chunk; + chunk.size_ = allocation_->size(); + chunk.offset_ = 0; + chunk.is_free = true; + chunks_.emplace_back(chunk); + free_chunks_[HighestBitPos(chunk.size_)].insert( + {chunk.size_, chunks_.begin()}); +} + +size_t BestFitAllocator::FreeSize() const { + size_t acc = 0; + for (auto& array_item : free_chunks_) { + for (auto& pair : array_item) { + acc += pair.second->size_; + } + } + return acc; +} + +BestFitAllocator::ListIt BestFitAllocator::SplitChunk(size_t request_size, + size_t free_chunk_offset, + MapIt bin_iterator) { + auto to_split_it = bin_iterator->second; + free_chunks_[free_chunk_offset].erase(bin_iterator); + + PADDLE_ENFORCE(to_split_it->is_free); + PADDLE_ENFORCE_GE(to_split_it->size_, request_size); + + auto remaining_size = to_split_it->size_ - request_size; + details::Chunk to_use; + details::Chunk remaining; + to_use.size_ = request_size; + to_use.is_free = false; + remaining.size_ = remaining_size; + remaining.is_free = true; + + // calc offsets + to_use.offset_ = to_split_it->offset_; + remaining.offset_ = to_use.offset_ + to_use.size_; + + // insert to chunk list + auto to_use_it = chunks_.insert(to_split_it, to_use); + if (remaining.size_ != 0) { + auto bit_size = static_cast(HighestBitPos(remaining.size_)); + free_chunks_[bit_size].insert( + {remaining.size_, chunks_.insert(to_split_it, remaining)}); + } + chunks_.erase(to_split_it); + return to_use_it; +} + +void BestFitAllocator::InsertFreeNode(const ListIt& it) { + auto pos = static_cast(HighestBitPos(it->size_)); + auto& free_map = free_chunks_[pos]; + free_map.insert({it->size_, it}); +} +void BestFitAllocator::EraseFreeNode(const ListIt& it) { + size_t pos = static_cast(HighestBitPos(it->size_)); + auto& free_map = free_chunks_[pos]; + auto map_it = free_map.find(it->size_); + while (map_it->second != it && map_it != free_map.end()) { + ++map_it; + } + PADDLE_ENFORCE(map_it != free_map.end()); + free_map.erase(map_it); +} +size_t BestFitAllocator::NumFreeChunks() const { + size_t num = 0; + for (auto& array_item : free_chunks_) { + num += array_item.size(); + } + return num; +} +void BestFitAllocator::FreeImpl(Allocation* allocation) { + auto* bf_allocation = dynamic_cast(allocation); + PADDLE_ENFORCE_NOT_NULL(bf_allocation, + "The input allocation is not BestFitAllocation."); + auto chunk_it = bf_allocation->ChunkIterator(); + PADDLE_ENFORCE(!chunk_it->is_free); + chunk_it->is_free = true; + if (chunk_it != chunks_.begin()) { + auto prev_it = chunk_it; + --prev_it; + + if (prev_it->is_free) { + // Merge Left. + EraseFreeNode(prev_it); + prev_it->size_ += chunk_it->size_; + chunks_.erase(chunk_it); + chunk_it = prev_it; + } + } + + auto next_it = chunk_it; + ++next_it; + if (next_it != chunks_.end() && next_it->is_free) { + EraseFreeNode(next_it); + chunk_it->size_ += next_it->size_; + chunks_.erase(next_it); + } + + InsertFreeNode(chunk_it); + delete allocation; +} +Allocation* BestFitAllocator::AllocateImpl(size_t size) { + auto highest_set_bit = static_cast(HighestBitPos(size)); + MapIt map_it; + for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) { + map_it = free_chunks_[highest_set_bit].lower_bound(size); + if (map_it != free_chunks_[highest_set_bit].end()) { + break; + } + } + if (UNLIKELY(highest_set_bit == free_chunks_.size())) { + throw BadAlloc(string::Sprintf( + "Cannot allocate %d, All fragments size is %d", size, FreeSize())); + } + auto chunk_it = SplitChunk(size, highest_set_bit, map_it); + return new BestFitAllocation(this, chunk_it); +} + +BestFitAllocation::BestFitAllocation( + paddle::memory::allocation::BestFitAllocator* allocator, + typename details::ChunkList::iterator chunk_it) + : Allocation(reinterpret_cast( + reinterpret_cast(allocator->BasePtr()) + + chunk_it->offset_), + chunk_it->size_, allocator->Place()), + chunk_it_(chunk_it) {} +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h new file mode 100644 index 00000000..64a552e4 --- /dev/null +++ b/paddle/fluid/memory/allocation/best_fit_allocator.h @@ -0,0 +1,132 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { +namespace details { +struct Chunk { + bool is_free{true}; + // Offset to the base allocation. + uintptr_t offset_; + size_t size_; +}; + +// Here we use std::list to maintain chunk list. +// NOTE(yy): The traditional implementation of ChunkList is add `prev`/`next` +// pointers in `Chunk`, and split the allocation as `ChunkHeader` and +// `Payload`. Such as +// *-------*---------------*---------------*--------------* +// | Chunk | prev_ pointer | next_ pointer | payload .... | +// *-------*---------------*---------------*--------------* +// This implementation can just return a raw pointer, and we can get the list +// structure by the raw pointer. However, we cannot use the same code on GPU +// since CPU cannot access GPU memory directly. +// +// So we choose to use `std::list` and return an allocation instance, which +// contains the list node iterator, then we can unify CPU/GPU code. +// +// To return an allocation is not a bad idea, since Tensor/Vector should holds +// an allocation instead of raw pointer directly. +using ChunkList = std::list; + +// Here we use a multi-level map of free chunks. +// the map is +// MSB offset --> size --> [ChunkList::iterator] +// +// The time complexities: +// find a free chunk: +// O(logN), +// where N is the number of free nodes with the same MSB offset. +// find the position of a chunk iterator: +// O(logN + K), +// where N is the number of free nodes with the same MSB offset. +// where K is the number of free nodes with the same size. +// insert a free chunk: +// O(logN), +// where N is the number of free nodes with the same MSB offset. +// erase a free chunk: +// O(1) +using FreeChunkBin = + std::array, sizeof(size_t) * 8>; +} // namespace details + +class BestFitAllocator; + +// The BestFitAllocation maintain the List Node iterator. +class BestFitAllocation : public Allocation { + private: + using ListIt = typename details::ChunkList::iterator; + + public: + BestFitAllocation(BestFitAllocator* allocator, ListIt chunk_it); + + const ListIt& ChunkIterator() const { return chunk_it_; } + + private: + typename details::ChunkList::iterator chunk_it_; +}; + +// TODO(yy): Current BestFitAllocator is not thread-safe. To make it thread +// safe, we must wrap a locked_allocator. However, we can implement a thread +// safe allocator by locking each bin and chunks list independently. It will +// make BestFitAllocator faster in multi-thread situation. +// +// This allocator implements a best-fit allocator with merging the free nodes. +// +// To allocate a buffer, it will find the best-fit chunk. If the best-fit chunk +// is larger than request size, the original block will be split into two +// chunks. The first block will be used and the second block will be put into +// free chunks. +// +// To free an allocation, it will set the chunk of allocation to free and merge +// the prev-chunk and the next-chunk when possible. +class BestFitAllocator : public Allocator { + public: + explicit BestFitAllocator(Allocation* allocation); + + void* BasePtr() const { return allocation_->ptr(); } + + const platform::Place& Place() const { return allocation_->place(); } + + size_t NumFreeChunks() const; + + private: + size_t FreeSize() const; + using MapIt = typename details::FreeChunkBin::value_type::iterator; + using ListIt = typename details::ChunkList::iterator; + + ListIt SplitChunk(size_t request_size, size_t free_chunk_offset, + MapIt bin_iterator); + void EraseFreeNode(const ListIt& it); + void InsertFreeNode(const ListIt& it); + + protected: + void FreeImpl(Allocation* allocation) override; + Allocation* AllocateImpl(size_t size) override; + + private: + Allocation* allocation_; // not owned + details::ChunkList chunks_; + details::FreeChunkBin free_chunks_; +}; +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc new file mode 100644 index 00000000..7e5207e6 --- /dev/null +++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc @@ -0,0 +1,139 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/best_fit_allocator.h" +#include +#include +#include // NOLINT +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/cpu_allocator.h" +#include "paddle/fluid/memory/allocation/locked_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class StubAllocation : public Allocation { + public: + explicit StubAllocation(size_t size) + : Allocation(0, size, platform::CPUPlace()) {} +}; + +TEST(BestFitAllocator, test_allocation) { + StubAllocation stub(4UL * 1024 * 1024 * 1024); + BestFitAllocator allocator(&stub); + { auto allocation = allocator.Allocate(64); } + + { + auto allocation = allocator.Allocate(80); + + { + auto best_fit_allocation = + dynamic_cast(allocation.get()); + ASSERT_NE(best_fit_allocation, nullptr); + ASSERT_FALSE(best_fit_allocation->ChunkIterator()->is_free); + ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0); + ASSERT_EQ(allocation->size(), 80); + ASSERT_EQ(allocation->ptr(), nullptr); + } + + auto allocation2 = allocator.Allocate(60); + auto allocation3 = allocator.Allocate(90); + allocation2.reset(); + allocation2 = allocator.Allocate(30); + + { + auto best_fit_allocation = + dynamic_cast(allocation2.get()); + ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80); + } + allocation2.reset(); + allocation2 = allocator.Allocate(60); + + { + auto best_fit_allocation = + dynamic_cast(allocation2.get()); + ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 80); + } + + allocation.reset(); + allocation2.reset(); + + allocation = allocator.Allocate(80 + 60); + { + auto best_fit_allocation = + dynamic_cast(allocation.get()); + ASSERT_EQ(best_fit_allocation->ChunkIterator()->offset_, 0); + } + + allocation.reset(); + + allocation = allocator.Allocate(80); + allocation2 = allocator.Allocate(60); + allocation = nullptr; + allocation2 = nullptr; + allocation3 = nullptr; + + ASSERT_EQ(allocator.NumFreeChunks(), 1U); + } +} + +TEST(BestFitAllocator, test_concurrent_cpu_allocation) { + CPUAllocator allocator; + auto global_allocation = allocator.Allocate(256UL * 1024 * 1024); + + std::unique_ptr best_fit_allocator( + new BestFitAllocator(global_allocation.get())); + + LockedAllocator locked_allocator(std::move(best_fit_allocator)); + + auto th_main = [&](std::random_device::result_type seed) { + std::default_random_engine engine(seed); + std::uniform_int_distribution dist(1U, 1024U); + + for (size_t i = 0; i < 128; ++i) { + size_t allocate_size = dist(engine); + + auto allocation = + locked_allocator.Allocate(sizeof(size_t) * allocate_size); + + size_t* data = reinterpret_cast(allocation->ptr()); + + for (size_t j = 0; j < allocate_size; ++j) { + data[j] = j; + } + std::this_thread::yield(); + + for (size_t j = 0; j < allocate_size; ++j) { + ASSERT_EQ(data[j], j); + } + } + }; + { + std::vector threads; + for (size_t i = 0; i < 1024; ++i) { + std::random_device dev; + threads.emplace_back(th_main, dev()); + } + for (auto& th : threads) { + th.join(); + } + } +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu new file mode 100644 index 00000000..eb24ba84 --- /dev/null +++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu @@ -0,0 +1,88 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include // NOLINT +#include +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/best_fit_allocator.h" +#include "paddle/fluid/memory/allocation/cuda_allocator.h" +#include "paddle/fluid/memory/allocation/locked_allocator.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/for_range.h" +namespace paddle { +namespace memory { +namespace allocation { + +struct ForEachFill { + size_t* ptr_; + + explicit ForEachFill(size_t* ptr) : ptr_(ptr) {} + + __device__ void operator()(size_t i) { ptr_[i] = i; } +}; + +TEST(BestFitAllocator, concurrent_cuda) { + CUDAAllocator allocator(platform::CUDAPlace(0)); + // 256 MB + auto cuda_allocation = allocator.Allocate(256U * 1024 * 1024); + LockedAllocator concurrent_allocator( + std::unique_ptr(new BestFitAllocator(cuda_allocation.get()))); + + auto th_main = [&](std::random_device::result_type seed) { + std::default_random_engine engine(seed); + std::uniform_int_distribution dist(1U, 1024U); + platform::CUDAPlace gpu(0); + platform::CUDADeviceContext dev_ctx(gpu); + std::array buf; + for (size_t i = 0; i < 128; ++i) { + size_t allocate_size = dist(engine); + + auto allocation = + concurrent_allocator.Allocate(sizeof(size_t) * allocate_size); + + size_t* data = reinterpret_cast(allocation->ptr()); + + ForEachFill fill(data); + platform::ForRange for_range(dev_ctx, + allocate_size); + for_range(fill); + + memory::Copy(platform::CPUPlace(), buf.data(), gpu, data, + sizeof(size_t) * allocate_size, dev_ctx.stream()); + + dev_ctx.Wait(); + for (size_t j = 0; j < allocate_size; ++j) { + ASSERT_EQ(buf[j], j); + } + allocation = nullptr; + } + }; + + { + std::vector threads; + for (size_t i = 0; i < 1024; ++i) { + std::random_device dev; + threads.emplace_back(th_main, dev()); + } + for (auto& th : threads) { + th.join(); + } + } +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc new file mode 100644 index 00000000..d80616b7 --- /dev/null +++ b/paddle/fluid/memory/allocation/buffered_allocator.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/buffered_allocator.h" +#include +#include +#include + +namespace paddle { +namespace memory { +namespace allocation { + +BufferedAllocator::BufferedAllocator(std::shared_ptr allocator) + : underlying_allocator_(std::move(allocator)) { + PADDLE_ENFORCE_NOT_NULL( + underlying_allocator_, + "Underlying allocator of BufferedAllocator must not be null"); + if (underlying_allocator_->IsAllocThreadSafe()) { + mtx_.reset(new std::mutex()); + } +} + +BufferedAllocator::~BufferedAllocator() { FreeCache(-1UL); } + +void BufferedAllocator::FreeCache(size_t size) { + platform::LockGuardPtr guard(mtx_); + if (UNLIKELY(size == 0)) return; + size_t cur = 0; + while (!allocations_.empty()) { // free the largest + auto it = --allocations_.end(); + cur += it->second->size(); + underlying_allocator_->Free(it->second.release()); + allocations_.erase(it); + if (cur >= size) return; + } +} + +bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; } + +void BufferedAllocator::FreeImpl(Allocation *allocation) { + platform::LockGuardPtr guard(mtx_); + allocations_.emplace(allocation->size(), AllocationPtr(allocation)); +} + +Allocation *BufferedAllocator::AllocateImpl(size_t size) { + { + platform::LockGuardPtr guard(mtx_); + auto it = allocations_.lower_bound(size); + if (it != allocations_.end() && it->first < size * 2) { + AllocationPtr result(std::move(it->second)); + allocations_.erase(it); + return result.release(); + } + } + + try { + return underlying_allocator_->Allocate(size).release(); + } catch (BadAlloc &) { + FreeCache(size); + return underlying_allocator_->Allocate(size).release(); + } +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h new file mode 100644 index 00000000..fd0996f7 --- /dev/null +++ b/paddle/fluid/memory/allocation/buffered_allocator.h @@ -0,0 +1,58 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/lock_guard_ptr.h" + +namespace paddle { +namespace memory { +namespace allocation { + +// NOTE(zjl): BufferedAllocator maintains a memory pool to accelerate +// memory allocation and reuse memory. +// BufferedAllocator provides the same thread-safety level as +// underlying_allocator_ +class BufferedAllocator : public Allocator { + public: + explicit BufferedAllocator(std::shared_ptr allocator); + + ~BufferedAllocator(); + + bool IsAllocThreadSafe() const override; + + // only used in unittest + inline void ClearCache() { FreeCache(-1UL); } + + private: + void FreeCache(size_t size); + + protected: + void FreeImpl(Allocation *allocation) override; + Allocation *AllocateImpl(size_t size) override; + + private: + std::shared_ptr underlying_allocator_; + std::multimap allocations_; + std::unique_ptr mtx_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc new file mode 100644 index 00000000..e4825233 --- /dev/null +++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/buffered_allocator.h" +#include +#include +#include "paddle/fluid/memory/allocation/best_fit_allocator.h" +#include "paddle/fluid/memory/allocation/cpu_allocator.h" +#include "paddle/fluid/memory/allocation/locked_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +inline std::unique_ptr GetBufferedAllocator( + Allocation *allocation, bool thread_safe) { + std::unique_ptr allocator(new BestFitAllocator(allocation)); + if (thread_safe) { + allocator.reset(new LockedAllocator(std::move(allocator))); + } + + return std::unique_ptr( + new BufferedAllocator(std::move(allocator))); +} + +TEST(buffered_allocator, thread_safety) { + std::unique_ptr allocator(new CPUAllocator()); + auto chunk = allocator->Allocate(1 << 20); + { + auto buf_allocator = GetBufferedAllocator(chunk.get(), true); + ASSERT_EQ(buf_allocator->IsAllocThreadSafe(), true); + } + + { + auto buf_allocator = GetBufferedAllocator(chunk.get(), false); + ASSERT_EQ(buf_allocator->IsAllocThreadSafe(), false); + } +} + +class StubAllocation : public Allocation { + public: + using Allocation::Allocation; +}; + +class StubAllocator : public Allocator { + public: + void ResetCounter() { + construct_count_ = 0; + destruct_count_ = 0; + } + + size_t GetAllocCount() const { return construct_count_; } + + size_t GetFreeCount() const { return destruct_count_; } + + protected: + void FreeImpl(Allocation *allocation) override { + auto *alloc = dynamic_cast(allocation); + PADDLE_ENFORCE_NOT_NULL(alloc); + if (alloc->ptr()) delete[] static_cast(alloc->ptr()); + ++destruct_count_; + delete allocation; + } + Allocation *AllocateImpl(size_t size) override { + ++construct_count_; + if (size == 0) { + return new StubAllocation(nullptr, 0, platform::CPUPlace()); + } else { + return new StubAllocation(new uint8_t[size], size, platform::CPUPlace()); + } + } + + private: + size_t construct_count_ = 0; + size_t destruct_count_ = 0; +}; + +constexpr size_t kZero = 0; +constexpr size_t kOne = 1; +constexpr size_t kTwo = 2; + +TEST(buffered_allocator, lazy_free) { + std::unique_ptr stub_allocator(new StubAllocator()); + auto *underlying_allocator = stub_allocator.get(); + std::unique_ptr allocator( + new BufferedAllocator(std::move(stub_allocator))); + + { + underlying_allocator->ResetCounter(); + auto x = allocator->Allocate(1025); + ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne); + ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero); + x = nullptr; + ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero); + } + + { + underlying_allocator->ResetCounter(); + auto x = allocator->Allocate(900); + ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero); + ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero); + auto y = allocator->Allocate(2048); + ASSERT_EQ(underlying_allocator->GetAllocCount(), kOne); + ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero); + x = nullptr; + ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero); + y = nullptr; + ASSERT_EQ(underlying_allocator->GetFreeCount(), kZero); + } + + { + underlying_allocator->ResetCounter(); + allocator->ClearCache(); + ASSERT_EQ(underlying_allocator->GetAllocCount(), kZero); + ASSERT_EQ(underlying_allocator->GetFreeCount(), kTwo); + } +} + +TEST(buffered_allocator, garbage_collection) { + std::unique_ptr cpu_allocator(new CPUAllocator()); + auto chunk = cpu_allocator->Allocate(2048); + auto allocator = GetBufferedAllocator(chunk.get(), false); + auto x1 = allocator->Allocate(1600); + auto x2 = allocator->Allocate(400); + x1 = nullptr; + x2 = nullptr; + auto x3 = allocator->Allocate(1600); + ASSERT_NE(x3, nullptr); + ASSERT_NE(x3->ptr(), nullptr); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc new file mode 100644 index 00000000..580cf1af --- /dev/null +++ b/paddle/fluid/memory/allocation/cpu_allocator.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/cpu_allocator.h" +#include +#include + +namespace paddle { +namespace memory { +namespace allocation { + +bool CPUAllocator::IsAllocThreadSafe() const { return true; } + +void CPUAllocator::FreeImpl(Allocation *allocation) { + void *p = allocation->ptr(); +#ifdef _WIN32 + _aligned_free(p); +#else + free(p); +#endif + delete allocation; +} + +Allocation *CPUAllocator::AllocateImpl(size_t size) { + void *p; +#ifdef _WIN32 + p = _aligned_malloc(size, kAlignment); +#else + PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!", + size); +#endif + return new Allocation(p, size, platform::CPUPlace()); +} +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h new file mode 100644 index 00000000..058ff633 --- /dev/null +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/memory/allocation/allocator.h" + +#ifdef _WIN32 +#define posix_memalign_free _aligned_free +#define posix_memalign(p, a, s) \ + (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) +#endif + +namespace paddle { +namespace memory { +namespace allocation { +// CPU system allocator and allocation. +// +// NOTE(yy): Should we just use `malloc` here since there is an +// aligned_allocator. +// +// NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import +// an open-sourced allocator into Paddle. +class CPUAllocator : public Allocator { + public: + constexpr static size_t kAlignment = 4096UL; + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(Allocation* allocation) override; + Allocation* AllocateImpl(size_t size) override; +}; +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc new file mode 100644 index 00000000..349c71ce --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/cuda_allocator.h" +#include +#include +#include +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace memory { +namespace allocation { +bool CUDAAllocator::IsAllocThreadSafe() const { return true; } +void CUDAAllocator::FreeImpl(Allocation* allocation) { + platform::CUDADeviceGuard guard(place_.device); + PADDLE_ENFORCE_EQ(boost::get(allocation->place()), + place_); + PADDLE_ENFORCE(cudaFree(allocation->ptr())); + delete allocation; +} + +Allocation* CUDAAllocator::AllocateImpl(size_t size) { + platform::CUDADeviceGuard guard(place_.device); + void* ptr; + auto status = cudaMalloc(&ptr, size); + if (UNLIKELY(status != cudaSuccess)) { + throw BadAlloc(string::Sprintf( + "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device, + status, cudaGetErrorString(status))); + } + return new Allocation(ptr, size, platform::Place(place_)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h new file mode 100644 index 00000000..886f6e7a --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_allocator.h @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class CUDAAllocator : public Allocator { + public: + explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {} + explicit CUDAAllocator(const platform::Place& place) + : place_(boost::get(place)) {} + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(Allocation* allocation) override; + Allocation* AllocateImpl(size_t size) override; + + private: + platform::CUDAPlace place_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc new file mode 100644 index 00000000..a9128076 --- /dev/null +++ b/paddle/fluid/memory/allocation/locked_allocator.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/locked_allocator.h" +#include // NOLINT +#include +#include "paddle/fluid/platform/lock_guard_ptr.h" + +namespace paddle { +namespace memory { +namespace allocation { + +bool LockedAllocator::IsAllocThreadSafe() const { return true; } + +LockedAllocator::LockedAllocator( + std::shared_ptr underlying_allocator) + : underlying_allocator_(std::move(underlying_allocator)) { + PADDLE_ENFORCE_NOT_NULL(underlying_allocator_); + if (!underlying_allocator_->IsAllocThreadSafe()) { + mtx_.reset(new std::mutex()); + } +} + +void LockedAllocator::FreeImpl(Allocation *allocation) { + platform::LockGuardPtr guard(mtx_); + underlying_allocator_->Free(allocation); +} + +Allocation *LockedAllocator::AllocateImpl(size_t size) { + platform::LockGuardPtr guard(mtx_); + return underlying_allocator_->Allocate(size).release(); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h new file mode 100644 index 00000000..4af77e6e --- /dev/null +++ b/paddle/fluid/memory/allocation/locked_allocator.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include // NOLINT +#include // NOLINT +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +// A allocator to make underlying allocator thread safe. +class LockedAllocator : public Allocator { + public: + explicit LockedAllocator(std::shared_ptr underlying_allocator); + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(Allocation *allocation) override; + Allocation *AllocateImpl(size_t size) override; + + private: + std::shared_ptr underlying_allocator_; + std::unique_ptr mtx_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc new file mode 100644 index 00000000..2e4e7162 --- /dev/null +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -0,0 +1,312 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h" +#include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/fluid/string/split.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif + +DEFINE_bool(init_allocated_mem, false, + "It is a mistake that the values of the memory allocated by " + "BuddyAllocator are always zeroed in some op's implementation. " + "To find this error in time, we use init_allocated_mem to indicate " + "that initializing the allocated memory with a small value " + "during unit testing."); +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +DECLARE_bool(benchmark); + +namespace paddle { +namespace memory { +namespace legacy { +template +void *Alloc(const Place &place, size_t size); + +template +void Free(const Place &place, void *p, size_t size); + +template +size_t Used(const Place &place); + +struct Usage : public boost::static_visitor { + size_t operator()(const platform::CPUPlace &cpu) const; + size_t operator()(const platform::CUDAPlace &gpu) const; + size_t operator()(const platform::CUDAPinnedPlace &cuda_pinned) const; +}; + +size_t memory_usage(const platform::Place &p); + +using BuddyAllocator = detail::BuddyAllocator; + +BuddyAllocator *GetCPUBuddyAllocator() { + // We tried thread_local for inference::RNN1 model, but that not works much + // for multi-thread test. + static std::once_flag init_flag; + static detail::BuddyAllocator *a = nullptr; + + std::call_once(init_flag, []() { + a = new detail::BuddyAllocator( + std::unique_ptr(new detail::CPUAllocator), + platform::CpuMinChunkSize(), platform::CpuMaxChunkSize()); + }); + + return a; +} + +template <> +void *Alloc(const platform::CPUPlace &place, size_t size) { + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + void *p = GetCPUBuddyAllocator()->Alloc(size); + if (FLAGS_init_allocated_mem) { + memset(p, 0xEF, size); + } + VLOG(10) << " pointer=" << p; + return p; +} + +template <> +void Free(const platform::CPUPlace &place, void *p, + size_t size) { + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetCPUBuddyAllocator()->Free(p); +} + +template <> +size_t Used(const platform::CPUPlace &place) { + return GetCPUBuddyAllocator()->Used(); +} + +#ifdef PADDLE_WITH_CUDA +BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { + static std::once_flag init_flag; + static detail::BuddyAllocator **a_arr = nullptr; + static std::vector devices; + + std::call_once(init_flag, [gpu_id]() { + devices = platform::GetSelectedDevices(); + int gpu_num = devices.size(); + a_arr = new BuddyAllocator *[gpu_num]; + + for (size_t i = 0; i < devices.size(); ++i) { + int dev_id = devices[i]; + a_arr[i] = nullptr; + platform::SetDeviceId(dev_id); + a_arr[i] = new BuddyAllocator(std::unique_ptr( + new detail::GPUAllocator(dev_id)), + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); + + VLOG(10) << "\n\nNOTE:\n" + << "You can set GFlags environment variable " + << "'FLAGS_fraction_of_gpu_memory_to_use' " + << "or 'FLAGS_initial_gpu_memory_in_mb' " + << "or 'FLAGS_reallocate_gpu_memory_in_mb' " + << "to change the memory size for GPU usage.\n" + << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is " + << FLAGS_fraction_of_gpu_memory_to_use + << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is " + << FLAGS_initial_gpu_memory_in_mb + << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " + << FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; + } + platform::SetDeviceId(gpu_id); + }); + + auto pos = std::distance(devices.begin(), + std::find(devices.begin(), devices.end(), gpu_id)); + return a_arr[pos]; +} +#endif + +template <> +size_t Used(const platform::CUDAPlace &place) { +#ifdef PADDLE_WITH_CUDA + return GetGPUBuddyAllocator(place.device)->Used(); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif +} + +template <> +void *Alloc(const platform::CUDAPlace &place, + size_t size) { +#ifdef PADDLE_WITH_CUDA + auto *buddy_allocator = GetGPUBuddyAllocator(place.device); + auto *ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + platform::CUDADeviceGuard(place.device); + size_t avail, total; + platform::GpuMemoryUsage(&avail, &total); + LOG(FATAL) << "Cannot allocate " << string::HumanReadableSize(size) + << " in GPU " << place.device << ", available " + << string::HumanReadableSize(avail) << ", total " + << string::HumanReadableSize(total) << ", GpuMinChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMinChunkSize()) + << ", GpuMaxChunkSize " + << string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()) + << ", GPU memory used: " + << string::HumanReadableSize(Used(place)); + } else { + if (FLAGS_init_allocated_mem) { + cudaMemset(ptr, 0xEF, size); + } + } + return ptr; +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif +} + +template <> +void Free(const platform::CUDAPlace &place, void *p, + size_t size) { +#ifdef PADDLE_WITH_CUDA + GetGPUBuddyAllocator(place.device)->Free(p); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif +} + +#ifdef PADDLE_WITH_CUDA +BuddyAllocator *GetCUDAPinnedBuddyAllocator() { + static std::once_flag init_flag; + static BuddyAllocator *ba = nullptr; + + std::call_once(init_flag, []() { + ba = new BuddyAllocator(std::unique_ptr( + new detail::CUDAPinnedAllocator), + platform::CUDAPinnedMinChunkSize(), + platform::CUDAPinnedMaxChunkSize()); + }); + + return ba; +} +#endif + +template <> +size_t Used(const platform::CUDAPinnedPlace &place) { +#ifdef PADDLE_WITH_CUDA + return GetCUDAPinnedBuddyAllocator()->Used(); +#else + PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device."); +#endif +} + +template <> +void *Alloc(const platform::CUDAPinnedPlace &place, + size_t size) { +#ifdef PADDLE_WITH_CUDA + auto *buddy_allocator = GetCUDAPinnedBuddyAllocator(); + void *ptr = buddy_allocator->Alloc(size); + + if (ptr == nullptr) { + LOG(WARNING) << "cudaHostAlloc Cannot allocate " << size + << " bytes in CUDAPinnedPlace"; + } + if (FLAGS_init_allocated_mem) { + memset(ptr, 0xEF, size); + } + return ptr; +#else + PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device."); +#endif +} + +template <> +void Free(const platform::CUDAPinnedPlace &place, + void *p, size_t size) { +#ifdef PADDLE_WITH_CUDA + GetCUDAPinnedBuddyAllocator()->Free(p); +#else + PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device."); +#endif +} + +struct AllocVisitor : public boost::static_visitor { + inline explicit AllocVisitor(size_t size) : size_(size) {} + + template + inline void *operator()(const Place &place) const { + return Alloc(place, size_); + } + + private: + size_t size_; +}; + +struct FreeVisitor : public boost::static_visitor { + inline explicit FreeVisitor(void *ptr, size_t size) + : ptr_(ptr), size_(size) {} + + template + inline void operator()(const Place &place) const { + Free(place, ptr_, size_); + } + + private: + void *ptr_; + size_t size_; +}; + +size_t Usage::operator()(const platform::CPUPlace &cpu) const { + return Used(cpu); +} + +size_t Usage::operator()(const platform::CUDAPlace &gpu) const { +#ifdef PADDLE_WITH_CUDA + return Used(gpu); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif +} + +size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { +#ifdef PADDLE_WITH_CUDA + return Used(cuda_pinned); +#else + PADDLE_THROW("'CUDAPinnedPlace' is not supported in CPU only device."); +#endif +} +} // namespace legacy + +namespace allocation { + +Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { + void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); + return new Allocation(ptr, size, place_); +} + +void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) { + boost::apply_visitor( + legacy::FreeVisitor(allocation->ptr(), allocation->size()), + allocation->place()); + delete allocation; +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h new file mode 100644 index 00000000..5b376e6c --- /dev/null +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include // NOLINT +#include +#include +#include +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" +namespace paddle { +namespace memory { +namespace allocation { + +class NaiveBestFitAllocator : public Allocator { + public: + explicit NaiveBestFitAllocator(const platform::Place &p) : place_(p) {} + + protected: + Allocation *AllocateImpl(size_t size) override; + void FreeImpl(Allocation *allocation) override; + + private: + platform::Place place_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc new file mode 100644 index 00000000..35391167 --- /dev/null +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/pinned_allocator.h" +#include +#include + +namespace paddle { +namespace memory { +namespace allocation { +bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } +void CPUPinnedAllocator::FreeImpl(Allocation *allocation) { + PADDLE_ENFORCE(cudaFreeHost(allocation->ptr())); + delete allocation; +} +Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { + void *ptr; + PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); + return new Allocation(ptr, size, platform::CUDAPinnedPlace()); +} +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h new file mode 100644 index 00000000..4f535ef3 --- /dev/null +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +// Allocator uses `cudaHostAlloc` +class CPUPinnedAllocator : public Allocator { + public: + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(Allocation *allocation) override; + Allocation *AllocateImpl(size_t size) override; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc new file mode 100644 index 00000000..bf14ed5d --- /dev/null +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/retry_allocator.h" +namespace paddle { +namespace memory { +namespace allocation { + +void RetryAllocator::FreeImpl(Allocation* allocation) { + // Delete underlying allocation first. + underlying_allocator_->Free(allocation); + cv_.notify_all(); +} + +Allocation* RetryAllocator::AllocateImpl(size_t size) { + auto alloc_func = [&, this]() { + return underlying_allocator_->Allocate(size).release(); + }; + // In fact, we can unify the code of allocation success and failure + // But it would add lock even when allocation success at the first time + try { + return alloc_func(); + } catch (BadAlloc& bad_alloc) { + { + // We can just write allocation retry inside the predicate function of + // wait_until + // But it needs to acquire the lock when executing predicate function + // For better performance, we use loop here + auto end_time = std::chrono::high_resolution_clock::now() + retry_time_; + auto wait_until = [&, this] { + std::unique_lock lock(mutex_); + return cv_.wait_until(lock, end_time); + }; + while (wait_until() != std::cv_status::timeout) { + try { + return alloc_func(); + } catch (BadAlloc& ex) { + bad_alloc = ex; + } catch (...) { + throw; + } + } + + throw; // rethrow the original exception or throw the internal bad_alloc + } + } catch (...) { + throw; + } +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h new file mode 100644 index 00000000..7840a834 --- /dev/null +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -0,0 +1,58 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include // NOLINT +#include +#include // NOLINT +#include +#include "paddle/fluid/memory/allocation/allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class RetryAllocator : public Allocator { + public: + RetryAllocator(std::shared_ptr allocator, size_t retry_ms) + : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) { + PADDLE_ENFORCE_NOT_NULL( + underlying_allocator_, + "UnderlyingAllocator of RetryAllocator must not be null"); + PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(), + "UnderlyingAllocator of RetryAllocator must be thread-safe"); + } + + bool IsAllocThreadSafe() const override { return true; } + + protected: + void FreeImpl(Allocation* allocation) override; + Allocation* AllocateImpl(size_t size) override; + + private: + std::shared_ptr underlying_allocator_; + std::chrono::milliseconds retry_time_; + std::mutex mutex_; + std::condition_variable cv_; + + // For debug, We can add an atomic integer to record how many memory sizes are + // waited to allocate + // std::atomic waited_allocate_size_{0}; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc new file mode 100644 index 00000000..4ac08d44 --- /dev/null +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -0,0 +1,98 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/retry_allocator.h" +#include +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/best_fit_allocator.h" +#include "paddle/fluid/memory/allocation/cpu_allocator.h" +#include "paddle/fluid/memory/allocation/locked_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +TEST(RetryAllocator, RetryAllocator) { + CPUAllocator cpu_allocator; + + size_t size = (1 << 20); + auto cpu_allocation = cpu_allocator.Allocate(size); + + std::unique_ptr best_fit_allocator( + new BestFitAllocator(cpu_allocation.get())); + std::unique_ptr locked_allocator( + new LockedAllocator(std::move(best_fit_allocator))); + + size_t thread_num = 8; + size_t sleep_time = 40; + size_t extra_time = 10; + + // Reserve to perform more tests in the future + std::vector> allocators; + { + std::unique_ptr best_fit_allocator( + new BestFitAllocator(cpu_allocation.get())); + std::unique_ptr locked_allocator( + new LockedAllocator(std::move(best_fit_allocator))); + allocators.push_back(std::make_shared( + std::move(locked_allocator), + (thread_num - 1) * (sleep_time + extra_time))); + } + + for (auto &allocator : allocators) { + std::vector threads(thread_num); + std::vector addresses(threads.size(), nullptr); + + std::mutex mutex; + std::condition_variable cv; + bool flag = false; + + for (size_t i = 0; i < threads.size(); ++i) { + threads[i] = std::thread([&, i]() { + { + std::unique_lock lock(mutex); + cv.wait(lock, [&] { return flag; }); + } + + auto ret = allocator->Allocate(size - 1); + addresses[i] = ret->ptr(); + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_time)); + }); + } + + { + std::lock_guard lock(mutex); + flag = true; + cv.notify_all(); + } + + for (auto &th : threads) { + th.join(); + } + + void *val = cpu_allocation->ptr(); + bool is_all_equal = std::all_of(addresses.begin(), addresses.end(), + [val](void *p) { return p == val; }); + ASSERT_TRUE(is_all_equal); + } +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/test_aligned_allocator.cc b/paddle/fluid/memory/allocation/test_aligned_allocator.cc new file mode 100644 index 00000000..41936ab3 --- /dev/null +++ b/paddle/fluid/memory/allocation/test_aligned_allocator.cc @@ -0,0 +1,81 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/aligned_allocator.h" + +namespace paddle { +namespace memory { +namespace allocation { + +TEST(aligned, aligned_size) { + ASSERT_EQ(AlignedSize(1024, 1024), 1024); + ASSERT_EQ(AlignedSize(1023, 1024), 1024); + ASSERT_EQ(AlignedSize(1025, 1024), 2048); +} + +struct StubAllocator : public Allocator { + public: + StubAllocator() = default; + + size_t AllocNum() const { return alloc_num_; } + + protected: + Allocation *AllocateImpl(size_t size) override { + ++alloc_num_; + return new Allocation(new uint8_t[size], size, platform::CPUPlace()); + } + + void FreeImpl(Allocation *allocation) override { + delete[] static_cast(allocation->ptr()); + delete allocation; + --alloc_num_; + } + + private: + size_t alloc_num_{0}; +}; + +bool IsAligned(const AllocationPtr &alloc, size_t alignment) { + return reinterpret_cast(alloc->ptr()) % alignment == 0; +} + +TEST(aligned_allocator, aligned_allocator) { + size_t alignment = 1024; + auto allocator = std::make_shared(); + auto aligned_allocator = + std::make_shared(allocator, alignment); + + auto alloc1 = aligned_allocator->Allocate(1345); + ASSERT_EQ(allocator->AllocNum(), 1); + ASSERT_TRUE(IsAligned(alloc1, alignment)); + alloc1.reset(); + ASSERT_EQ(allocator->AllocNum(), 0); + + { + auto alloc2 = aligned_allocator->Allocate(200); + ASSERT_TRUE(IsAligned(alloc2, alignment)); + ASSERT_EQ(allocator->AllocNum(), 1); + + auto alloc3 = aligned_allocator->Allocate(3021); + ASSERT_TRUE(IsAligned(alloc3, alignment)); + ASSERT_EQ(allocator->AllocNum(), 2); + } + + ASSERT_EQ(allocator->AllocNum(), 0); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt new file mode 100644 index 00000000..a555b6b2 --- /dev/null +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -0,0 +1,13 @@ +cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc) + +if(${WITH_GPU}) + nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) +else(${WITH_GPU}) + cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info) +endif(${WITH_GPU}) + +cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) + +cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS memory_block system_allocator glog) + +cc_test(buddy_allocator_test SRCS buddy_allocator_test.cc DEPS buddy_allocator) diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc new file mode 100644 index 00000000..edd6ea4a --- /dev/null +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -0,0 +1,352 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/buddy_allocator.h" + +#include +#include + +#include "glog/logging.h" + +DEFINE_bool(free_idle_memory, false, + "If it is true, Paddle will try to free idle memory trunks during " + "running time."); + +namespace paddle { +namespace memory { +namespace detail { + +BuddyAllocator::BuddyAllocator( + std::unique_ptr system_allocator, size_t min_chunk_size, + size_t max_chunk_size) + : min_chunk_size_(min_chunk_size), + max_chunk_size_(max_chunk_size), + cache_(system_allocator->UseGpu()), + system_allocator_(std::move(system_allocator)) {} + +BuddyAllocator::~BuddyAllocator() { + VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these " + "have actually been freed"; + while (!pool_.empty()) { + auto block = static_cast(std::get<2>(*pool_.begin())); + VLOG(10) << "Free from block (" << block << ", " << block->size(cache_) + << ")"; + + system_allocator_->Free(block, block->size(cache_), block->index(cache_)); + cache_.invalidate(block); + pool_.erase(pool_.begin()); + } +} + +inline size_t align(size_t size, size_t alignment) { + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + +void* BuddyAllocator::Alloc(size_t unaligned_size) { + // adjust allocation alignment + size_t size = + align(unaligned_size + sizeof(MemoryBlock::Desc), min_chunk_size_); + + // acquire the allocator lock + std::lock_guard lock(mutex_); + + VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size " + << size; + + // if the allocation is huge, send directly to the system allocator + if (size > max_chunk_size_) { + VLOG(10) << "Allocate from system allocator."; + return SystemAlloc(size); + } + + // query and allocate from the existing chunk + auto it = FindExistChunk(size); + + // refill the pool if failure + if (it == pool_.end()) { + it = RefillPool(size); + // if still failure, fail fatally + if (it == pool_.end()) { + return nullptr; + } + } else { + VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it) + << " at address " + << reinterpret_cast(std::get<2>(*it))->data(); + } + + total_used_ += size; + total_free_ -= size; + + // split the allocation and return data for use + return reinterpret_cast(SplitToAlloc(it, size))->data(); +} + +void BuddyAllocator::Free(void* p) { + // Point back to metadata + auto block = static_cast(p)->metadata(); + + // Acquire the allocator lock + std::lock_guard lock(mutex_); + + VLOG(10) << "Free from address " << block; + + if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) { + VLOG(10) << "Free directly from system allocator"; + system_allocator_->Free(block, block->total_size(cache_), + block->index(cache_)); + + // Invalidate GPU allocation from cache + cache_.invalidate(block); + + return; + } + + block->mark_as_free(&cache_); + + total_used_ -= block->total_size(cache_); + total_free_ += block->total_size(cache_); + + // Trying to merge the right buddy + if (block->has_right_buddy(cache_)) { + VLOG(10) << "Merging this block " << block << " with its right buddy " + << block->right_buddy(cache_); + + auto right_buddy = block->right_buddy(cache_); + + if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { + // Take away right buddy from pool + pool_.erase(IndexSizeAddress(right_buddy->index(cache_), + right_buddy->total_size(cache_), + right_buddy)); + + // merge its right buddy to the block + block->merge(&cache_, right_buddy); + } + } + + // Trying to merge the left buddy + if (block->has_left_buddy(cache_)) { + VLOG(10) << "Merging this block " << block << " with its left buddy " + << block->left_buddy(cache_); + + auto left_buddy = block->left_buddy(cache_); + + if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) { + // Take away right buddy from pool + pool_.erase(IndexSizeAddress(left_buddy->index(cache_), + left_buddy->total_size(cache_), left_buddy)); + + // merge the block to its left buddy + left_buddy->merge(&cache_, block); + block = left_buddy; + } + } + + // Dumping this block into pool + VLOG(10) << "Inserting free block (" << block << ", " + << block->total_size(cache_) << ")"; + pool_.insert( + IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); + + if (FLAGS_free_idle_memory) { + // Clean up if existing too much free memory + // Prefer freeing fallback allocation first + CleanIdleFallBackAlloc(); + + // Free normal allocation + CleanIdleNormalAlloc(); + } +} + +size_t BuddyAllocator::Used() { return total_used_; } +size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; } +size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; } + +void* BuddyAllocator::SystemAlloc(size_t size) { + size_t index = 0; + void* p = system_allocator_->Alloc(&index, size); + + VLOG(10) << "Allocated " << p << " from system allocator."; + + if (p == nullptr) return nullptr; + + static_cast(p)->init(&cache_, MemoryBlock::HUGE_CHUNK, index, + size, nullptr, nullptr); + + return static_cast(p)->data(); +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( + size_t request_bytes) { + size_t allocate_bytes = max_chunk_size_; + size_t index = 0; + +#ifdef PADDLE_WITH_CUDA + if (system_allocator_->UseGpu()) { + if ((total_used_ + total_free_) == 0) { + // Compute the allocation size for gpu for the first allocation. + allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes); + } else { + // Reallocation size + if (realloc_size_ == 0) { + realloc_size_ = platform::GpuReallocSize(); + } + allocate_bytes = std::max(realloc_size_, request_bytes); + } + } +#endif + + // Allocate a new block + void* p = system_allocator_->Alloc(&index, allocate_bytes); + + if (p == nullptr) return pool_.end(); + + VLOG(10) << "Creating and inserting new block " << p + << " from system allocator"; + + static_cast(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index, + allocate_bytes, nullptr, nullptr); + + // gpu fallback allocation + if (system_allocator_->UseGpu() && + static_cast(p)->index(cache_) == 1) { + fallback_alloc_count_++; + } + + total_free_ += allocate_bytes; + + // dump the block into pool + return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first; +} + +BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) { + size_t index = 0; + + while (1) { + auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr)); + + // no match chunk memory + if (it == pool_.end()) return it; + + if (std::get<0>(*it) > index) { + // find suitable one + if (std::get<1>(*it) >= size) { + return it; + } + // update and continue + index = std::get<0>(*it); + continue; + } + return it; + } +} + +void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, + size_t size) { + auto block = static_cast(std::get<2>(*it)); + pool_.erase(it); + + VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_) + << ") into"; + block->split(&cache_, size); + + VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_) + << ")"; + block->set_type(&cache_, MemoryBlock::ARENA_CHUNK); + + // the rest of memory if exist + if (block->has_right_buddy(cache_)) { + if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) { + VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", " + << block->right_buddy(cache_)->total_size(cache_) << ")"; + + pool_.insert( + IndexSizeAddress(block->right_buddy(cache_)->index(cache_), + block->right_buddy(cache_)->total_size(cache_), + block->right_buddy(cache_))); + } + } + + return block; +} + +void BuddyAllocator::CleanIdleFallBackAlloc() { + // If fallback allocation does not exist, return directly + if (!fallback_alloc_count_) return; + + for (auto pool = pool_.rbegin(); pool != pool_.rend();) { + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; + + MemoryBlock* block = static_cast(std::get<2>(*pool)); + + // If no GPU fallback allocator, return + if (!system_allocator_->UseGpu() || block->index(cache_) == 0) { + return; + } + + VLOG(10) << "Return block " << block << " to fallback allocator."; + + system_allocator_->Free(block, block->size(cache_), block->index(cache_)); + cache_.invalidate(block); + + pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); + + total_free_ -= block->size(cache_); + fallback_alloc_count_--; + + // If no fall allocation exists, return directly + if (!fallback_alloc_count_) return; + } +} + +void BuddyAllocator::CleanIdleNormalAlloc() { + auto shall_free_alloc = [&]() -> bool { + // free all fallback allocations + if (fallback_alloc_count_ > 0) { + return true; + } + // keep 2x overhead if we haven't fallen back + if ((total_used_ + max_chunk_size_) * 2 < total_free_) { + return true; + } + return false; + }; + + if (!shall_free_alloc()) return; + + for (auto pool = pool_.rbegin(); pool != pool_.rend();) { + // If free memory block less than max_chunk_size_, return directly + if (std::get<1>(*pool) < max_chunk_size_) return; + + MemoryBlock* block = static_cast(std::get<2>(*pool)); + + VLOG(10) << "Return block " << block << " to base allocator."; + + system_allocator_->Free(block, block->size(cache_), block->index(cache_)); + cache_.invalidate(block); + + pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); + + total_free_ -= block->size(cache_); + + if (!shall_free_alloc()) return; + } +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h new file mode 100644 index 00000000..bdc8cca4 --- /dev/null +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -0,0 +1,117 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT +#include +#include +#include +#include + +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace memory { +namespace detail { + +class BuddyAllocator { + public: + BuddyAllocator(std::unique_ptr system_allocator, + size_t min_chunk_size, size_t max_chunk_size); + + ~BuddyAllocator(); + + public: + void* Alloc(size_t unaligned_size); + void Free(void* ptr); + size_t Used(); + size_t GetMinChunkSize(); + size_t GetMaxChunkSize(); + + public: + // Disable copy and assignment + BuddyAllocator(const BuddyAllocator&) = delete; + BuddyAllocator& operator=(const BuddyAllocator&) = delete; + + private: + // Tuple (allocator index, memory size, memory address) + using IndexSizeAddress = std::tuple; + // Each element in PoolSet is a free allocation + using PoolSet = std::set; + + /*! \brief Allocate fixed-size memory from system */ + void* SystemAlloc(size_t size); + + /*! \brief If existing chunks are not suitable, refill pool */ + PoolSet::iterator RefillPool(size_t request_bytes); + + /** + * \brief Find the suitable chunk from existing pool and split + * it to left and right buddies + * + * \param it the iterator of pool list + * \param size the size of allocation + * + * \return the left buddy address + */ + void* SplitToAlloc(PoolSet::iterator it, size_t size); + + /*! \brief Find the existing chunk which used to allocation */ + PoolSet::iterator FindExistChunk(size_t size); + + /*! \brief Clean idle fallback allocation */ + void CleanIdleFallBackAlloc(); + + /*! \brief Clean idle normal allocation */ + void CleanIdleNormalAlloc(); + + private: + size_t total_used_ = 0; // the total size of used memory + size_t total_free_ = 0; // the total size of free memory + + size_t min_chunk_size_; // the minimum size of each chunk + size_t max_chunk_size_; // the maximum size of each chunk + + size_t realloc_size_ = 0; // the size of re-allocated chunk + + private: + /** + * \brief A list of free allocation + * + * \note Only store free chunk memory in pool + */ + PoolSet pool_; + + /*! Record fallback allocation count for auto-scaling */ + size_t fallback_alloc_count_ = 0; + + private: + /*! Unify the metadata format between GPU and CPU allocations */ + MetadataCache cache_; + + private: + /*! Allocate CPU/GPU memory from system */ + std::unique_ptr system_allocator_; + std::mutex mutex_; +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc new file mode 100644 index 00000000..1edc9f20 --- /dev/null +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/buddy_allocator.h" + +#include + +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/memory/detail/system_allocator.h" +#include "paddle/fluid/platform/gpu_info.h" + +#ifdef PADDLE_WITH_CUDA +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +#endif + +namespace paddle { +namespace memory { +namespace detail { + +constexpr static int test_gpu_id = 0; + +void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { + bool freed = false; + size_t used_bytes = allocator->Used(); + + if (size_bytes > 0) { + void* p = allocator->Alloc(size_bytes); + + EXPECT_NE(p, nullptr); +#ifdef PADDLE_WITH_CUDA + if (size_bytes < platform::GpuMaxChunkSize()) { +#else + if (size_bytes < platform::CpuMaxChunkSize()) { +#endif + // Not allocate from SystemAllocator + EXPECT_GE(allocator->Used(), used_bytes + size_bytes); + } else { + // Allocate from SystemAllocator doesn't count in Used() + EXPECT_EQ(allocator->Used(), used_bytes); + } + + int* intp = static_cast(p); + std::shared_ptr ptr(intp, [&](void* p) { + allocator->Free(intp); + freed = true; + }); + } else { + freed = true; + } + + EXPECT_EQ(used_bytes, allocator->Used()); + EXPECT_TRUE(freed); +} + +#ifdef PADDLE_WITH_CUDA +TEST(BuddyAllocator, GpuFraction) { + FLAGS_fraction_of_gpu_memory_to_use = 0.01; + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(test_gpu_id)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 10 << 20); + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); +} + +TEST(BuddyAllocator, InitRealloc) { + FLAGS_initial_gpu_memory_in_mb = 100; + FLAGS_reallocate_gpu_memory_in_mb = 50; + + EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast(100 << 20)); + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(test_gpu_id)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + // Less then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 10 << 20); + // Between initial size and reallocate size and not exceed pool + TestBuddyAllocator(&buddy_allocator, 80 << 20); + // Less then reallocate size and exceed pool + TestBuddyAllocator(&buddy_allocator, 40 << 20); + // Greater then reallocate size and exceed pool + TestBuddyAllocator(&buddy_allocator, 80 << 20); + // Greater then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); +} + +TEST(BuddyAllocator, ReallocSizeGreaterThanInit) { + FLAGS_initial_gpu_memory_in_mb = 5; + FLAGS_reallocate_gpu_memory_in_mb = 10; + + EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast(10 << 20)); + + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(test_gpu_id)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + // Less then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 1 << 20); + // Between initial size and reallocate size and not exceed pool + TestBuddyAllocator(&buddy_allocator, 3 << 20); + // Less then initial size and exceed pool + TestBuddyAllocator(&buddy_allocator, 3 << 20); + // Less then reallocate size and not exceed pool (now pool is 15 MB, used 7 + // MB) + TestBuddyAllocator(&buddy_allocator, 7 << 20); + // Less then reallocate size and exceed pool + TestBuddyAllocator(&buddy_allocator, 8 << 20); + // Greater then initial size and reallocate size + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); +} +#endif + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/detail/memory_block.cc new file mode 100644 index 00000000..f34b922b --- /dev/null +++ b/paddle/fluid/memory/detail/memory_block.cc @@ -0,0 +1,156 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/platform/assert.h" + +namespace paddle { +namespace memory { +namespace detail { + +void MemoryBlock::init(MetadataCache* cache, Type t, size_t index, size_t size, + void* left_buddy, void* right_buddy) { + cache->save( + this, MemoryBlock::Desc(t, index, size - sizeof(MemoryBlock::Desc), size, + static_cast(left_buddy), + static_cast(right_buddy))); +} + +MemoryBlock::Type MemoryBlock::type(const MetadataCache& cache) const { + return cache.load(this).type; +} + +size_t MemoryBlock::size(const MetadataCache& cache) const { + return cache.load(this).size; +} + +size_t MemoryBlock::index(const MetadataCache& cache) const { + return cache.load(this).index; +} + +size_t MemoryBlock::total_size(const MetadataCache& cache) const { + return cache.load(this).total_size; +} + +bool MemoryBlock::has_left_buddy(const MetadataCache& cache) const { + return left_buddy(cache) != nullptr; +} + +bool MemoryBlock::has_right_buddy(const MetadataCache& cache) const { + return right_buddy(cache) != nullptr; +} + +MemoryBlock* MemoryBlock::left_buddy(const MetadataCache& cache) const { + return cache.load(this).left_buddy; +} + +MemoryBlock* MemoryBlock::right_buddy(const MetadataCache& cache) const { + return cache.load(this).right_buddy; +} + +void MemoryBlock::split(MetadataCache* cache, size_t size) { + // make sure the split fits + PADDLE_ASSERT(total_size(*cache) >= size); + + // bail out if there is no room for another partition + if (total_size(*cache) - size <= sizeof(MemoryBlock::Desc)) { + return; + } + + // find the position of the split + void* right_partition = reinterpret_cast(this) + size; + + size_t remaining_size = total_size(*cache) - size; + + // Add the new block as a buddy + auto metadata = cache->load(this); + + // Write the metadata for the new block + auto new_block_right_buddy = metadata.right_buddy; + + cache->save(static_cast(right_partition), + MemoryBlock::Desc(FREE_CHUNK, index(*cache), + remaining_size - sizeof(MemoryBlock::Desc), + remaining_size, this, new_block_right_buddy)); + + metadata.right_buddy = static_cast(right_partition); + metadata.size = size - sizeof(MemoryBlock::Desc); + metadata.total_size = size; + + cache->save(this, metadata); + + // Write metadata for the new block's right buddy + if (new_block_right_buddy != nullptr) { + auto buddy_metadata = cache->load(new_block_right_buddy); + + buddy_metadata.left_buddy = static_cast(right_partition); + + cache->save(new_block_right_buddy, buddy_metadata); + } +} + +void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) { + // only free blocks can be merged + PADDLE_ASSERT(type(*cache) == FREE_CHUNK); + PADDLE_ASSERT(right_buddy->type(*cache) == FREE_CHUNK); + + auto metadata = cache->load(this); + + // link this->buddy's buddy + metadata.right_buddy = right_buddy->right_buddy(*cache); + + // link buddy's buddy -> this + if (metadata.right_buddy != nullptr) { + auto buddy_metadata = cache->load(metadata.right_buddy); + + buddy_metadata.left_buddy = this; + + cache->save(metadata.right_buddy, buddy_metadata); + } + + metadata.size += right_buddy->total_size(*cache); + metadata.total_size += right_buddy->total_size(*cache); + + cache->save(this, metadata); + cache->save(right_buddy, + MemoryBlock::Desc(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr)); +} + +void MemoryBlock::mark_as_free(MetadataCache* cache) { + // check for double free or corruption + PADDLE_ASSERT(type(*cache) != FREE_CHUNK); + PADDLE_ASSERT(type(*cache) != INVALID_CHUNK); + set_type(cache, FREE_CHUNK); +} + +void MemoryBlock::set_type(MetadataCache* cache, Type t) { + auto metadata = cache->load(this); + metadata.type = t; + cache->save(this, metadata); +} + +void* MemoryBlock::data() const { + return const_cast( + reinterpret_cast(this)) + + 1; +} + +MemoryBlock* MemoryBlock::metadata() const { + return const_cast(reinterpret_cast( + reinterpret_cast(this) - 1)); +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/memory_block.h b/paddle/fluid/memory/detail/memory_block.h new file mode 100644 index 00000000..5cceba65 --- /dev/null +++ b/paddle/fluid/memory/detail/memory_block.h @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include + +namespace paddle { +namespace memory { +namespace detail { + +// Forward declaration. +class MetadataCache; + +// MemoryBlock represents Each allocated memory block, which contains +// MemoryBlock::Desc and the payload. +struct MemoryBlock { + enum Type { + FREE_CHUNK, // memory is free and idle + ARENA_CHUNK, // memory is being occupied + HUGE_CHUNK, // memory is out of management + INVALID_CHUNK // memory is invalid + }; + + // init saves the MemoryBlock::Desc of the memory block in a MetadataCache. + // If it is a CPU memory block, the MetadataCache writes the + // MemoryBlock::Desc to the beginning of the block; or, if it is a GPU memory + // block, the MetadataCache writes the Meatadata to a std::map in + // the CPU. + void init(MetadataCache* cache, Type t, size_t index, size_t size, + void* left_buddy, void* right_buddy); + + // All these accessors returns fields in the MemoryBlock::Desc of the memory + // block. They all need a MetadataCache instance as their first + // parameter because they read the MemoryBlock::Desc from the cache. + Type type(const MetadataCache& cache) const; + size_t size(const MetadataCache& cache) const; + size_t index(const MetadataCache& cache) const; + size_t total_size(const MetadataCache& cache) const; + bool has_left_buddy(const MetadataCache& cache) const; + bool has_right_buddy(const MetadataCache& cache) const; + MemoryBlock* left_buddy(const MetadataCache& cache) const; + MemoryBlock* right_buddy(const MetadataCache& cache) const; + + // Split the allocation into left/right blocks. + void split(MetadataCache* cache, size_t size); + + // Merge left and right blocks together. + void merge(MetadataCache* cache, MemoryBlock* right_buddy); + + // Mark the allocation as free. + void mark_as_free(MetadataCache* cache); + + // Change the type of the allocation. + void set_type(MetadataCache* cache, Type t); + + void* data() const; + MemoryBlock* metadata() const; + + // MemoryBlock::Desc describes a MemoryBlock. + struct Desc { + Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l, + MemoryBlock* r); + Desc(); + + // Updates guard_begin and guard_end by hashes of the Metadata object. + void update_guards(); + + // Checks that guard_begin and guard_end are hashes of the Metadata object. + bool check_guards() const; + + // TODO(gangliao): compress this + size_t guard_begin = 0; + MemoryBlock::Type type = MemoryBlock::INVALID_CHUNK; + size_t index = 0; + size_t size = 0; + size_t total_size = 0; + MemoryBlock* left_buddy = nullptr; + MemoryBlock* right_buddy = nullptr; + size_t guard_end = 0; + }; +}; + +// A cache for accessing memory block meta-data that may be expensive +// to access directly. This class exists to unify the +// MemoryBlock::Desc format between GPU and CPU allocations. It should +// be removed when the CPU can access all GPU allocations directly via +// UVM. +class MetadataCache { + public: + explicit MetadataCache(bool uses_gpu); + + // Disable copying and assignment. + MetadataCache(const MetadataCache&) = delete; + MetadataCache& operator=(const MetadataCache&) = delete; + + // Returns the MemoryBlock::Desc for a memory block. When MetadataCache is + // used to manage CPU memory, the MemoryBlock::Desc resides at the beginning + // of the memory block; when used to manage GPU memory, the + // Meatadata resides in CPU memory indexed by cache_. + MemoryBlock::Desc load(const MemoryBlock* memory_block) const; + + // Saves the MemoryBlock::Desc of a memory block into the cache. For CPU + // memory block, writes the MemoryBlock::Desc to the beginning of the memory + // block; whereas for GPU memory, writes it to cache_. + void save(MemoryBlock* memory_block, const MemoryBlock::Desc& meta_data); + + // For GPU memory block, erases its MemoryBlock::Desc from cache_. + void invalidate(MemoryBlock* memory_block); + + private: + typedef std::unordered_map MetadataMap; + MetadataMap cache_; + bool uses_gpu_; +}; + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/memory_block_desc.cc b/paddle/fluid/memory/detail/memory_block_desc.cc new file mode 100644 index 00000000..393dd920 --- /dev/null +++ b/paddle/fluid/memory/detail/memory_block_desc.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/memory/detail/memory_block.h" + +namespace paddle { +namespace memory { +namespace detail { + +MemoryBlock::Desc::Desc(MemoryBlock::Type t, size_t i, size_t s, size_t ts, + MemoryBlock* l, MemoryBlock* r) + : type(t), + index(i), + size(s), + total_size(ts), + left_buddy(l), + right_buddy(r) {} + +MemoryBlock::Desc::Desc() + : type(MemoryBlock::INVALID_CHUNK), + index(0), + size(0), + total_size(0), + left_buddy(nullptr), + right_buddy(nullptr) {} + +namespace { + +template +inline void hash_combine(std::size_t* seed, const T& v) { + std::hash hasher; + (*seed) ^= hasher(v) + 0x9e3779b9 + ((*seed) << 6) + ((*seed) >> 2); +} + +inline size_t hash(const MemoryBlock::Desc& metadata, size_t initial_seed) { + size_t seed = initial_seed; + + hash_combine(&seed, static_cast(metadata.type)); + hash_combine(&seed, metadata.index); + hash_combine(&seed, metadata.size); + hash_combine(&seed, metadata.total_size); + hash_combine(&seed, metadata.left_buddy); + hash_combine(&seed, metadata.right_buddy); + + return seed; +} + +} // namespace + +void MemoryBlock::Desc::update_guards() { + guard_begin = hash(*this, 1); + guard_end = hash(*this, 2); +} + +bool MemoryBlock::Desc::check_guards() const { + return guard_begin == hash(*this, 1) && guard_end == hash(*this, 2); +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc new file mode 100644 index 00000000..b86e4f38 --- /dev/null +++ b/paddle/fluid/memory/detail/meta_cache.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "glog/logging.h" +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/platform/assert.h" + +namespace paddle { +namespace memory { +namespace detail { + +MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {} + +MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const { + if (uses_gpu_) { + auto existing_desc = cache_.find(block); + PADDLE_ASSERT(existing_desc->second.check_guards()); + return existing_desc->second; + } else { + auto* desc = reinterpret_cast(block); + VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type; + PADDLE_ASSERT(desc->check_guards()); + return *reinterpret_cast(block); + } +} + +void MetadataCache::save(MemoryBlock* block, + const MemoryBlock::Desc& original_desc) { + auto desc = original_desc; + desc.update_guards(); + + if (uses_gpu_) { + cache_[block] = desc; + } else { + *reinterpret_cast(block) = desc; + } +} + +void MetadataCache::invalidate(MemoryBlock* block) { + if (uses_gpu_) { + cache_.erase(block); + } +} + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc new file mode 100644 index 00000000..b0f48c45 --- /dev/null +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -0,0 +1,219 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#define GLOG_NO_ABBREVIATED_SEVERITIES + +#include "paddle/fluid/memory/detail/system_allocator.h" + +#ifdef _WIN32 +#include +#include // VirtualLock/VirtualUnlock +#else +#include // for mlock and munlock +#endif +#include // for malloc and free +#include // for std::max + +#include "gflags/gflags.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif + +DECLARE_bool(use_pinned_memory); +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); + +namespace paddle { +namespace memory { +namespace detail { + +void* AlignedMalloc(size_t size) { + void* p = nullptr; + size_t alignment = 32ul; +#ifdef PADDLE_WITH_MKLDNN + // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp + // memory alignment + alignment = 4096ul; +#endif +#ifdef _WIN32 + p = _aligned_malloc(size, alignment); +#else + PADDLE_ENFORCE_EQ(posix_memalign(&p, alignment, size), 0, "Alloc %ld error!", + size); +#endif + PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size); + return p; +} + +void* CPUAllocator::Alloc(size_t* index, size_t size) { + // According to http://www.cplusplus.com/reference/cstdlib/malloc/, + // malloc might not return nullptr if size is zero, but the returned + // pointer shall not be dereferenced -- so we make it nullptr. + if (size <= 0) return nullptr; + + *index = 0; // unlock memory + + void* p = AlignedMalloc(size); + + if (p != nullptr) { + if (FLAGS_use_pinned_memory) { + *index = 1; +#ifdef _WIN32 + VirtualLock(p, size); +#else + mlock(p, size); // lock memory +#endif + } + } + + return p; +} + +void CPUAllocator::Free(void* p, size_t size, size_t index) { + if (p != nullptr && index == 1) { +#ifdef _WIN32 + VirtualUnlock(p, size); +#else + munlock(p, size); +#endif + } +#ifdef _WIN32 + _aligned_free(p); +#else + free(p); +#endif +} + +bool CPUAllocator::UseGpu() const { return false; } + +#ifdef PADDLE_WITH_CUDA + +void* GPUAllocator::Alloc(size_t* index, size_t size) { + // CUDA documentation doesn't explain if cudaMalloc returns nullptr + // if size is 0. We just make sure it does. + if (size <= 0) return nullptr; + + paddle::platform::CUDADeviceGuard guard(gpu_id_); + + void* p; + cudaError_t result = cudaMalloc(&p, size); + + if (result == cudaSuccess) { + *index = 0; + gpu_alloc_size_ += size; + return p; + } else { + LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB GPU memory. Please shrink " + "FLAGS_fraction_of_gpu_memory_to_use or " + "FLAGS_initial_gpu_memory_in_mb or " + "FLAGS_reallocate_gpu_memory_in_mb" + "environment variable to a lower value. " + << "Current FLAGS_fraction_of_gpu_memory_to_use value is " + << FLAGS_fraction_of_gpu_memory_to_use + << ". Current FLAGS_initial_gpu_memory_in_mb value is " + << FLAGS_initial_gpu_memory_in_mb + << ". Current FLAGS_reallocate_gpu_memory_in_mb value is " + << FLAGS_reallocate_gpu_memory_in_mb; + return nullptr; + } +} + +void GPUAllocator::Free(void* p, size_t size, size_t index) { + cudaError_t err; + if (index == 0) { + PADDLE_ASSERT(gpu_alloc_size_ >= size); + gpu_alloc_size_ -= size; + err = cudaFree(p); + } else { + PADDLE_ASSERT(fallback_alloc_size_ >= size); + fallback_alloc_size_ -= size; + err = cudaFreeHost(p); + } + + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFree after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFree succeeds. + if (err != cudaErrorCudartUnloading) { + PADDLE_ENFORCE(err, "cudaFree{Host} failed in GPUAllocator::Free."); + } +} + +bool GPUAllocator::UseGpu() const { return true; } + +// PINNED memory allows direct DMA transfers by the GPU to and from system +// memory. It’s locked to a physical address. +void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { + if (size <= 0) return nullptr; + + // NOTE: here, we use CUDAPinnedMaxAllocSize as the maximum memory size + // of host pinned allocation. Allocates too much would reduce + // the amount of memory available to the underlying system for paging. + size_t usable = + paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_; + + if (size > usable) { + LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 + << " MB pinned memory." + << ", available " << usable / 1024.0 / 1024.0 << " MB"; + return nullptr; + } + + void* p; + // PINNED memory is visible to all CUDA contexts. + cudaError_t result = cudaHostAlloc(&p, size, cudaHostAllocPortable); + + if (result == cudaSuccess) { + *index = 1; // PINNED memory + cuda_pinnd_alloc_size_ += size; + return p; + } else { + LOG(WARNING) << "cudaHostAlloc failed."; + return nullptr; + } + + return nullptr; +} + +void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { + cudaError_t err; + PADDLE_ASSERT(index == 1); + + PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size); + cuda_pinnd_alloc_size_ -= size; + err = cudaFreeHost(p); + + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFreeHost after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFreeHost succeeds. + if (err != cudaErrorCudartUnloading) { + PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free."); + } +} + +bool CUDAPinnedAllocator::UseGpu() const { return false; } + +#endif + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h new file mode 100644 index 00000000..a0386a2d --- /dev/null +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -0,0 +1,72 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // for size_t + +namespace paddle { +namespace memory { +namespace detail { + +/** + * \brief SystemAllocator is the parent class of CPUAllocator, + * CUDAPinnedAllocator and GPUAllocator. A BuddyAllocator + * object uses a SystemAllocator* pointing to the + * underlying system allocator. + */ +class SystemAllocator { + public: + virtual ~SystemAllocator() {} + virtual void* Alloc(size_t* index, size_t size) = 0; + virtual void Free(void* p, size_t size, size_t index) = 0; + virtual bool UseGpu() const = 0; +}; + +class CPUAllocator : public SystemAllocator { + public: + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; +}; + +#ifdef PADDLE_WITH_CUDA +class GPUAllocator : public SystemAllocator { + public: + explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} + + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t gpu_alloc_size_ = 0; + size_t fallback_alloc_size_ = 0; + int gpu_id_; +}; + +class CUDAPinnedAllocator : public SystemAllocator { + public: + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t cuda_pinnd_alloc_size_ = 0; +}; +#endif + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc new file mode 100644 index 00000000..26826014 --- /dev/null +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/detail/system_allocator.h" + +#include +#include + +#include "gflags/gflags.h" +#include "gtest/gtest.h" + +DECLARE_bool(use_pinned_memory); + +void TestAllocator(paddle::memory::detail::SystemAllocator* a, size_t size) { + bool freed = false; + { + size_t index; + void* p = a->Alloc(&index, size); + if (size > 0) { + EXPECT_NE(p, nullptr); + } else { + EXPECT_EQ(p, nullptr); + } + + int* i = static_cast(p); + std::shared_ptr ptr(i, [&](void* p) { + freed = true; + a->Free(p, size, index); + }); + } + EXPECT_TRUE(freed); +} + +TEST(CPUAllocator, NoLockMem) { + FLAGS_use_pinned_memory = false; + paddle::memory::detail::CPUAllocator a; + TestAllocator(&a, 2048); + TestAllocator(&a, 0); +} + +TEST(CPUAllocator, LockMem) { + FLAGS_use_pinned_memory = true; + paddle::memory::detail::CPUAllocator a; + TestAllocator(&a, 2048); + TestAllocator(&a, 0); +} + +#ifdef PADDLE_WITH_CUDA +TEST(GPUAllocator, Alloc) { + paddle::memory::detail::GPUAllocator a(0); + TestAllocator(&a, 2048); + TestAllocator(&a, 0); +} +#endif diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc new file mode 100644 index 00000000..5884433a --- /dev/null +++ b/paddle/fluid/memory/malloc.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/malloc.h" +#include +#include +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/memory/allocation/allocator_strategy.h" +#include "paddle/fluid/platform/place.h" +namespace paddle { +namespace memory { +std::shared_ptr AllocShared(const platform::Place& place, + size_t size) { + return allocation::AllocatorFacade::Instance().AllocShared(place, size); +} + +AllocationPtr Alloc(const platform::Place& place, size_t size) { + return allocation::AllocatorFacade::Instance().Alloc(place, size); +} + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h new file mode 100644 index 00000000..6731203f --- /dev/null +++ b/paddle/fluid/memory/malloc.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" +namespace paddle { +namespace memory { +using allocation::Allocation; +using allocation::Allocator; +using allocation::AllocationPtr; + +extern std::shared_ptr AllocShared(const platform::Place& place, + size_t size); + +extern AllocationPtr Alloc(const platform::Place& place, size_t size); + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc new file mode 100644 index 00000000..c08d86eb --- /dev/null +++ b/paddle/fluid/memory/memcpy.cc @@ -0,0 +1,169 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/memcpy.h" + +#include // for memcpy +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace memory { + +template <> +void Copy(platform::CPUPlace, void* dst, + platform::CPUPlace, + const void* src, size_t num) { + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} + +#ifdef PADDLE_WITH_CUDA +static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K + +// NOTE(zcd): Do not use GpuMemcpySync as much as possible. +// because GpuMemcpySync issues the copying command to the default stream, +// which will make two commands from different streams cannot run concurrently. +// Reference: +// https://devblogs.nvidia.com/gpu-pro-tip-cuda-7-streams-simplify-concurrency/ + +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, cudaStream_t stream) { + if (UNLIKELY(num == 0)) return; + platform::SetDeviceId(src_place.device); + + if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CPU"); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); + } else { + platform::RecordEvent record_event("GpuMemcpySync:GPU->CPU"); + platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); + // FIXME(zjl): do we really need it? + if (num <= kMaxGpuAsyncCopyBytes) { + cudaStreamSynchronize(0); + } + } +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, + const void* src, size_t num, cudaStream_t stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetDeviceId(dst_place.device); + if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:CPU->GPU"); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); + } else { + platform::RecordEvent record_event("GpuMemcpySync:CPU->GPU"); + platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); + // FIXME(zjl): do we really need it? + if (num <= kMaxGpuAsyncCopyBytes) { + cudaStreamSynchronize(0); + } + } +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, cudaStream_t stream) { + if (UNLIKELY(num == 0)) return; + + if (dst_place == src_place) { + platform::SetDeviceId(src_place.device); + if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync(same_gpu):GPU->GPU"); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); + } else { + platform::RecordEvent record_event("GpuMemcpySync(same_gpu):GPU->GPU"); + platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice); + } + } else { + if (stream) { + platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU"); + platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, + num, stream); + } else { + platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU"); + platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, + num); + } + } +} + +template <> +void Copy( + platform::CPUPlace dst_place, void* dst, + platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::CUDAPinnedPlace dst_place, void* dst, + platform::CPUPlace src_place, const void* src, size_t num) { + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::CUDAPinnedPlace dst_place, void* dst, + platform::CUDAPinnedPlace src_place, const void* src, size_t num) { + if (UNLIKELY(num == 0)) return; + std::memcpy(dst, src, num); +} + +template <> +void Copy( + platform::CUDAPinnedPlace dst_place, void* dst, + platform::CUDAPlace src_place, const void* src, size_t num, + cudaStream_t stream) { + if (UNLIKELY(num == 0)) return; + platform::SetDeviceId(src_place.device); + if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned"); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); + } else { + platform::RecordEvent record_event("GpuMemcpySync:GPU->CUDAPinned"); + platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToHost); + } +} + +template <> +void Copy( + platform::CUDAPlace dst_place, void* dst, + platform::CUDAPinnedPlace src_place, const void* src, size_t num, + cudaStream_t stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetDeviceId(dst_place.device); + if (stream) { + platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU"); + platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); + } else { + platform::RecordEvent record_event("GpuMemcpySync:CUDAPinned->GPU"); + platform::GpuMemcpySync(dst, src, num, cudaMemcpyHostToDevice); + } +} + +#endif + +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h new file mode 100644 index 00000000..7b2b8eb0 --- /dev/null +++ b/paddle/fluid/memory/memcpy.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { + +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * + */ +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); + +#ifdef PADDLE_WITH_CUDA + +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU or GPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU or GPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * \param[in] stream CUDA stream. + * + * \note For GPU memory copy, CUDA stream need to be specified + * for asynchronously memory copy. + * + */ +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, + cudaStream_t stream); + +#endif +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h new file mode 100644 index 00000000..8d904e3b --- /dev/null +++ b/paddle/fluid/memory/memory.h @@ -0,0 +1,18 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" diff --git a/paddle/fluid/memory/pinned_memory_test.cu b/paddle/fluid/memory/pinned_memory_test.cu new file mode 100644 index 00000000..0d898f59 --- /dev/null +++ b/paddle/fluid/memory/pinned_memory_test.cu @@ -0,0 +1,146 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include + +#include "paddle/fluid/memory/detail/memory_block.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" + +#include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" + +// This unit test is an example comparing the performance between using pinned +// memory and not. In general, using pinned memory will be faster. +template +__global__ void Kernel(T* output, int dim) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < dim) { + output[tid] = output[tid] * output[tid] / 100; + } +} + +template +float test_pinned_memory() { + Place cpu_place; + paddle::platform::CUDAPlace cuda_place; + + const int data_size = 4096; + const int iteration = 10; + + // create event start and end + cudaEvent_t start_e, stop_e, copying_e; + float elapsedTime = 0; + cudaEventCreate(&start_e); + cudaEventCreate(&stop_e); + cudaEventCreate(©ing_e); + + // create computation stream, data copying stream + cudaStream_t computation_stream, copying_stream; + cudaStreamCreate(&computation_stream); + cudaStreamCreate(©ing_stream); + + // create record event, pinned memory, gpu memory + std::vector record_event(iteration); + std::vector input_pinned_mem(iteration); + std::vector gpu_mem(iteration); + std::vector output_pinned_mem(iteration); + + // initial data + for (int j = 0; j < iteration; ++j) { + cudaEventCreateWithFlags(&record_event[j], cudaEventDisableTiming); + cudaEventCreate(&(record_event[j])); + input_pinned_mem[j] = static_cast( + paddle::memory::Alloc(cpu_place, data_size * sizeof(float))); + output_pinned_mem[j] = static_cast( + paddle::memory::Alloc(cpu_place, data_size * sizeof(float))); + gpu_mem[j] = static_cast( + paddle::memory::Alloc(cuda_place, data_size * sizeof(float))); + + for (int k = 0; k < data_size; ++k) { + input_pinned_mem[j][k] = k; + } + } + + cudaEventRecord(start_e, computation_stream); + + // computation + for (int m = 0; m < 30; ++m) { + for (int i = 0; i < iteration; ++i) { + // cpu -> GPU on computation stream. + // note: this operation is async for pinned memory. + paddle::memory::Copy(cuda_place, gpu_mem[i], cpu_place, + input_pinned_mem[i], data_size * sizeof(float), + computation_stream); + + // call kernel on computation stream. + Kernel<<<4, 1024, 0, computation_stream>>>(gpu_mem[i], data_size); + + // record event_computation on computation stream + cudaEventRecord(record_event[i], computation_stream); + + // wait event_computation on copy stream. + // note: this operation is async. + cudaStreamWaitEvent(copying_stream, record_event[i], 0); + + // copy data GPU->CPU, on copy stream. + // note: this operation is async for pinned memory. + paddle::memory::Copy(cpu_place, output_pinned_mem[i], cuda_place, + gpu_mem[i], data_size * sizeof(float), + copying_stream); + } + } + + cudaEventRecord(copying_e, copying_stream); + cudaStreamWaitEvent(computation_stream, copying_e, 0); + + cudaEventRecord(stop_e, computation_stream); + + cudaEventSynchronize(start_e); + cudaEventSynchronize(stop_e); + cudaEventElapsedTime(&elapsedTime, start_e, stop_e); + + // std::cout << cpu_place << " " + // << "time consume:" << elapsedTime / 30 << std::endl; + + for (int l = 0; l < iteration; ++l) { + for (int k = 0; k < data_size; ++k) { + float temp = input_pinned_mem[l][k]; + temp = temp * temp / 100; + EXPECT_FLOAT_EQ(temp, output_pinned_mem[l][k]); + } + } + + // destroy resource + cudaEventDestroy(copying_e); + cudaEventDestroy(start_e); + cudaEventDestroy(stop_e); + for (int j = 0; j < 10; ++j) { + cudaEventDestroy((record_event[j])); + paddle::memory::Free(cpu_place, input_pinned_mem[j]); + paddle::memory::Free(cpu_place, output_pinned_mem[j]); + paddle::memory::Free(cuda_place, gpu_mem[j]); + } + return elapsedTime / 30; +} + +TEST(CPUANDCUDAPinned, CPUAllocatorAndCUDAPinnedAllocator) { + // Generally speaking, operation on pinned_memory is faster than that on + // unpinned-memory, but if this unit test fails frequently, please close this + // test for the time being. + float time1 = test_pinned_memory(); + float time2 = test_pinned_memory(); + EXPECT_GT(time1, time2); +} diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec new file mode 100644 index 00000000..4ec0a35b --- /dev/null +++ b/paddle/fluid/op_use_default_grad_op_maker.spec @@ -0,0 +1,43 @@ +attention_lstm +conv_shift +cos_sim +dequantize +fc +flatten +fsp +fused_embedding_fc_lstm +fused_embedding_seq_pool +fusion_gru +fusion_lstm +fusion_repeated_fc_relu +fusion_seqconv_eltadd_relu +fusion_seqexpand_concat_fc +fusion_seqpool_concat +fusion_squared_mat_sub +gru +lrn +lstm_unit +max_pool2d_with_index +max_pool3d_with_index +maxout +modified_huber_loss +nce +pool2d +pool3d +prelu +quantize +rank_loss +reduce_max +reduce_min +reduce_prod +reduce_sum +requantize +reshape +rnn_memory_helper +sequence_softmax +spp +squeeze +tensor_array_to_tensor +transpose +unpool +unsqueeze diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt new file mode 100644 index 00000000..98ff3ea1 --- /dev/null +++ b/paddle/fluid/operators/CMakeLists.txt @@ -0,0 +1,118 @@ +include(operators) + +# clean cache and pybind_file content first when rebuild +unset(GLOB_OP_LIB CACHE) +unset(OP_LIBRARY CACHE) +set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h CACHE INTERNAL "pybind.h file") +file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operator/CMakeLists.txt. DO NOT EDIT!\n\n") + +add_subdirectory(math) +add_subdirectory(controlflow) +add_subdirectory(detection) +add_subdirectory(elementwise) +add_subdirectory(fused) +add_subdirectory(metrics) +add_subdirectory(ngraph) +add_subdirectory(optimizers) +add_subdirectory(reduce_ops) +add_subdirectory(sequence_ops) +add_subdirectory(jit) + +if(WITH_DISTRIBUTE) + add_subdirectory(distributed) + add_subdirectory(distributed_ops) + add_subdirectory(collective) +endif() + +add_subdirectory(reader) + +if (NOT WIN32) + add_subdirectory(nccl) +endif() + +if (WITH_GPU AND TENSORRT_FOUND) + add_subdirectory(tensorrt) +endif() + +if (ANAKIN_SUBGRAPH) + add_subdirectory(anakin) +endif() + +SET(OP_HEADER_DEPS xxhash) +if (WITH_GPU) + SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub) +endif() + +SET(OP_PREFETCH_DEPS "") +if (WITH_DISTRIBUTE) + SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch) +endif() + +register_operators(EXCLUDES py_func_op warpctc_op dgc_op conv_fusion_op + sync_batch_norm_op deformable_conv_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) + +if (WITH_GPU) + # warpctc_op needs cudnn 7 above + if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) + else() + op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) + endif() + # conv_fusion_op needs cudnn 7 above + if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) + op_library(conv_fusion_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n") + endif() + if (NOT WIN32) + op_library(sync_batch_norm_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") + endif() + op_library(deformable_conv_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(deformable_conv);\n") +else() + op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) +endif() + +set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) + +if (WITH_DGC) + op_library(dgc_op DEPS dgc) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(dgc);\n") + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dgc) +endif() + +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search) +if (WITH_GPU) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) +endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment) + +# FIXME(typhoonzero): operator deps may not needed. +# op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) +# op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) +# op_library(unsqueeze_op DEPS reshape_op) +# op_library(squeeze_op DEPS reshape_op) +# op_library(flatten_op DEPS reshape_op) +# op_library(unstack_op DEPS stack_op) +# op_library(tensor_array_to_tensor_op DEPS concat_op) + +set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COMMON_OP_DEPS}) +set(GLOB_OPERATOR_DEPS ${OPERATOR_DEPS} CACHE INTERNAL "Global Op dependencies") + +cc_test(gather_test SRCS gather_test.cc DEPS tensor) +cc_test(scatter_test SRCS scatter_test.cc DEPS tensor math_function) +cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor) +cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) +cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) +cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) +nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) + +if (WITH_PYTHON) + cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind) +endif() + +set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") +add_subdirectory(benchmark) diff --git a/paddle/fluid/operators/activation_cudnn.cu.cc b/paddle/fluid/operators/activation_cudnn.cu.cc new file mode 100644 index 00000000..494c0237 --- /dev/null +++ b/paddle/fluid/operators/activation_cudnn.cu.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/platform/cudnn_desc.h" + +namespace paddle { +namespace operators { +using framework::Tensor; +using platform::ActivationDescriptor; +using platform::TensorDescriptor; + +template +class CudnnActivationKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + framework::Tensor *X, *Out; + ExtractActivationTensor(context, X, Out); + ActivationDescriptor act_desc; + TensorDescriptor x_desc, out_desc; + x_desc.set(detail::Ref(X)); + out_desc.set(detail::Ref(Out)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc new file mode 100644 index 00000000..f03355eb --- /dev/null +++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc @@ -0,0 +1,189 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/platform/cudnn_desc.h" + +namespace paddle { +namespace operators { +using framework::Tensor; +using platform::ActivationDescriptor; +using platform::TensorDescriptor; +using platform::CUDADeviceContext; + +template +struct CudnnActivationFunctor { + using ELEMENT_TYPE = T; + CudnnActivationFunctor(const CUDADeviceContext& ctx, const T& c, + const cudnnActivationMode_t& m) + : ctx_(ctx), coef_(c), mode_(m) {} + void operator()(const Tensor& x, Tensor* out) { + ActivationDescriptor act_desc; + act_desc.set(mode_, coef_); + TensorDescriptor x_desc, out_desc; + x_desc.set(x); + out_desc.set(detail::Ref(out)); + PADDLE_ENFORCE(platform::dynload::cudnnActivationForward( + ctx_.cudnn_handle(), act_desc.desc(), + platform::CudnnDataType::kOne(), x_desc.desc(), x.data(), + platform::CudnnDataType::kZero(), out_desc.desc(), + out->mutable_data(ctx_.GetPlace()))); + } + const CUDADeviceContext& ctx_; + const T coef_; + const cudnnActivationMode_t mode_; +}; + +template +struct CudnnActivationGradFunctor { + using ELEMENT_TYPE = T; + CudnnActivationGradFunctor(const CUDADeviceContext& ctx, const T& c, + const cudnnActivationMode_t& m) + : ctx_(ctx), coef_(c), mode_(m) {} + void operator()(const Tensor& x, const Tensor& out, const Tensor dout, + Tensor* dx) { + ActivationDescriptor act_desc; + act_desc.set(mode_, coef_); + TensorDescriptor x_desc, out_desc, dout_desc, dx_desc; + x_desc.set(x); + out_desc.set(out); + dout_desc.set(dout); + dx_desc.set(detail::Ref(dx)); + PADDLE_ENFORCE(platform::dynload::cudnnActivationBackward( + ctx_.cudnn_handle(), act_desc.desc(), + platform::CudnnDataType::kOne(), out_desc.desc(), out.data(), + dout_desc.desc(), dout.data(), x_desc.desc(), x.data(), + platform::CudnnDataType::kZero(), dx_desc.desc(), + dx->mutable_data(ctx_.GetPlace()))); + } + const CUDADeviceContext& ctx_; + const T coef_; + const cudnnActivationMode_t mode_; +}; + +template +struct CudnnReluFunctor : public CudnnActivationFunctor { + explicit CudnnReluFunctor(const CUDADeviceContext& ctx) + : CudnnActivationFunctor(ctx, 0.0, CUDNN_ACTIVATION_RELU) {} +}; +template +struct CudnnReluGradFunctor : public CudnnActivationGradFunctor { + explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx) + : CudnnActivationGradFunctor(ctx, 0.0, CUDNN_ACTIVATION_RELU) {} + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudnnRelu6Functor : public CudnnActivationFunctor { + explicit CudnnRelu6Functor(const CUDADeviceContext& ctx) + : CudnnActivationFunctor(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) {} +}; +template +struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor { + explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx) + : CudnnActivationGradFunctor(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) { + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudnnSigmoidFunctor : public CudnnActivationFunctor { + explicit CudnnSigmoidFunctor(const CUDADeviceContext& ctx) + : CudnnActivationFunctor(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {} +}; +template +struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor { + explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx) + : CudnnActivationGradFunctor(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {} + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct CudnnTanhFunctor : public CudnnActivationFunctor { + explicit CudnnTanhFunctor(const CUDADeviceContext& ctx) + : CudnnActivationFunctor(ctx, 0.0, CUDNN_ACTIVATION_TANH) {} +}; +template +struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor { + explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx) + : CudnnActivationGradFunctor(ctx, 0.0, CUDNN_ACTIVATION_TANH) {} + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +class CudnnActivationKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* X = nullptr; + framework::Tensor* Out = nullptr; + ExtractActivationTensor(context, &X, &Out); + Out->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + Functor functor(dev_ctx); + functor(detail::Ref(X), Out); + } +}; + +template +class CudnnActivationGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out."); + + const framework::Tensor *X, *Out, *dOut; + X = Out = dOut = nullptr; + framework::Tensor* dX = nullptr; + ExtractActivationGradTensor(context, &X, &Out, &dOut, + &dX); + dX->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + Functor functor(dev_ctx); + functor(detail::Ref(X), detail::Ref(Out), detail::Ref(dOut), dX); + } +}; + +} // namespace operators +} // namespace paddle + +namespace plat = paddle::platform; +namespace ops = paddle::operators; + +#define FOR_EACH_CUDNN_OP_FUNCTOR(__macro) \ + __macro(relu, CudnnReluFunctor, CudnnReluGradFunctor); \ + __macro(relu6, CudnnRelu6Functor, CudnnRelu6GradFunctor); \ + __macro(sigmoid, CudnnTanhFunctor, CudnnTanhGradFunctor); \ + __macro(tanh, CudnnTanhFunctor, CudnnTanhGradFunctor) + +#define REGISTER_ACTIVATION_CUDNN_KERNEL(act_type, functor, grad_functor) \ + REGISTER_OP_KERNEL(act_type, CUDNN, plat::CUDAPlace, \ + ops::CudnnActivationKernel>, \ + ops::CudnnActivationKernel>); \ + REGISTER_OP_KERNEL( \ + act_type##_grad, CUDNN, plat::CUDAPlace, \ + ops::CudnnActivationGradKernel>, \ + ops::CudnnActivationGradKernel>); + +FOR_EACH_CUDNN_OP_FUNCTOR(REGISTER_ACTIVATION_CUDNN_KERNEL); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc new file mode 100644 index 00000000..943c6f80 --- /dev/null +++ b/paddle/fluid/operators/activation_op.cc @@ -0,0 +1,890 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/activation_op.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" +#include "paddle/fluid/platform/port.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +template +static constexpr bool CanInplaceAct() { + return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps; +} + +std::unique_ptr> GetInplaceOpSet() { + std::unique_ptr> ret( + new std::unordered_set()); +#define INSERT_INTO_INPLACE_OP_SET(op_type, __omitted, fwd_functor, \ + bwd_functor) \ + if (CanInplaceAct>()) { \ + ret->insert(#op_type); \ + } + + FOR_EACH_ACTIVATION_OP(INSERT_INTO_INPLACE_OP_SET); +#undef INSERT_INTO_INPLACE_OP_SET + return ret; +} + +#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ + class OP_NAME##OpMaker \ + : public ::paddle::framework::OpProtoAndCheckerMaker { \ + public: \ + void Make() override { \ + AddInput("X", "Input of " #OP_NAME " operator"); \ + AddOutput("Out", "Output of " #OP_NAME " operator"); \ + AddAttr("use_mkldnn", \ + "(bool, default false) Only used in mkldnn kernel") \ + .SetDefault(false); \ + AddAttr("use_cudnn", \ + "(bool, default false) Only used in cudnn kernel, need " \ + "install cudnn") \ + .SetDefault(false); \ + AddAttr( \ + "is_test", \ + "(bool, default false) Set to true for inference only, false " \ + "for training. Some layers may run faster when this is true.") \ + .SetDefault(false); \ + AddComment(OP_COMMENT); \ + } \ + } + +template +class ActivationGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType(ForwardOpType() + "_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) { + op->SetInput("X", Input("X")); + } + + if (static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepOut)) { + op->SetInput("Out", Output("Out")); + } + + return op; + } +}; + +framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, + const framework::OperatorWithKernel& oper, + const std::string& name) { + framework::LibraryType library{framework::LibraryType::kPlain}; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; +// FIXME(liuwei1031) temporarily disable the code to unblock users +// TODO(liuwei1031) figure out the reason behind +// https://github.com/PaddlePaddle/Paddle/issues/16096 +// and re-enable this in the future +// #ifdef PADDLE_WITH_CUDA +// auto it1 = oper.Attrs().find("use_cudnn"); +// if (it1 != oper.Attrs().end() && platform::CanCUDNNBeUsed(ctx)) { +// library = framework::LibraryType::kCUDNN; +// } +// #endif +#ifdef PADDLE_WITH_MKLDNN + auto it = oper.Attrs().find("use_mkldnn"); + if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } +#endif + return framework::OpKernelType( + framework::GetDataTypeOfVar(ctx.InputVar(name)), ctx.GetPlace(), layout, + library); +} + +class ActivationOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->ShareDim("X", /*->*/ "Out"); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this, "X"); + } +}; + +class ActivationOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Out"}}; + } +}; + +class ActivationOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto out_grad_name = framework::GradVarName("Out"); + ctx->ShareDim(out_grad_name, framework::GradVarName("X")); + ctx->ShareLoD(out_grad_name, framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this, framework::GradVarName("Out")); + } +}; + +UNUSED constexpr char SigmoidDoc[] = R"DOC( +Sigmoid Activation Operator + +$$out = \\frac{1}{1 + e^{-x}}$$ + +)DOC"; + +UNUSED constexpr char LogSigmoidDoc[] = R"DOC( +Logsigmoid Activation Operator + +$$out = \\log \\frac{1}{1 + e^{-x}}$$ + +)DOC"; + +UNUSED constexpr char ExpDoc[] = R"DOC( +Exp Activation Operator. + +$out = e^x$ + +)DOC"; + +UNUSED constexpr char ReluDoc[] = R"DOC( +Relu Activation Operator. + +$out = \max(x, 0)$ + +)DOC"; + +UNUSED constexpr char GeluDoc[] = R"DOC( +Gelu Activation Operator. + +$out = \\frac{1 + erf(\\frac{x}{\\sqrt{2}})}{2} x$ + +)DOC"; + +UNUSED constexpr char TanhDoc[] = R"DOC( +Tanh Activation Operator. + +$$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ + +)DOC"; + +UNUSED constexpr char TanhShrinkDoc[] = R"DOC( +TanhShrink Activation Operator. + +$$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$ + +)DOC"; + +UNUSED constexpr char SqrtDoc[] = R"DOC( +Sqrt Activation Operator. + +Please make sure legal input, when input a negative value closed to zero, +you should add a small epsilon(1e-12) to avoid negative number caused by numerical errors. + +$out = \sqrt{x}$ + +)DOC"; + +UNUSED constexpr char RsqrtDoc[] = R"DOC( +Rsqrt Activation Operator. + +Please make sure input is legal in case of numeric errors. + +$out = \frac{1}{\sqrt{x}}$ + +)DOC"; + +UNUSED constexpr char AbsDoc[] = R"DOC( +Abs Activation Operator. + +$out = |x|$ + +)DOC"; + +UNUSED constexpr char CeilDoc[] = R"DOC( +Ceil Activation Operator. + +$out = \left \lceil x \right \rceil$ + +)DOC"; + +UNUSED constexpr char FloorDoc[] = R"DOC( +Floor Activation Operator. + +$out = \left \lfloor x \right \rfloor$ + +)DOC"; + +UNUSED constexpr char CosDoc[] = R"DOC( +Cosine Activation Operator. + +$out = cos(x)$ + +)DOC"; + +UNUSED constexpr char SinDoc[] = R"DOC( +Sine Activation Operator. + +$out = sin(x)$ + +)DOC"; + +UNUSED constexpr char RoundDoc[] = R"DOC( +Round Activation Operator. + +$out = [x]$ + +)DOC"; + +UNUSED constexpr char ReciprocalDoc[] = R"DOC( +Reciprocal Activation Operator. + +$$out = \\frac{1}{x}$$ + +)DOC"; + +UNUSED constexpr char LogDoc[] = R"DOC( +Log Activation Operator. + +$out = \ln(x)$ + +Natural logarithm of x. + +)DOC"; + +UNUSED constexpr char SquareDoc[] = R"DOC( +Square Activation Operator. + +$out = x^2$ + +)DOC"; + +UNUSED constexpr char SoftplusDoc[] = R"DOC( +Softplus Activation Operator. + +$out = \ln(1 + e^{x})$ + +)DOC"; + +UNUSED constexpr char SoftsignDoc[] = R"DOC( +Softsign Activation Operator. + +$$out = \\frac{x}{1 + \|x\|}$$ + +)DOC"; + +class AcosOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of acos operator"); + AddOutput("Out", "Output of acos operator"); + AddComment(R"DOC( +Arccosine Activation Operator. + +$$out = \cos^{-1}(x)$$ + +)DOC"); + } +}; + +class AsinOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of asin operator"); + AddOutput("Out", "Output of asin operator"); + AddComment(R"DOC( +Arcsine Activation Operator. + +$$out = \sin^{-1}(x)$$ + +)DOC"); + } +}; + +class AtanOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of atan operator"); + AddOutput("Out", "Output of atan operator"); + AddComment(R"DOC( +Arctanh Activation Operator. + +$$out = \tanh^{-1}(x)$$ + +)DOC"); + } +}; + +class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of LeakyRelu operator"); + AddOutput("Out", "Output of LeakyRelu operator"); + AddAttr("alpha", "The small negative slope").SetDefault(0.02f); + AddComment(R"DOC( +LeakyRelu Activation Operator. + +$out = \max(x, \alpha * x)$ + +)DOC"); + } +}; + +class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of Softshrink operator"); + AddOutput("Out", "Output of Softshrink operator"); + AddAttr("lambda", "non-negative offset").SetDefault(0.5f); + AddComment(R"DOC( +:strong:`Softshrink Activation Operator` + +.. math:: + out = \begin{cases} + x - \lambda, \text{if } x > \lambda \\ + x + \lambda, \text{if } x < -\lambda \\ + 0, \text{otherwise} + \end{cases} + +)DOC"); + } +}; + +class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of HardShrink operator"); + AddOutput("Out", "Output of HardShrink operator"); + AddAttr("threshold", + "The value of threshold for HardShrink. [default: 0.5]") + .SetDefault(0.5f); + AddComment(R"DOC( +:strong:`HardShrink activation operator` + +.. math:: + out = \begin{cases} + x, \text{if } x > \lambda \\ + x, \text{if } x < -\lambda \\ + 0, \text{otherwise} + \end{cases} + +)DOC"); + } +}; + +class BReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of BRelu operator"); + AddOutput("Out", "Output of BRelu operator"); + AddAttr("t_min", "The min marginal value of BRelu") + .SetDefault(static_cast(0)); + AddAttr("t_max", "The max marginal value of BRelu") + .SetDefault(static_cast(24)); + AddComment(R"DOC( +BRelu Activation Operator. + +$out = \max(\min(x, t_{min}), t_{max})$ + +)DOC"); + } +}; + +class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of SoftRelu operator"); + AddOutput("Out", "Output of SoftRelu operator"); + AddAttr("threshold", "The threshold value of SoftRelu") + .SetDefault(40.0f); + AddComment(R"DOC( +SoftRelu Activation Operator. + +$out = \ln(1 + \exp(\max(\min(x, threshold), -threshold)))$ + +)DOC"); + } +}; + +class ELUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of ELU operator"); + AddOutput("Out", "Output of ELU operator"); + AddAttr("alpha", "The alpha value of ELU").SetDefault(1.0f); + AddComment(R"DOC( +ELU Activation Operator. + +Applies the following element-wise computation on the input according to +https://arxiv.org/abs/1511.07289. + +$out = \max(0, x) + \min(0, \alpha * (e^x - 1))$ + +)DOC"); + } +}; + +class Relu6OpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of Relu6 operator"); + AddOutput("Out", "Output of Relu6 operator"); + AddAttr("threshold", "The threshold value of Relu6") + .SetDefault(6.0f); + AddComment(R"DOC( +Relu6 Activation Operator. + +$out = \min(\max(0, x), 6)$ + +)DOC"); + } +}; + +class PowOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of Pow operator"); + AddOutput("Out", "Output of Pow operator"); + AddAttr("factor", "The exponential factor of Pow").SetDefault(1.0f); + AddComment(R"DOC( +Pow Activation Operator. + +$out = x^{factor}$ + +)DOC"); + } +}; + +class STanhOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of STanh operator"); + AddOutput("Out", "Output of STanh operator"); + AddAttr("scale_a", "The scale parameter of a for the input") + .SetDefault(2.0f / 3.0f); + AddAttr("scale_b", "The scale parameter of b for the input") + .SetDefault(1.7159f); + AddComment(R"DOC( +STanh Activation Operator. + +$$out = b * \\frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$ + +)DOC"); + } +}; + +class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of ThresholdedRelu operator"); + AddOutput("Out", "Output of ThresholdedRelu operator"); + AddAttr("threshold", + "The threshold location of activation. [default 1.0].") + .SetDefault(1.0f); + AddComment(R"DOC( +:strong:`ThresholdedRelu activation operator` + +.. math:: + + out = \begin{cases} + x, \text{if } x > threshold \\ + 0, \text{otherwise} + \end{cases} +)DOC"); + } +}; + +class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of HardSigmoid operator"); + AddOutput("Out", "Output of HardSigmoid operator"); + AddAttr("slope", "Slope for linear approximation of sigmoid") + .SetDefault(0.2f); + AddAttr("offset", "Offset for linear approximation of sigmoid") + .SetDefault(0.5f); + AddComment(R"DOC( +HardSigmoid Activation Operator. + +Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), +which is much faster than sigmoid. + +$out = \max(0, \min(1, slope * x + shift))$ + +The slope should be positive. The offset can be either positive or negative. +The default slope and shift are set according to the above reference. +It is recommended to use the defaults for this activation. + +)DOC"); + } +}; + +class SwishOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of Swish operator"); + AddOutput("Out", "Output of Swish operator"); + AddAttr("beta", "Constant beta of swish operator").SetDefault(1.0f); + AddComment(R"DOC( +Swish Activation Operator. + +$$out = \\frac{x}{1 + e^{- \beta x}}$$ + +)DOC"); + } +}; + +REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc); +REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc); +REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc); +REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc); +REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc); +REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc); +REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc); +REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc); +REGISTER_ACTIVATION_OP_MAKER(Rsqrt, RsqrtDoc); +REGISTER_ACTIVATION_OP_MAKER(Abs, AbsDoc); +REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc); +REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc); +REGISTER_ACTIVATION_OP_MAKER(Cos, CosDoc); +REGISTER_ACTIVATION_OP_MAKER(Sin, SinDoc); +REGISTER_ACTIVATION_OP_MAKER(Round, RoundDoc); +REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc); +REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc); +REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc); +REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc); +REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc); + +template +class ActivationOpDoubleGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + if (static_cast(kDepValue) & static_cast(kDepX)) { + if (ctx->HasOutput("DX")) { + ctx->ShareDim("X", "DX"); + ctx->ShareLoD("X", "DX"); + } + if (ctx->HasOutput("DDOut")) { + ctx->ShareDim("X", "DDOut"); + ctx->ShareLoD("X", "DDOut"); + } + } + if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (ctx->HasOutput("DOut")) { + ctx->ShareDim("Out", "DOut"); + ctx->ShareLoD("Out", "DOut"); + } + if (ctx->HasOutput("DDOut")) { + ctx->ShareDim("Out", "DDOut"); + ctx->ShareLoD("Out", "DDOut"); + } + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this, "DDX"); + } +}; + +template +class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + if (static_cast(kDepValue) & static_cast(kDepX)) { + if (ctx->HasOutput("DDOut")) { + ctx->ShareDim("X", "DDOut"); + ctx->ShareLoD("X", "DDOut"); + } + } + if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (ctx->HasOutput("DDOut")) { + ctx->ShareDim("Out", "DDOut"); + ctx->ShareLoD("Out", "DDOut"); + } + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this, "DDX"); + } +}; + +// +// ReluGrad: dx = dy if y >= 0 else 0 +// ReluGradGrad: ddy = ddx if y >= 0 else 0 +// +class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker { + public: + using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<::paddle::framework::OpDesc> Apply() const override { + auto* op = new ::paddle::framework::OpDesc(); + op->SetType("relu_grad_grad"); + // input1: Out + op->SetInput("Out", Input("Out")); + // input2: ddx + op->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); + op->SetAttrMap(Attrs()); + // output: ddy + op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out"))); + return std::unique_ptr<::paddle::framework::OpDesc>(op); + } +}; + +// leaky_relu Grad: dx=dy if y>=0 else alpha * dy +// leaky_relu GradGrad: ddy=ddx if y>=0 else alpha * ddx +class LeakyReluDoubleGradMaker + : public ::paddle::framework::SingleGradOpDescMaker { + public: + using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<::paddle::framework::OpDesc> Apply() const override { + auto* op = new ::paddle::framework::OpDesc(); + op->SetType("leaky_relu_grad_grad"); + // input1: X + op->SetInput("X", Input("X")); + // X@GRAD@GRAD: ddx + op->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); + op->SetAttrMap(Attrs()); + // Out@GRAD@GRAD: ddy + op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out"))); + return std::unique_ptr<::paddle::framework::OpDesc>(op); + } +}; + +// sqrt Grad: dx = 0.5 * dy / y +// sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx +class SqrtDoubleGradMaker : public ::paddle::framework::SingleGradOpDescMaker { + public: + using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<::paddle::framework::OpDesc> Apply() const override { + auto* op = new ::paddle::framework::OpDesc(); + op->SetType("sqrt_grad_grad"); + op->SetInput("Out", Input("Out")); + op->SetInput("DX", Output(framework::GradVarName("X"))); + op->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); + op->SetAttrMap(Attrs()); + op->SetOutput("DOut", InputGrad("Out")); + op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out"))); + return std::unique_ptr<::paddle::framework::OpDesc>(op); + } +}; + +// square Grad: dx=2x*dy +// square GradGrad: ddy=2x*ddx, dx=2dy*ddx +class SquareDoubleGradMaker + : public ::paddle::framework::SingleGradOpDescMaker { + public: + using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr<::paddle::framework::OpDesc> Apply() const override { + auto* op = new ::paddle::framework::OpDesc(); + op->SetType("square_grad_grad"); + op->SetInput("X", Input("X")); + // Out@GRAD: dy + op->SetInput("DOut", Input(framework::GradVarName("Out"))); + // X@GRAD@GRAD: ddx + op->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); + + op->SetAttrMap(Attrs()); + + // X@GRAD: dx + op->SetOutput("DX", InputGrad("X")); + // Out@GRAD@GRAD: ddy + op->SetOutput("DDOut", InputGrad(framework::GradVarName("Out"))); + return std::unique_ptr<::paddle::framework::OpDesc>(op); + } +}; + +class ActivationGradOpInplaceInference : public framework::InplaceOpInference { + public: + std::unordered_map operator()( + const framework::OpDesc& op_desc, bool use_cuda) const override { + return {{framework::GradVarName("Out"), framework::GradVarName("X")}}; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +#define REGISTER_ACTIVATION_OP(KERNEL_TYPE, OP_NAME, functor, grad_functor) \ + REGISTER_OPERATOR( \ + KERNEL_TYPE, ops::ActivationOp, ops::OP_NAME##OpMaker, \ + ops::ActivationOpInferVarType, \ + ops::ActivationGradOpDescMaker::FwdDeps()>, \ + std::conditional>(), \ + ::paddle::framework::SingleOpInplaceInToOut, \ + void>::type); \ + REGISTER_OPERATOR(KERNEL_TYPE##_grad, ops::ActivationOpGrad, \ + ops::ActivationGradOpInplaceInference); + +#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, op_name, functor, \ + grad_functor) \ + REGISTER_OP_CPU_KERNEL( \ + act_type, ops::ActivationKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CPU_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); + +FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP); +FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL); + +/* ========================== relu register ============================= */ +REGISTER_OPERATOR( + relu, ops::ActivationOp, ops::ReluOpMaker, ops::ActivationOpInferVarType, + ops::ActivationGradOpDescMaker::FwdDeps()>, + paddle::framework::SingleOpInplaceInToOut); +REGISTER_OPERATOR(relu_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInference, + ops::ReluDoubleGradMaker); +REGISTER_OPERATOR( + relu_grad_grad, + ops::ActivationOpDoubleGrad2::FwdDeps()>); + +REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluFunctor, ReluGradFunctor); + +REGISTER_OP_CPU_KERNEL( + relu_grad_grad, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>); +/* ========================================================================== */ + +/* ======================== leaky relu register ============================ */ +REGISTER_OPERATOR( + leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker, + ops::ActivationOpInferVarType, + ops::ActivationGradOpDescMaker::FwdDeps()>, + paddle::framework::SingleOpInplaceInToOut); +REGISTER_OPERATOR(leaky_relu_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInference, + ops::LeakyReluDoubleGradMaker); +REGISTER_OPERATOR( + leaky_relu_grad_grad, + ops::ActivationOpDoubleGrad2::FwdDeps()>); + +REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor, + LeakyReluGradFunctor); +REGISTER_OP_CPU_KERNEL( + leaky_relu_grad_grad, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel< + plat::CPUDeviceContext, ops::LeakyReluGradGradFunctor>); +/* ========================================================================== */ + +/* =========================== sqrt register ============================= */ +REGISTER_OPERATOR( + sqrt, ops::ActivationOp, ops::SqrtOpMaker, ops::ActivationOpInferVarType, + ops::ActivationGradOpDescMaker::FwdDeps()>, + paddle::framework::SingleOpInplaceInToOut); +REGISTER_OPERATOR(sqrt_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInference, + ops::SqrtDoubleGradMaker); +REGISTER_OPERATOR( + sqrt_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>); +REGISTER_ACTIVATION_CPU_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor); +REGISTER_OP_CPU_KERNEL( + sqrt_grad_grad, ops::SqrtDoubleGradKernel>, + ops::SqrtDoubleGradKernel>, + ops::SqrtDoubleGradKernel>); +/* ========================================================================== */ + +/* ========================== square register ============================ */ +REGISTER_OPERATOR( + square, ops::ActivationOp, ops::SquareOpMaker, + ops::ActivationOpInferVarType, + ops::ActivationGradOpDescMaker::FwdDeps()>, + paddle::framework::SingleOpInplaceInToOut); +REGISTER_OPERATOR(square_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInference, + ops::SquareDoubleGradMaker); +REGISTER_OPERATOR( + square_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>); + +REGISTER_ACTIVATION_CPU_KERNEL(square, Square, SquareFunctor, + SquareGradFunctor); + +REGISTER_OP_CPU_KERNEL( + square_grad_grad, + ops::SquareDoubleGradKernel>, + ops::SquareDoubleGradKernel>, + ops::SquareDoubleGradKernel>); +/* ========================================================================== */ diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu new file mode 100644 index 00000000..25514186 --- /dev/null +++ b/paddle/fluid/operators/activation_op.cu @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor, \ + grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + act_type, \ + ops::ActivationKernel>, \ + ops::ActivationKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); + +FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL); + +/* ======================== leaky relu register ============================ */ +REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor, + LeakyReluGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + leaky_relu_grad_grad, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel< + plat::CUDADeviceContext, ops::LeakyReluGradGradFunctor>); +/* ========================================================================== */ + +/* =========================== relu register ============================ */ +REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluFunctor, ReluGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + relu_grad_grad, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>, + ops::ActivationDoubleGradKernel>); +/* ========================================================================== */ + +/* =========================== sqrt register ============================= */ +REGISTER_ACTIVATION_CUDA_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + sqrt_grad_grad, + ops::SqrtDoubleGradKernel>, + ops::SqrtDoubleGradKernel>, + ops::SqrtDoubleGradKernel>); +/* ========================================================================== */ + +/* =========================== square register ============================ */ +REGISTER_ACTIVATION_CUDA_KERNEL(square, Square, SquareFunctor, + SquareGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + square_grad_grad, + ops::SquareDoubleGradKernel>, + ops::SquareDoubleGradKernel>, + ops::SquareDoubleGradKernel>); +/* ========================================================================== */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h new file mode 100644 index 00000000..b516fc8a --- /dev/null +++ b/paddle/fluid/operators/activation_op.h @@ -0,0 +1,1551 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES +#endif + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/platform/float16.h" + +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +enum ActBwdOpFwdDeps { + kNoDeps = 0x00, // Do not need any forward input/output + kDepX = 0x01, // Only need forward input X + kDepOut = 0x02, // Only need forward output Out + + // Never add kDepXOut, because Out can be always calculated + // by forward input X in backward part. + // FIXME(zjl): but in MKLDNN abs, X and Out are all needed... + // Developers should not rely on this enum value! + kDepXOut = 0x03 +}; + +std::unique_ptr> GetInplaceOpSet(); + +static bool IsInplace(const std::string& op) { + static auto InplaceOpSet = GetInplaceOpSet(); + bool inplace = InplaceOpSet->count(op); + // for op_grad + const int kGradSuffixLen = 4; + if (op.size() > kGradSuffixLen && + op.compare(op.size() - kGradSuffixLen - 1, kGradSuffixLen, "grad")) { + inplace = + InplaceOpSet->count(op.substr(0, op.size() - (kGradSuffixLen + 1))); + } + return inplace; +} + +/* The following operator can be used to process SelectedRows, because the + * output of those operator for zero is zero too. + */ +static std::unordered_set CanBeUsedBySelectedRows = { + "abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"}; + +inline void ExtractActivationTensor(const framework::ExecutionContext& context, + const framework::Tensor** X, + framework::Tensor** Out) { + auto x_var = context.InputVar("X"); + auto out_var = context.OutputVar("Out"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input Variable X, variable name = %s", + context.op().Input("X")); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get output Variable Out, variable name = %s", + context.op().Output("Out")); + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var); + *Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + out_var); + } else { + *X = context.Input("X"); + *Out = context.Output("Out"); + } + + PADDLE_ENFORCE(*Out != nullptr, + "Cannot get output tensor Out, variable name = %s", + context.op().Output("Out")); +} + +template +inline void ExtractActivationGradTensor( + const framework::ExecutionContext& context, const framework::Tensor** X, + const framework::Tensor** Out, const framework::Tensor** dOut, + framework::Tensor** dX) { + auto out_grad_var = context.InputVar(framework::GradVarName("Out")); + auto x_grad_var = context.OutputVar(framework::GradVarName("X")); + const framework::Variable* out_var = nullptr; + + if (static_cast(kDepValue) & static_cast(kDepOut)) { + out_var = context.InputVar("Out"); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get input Variable Out, variable name = %s", + context.op().Input("Out")); + } + PADDLE_ENFORCE(out_grad_var != nullptr, + "Cannot get input Variable %s, variable name = %s", + framework::GradVarName("Out"), + context.op().Input(framework::GradVarName("Out"))); + PADDLE_ENFORCE(x_grad_var != nullptr, + "Cannot get output Variable %s, variable name = %s", + framework::GradVarName("X"), + context.op().Output(framework::GradVarName("X"))); + + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + *dOut = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar( + *out_grad_var); + *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + x_grad_var); + + if (out_var) { + *Out = + paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var); + } else { + *Out = *dOut; // fake out + } + + } else { + *Out = context.Input("Out"); + *dOut = context.Input(framework::GradVarName("Out")); + *dX = context.Output(framework::GradVarName("X")); + + if (out_var) { + *Out = &(out_var->Get()); + } else { + *Out = *dOut; // fake out + } + } + + PADDLE_ENFORCE(*dX != nullptr, + "Cannot get output tensor %s, variable name = %s", + framework::GradVarName("X"), + context.op().Output(framework::GradVarName("X"))); + + if (static_cast(kDepValue) & static_cast(kDepX)) { + auto x_var = context.InputVar("X"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input tensor X, variable name = %s", + context.op().Input("X")); + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var); + } else { + *X = context.Input("X"); + } + } else { + VLOG(10) << " Inplace activation of Op : " << context.op().Type(); + *X = *dX; + } +} + +template +class ActivationKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* X = nullptr; + framework::Tensor* Out = nullptr; + ExtractActivationTensor(context, &X, &Out); + Out->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(detail::Ref(X)); + auto out = framework::EigenVector::Flatten(detail::Ref(Out)); + auto* place = + context.template device_context().eigen_device(); + Functor functor; + + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(*place, x, out); + } +}; + +template +class ActivationGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor *X, *Out, *dOut; + framework::Tensor* dX = nullptr; + X = Out = dOut = nullptr; + ExtractActivationGradTensor(context, &X, &Out, &dOut, + &dX); + dX->mutable_data(context.GetPlace()); + auto dout = framework::EigenVector::Flatten(detail::Ref(dOut)); + auto out = framework::EigenVector::Flatten(detail::Ref(Out)); + auto dx = framework::EigenVector::Flatten(detail::Ref(dX)); + auto x = framework::EigenVector::Flatten(detail::Ref(X)); + auto* place = + context.template device_context().eigen_device(); + Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(*place, x, out, dout, dx); + } +}; + +template +struct BaseActivationFunctor { + using ELEMENT_TYPE = T; + + using AttrPair = std::vector>; + + AttrPair GetAttrs() { return AttrPair(); } + + /* NOTE(*): Output reuse X memory if X is not dependented by its Gradient. + For example, sigmoid op's gradient didn't involve x, so its output can + reuse + input memory. But abs op's gradient use x, it can not be inplaced. + gradient did use x. + */ + bool Inplace() const { return false; } +}; + +// sigmoid(x) = 1 / (1 + exp(-x)) +template +struct SigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = static_cast(1) / (static_cast(1) + (-x).exp()); + } +}; + +template +struct SigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out * (static_cast(1) - out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +// Originally: logsigmoid(x) = -log (1 + exp(-x)) +// For numerical stability, we can use the log-sum-exp trick: +// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ +// We can rewrite the above equation as: +// out = -log( exp(0) + exp(-x)) [since exp(0) = 1] +// = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0))) +// = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x - +// max(-x, 0))) +// = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) +// = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0))) +// +// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0)) +// + exp(-x - max(-x, 0)))) +template +struct LogSigmoidFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log()); + } +}; + +// Originally: f' = exp(-x) / (1 + exp(-x)) +// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + +// exp(-x - max(-x, 0))) +template +struct LogSigmoidGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp = (-x).cwiseMax(static_cast(0)); // temp = max(-x, 0) + dx.device(d) = + dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp())); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// exp(x) = e^x +template +struct ExpFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.exp(); + } +}; + +template +struct ExpGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +// relu(x) = max(x, 0) +template +struct ReluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.cwiseMax(static_cast(0)); + } +}; + +template +struct ReluGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (out > static_cast(0)).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +// gelu(x) = 0.5 * x * (1 + erf(x / sqrt(2))) +template +struct GeluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { +// Because the execute or device context can not be deliver here, it keep the +// marco for NVCC. +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) + auto x_data = x.data(); + auto out_data = out.data(); + int n = std::min(x.size(), out.size()); + + std::memset(out_data, 0, n * sizeof(T)); + math::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, out_data, 1); + math::CBlas::VMERF(n, out_data, out_data, VML_LA); + for (int i = 0; i < n; i++) { + out_data[i] += static_cast(1); + } + math::CBlas::VMUL(n, x_data, out_data, out_data); + for (int i = 0; i < n; i++) { + out_data[i] *= static_cast(0.5); + } +#else + auto temp = (x * static_cast(M_SQRT1_2)).erf(); + out.device(d) = x * static_cast(0.5) * (static_cast(1) + temp); +#endif + } +}; + +template +struct GeluGradFunctor : BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto first = static_cast(0.5) * + (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); + + auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * + (-static_cast(0.5) * x.square()).exp(); + dx.device(d) = dout * (first + second); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.tanh(); + } +}; + +template +struct TanhGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (static_cast(1) - out * out); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct TanhShrinkFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x - x.tanh(); + } +}; + +template +struct TanhShrinkGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (x.tanh() * x.tanh()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// tanhshrink(x) = x - tanh(x) +// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) +template +struct HardShrinkFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out) const { + auto temp1 = (x < static_cast(threshold * -1)).template cast().eval(); + auto temp2 = (x > static_cast(threshold)).template cast().eval(); + out.device(d) = x * (temp1 + temp2); + } +}; + +template +struct HardShrinkGradFunctor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = (x < static_cast(threshold * -1)).template cast().eval(); + auto temp2 = (x > static_cast(threshold)).template cast().eval(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0 +// otherwise +template +struct SoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast().eval(); + auto temp2 = (x < -lambdaT).template cast().eval(); + out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT); + } +}; + +template +struct SoftShrinkGradFunctor : public BaseActivationFunctor { + float lambda; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto lambdaT = static_cast(lambda); + auto temp1 = (x > lambdaT).template cast().eval(); + auto temp2 = (x < -lambdaT).template cast().eval(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// sqrt(x) = x^(1/2) +template +struct SqrtFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.sqrt(); + } +}; + +template +struct SqrtGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = static_cast(0.5) * dout / out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +// rsqrt(x) = x^(-1/2) +template +struct RsqrtFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.rsqrt(); + } +}; + +template +struct RsqrtGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = static_cast(-0.5) * dout * out * out * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +// ceil(x) = ceiling(x) +template +struct CeilFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.ceil(); + } +}; + +template +struct ZeroGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = static_cast(0) * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } +}; + +// floor(x) = flooring(x) +template +struct FloorFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.floor(); + } +}; + +template +struct Sine { + HOSTDEVICE T operator()(const T& val) const { return sin(val); } +}; + +template <> +struct Sine { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(sin(static_cast(val))); + } +}; + +template +struct Cosine { + HOSTDEVICE T operator()(const T& val) const { return cos(val); } +}; + +template <> +struct Cosine { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(cos(static_cast(val))); + } +}; + +// cosine'(x) = -sin(x) +template +struct CosGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = -dout * x.unaryExpr(Sine()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// cosine(x) = cos(x) +template +struct CosFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Cosine()); + } +}; + +// sine'(x) = cos(x) +template +struct SinGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * x.unaryExpr(Cosine()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// sine(x) = sin(x) +template +struct SinFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Sine()); + } +}; + +template +struct Acos { + HOSTDEVICE T operator()(const T& val) const { return acos(val); } +}; + +template <> +struct Acos { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(acos(static_cast(val))); + } +}; + +// Acos(x) = acos(x) +template +struct AcosFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Acos()); + } +}; + +// acos'(x) = -1/sqrt(1-x^2) +template +struct AcosGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + -dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Asin { + HOSTDEVICE T operator()(const T& val) const { return asin(val); } +}; + +template <> +struct Asin { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(asin(static_cast(val))); + } +}; + +// Asin(x) = asin(x) +template +struct AsinFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Asin()); + } +}; + +// asin'(x) = 1/sqrt(1-x^2) +template +struct AsinGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * static_cast(1) / (static_cast(1) - x.square()).sqrt(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct Atan { + HOSTDEVICE T operator()(const T& val) const { return atan(val); } +}; + +template <> +struct Atan { + HOSTDEVICE platform::float16 operator()(const platform::float16& val) const { + return platform::float16(atan(static_cast(val))); + } +}; + +// Atan(x) = atan(x) +template +struct AtanFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Atan()); + } +}; + +// atan'(x) = 1 / (1 + x^2) +template +struct AtanGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(1) / (static_cast(1) + x.square()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// round(x) = [x] +template +struct RoundFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.round(); + } +}; + +// abs(x) = |x| +template +struct AbsFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.abs(); + } +}; + +template +struct AbsGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * x.sign(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepXOut; } +}; + +// reciprocal(x) = 1 / x +template +struct ReciprocalFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = static_cast(1) / x; + } +}; + +template +struct ReciprocalGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(-1) * out * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +// log(x) = natural logarithm of x +template +struct LogFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.log(); + } +}; + +template +struct LogGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (static_cast(1) / x); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// square(x) = x^2 +template +struct SquareFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.square(); + } +}; + +template +struct SquareGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(2) * x; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct BReluFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + + // NOTE: Explicit hides the `BaseActivationFunctor::GetAttrs` + // not polymorphism for speed. + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + x.cwiseMax(static_cast(t_min)).cwiseMin(static_cast(t_max)); + } +}; + +template +struct BReluGradFunctor : public BaseActivationFunctor { + float t_min; + float t_max; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"t_min", &t_min}, {"t_max", &t_max}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((x > static_cast(t_min)) * (x < static_cast(t_max))) + .template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// relu6(x) = min(max(0, x), 6) +template +struct Relu6Functor : public BaseActivationFunctor { + float threshold; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + x.cwiseMax(static_cast(0)).cwiseMin(static_cast(threshold)); + } +}; + +template +struct Relu6GradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = + dout * + ((out > static_cast(0)) * (out < static_cast(threshold))) + .template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +// softplus(x) = log(1 + exp(x)) +// When x is a very large positive number, exp(x) may explode to inf, +// Using trick below for numerical stability +// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ +// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0))) +template +struct SoftplusFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) { + auto temp = x.cwiseMax(static_cast(0)); // temp = max(x, 0) + out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log()); + } +}; + +// d(softplus(x))/dx = exp(x) / (1 + exp(x)) +// For numerical stability: +// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) + +// exp(x - max(x, 0))) +template +struct SoftplusGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) { + auto temp = x.cwiseMax(static_cast(0)); // temp = max(x, 0) + dx.device(d) = + dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp())); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// softsign(x) = x / (1 + |x|) +template +struct SoftsignFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) { + out.device(d) = x / (static_cast(1) + x.abs()); + } +}; + +// d(softsign(x))/dx = 1 / (1 + |x|)^2 +// Taken from https://en.wikipedia.org/wiki/Activation_function +template +struct SoftsignGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) { + dx.device(d) = + dout * (static_cast(1) / (static_cast(1) + x.abs()).square()); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct SoftReluFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto tmp = static_cast(threshold); + auto temp = x.cwiseMax(-tmp).cwiseMin(tmp); + out.device(d) = (static_cast(1) + temp.exp()).log(); + } +}; + +template +struct SoftReluGradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto tmp = static_cast(threshold); + auto temp = ((out > -tmp) * (out < tmp)).template cast().eval(); + dx.device(d) = dout * (static_cast(1) - (-out).exp()) * temp; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct LeakyReluFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.cwiseMax(static_cast(alpha) * x); + } +}; + +template +struct LeakyReluGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = static_cast(alpha) * + (x < static_cast(0)).template cast().eval(); + auto temp2 = (x >= static_cast(0)).template cast().eval(); + dx.device(d) = dout * (temp1 + temp2).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct ELUFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.cwiseMax(static_cast(0)) + + (static_cast(alpha) * (x.exp() - static_cast(1))) + .cwiseMin(static_cast(0)); + } +}; + +template +struct ELUGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * (x > static_cast(0)).template cast() + + dout * static_cast(alpha) * x.exp() * + (x < static_cast(0)).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198 +template +struct PowFunctor : public BaseActivationFunctor { + float factor; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"factor", &factor}}; + } + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.pow(static_cast(factor)); + } +}; + +template +struct PowGradFunctor : public BaseActivationFunctor { + float factor; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"factor", &factor}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * static_cast(factor) * + x.pow(static_cast(factor) - static_cast(1)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct STanhFunctor : public BaseActivationFunctor { + float scale_a; + float scale_b; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + static_cast(scale_b) * (static_cast(scale_a) * x).tanh(); + } +}; + +template +struct STanhGradFunctor : public BaseActivationFunctor { + float scale_a; + float scale_b; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"scale_a", &scale_a}, {"scale_b", &scale_b}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto a = static_cast(scale_a); + auto b = static_cast(scale_b); + auto temp = (a * x).tanh() * (a * x).tanh(); + dx.device(d) = dout * a * b * (static_cast(1) - temp); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct ThresholdedReluFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto th = static_cast(threshold); + out.device(d) = (x > th).template cast() * x; + } +}; + +template +struct ThresholdedReluGradFunctor : public BaseActivationFunctor { + float threshold; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}}; + } + + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto th = static_cast(threshold); + dx.device(d) = dout * (x > th).template cast(); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct HardSigmoidFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + + template + void operator()(Device d, X x, Out out) const { + auto temp = x * static_cast(slope) + static_cast(offset); + out.device(d) = + temp.cwiseMax(static_cast(0)).cwiseMin(static_cast(1)); + } +}; + +template +struct HardSigmoidGradFunctor : public BaseActivationFunctor { + float slope; + float offset; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"slope", &slope}, {"offset", &offset}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * + ((out > static_cast(0)) * (out < static_cast(1))) + .template cast() * + static_cast(slope); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct SwishFunctor : public BaseActivationFunctor { + float beta; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x / (static_cast(1) + (static_cast(-beta) * x).exp()); + } +}; + +template +struct SwishGradFunctor : public BaseActivationFunctor { + float beta; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + template + void operator()(Device d, X x, Out fake_out, dOut dout, dX dx) const { + auto temp1 = static_cast(1) / + (static_cast(1) + (static_cast(-beta) * x).exp()); + auto out = x * temp1; + auto temp2 = temp1 * (static_cast(1) - (static_cast(beta) * out)); + dx.device(d) = dout * ((static_cast(beta) * out) + temp2); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +/* + * in arguments: x, out, ddx + * out arguments: ddout, dout, dx + */ +template +inline void ExtractActivationDoubleGradTensor( + const framework::ExecutionContext& ctx, const framework::Tensor** X, + const framework::Tensor** Out, const framework::Tensor** ddX, + framework::Tensor** dX, framework::Tensor** dOut, + framework::Tensor** ddOut) { + auto ddx_var = ctx.InputVar("DDX"); + auto ddo_var = ctx.OutputVar("DDOut"); + PADDLE_ENFORCE(ddx_var != nullptr, + "Cannot get input Variable Out, variable name = %s", + ctx.op().Input("DDX")); + if (CanBeUsedBySelectedRows.count(ctx.op().Type())) { + *ddX = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*ddx_var); + if (ddo_var) { + *ddOut = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + ddo_var); + } + } else { + *ddX = ctx.Input("DDX"); + if (ddo_var) { + *ddOut = ctx.Output("DDOut"); + } + } + PADDLE_ENFORCE(*ddX != nullptr, + "Cannot get output tensor DDX, variable name = %s", + ctx.op().Output("DDX")); + + if (static_cast(kDepValue) & static_cast(kDepX)) { + auto x_var = ctx.InputVar("X"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input Variable Out, variable name = %s", + ctx.op().Input("X")); + auto dx_var = ctx.OutputVar("DX"); + if (CanBeUsedBySelectedRows.count(ctx.op().Type())) { + *X = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var); + if (dx_var) { + *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + dx_var); + } + } else { + *X = ctx.Input("X"); + if (dx_var) { + *dX = ctx.Output("DX"); + } + } + } else { + VLOG(10) << "Inplace activation of Op: " << ctx.op().Type(); + *X = *ddX; + } + if (static_cast(kDepValue) & static_cast(kDepOut)) { + auto out_var = ctx.InputVar("Out"); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get input tensor Out, variable name = %s", + ctx.op().Input("Out")); + auto dout_var = ctx.OutputVar("DOut"); + if (CanBeUsedBySelectedRows.count(ctx.op().Type())) { + *Out = + paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var); + if (dout_var) { + *dOut = + paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + dout_var); + } + } else { + *Out = ctx.Input("Out"); + if (dout_var) { + *dOut = ctx.Output("DOut"); + } + } + } else { + VLOG(10) << "Inplace activation of Op: " << ctx.op().Type(); + *Out = *ddX; + } +} + +template +class ActivationDoubleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *X, *Out, *ddX; + X = Out = ddX = nullptr; + framework::Tensor *ddOut, *dOut, *dX; + ddOut = dOut = dX = nullptr; + + ExtractActivationDoubleGradTensor(ctx, &X, &Out, &ddX, + &dX, &dOut, &ddOut); + + if (ddOut) ddOut->mutable_data(ctx.GetPlace()); + if (dOut) dOut->mutable_data(ctx.GetPlace()); + if (dX) dX->mutable_data(Out->dims(), ctx.GetPlace()); + + auto& place = ctx.template device_context(); + + Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = ctx.Attr(attr.first); + } + functor(place, X, Out, ddX, ddOut, dOut, dX); + } +}; + +template +struct ReluGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* X, + const framework::Tensor* Out, const framework::Tensor* ddX, + framework::Tensor* ddOut, framework::Tensor* dOut, + framework::Tensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten(detail::Ref(ddX)); + auto out = framework::EigenVector::Flatten(detail::Ref(Out)); + if (ddOut) { + auto ddout = framework::EigenVector::Flatten(detail::Ref(ddOut)); + ddout.device(*d) = ddx * (out > static_cast(0)).template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct LeakyReluGradGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(const Device& dev, const framework::Tensor* X, + const framework::Tensor* Out, const framework::Tensor* ddX, + framework::Tensor* ddOut, framework::Tensor* dOut, + framework::Tensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten(detail::Ref(ddX)); + auto x = framework::EigenVector::Flatten(detail::Ref(X)); + if (ddOut) { + auto ddout = framework::EigenVector::Flatten(detail::Ref(ddOut)); + ddout.device(*d) = ddx * + ((x >= static_cast(0)).template cast().eval() + + static_cast(alpha) * + (x < static_cast(0)).template cast().eval()) + .template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct SqrtGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, framework::Tensor* ddOut, + framework::Tensor* dOut, const framework::Tensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten(detail::Ref(ddX)); + auto out = framework::EigenVector::Flatten(detail::Ref(Out)); + if (ddOut) { + auto ddout = framework::EigenVector::Flatten(detail::Ref(ddOut)); + ddout.device(*d) = ddx * static_cast(0.5) / out; + } + if (dOut) { + auto dx = framework::EigenVector::Flatten(detail::Ref(dX)); + auto dout = framework::EigenVector::Flatten(detail::Ref(dOut)); + dout.device(*d) = dx * ddx * static_cast(-1) / out; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template +struct SquareGradGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* X, + const framework::Tensor* ddX, framework::Tensor* ddOut, + const framework::Tensor* dOut, framework::Tensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten(detail::Ref(ddX)); + auto x = framework::EigenVector::Flatten(detail::Ref(X)); + if (ddOut) { + auto ddout = framework::EigenVector::Flatten(detail::Ref(ddOut)); + ddout.device(*d) = ddx * static_cast(2) * x; + } + if (dX) { + auto dx = framework::EigenVector::Flatten(detail::Ref(dX)); + auto dout = framework::EigenVector::Flatten(detail::Ref(dOut)); + dx.device(*d) = ddx * static_cast(2) * dout; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +// TODO(dengkaipeng): double gradient calculation for Square/Sqrt need +// DOut(dy) as input(not output), tensor extraction is different from +// others. Impliment extraction kernel seperately here. +inline void ExtractDoubleGradTensorWithInputDOut( + const framework::ExecutionContext& ctx, const framework::Tensor** X, + const framework::Tensor** ddX, framework::Tensor** dX, + const framework::Tensor** dOut, framework::Tensor** ddOut) { + // extract ddX(output), ddOut(input) + auto ddx_var = ctx.InputVar("DDX"); + auto ddo_var = ctx.OutputVar("DDOut"); + PADDLE_ENFORCE(ddx_var != nullptr, + "Cannot get input Variable Out, variable name = %s", + ctx.op().Input("DDX")); + *ddX = ctx.Input("DDX"); + if (ddo_var) { + *ddOut = ctx.Output("DDOut"); + } + PADDLE_ENFORCE(*ddX != nullptr, + "Cannot get output tensor DDX, variable name = %s", + ctx.op().Output("DDX")); + + // extract x(input), dx(output) + auto x_var = ctx.InputVar("X"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input Variable Out, variable name = %s", + ctx.op().Input("X")); + auto dx_var = ctx.OutputVar("DX"); + *X = ctx.Input("X"); + if (dx_var) { + *dX = ctx.Output("DX"); + } + + // extract dOut(input) + auto dout_var = ctx.InputVar("DOut"); + if (dout_var) { + *dOut = ctx.Input("DOut"); + } +} + +template +class SquareDoubleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *X, *ddX, *dOut; + X = ddX = dOut = nullptr; + framework::Tensor *dX, *ddOut; + dX = ddOut = nullptr; + + ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut); + + if (dX) dX->mutable_data(X->dims(), ctx.GetPlace()); + if (ddOut) ddOut->mutable_data(ctx.GetPlace()); + + auto& place = ctx.template device_context(); + + Functor functor; + functor(place, X, ddX, ddOut, dOut, dX); + } +}; + +template +class SqrtDoubleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *Out, *dX, *ddX; + Out = dX = ddX = nullptr; + framework::Tensor *ddOut, *dOut; + ddOut = dOut = nullptr; + + // extract ddx(input), ddout(output) + auto ddx_var = ctx.InputVar("DDX"); + auto ddo_var = ctx.OutputVar("DDOut"); + PADDLE_ENFORCE(ddx_var != nullptr, + "Cannot get input Variable DDX, variable name = %s", + ctx.op().Input("DDX")); + ddX = ctx.Input("DDX"); + if (ddo_var) { + ddOut = ctx.Output("DDOut"); + } + PADDLE_ENFORCE(ddX != nullptr, + "Cannot get input Variable DDX, variable name = %s", + ctx.op().Input("DDX")); + + // extract out(input), dout(output) + auto out_var = ctx.InputVar("Out"); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get input Variable Out, variable name = %s", + ctx.op().Input("Out")); + auto dout_var = ctx.OutputVar("DOut"); + Out = ctx.Input("Out"); + if (dout_var) { + dOut = ctx.Output("DOut"); + } + + // extract dx(input) + auto dx_var = ctx.InputVar("DX"); + PADDLE_ENFORCE(dx_var != nullptr, + "Cannot get input Variable DX, variable name = %s", + ctx.op().Input("DX")); + if (dx_var) { + dX = ctx.Input("DX"); + } + + if (dOut) dOut->mutable_data(Out->dims(), ctx.GetPlace()); + if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); + + auto& place = ctx.template device_context(); + + Functor functor; + functor(place, Out, ddX, ddOut, dOut, dX); + } +}; + +} // namespace operators +} // namespace paddle + +#define FOR_EACH_ACTIVATION_OP(__macro) \ + __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ + __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ + __macro(exp, Exp, ExpFunctor, ExpGradFunctor); \ + __macro(gelu, Gelu, GeluFunctor, GeluGradFunctor); \ + __macro(tanh, Tanh, TanhFunctor, TanhGradFunctor); \ + __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ + __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ + __macro(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor); \ + __macro(abs, Abs, AbsFunctor, AbsGradFunctor); \ + __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ + __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ + __macro(cos, Cos, CosFunctor, CosGradFunctor); \ + __macro(acos, Acos, AcosFunctor, AcosGradFunctor); \ + __macro(sin, Sin, SinFunctor, SinGradFunctor); \ + __macro(asin, Asin, AsinFunctor, AsinGradFunctor); \ + __macro(round, Round, RoundFunctor, ZeroGradFunctor); \ + __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ + __macro(log, Log, LogFunctor, LogGradFunctor); \ + __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \ + __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ + __macro(pow, Pow, PowFunctor, PowGradFunctor); \ + __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ + __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ + __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ + __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor); \ + __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \ + __macro(elu, ELU, ELUFunctor, ELUGradFunctor); \ + __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \ + __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, \ + HardSigmoidGradFunctor); \ + __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ + __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, \ + ThresholdedReluGradFunctor); diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc new file mode 100644 index 00000000..2580c5a5 --- /dev/null +++ b/paddle/fluid/operators/add_position_encoding_op.cc @@ -0,0 +1,124 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/add_position_encoding_op.h" +#include + +namespace paddle { +namespace operators { + +class AddPositionEncodingOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "X(Input) of add_position_encoding_op should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Out(Output) of add_position_encoding_op should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); + } +}; + +class AddPositionEncodingOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + if (ctx->HasOutput(framework::GradVarName("X"))) { + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(framework::GradVarName("X"), out_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + platform::CPUPlace()); + } +}; + +class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of AddPositionEncoding operator"); + AddOutput("Out", "Output of AddPositionEncoding operator"); + AddAttr("alpha", "The scale of Original Embedding.") + .SetDefault(1.0f) + .AddCustomChecker([](const float& alpha) { + PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0."); + }); + AddAttr("beta", "The scale of Position Embedding.") + .SetDefault(1.0f) + .AddCustomChecker([](const float& beta) { + PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0."); + }); + AddComment(R"DOC( + Add Position Encoding Operator. + + The add position encoding calculates the output based on the input, alpha, beta. + The size of each dimension of the parameters checked in the infer-shape. + )DOC"); + } +}; + +class AddPositionEncodingGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("add_position_encoding_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; + +REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp, + ops::AddPositionEncodingOpMaker, + ops::AddPositionEncodingGradOpDescMaker); +REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad); + +REGISTER_OP_CPU_KERNEL( + add_position_encoding, + ops::AddPositionEncodingKernel, + ops::AddPositionEncodingKernel); + +REGISTER_OP_CPU_KERNEL( + add_position_encoding_grad, + ops::AddPositionEncodingGradKernel, + ops::AddPositionEncodingGradKernel); diff --git a/paddle/fluid/operators/add_position_encoding_op.h b/paddle/fluid/operators/add_position_encoding_op.h new file mode 100644 index 00000000..0b40d3de --- /dev/null +++ b/paddle/fluid/operators/add_position_encoding_op.h @@ -0,0 +1,106 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +template +class AddPositionEncodingKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto& x_lod = X->lod(); + auto* src_ptr = X->data(); + + auto* Out = context.Output("Out"); + auto* dst_ptr = Out->mutable_data(context.GetPlace()); + + float alpha = context.Attr("alpha"); + float beta = context.Attr("beta"); + + auto x_dim = X->dims(); + int batch_size = 0; + int max_seq_len = 0; + int enc_size = 0; + + if (x_lod.empty()) { + PADDLE_ENFORCE( + x_dim.size() == 3UL, + "The input X of Add Position Encoding should be 3-D Tensor!"); + batch_size = x_dim[0]; + max_seq_len = x_dim[1]; + enc_size = x_dim[2]; + } else { + PADDLE_ENFORCE( + x_dim.size() == 2UL, + "The input X of Add Position Encoding should be 2-D LoDTensor!"); + PADDLE_ENFORCE( + x_lod.size() == 1UL, + "The Add Position Encoding Op only supports lod_level == 1!"); + batch_size = x_lod[0].size() - 1; + max_seq_len = -1; + enc_size = x_dim[1]; + } + + PADDLE_ENFORCE(enc_size % 2 == 0, "Only support even encode size!"); + + const int half_size = enc_size / 2; + for (int i = 0; i < batch_size; ++i) { + const int max_length = + x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i]; + for (int j = 0; j < max_length; ++j) { + for (int k = 0; k < half_size; ++k) { + const double val = + (half_size > 1) + ? j / pow(10000.0, static_cast(k) / (half_size - 1)) + : j / 10000.0; + dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta; + dst_ptr[half_size + k] = + src_ptr[half_size + k] * alpha + cos(val) * beta; + } + src_ptr += enc_size; + dst_ptr += enc_size; + } + } + } +}; + +template +class AddPositionEncodingGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* dOut = + context.Input(framework::GradVarName("Out")); + auto dout = framework::EigenVector::Flatten(*dOut); + + auto* dX = + context.Output(framework::GradVarName("X")); + dX->mutable_data(context.GetPlace()); + auto dx = framework::EigenVector::Flatten(*dX); + + float alpha = context.Attr("alpha"); + + auto* place = + context.template device_context().eigen_device(); + dx.device(*place) = dout * static_cast(alpha); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc new file mode 100644 index 00000000..da063541 --- /dev/null +++ b/paddle/fluid/operators/affine_channel_op.cc @@ -0,0 +1,334 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class AffineChannelOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) Feature map input can be a 4D tensor with order NCHW " + "or NHWC. It also can be a 2D tensor and C is the second " + "dimension."); + AddInput("Scale", + "(Tensor) 1D input of shape (C), the c-th element " + "is the scale factor of the affine transformation " + "for the c-th channel of the input."); + AddInput("Bias", + "(Tensor) 1D input of shape (C), the c-th element " + "is the bias of the affine transformation for the " + "c-th channel of the input."); + AddAttr( + "data_layout", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + AddOutput("Out", "(Tensor) A tensor of the same shape and order with X."); + AddComment(R"DOC( + +Applies a separate affine transformation to each channel of the input. Useful +for replacing spatial batch norm with its equivalent fixed transformation. +The input also can be 2D tensor and applies a affine transformation in second +dimension. + +$$Out = Scale*X + Bias$$ + +)DOC"); + } +}; + +class AffineChannelOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of AffineChannelOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) of AffineChannelOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of AffineChannelOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of AffineChannelOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto scale_dims = ctx->GetInputDim("Scale"); + auto b_dims = ctx->GetInputDim("Bias"); + const framework::DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + const int64_t C = (data_layout == framework::DataLayout::kNCHW + ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL); + PADDLE_ENFORCE_EQ(b_dims.size(), 1UL); + if (ctx->IsRuntime() || scale_dims[0] > 0) { + PADDLE_ENFORCE_EQ(scale_dims[0], C); + } + if (ctx->IsRuntime() || b_dims[0] > 0) { + PADDLE_ENFORCE_EQ(b_dims[0], C); + } + + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", "Out"); + } +}; + +class AffineChannelOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + if (ctx->HasOutput(framework::GradVarName("X"))) { + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); + } + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + // Scale@GRAD and Bias@GRAD must exist at the same time. + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), + "Output(Scale@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + ctx->SetOutputDim(framework::GradVarName("Scale"), + ctx->GetInputDim("Scale")); + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Scale")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } +}; + +class AffineChannelGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("affine_channel_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Scale", Input("Scale")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + + return std::unique_ptr(op); + } +}; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +template +class AffineChannelKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + + auto* y = ctx.Output("Out"); + y->mutable_data(ctx.GetPlace()); + + const framework::DataLayout layout = + framework::StringToDataLayout(ctx.Attr("data_layout")); + + auto dims = x->dims(); + int N = dims[0]; + int C = layout == framework::DataLayout::kNCHW ? dims[1] + : dims[dims.size() - 1]; + int HxW = x->numel() / N / C; + + auto* scale_d = scale->data(); + auto* bias_d = bias->data(); + ConstEigenVectorArrayMap a_e(scale_d, C); + ConstEigenVectorArrayMap b_e(bias_d, C); + + auto* x_d = x->data(); + auto* y_d = y->data(); + if (layout == framework::DataLayout::kNCHW) { + int stride = C * HxW; + for (int i = 0; i < N; i++) { + ConstEigenArrayMap x_e(x_d, HxW, C); + EigenArrayMap y_e(y_d, HxW, C); + y_e = (x_e.rowwise() * a_e.transpose()).rowwise() + b_e.transpose(); + x_d += stride; + y_d += stride; + } + } else { + int num = N * HxW; + ConstEigenArrayMap x_e(x_d, C, num); + EigenArrayMap y_e(y_d, C, num); + y_e = (x_e.colwise() * a_e).colwise() + b_e; + } + } +}; + +template +class AffineChannelGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* scale = ctx.Input("Scale"); + auto* dy = ctx.Input(framework::GradVarName("Out")); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dscale = + ctx.Output(framework::GradVarName("Scale")); + auto* dbias = ctx.Output(framework::GradVarName("Bias")); + + const framework::DataLayout layout = + framework::StringToDataLayout(ctx.Attr("data_layout")); + + auto dims = x->dims(); + int N = dims[0]; + int C = layout == framework::DataLayout::kNCHW ? dims[1] + : dims[dims.size() - 1]; + int HxW = x->numel() / N / C; + + auto* dy_d = dy->data(); + auto* scale_d = scale->data(); + ConstEigenVectorArrayMap scale_e(scale_d, C); + + T* dx_d = dx ? dx->mutable_data(ctx.GetPlace()) : nullptr; + T* dscale_d = dscale ? dscale->mutable_data(ctx.GetPlace()) : nullptr; + T* dbias_d = dbias ? dbias->mutable_data(ctx.GetPlace()) : nullptr; + EigenVectorArrayMap dscale_e(dscale_d, C); + EigenVectorArrayMap dbias_e(dbias_d, C); + + if (layout == framework::DataLayout::kNCHW) { + // compute dx + int stride = C * HxW; + if (dx) { + for (int i = 0; i < N; i++) { + ConstEigenArrayMap dy_e(dy_d, HxW, C); + EigenArrayMap dx_e(dx_d, HxW, C); + dx_e = dy_e.rowwise() * scale_e.transpose(); + dy_d += stride; + dx_d += stride; + } + } + // compute dscale and dbias + if (dscale && dbias) { + auto* x_d = x->data(); + dy_d = dy->data(); + for (int i = 0; i < N; i++) { + ConstEigenArrayMap x_e(x_d, HxW, C); + ConstEigenArrayMap dy_e(dy_d, HxW, C); + if (i == 0) { + dscale_e = (x_e * dy_e).colwise().sum(); + } else { + dscale_e += (x_e * dy_e).colwise().sum(); + } + if (i == 0) { + dbias_e = dy_e.colwise().sum(); + } else { + dbias_e += dy_e.colwise().sum(); + } + x_d += stride; + dy_d += stride; + } + } + } else { + int num = N * HxW; + ConstEigenArrayMap dy_e(dy_d, C, num); + // compute dx + if (dx) { + EigenArrayMap dx_e(dx_d, C, num); + dx_e = dy_e.colwise() * scale_e; + } + // compute dscale and dbias + if (dscale && dbias) { + auto* x_d = x->data(); + ConstEigenArrayMap x_e(x_d, C, num); + dscale_e = (x_e * dy_e).rowwise().sum(); + dbias_e = dy_e.rowwise().sum(); + } + } + } +}; + +class AffineChannelNoNeedBufferVarsInference + : public framework::NoNeedBufferVarsInference { + public: + using framework::NoNeedBufferVarsInference::NoNeedBufferVarsInference; + + private: + inline bool HasInput(const std::string& name) const { + auto& inputs = Inputs(); + auto iter = inputs.find(name); + if (iter == inputs.end() || iter->second.empty()) { + return false; + } else { + return iter->second[0] != framework::kEmptyVarName; + } + } + + public: + std::unordered_set operator()() const { + if (!HasInput(framework::GradVarName("Scale")) && + !HasInput(framework::GradVarName("Bias"))) { + return {"X"}; + } else { + return {}; + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp, + ops::AffineChannelOpMaker, ops::AffineChannelGradMaker); +REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad, + ops::AffineChannelNoNeedBufferVarsInference); + +REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel, + ops::AffineChannelKernel); +REGISTER_OP_CPU_KERNEL(affine_channel_grad, + ops::AffineChannelGradKernel, + ops::AffineChannelGradKernel); diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu new file mode 100644 index 00000000..6bc0a263 --- /dev/null +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -0,0 +1,194 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "cub/cub.cuh" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +template +__global__ void KeAffineChannelCUDA(const T* x, const T* scale, const T* bias, + const int C, const int HxW, const int num, + T* y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + if (HasBias) { + y[i] = scale[c] * x[i] + bias[c]; + } else { + y[i] = scale[c] * x[i]; + } + } +} + +template +class AffineChannelCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + + auto* y = ctx.Output("Out"); + y->mutable_data(ctx.GetPlace()); + + const framework::DataLayout layout = + framework::StringToDataLayout(ctx.Attr("data_layout")); + auto& dev_ctx = ctx.template device_context(); + + auto dims = x->dims(); + const int num = x->numel(); + int N = dims[0]; + int C = layout == framework::DataLayout::kNCHW ? dims[1] + : dims[dims.size() - 1]; + int HxW = num / N / C; + + const T* x_d = x->data(); + const T* scale_d = scale->data(); + const T* bias_d = bias->data(); + T* y_d = y->data(); + + int block = 1024; + int grid = (num + block - 1) / block; + + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + grid = std::min(std::max(max_threads / block, 1), grid); + if (layout == framework::DataLayout::kNCHW) { + KeAffineChannelCUDA<<>>( + x_d, scale_d, bias_d, C, HxW, num, y_d); + } else { + KeAffineChannelCUDA<<>>( + x_d, scale_d, bias_d, C, HxW, num, y_d); + } + } +}; + +template +__global__ void AffineChannelScaleBiasGradientCUDAKernel( + const T* dy, const T* x, const int N, const int C, const int HxW, T* dscale, + T* dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + T ds_sum = 0; + T db_sum = 0; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + ds_sum += dy[index] * x[index]; + db_sum += dy[index]; + } + __syncthreads(); + auto ds_out = + BlockReduce(ds_storage).Reduce(static_cast(ds_sum), cub::Sum()); + auto db_out = + BlockReduce(db_storage).Reduce(static_cast(db_sum), cub::Sum()); + __syncthreads(); + if (threadIdx.x == 0) { + dscale[i] = ds_out; + dbias[i] = db_out; + } + } +} + +template +class AffineChannelGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* dy = ctx.Input(framework::GradVarName("Out")); + + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dscale = + ctx.Output(framework::GradVarName("Scale")); + auto* dbias = ctx.Output(framework::GradVarName("Bias")); + + const framework::DataLayout layout = + framework::StringToDataLayout(ctx.Attr("data_layout")); + auto& dev_ctx = ctx.template device_context(); + + auto dims = dy->dims(); + const int num = dy->numel(); + int N = dims[0]; + int C = layout == framework::DataLayout::kNCHW ? dims[1] + : dims[dims.size() - 1]; + int HxW = num / N / C; + + const T* dy_d = dy->data(); + const T* s_d = scale->data(); + + T* dx_d = dx ? dx->mutable_data(ctx.GetPlace()) : nullptr; + T* ds_d = dscale ? dscale->mutable_data(ctx.GetPlace()) : nullptr; + T* db_d = dbias ? dbias->mutable_data(ctx.GetPlace()) : nullptr; + + const int block = 1024; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid1 = (num + block - 1) / block; + int grid2 = std::min(C, max_blocks); + if (layout == framework::DataLayout::kNCHW) { + if (dx) { + KeAffineChannelCUDA<<>>( + dy_d, s_d, nullptr, C, HxW, num, dx_d); + } + if (dscale && dbias) { + const T* x_d = x->data(); + AffineChannelScaleBiasGradientCUDAKernel< + T, block, framework::DataLayout::kNCHW><<>>( + dy_d, x_d, N, C, HxW, ds_d, db_d); + } + } else { + if (dx) { + KeAffineChannelCUDA<<>>( + dy_d, s_d, nullptr, C, HxW, num, dx_d); + } + if (dscale && dbias) { + const T* x_d = x->data(); + AffineChannelScaleBiasGradientCUDAKernel< + T, block, framework::DataLayout::kNHWC><<>>( + dy_d, x_d, N, C, HxW, ds_d, db_d); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CUDA = paddle::platform::CUDADeviceContext; + +REGISTER_OP_CUDA_KERNEL(affine_channel, + ops::AffineChannelCUDAKernel, + ops::AffineChannelCUDAKernel); +REGISTER_OP_CUDA_KERNEL(affine_channel_grad, + ops::AffineChannelGradCUDAKernel, + ops::AffineChannelGradCUDAKernel); diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc new file mode 100644 index 00000000..ed71594b --- /dev/null +++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedSpatialTransformerDescriptor = + platform::ScopedSpatialTransformerDescriptor; + +template +class CUDNNAffineGridOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto* theta = ctx.Input("Theta"); + auto* output = ctx.Output("Output"); + const T* theta_data = theta->data(); + + int n = theta->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + Tensor h_sizes; + int* h_size_data; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + h_size_data = h_sizes.data(); + } else { + h_size_data = h_sizes.mutable_data({4}, platform::CPUPlace()); + h_size_data[0] = n; + h_size_data[1] = size_attr[1]; + h_size_data[2] = size_attr[2]; + h_size_data[3] = size_attr[3]; + } + + T* output_data = output->mutable_data( + {n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace()); + ScopedSpatialTransformerDescriptor st_desc; + cudnnSpatialTransformerDescriptor_t cudnn_st_desc = + st_desc.descriptor(4, h_size_data); + + PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward( + handle, cudnn_st_desc, theta_data, output_data)); + } +}; + +template +class CUDNNAffineGridGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto theta_grad = ctx.Output(framework::GradVarName("Theta")); + + int n = output_grad->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + Tensor h_sizes; + int* h_size_data; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + h_size_data = h_sizes.data(); + } else { + h_size_data = h_sizes.mutable_data({4}, platform::CPUPlace()); + h_size_data[0] = n; + h_size_data[1] = size_attr[1]; + h_size_data[2] = size_attr[2]; + h_size_data[3] = size_attr[3]; + } + + ScopedSpatialTransformerDescriptor st_desc; + cudnnSpatialTransformerDescriptor_t cudnn_st_desc = + st_desc.descriptor(4, h_size_data); + + const T* output_grad_data = output_grad->data(); + T* theta_grad_data = theta_grad->mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward( + handle, cudnn_st_desc, output_grad_data, theta_grad_data)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace plat = paddle::platform; +REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNAffineGridOpKernel, + paddle::operators::CUDNNAffineGridOpKernel); +REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNAffineGridGradOpKernel, + paddle::operators::CUDNNAffineGridGradOpKernel); diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc new file mode 100644 index 00000000..9d7100cc --- /dev/null +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -0,0 +1,234 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/affine_grid_op.h" +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct Linspace { + void operator()(T start, T end, int count, framework::Tensor* numbers, + const framework::ExecutionContext& ctx) { + T* number_data = numbers->mutable_data({count}, platform::CPUPlace()); + T slice = (end - start) / (T)(count - 1); + for (int i = 0; i < count; ++i) { + number_data[i] = start + (T)i * slice; + } + } +}; + +class AffineGridOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Theta"), + "Input(Theta) of AffineGridOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of AffineGridOp should not be null."); + auto theta_dims = ctx->GetInputDim("Theta"); + PADDLE_ENFORCE(theta_dims.size() == 3, + "AffineGrid's Input(Theta) should be 3-D tensor."); + + auto output_shape = ctx->Attrs().Get>("output_shape"); + if (output_shape.size() == 0) { + PADDLE_ENFORCE(ctx->HasInput("OutputShape"), + "Input(OutputShape) of AffineGridOp should not be null if " + "attr(output_shape) is not configured."); + auto output_shape_dims = ctx->GetInputDim("OutputShape"); + PADDLE_ENFORCE(output_shape_dims.size() == 1, + "AffineGrid's Input(OutputShape) should be 1-D tensor."); + } else { + PADDLE_ENFORCE(output_shape.size() == 4, + "The size of attr(output_shape) should be 4."); + } + + PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2."); + PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3."); + // N * H * W * 2 + ctx->SetOutputDim("Output", + framework::make_ddim({theta_dims[0], -1, -1, 2})); + ctx->ShareLoD("Theta", "Output"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_CUDA + if (platform::CanCUDNNBeUsed(ctx)) { + library = framework::LibraryType::kCUDNN; + } +#endif + auto data_type = ctx.Input("Theta")->type(); + return framework::OpKernelType(data_type, ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library); + } +}; + +class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Theta", + "(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. " + "It is used to transform coordinate (x_0, y_0) to coordinate (x_1, " + "y_1)."); + AddInput("OutputShape", + "(Tensor) The shape of target image with format [N, C, H, W].") + .AsDispensable(); + AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2]."); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(true); + AddAttr>( + "output_shape", + "The target output image shape with format [N, C, H, W].") + .SetDefault(std::vector()); + + AddComment(R"DOC( + It generates a grid of (x,y) coordinates using the parameters of the + affine transformation that correspond to a set of points where the input + feature map should be sampled to produce the transformed output feature map. + + Given: + Theta = [[[x_11, x_12, x_13] + [x_14, x_15, x_16]] + [[x_21, x_22, x_23] + [x_24, x_25, x_26]]] + + OutputShape = [2, 3, 5, 5] + + Step 1: + + Generate relative coordinates according to OutputShape. + The values of relative coordinates are in the interval between -1 and 1. + The shape of the relative coordinates is [2, H, W] as below: + + C = [[[-1. -1. -1. -1. -1. ] + [-0.5 -0.5 -0.5 -0.5 -0.5] + [ 0. 0. 0. 0. 0. ] + [ 0.5 0.5 0.5 0.5 0.5] + [ 1. 1. 1. 1. 1. ]] + [[-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ]]] + C[0] is the coordinates in height axis and C[1] is the coordinates in width axis. + + Step2: + Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get: + C_ = [[-1. -1. 1. ] + [-0.5 -1. 1. ] + [ 0. -1. 1. ] + [ 0.5 -1. 1. ] + [ 1. -1. 1. ] + [-1. -0.5 1. ] + [-0.5 -0.5 1. ] + [ 0. -0.5 1. ] + [ 0.5 -0.5 1. ] + [ 1. -0.5 1. ] + [-1. 0. 1. ] + [-0.5 0. 1. ] + [ 0. 0. 1. ] + [ 0.5 0. 1. ] + [ 1. 0. 1. ] + [-1. 0.5 1. ] + [-0.5 0.5 1. ] + [ 0. 0.5 1. ] + [ 0.5 0.5 1. ] + [ 1. 0.5 1. ] + [-1. 1. 1. ] + [-0.5 1. 1. ] + [ 0. 1. 1. ] + [ 0.5 1. 1. ] + [ 1. 1. 1. ]] + Step3: + Compute output by equation $$Output[i] = C_ * Theta[i]^T$$ + )DOC"); + } +}; + +class AffineGridOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + if (ctx->HasOutput(framework::GradVarName("Theta"))) { + auto output_dims = ctx->GetInputDim(framework::GradVarName("Output")); + ctx->SetOutputDim(framework::GradVarName("Theta"), + {output_dims[0], 2, 3}); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_CUDA + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; + } +#endif + return framework::OpKernelType(ctx.Input("Theta")->type(), + ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library_); + } +}; + +class AffineGridGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("affine_grid_grad"); + op->SetInput("Theta", Input("Theta")); + op->SetInput("OutputShape", Input("OutputShape")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta")); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker, + ops::AffineGridGradMaker); +REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad); + +REGISTER_OP_CPU_KERNEL( + affine_grid, + ops::AffineGridOpKernel, + ops::AffineGridOpKernel); +REGISTER_OP_CPU_KERNEL( + affine_grid_grad, + ops::AffineGridGradOpKernel, + ops::AffineGridGradOpKernel); diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h new file mode 100644 index 00000000..73df8a38 --- /dev/null +++ b/paddle/fluid/operators/affine_grid_op.h @@ -0,0 +1,178 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenTensor = framework::EigenTensor; + +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; +using Array3 = Eigen::DSizes; +using Array4 = Eigen::DSizes; + +/** + *Return a tensor with evenly spaced numbers over a specified interval. + */ +template +struct Linspace { + void operator()(T start, T end, int count, framework::Tensor* numbers, + const framework::ExecutionContext& ctx); +}; + +template +inline void GetIdxMap(int n, int h, int w, Tensor* grid, + const framework::ExecutionContext& ctx) { + auto& place = *ctx.template device_context().eigen_device(); + grid->mutable_data({n, h, w, 3}, ctx.GetPlace()); + auto grid_t = EigenTensor::From(*grid); + // Get indexes of height with shape [height, width, 1] + Tensor h_idx; + Linspace linspace; + linspace((T)-1, (T)1, h, &h_idx, ctx); + auto h_idx_t = EigenTensor::From(h_idx); + // Get indexes of width with shape [height, width, 1] + Tensor w_idx; + linspace((T)-1, (T)1, w, &w_idx, ctx); + auto w_idx_t = EigenTensor::From(w_idx); + // Get constant ones tensor with shape [height, width, 1] + Tensor ones; + ones.mutable_data({h, w, 1}, ctx.GetPlace()); + auto ones_t = EigenTensor::From(ones).setConstant((T)1); + // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and + // ones + Tensor w_idx_map; + w_idx_map.mutable_data({h, w, 1}, ctx.GetPlace()); + auto w_idx_map_t = EigenTensor::From(w_idx_map); + Tensor h_idx_map; + h_idx_map.mutable_data({h, w, 1}, ctx.GetPlace()); + auto h_idx_map_t = EigenTensor::From(h_idx_map); + Tensor w_h_idx_map; + w_h_idx_map.mutable_data({h, w, 2}, ctx.GetPlace()); + auto w_h_idx_map_t = EigenTensor::From(w_h_idx_map); + Tensor w_h_one_idx_map; + w_h_one_idx_map.mutable_data({h, w, 3}, ctx.GetPlace()); + auto w_h_one_idx_map_t = EigenTensor::From(w_h_one_idx_map); + + w_idx_map_t.device(place) = w_idx_t.reshape(Array2(1, w)) + .broadcast(Array2(h, 1)) + .reshape(Array3(h, w, 1)); + + h_idx_map_t.device(place) = h_idx_t.reshape(Array2(1, h)) + .broadcast(Array2(w, 1)) + .shuffle(Array2(1, 0)) + .reshape(Array3(h, w, 1)); + + w_h_idx_map_t.device(place) = w_idx_map_t.concatenate(h_idx_map_t, 2); + w_h_one_idx_map_t.device(place) = w_h_idx_map_t.concatenate(ones_t, 2); + grid_t.device(place) = w_h_one_idx_map_t.reshape(Array4(1, h, w, 3)) + .broadcast(Array4(n, 1, 1, 1)); +} + +template +class AffineGridOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* theta = ctx.Input("Theta"); + int n = theta->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + int h = 0; + int w = 0; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + Tensor h_sizes; + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + const int* h_size_data = h_sizes.data(); + h = h_size_data[2]; + w = h_size_data[3]; + } else { + h = size_attr[2]; + w = size_attr[3]; + } + auto* output = ctx.Output("Output"); + output->mutable_data({n, h, w, 2}, ctx.GetPlace()); + math::SetConstant()( + ctx.template device_context(), output, + static_cast(0)); + Tensor grid; + GetIdxMap(n, h, w, &grid, ctx); + // output = grid * theta.T + // TODO(wanghaoshuang): Refine batched matrix multiply + auto blas = math::GetBlas(ctx); + for (int i = 0; i < n; ++i) { + Tensor sliced_grid = grid.Slice(i, i + 1).Resize( + {static_cast(h) * static_cast(w), 3}); + Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3}); + Tensor sliced_out = output->Slice(i, i + 1).Resize( + {static_cast(h) * static_cast(w), 2}); + blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out, + T(0)); + } + } +}; + +template +class AffineGridGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto theta_grad = ctx.Output(framework::GradVarName("Theta")); + int n = output_grad->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + int h = 0; + int w = 0; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + Tensor h_sizes; + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + const int* h_size_data = h_sizes.data(); + h = h_size_data[2]; + w = h_size_data[3]; + } else { + h = size_attr[2]; + w = size_attr[3]; + } + theta_grad->mutable_data({n, 2, 3}, ctx.GetPlace()); + math::SetConstant()( + ctx.template device_context(), theta_grad, + static_cast(0)); + Tensor grid; + GetIdxMap(n, h, w, &grid, ctx); + // output = grid * theta.T + // TODO(wanghaoshuang): Refine batched matrix multiply + auto blas = math::GetBlas(ctx); + for (int i = 0; i < n; ++i) { + Tensor sliced_grid = grid.Slice(i, i + 1).Resize( + {static_cast(h) * static_cast(w), 3}); + Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize( + {static_cast(h) * static_cast(w), 2}); + Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3}); + blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1), + &sliced_theta_grad, T(0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/anakin/CMakeLists.txt b/paddle/fluid/operators/anakin/CMakeLists.txt new file mode 100644 index 00000000..5eacefc6 --- /dev/null +++ b/paddle/fluid/operators/anakin/CMakeLists.txt @@ -0,0 +1,2 @@ +op_library(anakin_engine_op DEPS anakin_engine anakin_op_converter) +# file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(anakin_engine);\n") diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.cc b/paddle/fluid/operators/anakin/anakin_engine_op.cc new file mode 100644 index 00000000..58db16ea --- /dev/null +++ b/paddle/fluid/operators/anakin/anakin_engine_op.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUDA + +#include +#include + +#include "paddle/fluid/operators/anakin/anakin_engine_op.h" + +namespace paddle { + +namespace operators { + +class AnakinEngineOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Xs", "A list of inputs.").AsDuplicable(); + AddOutput("Ys", "A list of outputs").AsDuplicable(); + AddAttr("subgraph", "the subgraph."); + AddAttr( + "engine_key", + "The engine_key here is used to distinguish different TRT Engines"); + AddAttr("sub_block", "the trt block"); + AddComment("Anakin engine operator."); + } +}; + +class AnakinEngineInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(anakin_engine, ops::AnakinEngineOp, ops::AnakinEngineOpMaker, + ops::AnakinEngineOpMaker); + +#endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h new file mode 100644 index 00000000..b4aaa228 --- /dev/null +++ b/paddle/fluid/operators/anakin/anakin_engine_op.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_CUDA + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" +#include "paddle/fluid/inference/anakin/engine.h" +#include "paddle/fluid/inference/analysis/helper.h" + +namespace paddle { +namespace operators { + +using inference::Singleton; +using inference::anakin::AnakinEngine; + +class AnakinEngineOp : public framework::OperatorBase { + private: + std::vector input_names_; + std::unordered_set param_names_; + std::string engine_key_; + std::string engine_serialized_data_; + bool use_gpu_; + bool enable_int8_; + + public: + AnakinEngineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) { + input_names_ = Inputs("Xs"); + engine_key_ = Attr("engine_key"); + auto params = Attr>("parameters"); + use_gpu_ = Attr("use_gpu"); + enable_int8_ = Attr("enable_int8"); + for (const auto ¶m : params) { + param_names_.insert(param); + } + } + + protected: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + RunAnakin(scope, dev_place); + } + + void RunAnakin(const framework::Scope &scope, + const platform::Place &dev_place) const { + PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); + + std::vector output_maps = + Attr>("output_name_mapping"); + + std::map inputs; + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + auto &t = + inference::analysis::GetFromScope(scope, x); + + inputs.insert({x, &t}); + } + + std::map outputs; + int output_index = 0; + for (const auto &y : Outputs("Ys")) { + auto *fluid_v = scope.FindVar(y); + PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); + auto *fluid_t = fluid_v->GetMutable(); + outputs.insert({output_maps[output_index], fluid_t}); + output_index += 1; + } + if (enable_int8_) { + Execute<::anakin::Precision::INT8>(inputs, outputs, dev_place); + } else { + Execute<::anakin::Precision::FP32>(inputs, outputs, dev_place); + } + } + + template <::anakin::Precision PrecisionT> + void Execute(const std::map &inputs, + const std::map &outputs, + const platform::Place &dev_place) const { + if (use_gpu_) { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto stream = + reinterpret_cast(dev_ctx) + .stream(); + auto *engine = + inference::Singleton>::Global() + .Get(engine_key_); + engine->Execute(inputs, outputs, stream); +#endif + } else { +#ifdef ANAKIN_X86_PLACE + auto *engine = + inference::Singleton>::Global() + .Get(engine_key_); + engine->Execute(inputs, outputs); +#else + LOG(FATAL) << "Unknown Platform for AnakinEngine!"; +#endif + } + } +}; + +} // namespace operators +} // namespace paddle + +#endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc new file mode 100644 index 00000000..7fe9a0df --- /dev/null +++ b/paddle/fluid/operators/arg_max_op.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp, + paddle::operators::ArgMaxOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + arg_max, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu new file mode 100644 index 00000000..85e4f981 --- /dev/null +++ b/paddle/fluid/operators/arg_max_op.cu @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OP_CUDA_KERNEL( + arg_max, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h new file mode 100644 index 00000000..bf7b83bb --- /dev/null +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -0,0 +1,162 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace operators { + +enum ArgMinMaxType { kArgMin, kArgMax }; + +template +struct ArgMinMaxFunctor {}; + +#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value) \ + template \ + struct ArgMinMaxFunctor { \ + void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ + framework::LoDTensor* out, int64_t axis) { \ + auto in_eigen = framework::EigenTensor::From(in); \ + auto out_eigen = framework::EigenTensor::From(*out); \ + out_eigen.device(*(ctx.eigen_device())) = \ + in_eigen.eigen_op_type(axis).template cast(); \ + } \ + } + +DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin); +DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax); + +template +class ArgMinMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& x = *(ctx.Input("X")); + auto& out = *(ctx.Output("Out")); + out.mutable_data(ctx.GetPlace()); + auto axis = ctx.Attr("axis"); + auto x_rank = x.dims().size(); + if (axis < 0) axis += x_rank; + auto& dev_ctx = ctx.template device_context(); + +#define CALL_ARG_MINMAX_FUNCTOR(rank) \ + ArgMinMaxFunctor \ + functor##rank; \ + functor##rank(dev_ctx, x, &out, axis) + + switch (x.dims().size()) { + case 1: + CALL_ARG_MINMAX_FUNCTOR(1); + break; + case 2: + CALL_ARG_MINMAX_FUNCTOR(2); + break; + case 3: + CALL_ARG_MINMAX_FUNCTOR(3); + break; + case 4: + CALL_ARG_MINMAX_FUNCTOR(4); + break; + case 5: + CALL_ARG_MINMAX_FUNCTOR(5); + break; + case 6: + CALL_ARG_MINMAX_FUNCTOR(6); + break; + default: + PADDLE_THROW( + "%s operator doesn't supports tensors whose ranks are greater " + "than 6.", + (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")); + break; +#undef CALL_ARG_MINMAX_FUNCTOR + } + } +}; + +template +using ArgMinKernel = + ArgMinMaxKernel; + +template +using ArgMaxKernel = + ArgMinMaxKernel; + +class ArgMinMaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + const auto& x_dims = ctx->GetInputDim("X"); + int64_t axis = ctx->Attrs().Get("axis"); + PADDLE_ENFORCE(axis >= -x_dims.size() && axis < x_dims.size(), + "'axis' must be inside [-Rank(X), Rank(X))"); + + auto x_rank = x_dims.size(); + if (axis < 0) axis += x_rank; + + std::vector vec; + for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]); + for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]); + ctx->SetOutputDim("Out", framework::make_ddim(vec)); + } +}; + +class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { + protected: + virtual const char* OpName() const = 0; + virtual const char* Name() const = 0; + + public: + void Make() override { + AddInput("X", "Input tensor."); + AddOutput("Out", "Output tensor."); + AddAttr("axis", "The axis in which to compute the arg indics."); + AddComment(string::Sprintf(R"DOC( + %s Operator. + + Computes the indices of the %s elements of the input tensor's element + along the provided axis. +)DOC", + OpName(), Name())); + } +}; + +class ArgMinOpMaker : public BaseArgMinMaxOpMaker { + protected: + const char* OpName() const override { return "ArgMin"; } + const char* Name() const override { return "min"; } +}; + +class ArgMaxOpMaker : public BaseArgMinMaxOpMaker { + protected: + const char* OpName() const override { return "ArgMax"; } + const char* Name() const override { return "max"; } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc new file mode 100644 index 00000000..23b24735 --- /dev/null +++ b/paddle/fluid/operators/arg_min_op.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp, + paddle::operators::ArgMinOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + arg_min, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu new file mode 100644 index 00000000..47d7c8b1 --- /dev/null +++ b/paddle/fluid/operators/arg_min_op.cu @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OP_CUDA_KERNEL( + arg_min, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc new file mode 100644 index 00000000..d25160f4 --- /dev/null +++ b/paddle/fluid/operators/argsort_op.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/argsort_op.h" + +namespace paddle { +namespace operators { + +class ArgsortOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ArgsortOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ArgsortOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Indices"), + "Output(Indices) of ArgsortOp should not be null."); + + auto in_dims = ctx->GetInputDim("X"); + int axis = ctx->Attrs().Get("axis"); + + auto num_dims = in_dims.size(); + PADDLE_ENFORCE(axis < num_dims, + "Attr(axis) %d of ArgsortOp is out of bounds for Input(X)'s " + "rank %d.", + axis, num_dims); + PADDLE_ENFORCE(axis >= -num_dims, + "Attr(axis) %d of ArgsortOp must be not less than " + "-rank(Input(X)) (%d).", + axis, num_dims); + + ctx->ShareDim("X", "Out"); + ctx->ShareDim("X", "Indices"); + ctx->ShareLoD("X", "Out"); + ctx->ShareLoD("X", "Indices"); + } +}; + +class ArgsortOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) The input of Argsort op."); + AddOutput("Out", + "(Tensor) The sorted tensor of Argsort op, with the same " + "shape as Input(X)."); + AddOutput("Indices", + "(Tensor) The indices of a tensor giving the sorted order, with " + "the same shape as Input(X)."); + AddComment(R"DOC( +Argsort operator + +Performs sorting on the input tensor along the given axis and outputs two +tensors, Output(Out) and Output(Indices). They reserve the same shape +with Input(X), and Output(Out) represents the sorted tensor while +Output(Indices) gives the sorted order along the given axis Attr(axis). + + )DOC"); + AddAttr("axis", + "(int, default -1) The axis along which to sort the tensor. " + "When axis < 0, the actual axis will be the |axis|'th " + "counting backwards. Default -1, the last dimension.") + .SetDefault(-1); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(argsort, ops::ArgsortOp, ops::ArgsortOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(argsort, + ops::ArgsortKernel, + ops::ArgsortKernel); diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu new file mode 100644 index 00000000..7d5199aa --- /dev/null +++ b/paddle/fluid/operators/argsort_op.cu @@ -0,0 +1,151 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/argsort_op.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using platform::PADDLE_CUDA_NUM_THREADS; + +const int kMaxRank = 9; // The max rank of a tensor allowed in Fluid + +__global__ void ComputeTargetIdx(const int64_t* in_dims, int dims_size, + int axis, int64_t n, int64_t* trg_idx, + int64_t* med_ids) { + int64_t index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < n) { + int64_t shape_out_axis[kMaxRank - 1] = {0}; + int64_t dims_out_axis[kMaxRank - 1] = {0}; + int64_t tmp = index; + int64_t pos_in_axis = 0; + int64_t i = dims_size - 2; + int64_t dim_axis = 0; + for (int64_t j = dims_size - 1; j >= 0; --j) { + int64_t dim = in_dims[j]; + if (j != axis) { + shape_out_axis[i] = tmp % dim; + dims_out_axis[i] = dim; + i--; + } else { + dim_axis = dim; + pos_in_axis = tmp % dim_axis; + } + tmp /= dim; + } + int64_t group = (dims_size > 1) ? shape_out_axis[0] : 0; + for (int64_t j = 0; j < dims_size - 2; ++j) { + group = group * dims_out_axis[j + 1] + shape_out_axis[j + 1]; + } + + int64_t traget_idx = group * dim_axis + pos_in_axis; + trg_idx[index] = traget_idx; + med_ids[traget_idx] = pos_in_axis; + } +} + +template +__global__ void PermuteInData(const T* in, const int64_t* trg_idx, int64_t n, + T* med_out) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < n) { + med_out[trg_idx[index]] = in[index]; + } +} + +template +__global__ void Sort(int64_t axis_dim, int64_t groups, T* med_out, + int64_t* med_ids) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < groups) { + thrust::sort_by_key(thrust::device, med_out + index * axis_dim, + med_out + axis_dim * (1 + index), + med_ids + index * axis_dim); + } +} + +template +__global__ void PermuteMediateData(const T* med_out, const int64_t* med_ids, + const int64_t* trg_idx, int64_t n, T* out, + int64_t* indices) { + int index = threadIdx.x + blockDim.x * blockIdx.x; + if (index < n) { + out[index] = med_out[trg_idx[index]]; + indices[index] = med_ids[trg_idx[index]]; + } +} + +template +class ArgsortOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + int axis = ctx.Attr("axis"); + + auto in_dims = input->dims(); + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + + const T* in_data = input->data(); + T* out_data = output->mutable_data(ctx.GetPlace()); + int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); + + int64_t numel = input->numel(); + int64_t groups = numel / in_dims[axis]; + + std::vector in_dims_vec = vectorize(in_dims); + thrust::device_vector in_dims_dev(in_dims_vec.begin(), + in_dims_vec.end()); + int64_t* in_dims_data = thrust::raw_pointer_cast(in_dims_dev.data()); + // Mediate tensor for sorting data and indices + Tensor mediate_output, mediate_indices; + T* med_out_data = + mediate_output.mutable_data(input->dims(), ctx.GetPlace()); + int64_t* med_ids_data = + mediate_indices.mutable_data(in_dims, ctx.GetPlace()); + // Target index of each element along the given axis in the mediate tensors + Tensor trg_idx_t; + int64_t* trg_idx = trg_idx_t.mutable_data(in_dims, ctx.GetPlace()); + + auto stream = ctx.cuda_device_context().stream(); + const int num_threads = PADDLE_CUDA_NUM_THREADS; + + ComputeTargetIdx<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>( + in_dims_data, in_dims.size(), axis, numel, trg_idx, med_ids_data); + + PermuteInData<<<(numel - 1) / num_threads + 1, num_threads, 0, stream>>>( + in_data, trg_idx, numel, med_out_data); + + Sort<<<(groups - 1) / num_threads + 1, num_threads, 0, stream>>>( + in_dims[axis], groups, med_out_data, med_ids_data); + + PermuteMediateData<<<(numel - 1) / num_threads + 1, num_threads, 0, + stream>>>(med_out_data, med_ids_data, trg_idx, numel, + out_data, ids_data); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(argsort, paddle::operators::ArgsortOpCUDAKernel, + paddle::operators::ArgsortOpCUDAKernel); diff --git a/paddle/fluid/operators/argsort_op.h b/paddle/fluid/operators/argsort_op.h new file mode 100644 index 00000000..7e9112cf --- /dev/null +++ b/paddle/fluid/operators/argsort_op.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class ArgsortKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + int axis = ctx.Attr("axis"); + + auto in_dims = input->dims(); + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + + const T* in_data = input->data(); + T* out_data = output->mutable_data(ctx.GetPlace()); + int64_t* ids_data = indices->mutable_data(ctx.GetPlace()); + + int64_t groups = input->numel() / in_dims[axis]; + int64_t stride = (axis == in_dims.size() - 1) + ? 1 + : framework::product(framework::slice_ddim( + in_dims, axis + 1, in_dims.size())); + + for (int64_t i = 0; i < groups; ++i) { + int64_t idx = i; + std::vector shape_vec(in_dims.size(), 0); + for (int64_t dim = in_dims.size() - 1; dim >= 0; --dim) { + if (dim != axis) { + shape_vec[dim] = idx % in_dims[dim]; + idx /= in_dims[dim]; + } + } + + int64_t start_index = shape_vec[0]; + for (int64_t dim = 0; dim < in_dims.size() - 1; ++dim) { + start_index = start_index * in_dims[dim + 1] + shape_vec[dim + 1]; + } + + std::vector org_index_vec(in_dims[axis], start_index); + for (int64_t j = 1; j < in_dims[axis]; ++j) { + org_index_vec[j] += j * stride; + } + + std::sort(org_index_vec.begin(), org_index_vec.end(), + [in_data](const int64_t v1, const int64_t v2) { + return in_data[v1] < in_data[v2]; + }); + + for (size_t j = 0; j < org_index_vec.size(); ++j) { + int64_t index = start_index + j * stride; + out_data[index] = in_data[org_index_vec[j]]; + ids_data[index] = (org_index_vec[j] - start_index) / stride; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/array_operator.h b/paddle/fluid/operators/array_operator.h new file mode 100644 index 00000000..4309f0a5 --- /dev/null +++ b/paddle/fluid/operators/array_operator.h @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +class ArrayOp : public framework::OperatorBase { + public: + ArrayOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + protected: + size_t GetOffset(const framework::Scope &scope, + const platform::Place &place) const { + auto *i = scope.FindVar(Input("I")); + PADDLE_ENFORCE(i != nullptr, "I must be set"); + auto &i_tensor = i->Get(); + PADDLE_ENFORCE_EQ(i_tensor.numel(), 1); + + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + size_t offset; + if (platform::is_gpu_place(i_tensor.place())) { + // FIXME: Avoid copy from GPU to CPU + framework::Tensor t; + framework::TensorCopy(i_tensor, platform::CPUPlace(), dev_ctx, &t); + dev_ctx.Wait(); + offset = static_cast(*t.data()); + } else { + offset = static_cast(*i_tensor.data()); + } + VLOG(10) << " Offset = " << offset; + return offset; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc new file mode 100644 index 00000000..d942391b --- /dev/null +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -0,0 +1,222 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include + +#include "paddle/fluid/framework/lod_rank_table.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +using LoD = framework::LoD; + +struct ArrayToLoDFunctor; +template +struct ArrayToLoDFunctorImpl { + const ArrayToLoDFunctor *prev_functor_; + DeviceContext *dev_ctx_; + + template + void apply(); +}; + +struct ArrayToLoDFunctor : public boost::static_visitor { + std::vector in; + mutable framework::Tensor *out; + + template + void operator()(Place place) const { + auto &pool = platform::DeviceContextPool::Instance(); + if (std::is_same::value) { + Apply(static_cast(pool.Get(place))); + } else { +#ifdef PADDLE_WITH_CUDA + Apply(static_cast(pool.Get(place))); +#else + PADDLE_THROW("Fluid is not compiled with CUDA"); +#endif + } + } + + template + void Apply(DeviceContext *dev_ctx) const { + ArrayToLoDFunctorImpl functor; + functor.dev_ctx_ = dev_ctx; + functor.prev_functor_ = this; + framework::VisitDataType(out->type(), functor); + } +}; + +template +template +void ArrayToLoDFunctorImpl::apply() { + math::ConcatFunctor func; + func(*dev_ctx_, prev_functor_->in, 0, prev_functor_->out); +} + +class ArrayToLoDTensorOp : public framework::OperatorBase { + public: + ArrayToLoDTensorOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + auto &x = scope.FindVar(Input("X"))->Get(); + auto &rank_table = + scope.FindVar(Input("RankTable"))->Get(); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + + // Check dims, place and data type of input's elements and infer output's + // dim + PADDLE_ENFORCE(!x.empty(), "There's no element in the input array."); + int rank = x[0].dims().size(); + platform::Place place = x[0].place(); + auto data_type = x[0].type(); + int64_t batch_size = x[0].dims()[0]; + framework::DDim ins_dims = rank > 1 + ? framework::slice_ddim(x[0].dims(), 1, rank) + : framework::make_ddim({0}); + for (size_t i = 1; i < x.size(); ++i) { + auto ins_i_dims = rank > 1 ? framework::slice_ddim(x[i].dims(), 1, rank) + : framework::make_ddim({0}); + PADDLE_ENFORCE_EQ(ins_i_dims, ins_dims, + "The dimension of the %zu'th element in LoDTensorArray " + "differs from previous ones.", + i); + PADDLE_ENFORCE(x[i].place() == place, + "The place class of the %zu'th element in LoDTensorArray " + "differs from previous ones.", + i); + PADDLE_ENFORCE(x[i].type() == data_type, + "The date type of the %zu'th element in LoDTensorArray " + "differs from previous ones.", + i); + batch_size += x[i].dims()[0]; + } + auto ins_dim_vec = framework::vectorize(ins_dims); + ins_dim_vec.insert(ins_dim_vec.begin(), batch_size); + framework::DDim out_dims = framework::make_ddim(ins_dim_vec); + out->Resize(out_dims); + out->mutable_data(place, data_type); + + auto &table_items = rank_table.items(); + std::vector table_item_idx(table_items.size()); + // table_item_idx = range(table_items_idx.size()) + std::iota(table_item_idx.begin(), table_item_idx.end(), 0); + std::sort(table_item_idx.begin(), table_item_idx.end(), + [&](size_t a, size_t b) { + return table_items[a].index < table_items[b].index; + }); + + // Build LoDTensor `out` + framework::LoD *out_lod = out->mutable_lod(); + out_lod->clear(); + auto prefix_lod = rank_table.coarse_lod(); + prefix_lod.emplace_back(); + auto &cur_level_lod = prefix_lod.back(); + cur_level_lod.push_back(0); + ArrayToLoDFunctor functor; + for (size_t idx : table_item_idx) { + cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length); + PADDLE_ENFORCE_LE(table_items[idx].length, x.size()); + for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) { + auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( + x[x_idx].lod(), idx, idx + 1, 0); + + auto &lod_length = lod_and_offset.first; + framework::AppendLoD(out_lod, lod_length); + + size_t start_offset = lod_and_offset.second.first; + size_t end_offset = lod_and_offset.second.second; + VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " [" + << ", " << end_offset << "]"; + // Copy data + PADDLE_ENFORCE_GE(end_offset, start_offset); + size_t len = end_offset - start_offset; + if (len == 0) { + continue; + } + functor.in.emplace_back(x[x_idx].Slice(start_offset, end_offset)); + } + } + functor.out = out; + platform::VisitPlace(place, functor); + out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end()); + } +}; + +class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(std::vector) A vector of tensors that is going to " + "be casted to a big LoDTensor."); + AddInput("RankTable", + "(LoDRankTable) RankTable provides the coarse lod infomation to " + "build the output LoDTensor. See " + "'paddle/framework/lod_rank_table.h' for more details."); + AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array."); + AddComment( + R"DOC(This Op build a big LoDTensor from a std::vector + and a LoDRankTable. It is supposed to be used in getting dynamic RNN's + outputs back to a normal LoDTensor. The std::vector + would be the output of RNN Op and the LoDRankTable would be build + with RNN's input.)DOC"); + } +}; + +class ArrayToLoDTensorInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), + "ArrayToLoDTensorOp must has input X."); + PADDLE_ENFORCE(context->HasInput("RankTable"), + "ArrayToLoDTensorOp must has input RankTable."); + context->SetOutputDim("Out", context->GetInputDim("X")); + } +}; + +class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("lod_tensor_to_array"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetInput("RankTable", Input("RankTable")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp, + ops::ArrayToLoDTensorOpProtoMaker, + ops::ArrayToLoDTensorInferShape, + ops::ArrayToLoDTensorGradMaker); diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc new file mode 100644 index 00000000..d9294048 --- /dev/null +++ b/paddle/fluid/operators/assign_op.cc @@ -0,0 +1,145 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +class AssignFunctor { + public: + AssignFunctor(framework::Variable *out, + const platform::DeviceContext &dev_ctx) + : out_(out), dev_ctx_(dev_ctx) {} + + void operator()(const framework::LoDTensor &lod_tensor) const { + auto &out_tensor = *out_->GetMutable(); + copy_tensor(lod_tensor, &out_tensor); + } + + void operator()(const framework::LoDTensorArray &array) const { + auto &out_array = *out_->GetMutable(); + out_array.resize(array.size()); + for (size_t i = 0; i < array.size(); ++i) { + copy_tensor(array[i], &out_array[i]); + } + } + + void operator()(const framework::SelectedRows &rows) const { + framework::SelectedRows &out_rows = + *out_->GetMutable(); + out_rows.set_rows(rows.rows()); + out_rows.set_height(rows.height()); + auto &t = rows.value(); + auto *m = out_rows.mutable_value(); + framework::TensorCopy(t, t.place(), dev_ctx_, m); + } + + template + void operator()(const T &v) const { + PADDLE_THROW("Not support type for assign op %s", typeid(T).name()); + } + + private: + void copy_tensor(const framework::LoDTensor &lod_tensor, + framework::LoDTensor *out) const { + if (lod_tensor.numel() == 0) return; + auto &out_tensor = *out; + TensorCopy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor); + out_tensor.set_lod(lod_tensor.lod()); + } + + framework::Variable *out_; + const platform::DeviceContext &dev_ctx_; +}; + +class AssignOp : public framework::OperatorBase { + public: + AssignOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto *x = scope.FindVar(Input("X")); + if (x == nullptr) { + return; + } + auto *out = scope.FindVar(Output("Out")); + PADDLE_ENFORCE( + out != nullptr, + "The Output(Out) should not be null if the Input(X) is set."); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); + } +}; + +class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor, SelectedRows or LoDTensorArray) The input variable " + "could be LoDTensor, SelectedRows or LoDTensorArray.") + .AsDispensable(); + AddOutput("Out", + "(LoDTensor, SelectedRows or LoDTensorArray) The type of output " + "is the same as input X."); + AddComment(R"DOC(Assign Operator + +Out = X, when type in [LoDTensor/SelectedRows/LoDTensorArray] +raise error if the type is not listed above. +)DOC"); + } +}; + +class AssignInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + if (context->HasInput("X")) { + auto type = context->GetInputsVarType("X")[0]; + if (type == framework::proto::VarType::SELECTED_ROWS || + type == framework::proto::VarType::LOD_TENSOR) { + context->SetOutputDim("Out", context->GetInputDim("X")); + } + } + } +}; + +class AssignGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *op = new framework::OpDesc(); + op->SetType("assign"); + op->SetInput("X", OutputGrad("Out")); + op->SetOutput("Out", InputGrad("X")); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker, + ops::AssignInferShape, ops::AssignOpProtoMaker); diff --git a/paddle/fluid/operators/assign_value_op.cc b/paddle/fluid/operators/assign_value_op.cc new file mode 100644 index 00000000..a757916b --- /dev/null +++ b/paddle/fluid/operators/assign_value_op.cc @@ -0,0 +1,76 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/assign_value_op.h" +#include +#include + +namespace paddle { +namespace operators { + +class AssignValueOp : public framework::OperatorWithKernel { + public: + AssignValueOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of AssignValueOp should not be null."); + auto shape = ctx->Attrs().Get>("shape"); + ctx->SetOutputDim("Out", framework::make_ddim(shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::Type(ctx.Attr("dtype")), + ctx.GetPlace()); + } +}; + +class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("Out", "(Tensor) Output tensor of assign_value operator."); + AddAttr>("shape", + "(vector) " + "Shape of values."); + AddAttr("dtype", "data type of values") + .InEnum({framework::proto::VarType::INT32, + framework::proto::VarType::FP32}); + AddAttr>("fp32_values", "store the float values") + .SetDefault({}); + AddAttr>("int32_values", "store the int values") + .SetDefault({}); + AddComment(R"DOC( +AssignValue operator + +$$Out = values$$ +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel, + ops::AssignValueKernel); diff --git a/paddle/fluid/operators/assign_value_op.cu.cc b/paddle/fluid/operators/assign_value_op.cu.cc new file mode 100644 index 00000000..08bfde5d --- /dev/null +++ b/paddle/fluid/operators/assign_value_op.cu.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/assign_value_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel, + ops::AssignValueKernel); diff --git a/paddle/fluid/operators/assign_value_op.h b/paddle/fluid/operators/assign_value_op.h new file mode 100644 index 00000000..e749d6f6 --- /dev/null +++ b/paddle/fluid/operators/assign_value_op.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +template +class AssignValueKernel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& ctx) const { + auto shape = ctx.Attr>("shape"); + auto* out = ctx.Output("Out"); + int dtype = ctx.Attr("dtype"); + const char* value_name = nullptr; + switch (dtype) { + case framework::proto::VarType::INT32: + value_name = "int32_values"; + break; + case framework::proto::VarType::FP32: + value_name = "fp32_values"; + break; + default: + PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype); + break; + } + auto values = ctx.Attr>(value_name); + framework::TensorFromVector(values, ctx.device_context(), out); + out->Resize(framework::make_ddim(shape)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc new file mode 100644 index 00000000..f991bef9 --- /dev/null +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -0,0 +1,426 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/attention_lstm_op.h" +#include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { + +void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Assert only one Input(X) of AttentionLSTM."); + PADDLE_ENFORCE(ctx->HasInput("C0"), + "Assert only one Input(C0) of AttentionLSTM."); + PADDLE_ENFORCE(ctx->HasInput("LSTMWeight"), + "Assert only one Input(LSTMWeight) of AttentionLSTM."); + PADDLE_ENFORCE(ctx->HasInput("LSTMBias"), + "Assert only one Input(LSTMBias) of AttentionLSTM."); + PADDLE_ENFORCE(ctx->HasInput("AttentionWeight"), + "Assert only one Input(AttentionWeight) of AttentionLSTM."); + + PADDLE_ENFORCE(ctx->HasOutput("Hidden"), + "Assert only one Output(Hidden) of AttentionLSTM."); + PADDLE_ENFORCE(ctx->HasOutput("Cell"), + "Assert only one Output(Cell) of AttentionLSTM."); + PADDLE_ENFORCE(ctx->HasOutput("AttentionedX"), + "Assert only one Output(AttentionedX) of AttentionLSTM."); + PADDLE_ENFORCE(ctx->HasOutput("AttentionFCOut"), + "Assert only one Output(AttentionFCOut) of AttentionLSTM."); + PADDLE_ENFORCE(ctx->HasOutput("LSTMX"), + "Assert only one Output(LSTMX) of AttentionLSTM."); + PADDLE_ENFORCE(ctx->HasOutput("LSTMOUT"), + "Assert only one Output(LSTMOUT) of AttentionLSTM."); + + auto x_dims = ctx->GetInputDim("X"); + const int M = x_dims[1]; + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2."); + + auto w_dims = ctx->GetInputDim("LSTMWeight"); + const int D = w_dims[1] / 4; + PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(LSTMWeight)'s rank must be 2."); + PADDLE_ENFORCE_EQ(w_dims[0], D + M, + "LSTMWeight dims should be (%d + %d) * %d.", D, M, 4 * D); + + auto b_dims = ctx->GetInputDim("LSTMBias"); + PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(LSTMBias)'s rank must be 2."); + PADDLE_ENFORCE_EQ(b_dims[0], 1, "LSTMBias dims should be 1 x %d.", 4 * D); + PADDLE_ENFORCE_EQ(b_dims[1], 4 * D, "LSTMBias dims should be 1 x %d.", 4 * D); + + auto c_dims = ctx->GetInputDim("C0"); + PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2."); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D); + } + + if (ctx->HasInput("H0")) { + auto h_dims = ctx->GetInputDim("H0"); + PADDLE_ENFORCE_EQ(h_dims.size(), 2UL, "Input(H0)'s rank must be 2."); + if (ctx->IsRuntime() || + (framework::product(c_dims) > 0 && framework::product(h_dims) > 0)) { + PADDLE_ENFORCE(h_dims == c_dims, + "The dimension of Input(H0) and Input(C0) " + "should be the same."); + } + } + + auto atten_w_dims = ctx->GetInputDim("AttentionWeight"); + PADDLE_ENFORCE_EQ(atten_w_dims.size(), 2, + "Input(AttentionWeight)'s rank must be 2."); + PADDLE_ENFORCE_EQ(atten_w_dims[0], M + D, + "AttentionWeight shapes must be (%d + %d) * 1.", M, D); + PADDLE_ENFORCE_EQ(atten_w_dims[1], 1, + "AttentionWeight shapes must be (%d + %d) * 1.", M, D); + + if (ctx->HasInput("AttentionBias")) { + auto atten_b_dims = ctx->GetInputDim("AttentionBias"); + PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2, + "Input(AttentionBias)'s rank must be 2."); + PADDLE_ENFORCE_EQ(atten_b_dims[0], 1, + "AttentionBias shapes must be 1 * 1."); + PADDLE_ENFORCE_EQ(atten_b_dims[1], 1, + "AttentionBias shapes must be 1 * 1."); + } + + if (ctx->HasInput("AttentionScalar")) { + auto dims = ctx->GetInputDim("AttentionScalar"); + PADDLE_ENFORCE_EQ(dims.size(), 2, + "Input(AttentionScalar)'s rank must be 2."); + PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalar shapes must be 1 * 1."); + PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalar shapes must be 1 * 1."); + } + + if (ctx->HasInput("AttentionScalarBias")) { + auto dims = ctx->GetInputDim("AttentionScalarBias"); + PADDLE_ENFORCE( + ctx->HasInput("AttentionScalar"), + "AttentionScalar should not be null when have AttentionScalarBias."); + PADDLE_ENFORCE_EQ(dims.size(), 2, + "Input(AttentionScalarBias)'s rank must be 2."); + PADDLE_ENFORCE_EQ(dims[0], 1, "AttentionScalarBias shapes must be 1 * 1."); + PADDLE_ENFORCE_EQ(dims[1], 1, "AttentionScalarBias shapes must be 1 * 1."); + } + + framework::DDim out_dims({x_dims[0], D}); + ctx->SetOutputDim("Hidden", out_dims); + ctx->SetOutputDim("Cell", out_dims); + ctx->SetOutputDim("AttentionedX", {x_dims[0], 1}); + ctx->SetOutputDim("LSTMX", {1, M}); + ctx->SetOutputDim("LSTMOUT", {1, 4 * D}); + // AttentionFCOut should be reshape as (maxseqlen,1) in runtime + ctx->ShareLoD("X", "Hidden"); + ctx->ShareLoD("X", "Cell"); +} + +framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); +} + +void AttentionLSTMOpMaker::Make() { + AddInput("X", + "(LoDTensor) the input is a LodTensor, which support " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X M), where T is the " + "total time steps in this mini-batch, M is the dim size of x."); + AddInput("C0", + "(Tensor) LSTM C0" + "This is a tensor with shape (N x D), where N is the batch size, D " + "is the gate size." + "C0 is necessary because of attention."); + AddInput("H0", + "(Tensor, optional) LSTM H0" + "This is a tensor with shape (N x D), where N is the " + "batch size and D is the gate size.") + .AsDispensable(); + AddInput("AttentionWeight", + "(Tensor) the weights of attention fc. Always relu the fc result." + "The shape is ((M+D) x 1), where M is the dim size of x, D is the " + "gate size of LSTM."); + AddInput("AttentionBias", + "(Tensor, optional) the bias of attention fc." + "The shape is (1 x 1)") + .AsDispensable(); + AddInput("AttentionScalar", + "(Tensor, optional) the scalar on the result of attentioned fc. " + "Always relu the Scalar." + "The shape is (1 x 1)") + .AsDispensable(); + AddInput("AttentionScalarBias", + "(Tensor, optional) the scalar bias of attention fc." + "The shape is (1 x 1)") + .AsDispensable(); + AddInput("LSTMWeight", + "(Tensor) the combined weight of LSTM" + " - The shape is ((D+M) x 4D), where D is the hidden gate size, M " + "is the dim size of x" + " - Weight = {W_forget, W_input, W_output, W_cell}"); + AddInput("LSTMBias", + "(Tensor) the combined bias of LSTM, shape (1x4D)." + "Note: we should add the bias of hidden and context accorindg to " + "the same gate: " + "{B_forget, B_input, B_output, B_cell}"); + AddOutput("Hidden", + "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("Cell", + "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. " + "The shape is (T x D), and lod is the same with the `Input`."); + AddOutput("AttentionedX", + "(Tensor) shape is (T x 1), the result after X * AttentionWeight," + " where T is the total time steps in this mini-batch," + " D is the hidden size.") + .AsIntermediate(); + AddOutput("AttentionFCOut", + "(Tensor) (max_seq_len, 1), compute at each step.") + .AsIntermediate(); + AddOutput("LSTMX", + "(Tensor) the input X of LSTM for each step." + "Shape is (1 x M), where M is the x frame size") + .AsIntermediate(); + AddOutput( + "LSTMOUT", + "(Tensor) the output of LSTM X(1*(D+M))* weight((D+M)*4D) for each step." + "Shape is (1 x 4D), where M is the x frame size") + .AsIntermediate(); + AddAttr("gate_activation", + "(string, default: sigmoid)" + "The activation for input gate, forget gate and output " + "gate, `sigmoid` by default.") + .SetDefault("sigmoid") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("cell_activation", + "(string, default: tanh)" + "The activation for cell output, `tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddAttr("candidate_activation", + "(string, default: tanh)" + "The activation for candidate hidden state, " + "`tanh` by default.") + .SetDefault("tanh") + .InEnum({"sigmoid", "tanh", "relu", "identity"}); + AddComment(R"DOC( +Attention Long-Short Term Memory (LSTM) Operator. + +Attention part: +concat( x(seqlen * M), expand( cell_t-1(1,D) ) ) => tmp(seqlen*(M+D)) + +tmp(seqlen*(M+D)) * fc((M+D)*1) => fcout(seqlen*1) with bias, relu + +fcout(seqlen*1) * scalar => fcout(seqlen*1) with bias, relu + +dotmul and sum pool ( fcout(seqlen*1), x(seqlen * M) ) => lstm_x_t(1, M) + +LSTM part: +use lstm_x_t as input and compute as standard LSTM. + +)DOC"); +} + +// y[i] = (x[i] + bias[0]) > 0 ? (x[i] + bias[0]) : 0; +template +inline void bias_relu(const int n, const T* x, const T* bias, T* y) { + if (bias) { + math::vec_add_bias(n, *bias, x, y); + math::vec_relu(n, y, y); + } else { + math::vec_relu(n, x, y); + } +} + +template +inline void vec_softmax(const int n, const T* x, T* y) { + T scalar = x[0]; + // max + for (int i = 1; i < n; ++i) { + scalar = scalar < x[i] ? x[i] : scalar; + } + math::vec_add_bias(n, -scalar, x, y); // sub + math::vec_exp(n, y, y); // exp + // sum + scalar = T(0); + for (int i = 0; i < n; ++i) { + scalar += y[i]; + } + math::vec_scal(n, static_cast(1) / scalar, y); // scale +} + +template +class AttentionLSTMKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using DeviceContext = paddle::platform::CPUDeviceContext; + + auto* x = ctx.Input("X"); + auto* h0 = ctx.Input("H0"); + auto* c0 = ctx.Input("C0"); + auto* atten_w = ctx.Input("AttentionWeight"); + auto* atten_b = ctx.Input("AttentionBias"); + auto* atten_scalar = ctx.Input("AttentionScalar"); + auto* atten_scalar_bias = ctx.Input("AttentionScalarBias"); + auto* lstm_w = ctx.Input("LSTMWeight"); + auto* lstm_b = ctx.Input("LSTMBias"); + + auto* hidden_out = ctx.Output("Hidden"); + auto* cell_out = ctx.Output("Cell"); + auto* atted_x = ctx.Output("AttentionedX"); + auto* fc_out = ctx.Output("AttentionFCOut"); + auto* lstm_x = ctx.Output("LSTMX"); + auto* lstm_out = ctx.Output("LSTMOUT"); + + // some shape should be reshape here since infershape can not get lod info + auto x_lod = x->lod(); + const int N = x_lod[0].size() - 1; // batch size + auto x_dims = x->dims(); // T x M + auto w_dims = lstm_w->dims(); // (D+M) x 4D + const int total_T = x_dims[0]; + const int M = x_dims[1]; // x frame size + const int D = w_dims[1] / 4; // gate frame size + const int D2 = D * 2; + const int D3 = D * 3; + const int D4 = w_dims[1]; + int max_seq_len = x_lod[0][1]; + for (int i = 1; i < N; ++i) { + int len = x_lod[0][i + 1] - x_lod[0][i]; + max_seq_len = max_seq_len < len ? len : max_seq_len; + } + PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, "Input(X)'s lod size must be 1."); + PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D); + fc_out->Resize({max_seq_len, 1}); + + std::function act_gate, act_cell, act_cand; + auto& act_gate_str = ctx.Attr("gate_activation"); + auto& act_cell_str = ctx.Attr("cell_activation"); + auto& act_cand_str = ctx.Attr("candidate_activation"); + if (platform::MayIUse(platform::avx)) { + math::VecActivations act_functor; + act_gate = act_functor(act_gate_str); + act_cell = act_functor(act_cell_str); + act_cand = act_functor(act_cand_str); + } else { + math::VecActivations act_functor; + act_gate = act_functor(act_gate_str); + act_cell = act_functor(act_cell_str); + act_cand = act_functor(act_cand_str); + } + + const T* x_data = x->data(); + const T* h0_data = h0 ? h0->data() : NULL; + const T* c0_data = c0->data(); + const T* lstm_w_data = lstm_w->data(); + const T* lstm_b_data = lstm_b->data(); + const T* atten_w_data = atten_w->data(); + const T* atten_b_data = atten_b ? atten_b->data() : NULL; + const T* atten_scalar_data = atten_scalar ? atten_scalar->data() : NULL; + const T* atten_scalar_bias_data = + atten_scalar_bias ? atten_scalar_bias->data() : NULL; + + T* hidden_out_data = hidden_out->mutable_data(ctx.GetPlace()); + T* cell_out_data = cell_out->mutable_data(ctx.GetPlace()); + T* atted_x_data = atted_x->mutable_data(ctx.GetPlace()); + T* fc_out_data = fc_out->mutable_data(ctx.GetPlace()); + T* lstm_x_data = lstm_x->mutable_data(ctx.GetPlace()); + T* lstm_out_data = lstm_out->mutable_data(ctx.GetPlace()); + + // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1 + auto blas = math::GetBlas(ctx); + math::FCCompute(blas, total_T, 1, M, x_data, atten_w_data, + atted_x_data, atten_b_data); + + const T* cur_atten_x_data = atted_x_data; + const T* cur_x_data = x_data; + const T* prev_cell_data = NULL; + const T* prev_hidden_data = NULL; + T* cur_cell_out_data = cell_out_data; + T* cur_hidden_out_data = hidden_out_data; + for (int i = 0; i < N; ++i) { + int seq_len = x_lod[0][i + 1] - x_lod[0][i]; + prev_cell_data = c0_data + i * D; + prev_hidden_data = h0_data ? h0_data + i * D : NULL; + for (int step = 0; step < seq_len; ++step) { + /// 1. compute attention vector + // 1a. prev_cell(1xD) * fc(D) rest part of atten_wgt + T prev_cell_bias = blas.DOT(D, prev_cell_data, atten_w_data + M); + // 1b. add cell bias and relu + bias_relu(seq_len, cur_atten_x_data, &prev_cell_bias, fc_out_data); + // 1c. fc scalar + if (atten_scalar_data) { + blas.SCAL(seq_len, *atten_scalar_data, fc_out_data); + bias_relu(seq_len, fc_out_data, atten_scalar_bias_data, + fc_out_data); + } + // 1d. softmax + vec_softmax(seq_len, fc_out_data, fc_out_data); + // mul x(seq_len*M) and sum pool + math::FCCompute(blas, 1, M, seq_len, fc_out_data, + cur_x_data, lstm_x_data); + + /// 2. compute LSTM step + // lstm weight : concat[forget , input , output , tilde] + // shape : (D + M) x (4 * D) + // fc inputX(1xM) * weightX(M*(4D)) => 1 x 4D + blas.MatMul(1, D4, M, lstm_x_data, lstm_w_data + D * D4, lstm_out_data); + if (prev_hidden_data) { + blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast(1), + prev_hidden_data, D, lstm_w_data, D4, static_cast(1), + lstm_out_data, D4); + } + // since input is 1xM, so can use add bias + blas.VADD(D4, lstm_b_data, lstm_out_data, lstm_out_data); + + // gate act: sigmoid + act_gate(D3, lstm_out_data, lstm_out_data); + // candicate act: tanh + act_cand(D, lstm_out_data + D3, lstm_out_data + D3); + + // a = forget * prev_cell + blas.VMUL(D, lstm_out_data, prev_cell_data, lstm_out_data); + + // b = input * tilde + blas.VMUL(D, lstm_out_data + D, lstm_out_data + D3, lstm_out_data + D); + + // cell_out = a + b + blas.VADD(D, lstm_out_data, lstm_out_data + D, cur_cell_out_data); + + // state act tanh(cell_out) * output_gate + act_cell(D, cur_cell_out_data, lstm_out_data); + blas.VMUL(D, lstm_out_data, lstm_out_data + D2, cur_hidden_out_data); + + prev_hidden_data = cur_hidden_out_data; + prev_cell_data = cur_cell_out_data; + cur_cell_out_data = cur_cell_out_data + D; + cur_hidden_out_data = cur_hidden_out_data + D; + } + cur_x_data = cur_x_data + seq_len * M; + cur_atten_x_data = cur_atten_x_data + seq_len; + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp, + ops::AttentionLSTMOpMaker, + paddle::framework::DefaultGradOpDescMaker); + +REGISTER_OP_CPU_KERNEL(attention_lstm, ops::AttentionLSTMKernel, + ops::AttentionLSTMKernel); diff --git a/paddle/fluid/operators/attention_lstm_op.h b/paddle/fluid/operators/attention_lstm_op.h new file mode 100644 index 00000000..6ede3a7f --- /dev/null +++ b/paddle/fluid/operators/attention_lstm_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class AttentionLSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class AttentionLSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc new file mode 100644 index 00000000..0922b03b --- /dev/null +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -0,0 +1,214 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/average_accumulates_op.h" + +namespace paddle { +namespace operators { + +template <> +void GetAccumulators( + const framework::ExecutionContext& ctx, int64_t* num_updates, + int64_t* num_accumulates, int64_t* old_num_accumulates) { + auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); + auto* in_num_accumulates = ctx.Input("in_num_accumulates"); + auto* in_num_updates = ctx.Input("in_num_updates"); + + *old_num_accumulates = in_old_num_accumulates->data()[0]; + *num_accumulates = in_num_accumulates->data()[0]; + *num_updates = in_num_updates->data()[0]; +} + +template <> +void SetAccumulators( + const framework::ExecutionContext& ctx, int64_t num_updates, + int64_t num_accumulates, int64_t old_num_accumulates) { + auto* out_old_num_accumulates = ctx.Output("out_old_num_accumulates"); + auto* out_num_accumulates = ctx.Output("out_num_accumulates"); + auto* out_num_updates = ctx.Output("out_num_updates"); + + out_old_num_accumulates->data()[0] = old_num_accumulates; + out_num_accumulates->data()[0] = num_accumulates; + out_num_updates->data()[0] = num_updates; +} + +class AverageAccumulatesOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("param"), + "Input (param) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("in_sum_1"), + "Input (sum_1) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("in_sum_2"), + "Input (sum_2) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("in_sum_3"), + "Input (sum_3) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("in_num_accumulates"), + "Input (in_num_accumulates) of average_accumulates op should " + "not be null."); + PADDLE_ENFORCE(ctx->HasInput("in_old_num_accumulates"), + "Input (old_num_accumulates) of average_accumulates op " + "should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("in_num_updates"), + "Input (num_updates) of average_accumulates op should not be null."); + + PADDLE_ENFORCE( + ctx->HasOutput("out_sum_1"), + "Output (sum_1) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("out_sum_2"), + "Output (sum_2) of average_accumulates op should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("out_sum_3"), + "Output (sum_3) of average_accumulates op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("out_num_accumulates"), + "Output (num_accumulates) of average_accumulates op should " + "not be null."); + PADDLE_ENFORCE(ctx->HasOutput("out_old_num_accumulates"), + "Output (old_num_accumulates) of average_accumulates op " + "should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("out_num_updates"), + "Output (num_updates) of average_accumulates op should not be null."); + + auto in_dim = ctx->GetInputDim("param"); + + ctx->SetOutputDim("out_sum_1", in_dim); + ctx->SetOutputDim("out_sum_2", in_dim); + ctx->SetOutputDim("out_sum_3", in_dim); + ctx->SetOutputDim("out_num_accumulates", {1}); + ctx->SetOutputDim("out_old_num_accumulates", {1}); + ctx->SetOutputDim("out_num_updates", {1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("param")->type(), + ctx.GetPlace()); + } +}; + +class AverageAccumulatesOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("param", "(Tensor), The parameter to be accumulated."); + AddInput("in_sum_1", + "(Tensor), A tensor used to store the parameter " + "sums with the same shape as input(param)."); + AddInput("in_sum_2", + "(Tensor), A auxiliary tensor to help " + "accumulating sums of parameter values with the same shape as " + "input(param). It is used to avoid loss of precision due to too " + "many sums."); + AddInput("in_sum_3", + "(Tensor), A auxiliary tensor to help " + "accumulating sums of parameter values with the same shape as " + "input(param)."); + AddInput("in_num_accumulates", + "(Tensor), The accumulating times of current window with " + "shape [1]."); + AddInput( + "in_old_num_accumulates", + "(Tensor), The accumulating times of previous window with " + "shape [1]."); + AddInput("in_num_updates", + "(Tensor), The total number of batches used by trainning " + "before this batch with shape [1]."); + + AddOutput("out_sum_1", + "(Tensor), A tensor used to store the " + "parameter sums with the same shape as input(param)."); + AddOutput("out_sum_2", + "(Tensor), A auxiliary tensor to help " + "accumulating sums of parameter values with the same shape as " + "input(param). It is used to avoid loss of precision due to too " + "many sums."); + AddOutput("out_sum_3", + "(Tensor), A auxiliary tensor to help " + "accumulating sums of parameter values with the same shape as " + "input(param)."); + AddOutput( + "out_num_accumulates", + "(Tensor), The accumulating times of current window with " + "shape [1]."); + AddOutput( + "out_old_num_accumulates", + "(Tensor) The accumulating times of previous window with " + "shape [1]."); + AddOutput( + "out_num_updates", + "(Tensor), The total number of batches used by trainning " + "before this batch with shape [1]."); + + AddAttr("average_window", + "(float, default 0) " + "The rate of average window size relative to num_updates.") + .SetDefault(0); + AddAttr("max_average_window", + "(int64_t) " + "Maximum size of average window. It suggests that the " + "number of mini-batches " + "in one pass is appropriate value to set."); + AddAttr("min_average_window", + "(int64_t, default 10000L) " + "Minimu size of average window.") + .SetDefault(10000L); + + AddComment(R"DOC( +AverageAccumulates Operator. +Accumulate the sum of parameter within sliding window. The size of sliding window is +determined by 'average_window', 'max_average_window' and 'min_average_window'. +Memory was shared by Input(in_sum_1) and Output(out_sum_1) which acts as an accumulator 'sum_1'. +'sum_2', 'sum_3', 'num_accumulates', 'old_num_accumulates' and 'num_updates' were the same as 'sum_1'. + +All the accumulators were inited to zero before training. + +And for a mini-batch in training, accumulators were computed as below steps: + num_updates += 1 + num_accumulates += 1 + sum_1 += param + if num_updates % kMaxNumAccumulates == 0: + sum_2 += sum_1 + sum_1 = 0 + if num_accumulates >= min_average_window && num_accumulates >= min(max_average_window, num_updates * average_window): + sum_3 = sum_1 + sum_2 + sum_1 = 0 + sum_2 = 0 + old_num_accumulates = num_accumulates + num_accumulates = 0 + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(average_accumulates, ops::AverageAccumulatesOp, + ops::AverageAccumulatesOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + average_accumulates, + ops::AverageAccumulatesKernel, + ops::AverageAccumulatesKernel); diff --git a/paddle/fluid/operators/average_accumulates_op.cu b/paddle/fluid/operators/average_accumulates_op.cu new file mode 100644 index 00000000..104e24f6 --- /dev/null +++ b/paddle/fluid/operators/average_accumulates_op.cu @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/average_accumulates_op.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { +template <> +void GetAccumulators( + const framework::ExecutionContext& ctx, int64_t* num_updates_, + int64_t* num_accumulates_, int64_t* old_num_accumulates_) { + auto* in_old_num_accumulates = ctx.Input("in_old_num_accumulates"); + auto* in_num_accumulates = ctx.Input("in_num_accumulates"); + auto* in_num_updates = ctx.Input("in_num_updates"); + auto stream = ctx.cuda_device_context().stream(); + auto cuda_place = + boost::get(in_old_num_accumulates->place()); + memory::Copy(platform::CPUPlace(), old_num_accumulates_, cuda_place, + in_old_num_accumulates->data(), sizeof(int64_t), + stream); + memory::Copy(platform::CPUPlace(), num_accumulates_, cuda_place, + in_num_accumulates->data(), sizeof(int64_t), stream); + memory::Copy(platform::CPUPlace(), num_updates_, cuda_place, + in_num_updates->data(), sizeof(int64_t), stream); +} + +template <> +void SetAccumulators( + const framework::ExecutionContext& ctx, int64_t num_updates_, + int64_t num_accumulates_, int64_t old_num_accumulates_) { + auto stream = ctx.cuda_device_context().stream(); + auto* out_old_num_accumulates = ctx.Output("out_old_num_accumulates"); + auto* out_num_accumulates = ctx.Output("out_num_accumulates"); + auto* out_num_updates = ctx.Output("out_num_updates"); + auto cuda_place = + boost::get(out_old_num_accumulates->place()); + + memory::Copy(cuda_place, out_old_num_accumulates->data(), + platform::CPUPlace(), &old_num_accumulates_, sizeof(int64_t), + stream); + memory::Copy(cuda_place, out_num_accumulates->data(), + platform::CPUPlace(), &num_accumulates_, sizeof(int64_t), + stream); + memory::Copy(cuda_place, out_num_updates->data(), + platform::CPUPlace(), &num_updates_, sizeof(int64_t), stream); +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + average_accumulates, + ops::AverageAccumulatesKernel, + ops::AverageAccumulatesKernel); diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h new file mode 100644 index 00000000..3958d3f6 --- /dev/null +++ b/paddle/fluid/operators/average_accumulates_op.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenVector = framework::EigenVector; + +template +void GetAccumulators(const framework::ExecutionContext& ctx, + int64_t* num_updates, int64_t* num_accumulates, + int64_t* old_num_accumulates); + +template +void SetAccumulators(const framework::ExecutionContext& ctx, + int64_t num_updates, int64_t num_accumulates, + int64_t old_num_accumulates); + +template +class AverageAccumulatesKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // It is used to avoid loss of precision + static const int64_t kMaxNumAccumulates = 16384; + // Get accumulators from input + int64_t num_updates = 0; + int64_t num_accumulates = 0; + int64_t old_num_accumulates = 0; + GetAccumulators(ctx, &num_updates, &num_accumulates, + &old_num_accumulates); + + // Get attrs + float average_window = ctx.Attr("average_window"); + int64_t max_average_window = ctx.Attr("max_average_window"); + int64_t min_average_window = ctx.Attr("min_average_window"); + PADDLE_ENFORCE_LE(min_average_window, max_average_window, + "min_average_window shouldn't be larger than " + "max_average_window"); + + // Get inputs + auto* param = ctx.Input("param"); + auto* in_sum_1 = ctx.Input("in_sum_1"); + auto* in_sum_2 = ctx.Input("in_sum_2"); + auto* in_sum_3 = ctx.Input("in_sum_3"); + auto param_tensor = EigenVector::Flatten(*param); + auto in_sum_1_tensor = EigenVector::Flatten(*in_sum_1); + auto in_sum_2_tensor = EigenVector::Flatten(*in_sum_2); + auto in_sum_3_tensor = EigenVector::Flatten(*in_sum_3); + + // Get outputs + auto* out_sum_1 = ctx.Output("out_sum_1"); + auto* out_sum_2 = ctx.Output("out_sum_2"); + auto* out_sum_3 = ctx.Output("out_sum_3"); + auto out_sum_1_tensor = EigenVector::Flatten(*out_sum_1); + auto out_sum_2_tensor = EigenVector::Flatten(*out_sum_2); + auto out_sum_3_tensor = EigenVector::Flatten(*out_sum_3); + + // Compute + auto& place = *ctx.template device_context().eigen_device(); + math::SetConstant constant_functor; + ++num_updates; + ++num_accumulates; + out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor; + out_sum_2_tensor.device(place) = in_sum_2_tensor; + out_sum_3_tensor.device(place) = in_sum_3_tensor; + if (num_updates % kMaxNumAccumulates == 0) { + // Move the sum to a different buffer to avoid loss of precision due to + // too many sums. + out_sum_2_tensor.device(place) = in_sum_2_tensor + in_sum_1_tensor; + constant_functor(ctx.template device_context(), out_sum_1, + 0.0); + } + if (num_accumulates >= min_average_window && + num_accumulates >= std::min(max_average_window, + num_updates * average_window)) { + // Now the average window is too long, discard the old sum. + out_sum_3_tensor.device(place) = in_sum_1_tensor + in_sum_2_tensor; + constant_functor(ctx.template device_context(), out_sum_1, + 0.0); + constant_functor(ctx.template device_context(), out_sum_2, + 0.0); + old_num_accumulates = num_accumulates; + num_accumulates = 0; + } + + // Set accumulators to output + SetAccumulators(ctx, num_updates, num_accumulates, + old_num_accumulates); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc new file mode 100644 index 00000000..f6295337 --- /dev/null +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -0,0 +1,638 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/batch_norm_op.h" +#include +#include +#include +#include "paddle/fluid/framework/data_layout.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scale"), + "Input(Scale) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Bias"), + "Input(Bias) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Mean"), + "Input(Mean) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Variance"), + "Input(Variance) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), + "Output(Y) of ConvOp should not be null."); + bool is_test = ctx->Attrs().Get("is_test"); + if (!is_test) { + PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), + "Output(MeanOut) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), + "Output(VarianceOut) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), + "Output(SavedMean) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), + "Output(SavedVariance) of ConvOp should not be null."); + } + + // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python + PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0], + "Mean and MeanOut should share the same memory"); + PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0], ctx->Outputs("VarianceOut")[0], + "Variance and VarianceOut should share the same memory"); + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "Input X must have 2 to 5 dimensions."); + + const int64_t C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + auto scale_dim = ctx->GetInputDim("Scale"); + auto bias_dim = ctx->GetInputDim("Bias"); + + PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL); + PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL); + + bool check = true; + if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 || + framework::product(bias_dim) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(scale_dim[0], C); + PADDLE_ENFORCE_EQ(scale_dim[0], C); + } + ctx->SetOutputDim("Y", x_dims); + ctx->SetOutputDim("MeanOut", {C}); + ctx->SetOutputDim("VarianceOut", {C}); + ctx->SetOutputDim("SavedMean", {C}); + ctx->SetOutputDim("SavedVariance", {C}); + ctx->ShareLoD("X", "Y"); +} + +framework::OpKernelType BatchNormOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + auto input_data_type = ctx.Input("X")->type(); + // By default, the type of the scale, bias, mean, + // and var tensors should both be float. (For float or float16 input tensor) + // or double (For double input tensor). + auto bn_param_type = framework::proto::VarType::FP32; + if (input_data_type == framework::proto::VarType::FP64) { + bn_param_type = framework::proto::VarType::FP64; + } + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Scale")->type(), + "Scale input should be of float type"); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Bias")->type(), + "Bias input should be of float type"); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Mean")->type(), + "Mean input should be of float type"); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Variance")->type(), + "Variance input should be of float type"); + + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } +#endif + + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library); +} + +void BatchNormOpMaker::Make() { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddAttr("momentum", "").SetDefault(0.9); + AddAttr("epsilon", "") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr("data_layout", "").SetDefault("NCHW"); + AddInput("X", "The input tensor"); + AddInput("Scale", + "Scale is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Bias", + "Bias is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Mean", + "The global mean (for training) or " + "estimated mean (for testing)"); + AddInput("Variance", + "The global variance (for training) " + "or estimated Variance (for testing)"); + AddOutput("Y", "result after normalization"); + AddOutput("MeanOut", + "Share memory with Mean. " + "Store the global mean when training"); + AddOutput("VarianceOut", + "Share memory with Variance. " + "Store the global Variance when training"); + AddOutput("SavedMean", + "Mean of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("SavedVariance", + "Variance of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("fuse_with_relu", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("use_global_stats", + "(bool, default false) Whether to use global mean and " + "variance. In inference or test mode, set use_global_stats " + "to true or is_test true. the behavior is equivalent. " + "In train mode, when setting use_global_stats True, the " + "global mean and variance are also used during train time, " + "the BN acts as scaling and shiffting.") + .SetDefault(false); + AddComment(R"DOC( +Batch Normalization. + +Batch Norm has been implemented as discussed in the paper: +https://arxiv.org/pdf/1502.03167.pdf +Can be used as a normalizer function for conv2d and fully_connected operations. +The required data format for this layer is one of the following: +1. NHWC `[batch, in_height, in_width, in_channels]` +2. NCHW `[batch, in_channels, in_height, in_width]` + +)DOC"); +} + +template +class BatchNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + const float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + const bool use_global_stats = ctx.Attr("use_global_stats"); + + bool global_stats = is_test || use_global_stats; + + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + const int N = x_dims[0]; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x->numel() / N / C; + + auto *y = ctx.Output("Y"); + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + + // alloc memory + y->mutable_data(ctx.GetPlace()); + mean_out->mutable_data(ctx.GetPlace()); + variance_out->mutable_data(ctx.GetPlace()); + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); + + if (!global_stats) { + // saved_xx is use just in this batch of data + EigenVectorArrayMap saved_mean_e( + saved_mean->mutable_data(ctx.GetPlace()), C); + EigenVectorArrayMap saved_variance_e( + saved_variance->mutable_data(ctx.GetPlace()), C); + saved_mean_e.setZero(); + saved_variance_e.setZero(); + + EigenVectorArrayMap running_mean_arr( + mean_out->mutable_data(ctx.GetPlace()), C); + EigenVectorArrayMap running_var_arr( + variance_out->mutable_data(ctx.GetPlace()), C); + + if ((N * sample_size) == 1) { + LOG(WARNING) << "Only 1 element in normalization dimension, " + << "we skip the batch norm calculation, let y = x."; + framework::TensorCopy(*x, ctx.GetPlace(), y); + return; + } + + switch (data_layout) { + case DataLayout::kNCHW: { + ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + saved_mean_e(nc % C) += x_arr.col(nc).sum(); + } + saved_mean_e /= N * sample_size; + for (int nc = 0; nc < N * C; ++nc) { + saved_variance_e(nc % C) += + (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); + } + saved_variance_e /= N * sample_size; + break; + } + case DataLayout::kNHWC: { + ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); + for (int i = 0; i < N * sample_size; ++i) { + saved_mean_e += x_arr.col(i); + } + saved_mean_e /= N * sample_size; + for (int i = 0; i < N * sample_size; ++i) { + saved_variance_e += + (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e); + } + saved_variance_e /= N * sample_size; + break; + } + default: + PADDLE_THROW("Unknown storage order: %s", data_layout_str); + } + + running_mean_arr = + running_mean_arr * momentum + saved_mean_e * (1. - momentum); + running_var_arr = + running_var_arr * momentum + saved_variance_e * (1. - momentum); + } + + // use SavedMean and SavedVariance to do normalize + Eigen::Array inv_std(C); + if (global_stats) { + ConstEigenVectorArrayMap var_arr( + ctx.Input("Variance")->data(), C); + inv_std = (var_arr + epsilon).sqrt().inverse(); + } else { + EigenVectorArrayMap saved_inv_std( + ctx.Output("SavedVariance")->data(), C); + // inverse SavedVariance first, gradient will use it too. + saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt(); + inv_std = saved_inv_std; + } + ConstEigenVectorArrayMap mean_arr( + global_stats ? ctx.Input("Mean")->data() + : ctx.Output("SavedMean")->data(), + C); + + // ((x - est_mean) * (inv_var) * scale + bias + // formula transform ====> + // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + ConstEigenVectorArrayMap scale_arr(scale->data(), C); + ConstEigenVectorArrayMap bias_arr(bias->data(), C); + Eigen::Array new_scale = inv_std * scale_arr; + Eigen::Array new_bias = + bias_arr - mean_arr * inv_std * scale_arr; + + switch (data_layout) { + case DataLayout::kNCHW: { + EigenArrayMap y_arr(y->mutable_data(ctx.GetPlace()), sample_size, + N * C); + ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); + } + break; + } + case DataLayout::kNHWC: { + EigenArrayMap(y->mutable_data(ctx.GetPlace()), C, + N * sample_size) = + (ConstEigenArrayMap(x->data(), C, N * sample_size).colwise() * + new_scale) + .colwise() + + new_bias; + break; + } + default: + PADDLE_THROW("Unknown storage order: %d", data_layout); + } + } +}; + +void BatchNormGradOp::InferShape(framework::InferShapeContext *ctx) const { + // check input + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput("Scale"), "Input(scale) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("SavedMean"), + "Input(SavedMean) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), + "Input(SavedVariance) should not be null"); + + // check output + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), + "Output(Scale@GRAD) and Output(Bias@GRAD) should not be " + "null at same time"); + } + const bool use_global_stats = ctx->Attrs().Get("use_global_stats"); + if (use_global_stats) { + PADDLE_ENFORCE(!ctx->Attrs().Get("use_mkldnn"), + "Using global stats during training is not supported " + "in gradient op kernel of batch_norm_mkldnn_op now."); + } + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); + ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); + } +} + +framework::OpKernelType BatchNormGradOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } +#endif + + return framework::OpKernelType(ctx.Input("X")->type(), ctx.GetPlace(), + layout, library); +} + +template +class BatchNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + const auto *saved_mean = ctx.Input("SavedMean"); + // SavedVariance have been reverted in forward operator + const auto *saved_inv_variance = ctx.Input("SavedVariance"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const bool use_global_stats = ctx.Attr("use_global_stats"); + const float epsilon = ctx.Attr("epsilon"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + const int N = x_dims[0]; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x->numel() / N / C; + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + + const T *mean_data = saved_mean->data(); + const T *inv_var_data = saved_inv_variance->data(); + Tensor inv_var_tensor; + if (use_global_stats) { + const auto *running_mean = ctx.Input("Mean"); + const auto *running_variance = ctx.Input("Variance"); + mean_data = running_mean->data(); + inv_var_tensor.Resize({C}); + T *running_inv_var_data = inv_var_tensor.mutable_data(ctx.GetPlace()); + EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); + ConstEigenVectorArrayMap var_arr(running_variance->data(), C); + + inv_var_tmp = (var_arr + epsilon).sqrt().inverse().eval(); + inv_var_data = running_inv_var_data; + } + + ConstEigenVectorArrayMap scale_arr(scale->data(), C); + ConstEigenVectorArrayMap mean_arr(mean_data, C); + ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); + + T *d_bias_data = nullptr; + T *d_scale_data = nullptr; + if (d_scale && d_bias) { + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + d_bias_data = d_bias->mutable_data(ctx.GetPlace()); + d_scale_data = d_scale->mutable_data(ctx.GetPlace()); + } + + // d_bias = np.sum(d_y, axis=0) + // d_scale = np.sum((X - mean) / inv_std * dy, axis=0) + // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) + // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) + EigenVectorArrayMap d_bias_arr(d_bias_data, C); + EigenVectorArrayMap d_scale_arr(d_scale_data, C); + + if (d_scale && d_bias) { + d_bias_arr.setZero(); + d_scale_arr.setZero(); + } + + if ((N * sample_size) == 1 && !use_global_stats) { + framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); + return; + } + + int scale_coefff = use_global_stats ? 1 : N * sample_size; + const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff; + + switch (data_layout) { + case DataLayout::kNCHW: { + ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); + ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); + EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), + sample_size, N * C); + d_x_arr.setZero(); + + if (d_scale && d_bias) { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_bias_arr(c) += d_y_arr.col(nc).sum(); + d_scale_arr(c) += ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * + d_y_arr.col(nc)) + .sum(); + } + } + if (!use_global_stats) { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) += + scale_inv_var_nhw(c) * + (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) - + (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * + inv_var_arr(c)); + } + } else { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) += scale_inv_var_nhw(c) * d_y_arr.col(nc); + } + } + break; + } + case DataLayout::kNHWC: { + ConstEigenArrayMap x_arr(x->data(), C, N * sample_size); + ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); + EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, + N * sample_size); + d_x_arr.setZero(); + + const auto d_y_row_sum = d_y_arr.rowwise().sum(); + const auto x_minus_mean = x_arr.colwise() - mean_arr; + const auto d_y_mul_x_minus_mean_row_sum = + (d_y_arr * x_minus_mean).rowwise().sum(); + const auto inv_var_sqr = inv_var_arr * inv_var_arr; + + if (d_scale && d_bias) { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_bias_arr += d_y_arr.col(nhw); + d_scale_arr += + (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); + } + } + + if (!use_global_stats) { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_x_arr.col(nhw) += + scale_inv_var_nhw * + (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum - + x_minus_mean.col(nhw) * inv_var_sqr * + d_y_mul_x_minus_mean_row_sum); + } + } else { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_x_arr.col(nhw) += scale_inv_var_nhw * d_y_arr.col(nhw); + } + } + break; + } + default: + PADDLE_THROW("Unknown storage order: %s", data_layout_str); + } + } +}; + +std::unique_ptr BatchNormGradMaker::Apply() const { + auto *op = new framework::OpDesc(); + op->SetType(GradOpType()); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + + op->SetInput("Scale", Input("Scale")); + op->SetInput("Bias", Input("Bias")); + op->SetInput("SavedMean", Output("SavedMean")); + op->SetInput("SavedVariance", Output("SavedVariance")); + + // used when setting use_global_stats True during training + if (boost::get(GetAttr("use_global_stats"))) { + op->SetInput("Mean", Output("MeanOut")); + op->SetInput("Variance", Output("VarianceOut")); + } + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + + return std::unique_ptr(op); +} + +class BatchNormInplaceInToOut : public framework::InplaceOpInference { + public: + std::unordered_map operator()( + const framework::OpDesc &op_desc, bool use_cuda) const override { + return {{"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"}}; + } +}; + +class BatchNormGradInplaceInToOut : public framework::InplaceOpInference { + public: + std::unordered_map operator()( + const framework::OpDesc &op_desc, bool use_cuda) const override { + // Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C] + return { + {framework::GradVarName("Y"), framework::GradVarName("X")}, + {"SavedMean", framework::GradVarName("Scale")}, + {"SavedVariance", framework::GradVarName("Bias")}, + }; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, + ops::BatchNormOpInferVarType, ops::BatchNormGradMaker) +// ops::BatchNormInplaceInToOut); +REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp) +// ops::BatchNormGradInplaceInToOut); + +REGISTER_OP_CPU_KERNEL( + batch_norm, ops::BatchNormKernel, + ops::BatchNormKernel); +REGISTER_OP_CPU_KERNEL( + batch_norm_grad, + ops::BatchNormGradKernel, + ops::BatchNormGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu new file mode 100644 index 00000000..a78a6726 --- /dev/null +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -0,0 +1,418 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/float16.h" + +// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in +// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT +// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The +// reason we set it to false by default is that this mode may use scaled +// atomic integer reduction that may cause a numerical overflow for certain +// input data range. +DEFINE_bool(cudnn_batchnorm_spatial_persistent, false, + "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn " + "batch_norm, default is False."); + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; +template +using CudnnDataType = platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +class BatchNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + const float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + const bool use_global_stats = ctx.Attr("use_global_stats"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + auto *y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + + // ------------------- cudnn descriptors --------------------- + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); +#if CUDNN_VERSION_MIN(7, 0, 0) + if (FLAGS_cudnn_batchnorm_spatial_persistent) { + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#else + mode_ = CUDNN_BATCHNORM_SPATIAL; +#endif + + VLOG(3) << "Setting descriptors."; + std::vector dims; + std::vector strides; + if (data_layout == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * D * C, 1, W * D * C, D * C, C}; + } + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + // Note: PERSISTENT not implemented for inference + CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, is_test ? CUDNN_BATCHNORM_SPATIAL : mode_)); + + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + + auto &dev_ctx = ctx.template device_context(); + + auto handle = dev_ctx.cudnn_handle(); + + // Now, depending on whether we are running test or not, we have two paths. + if (is_test || use_global_stats) { + // only when test we use input to do computation. + const auto *est_mean = ctx.Input("Mean"); + const auto *est_var = ctx.Input("Variance"); + // Run inference mode. + PADDLE_ENFORCE_EQ(est_mean->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(est_var->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(est_mean->dims()[0], C); + PADDLE_ENFORCE_EQ(est_var->dims()[0], C); + + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardInference( + handle, + // Note: PERSISTENT not implemented for inference + CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, y->template mutable_data(ctx.GetPlace()), + bn_param_desc_, scale->template data>(), + bias->template data>(), + est_mean->template data>(), + est_var->template data>(), epsilon)); + } else { + // Run training mode. + // obtain running mean and running inv var, and see if we need to + // initialize them. + + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + mean_out->mutable_data>(ctx.GetPlace()); + variance_out->mutable_data>(ctx.GetPlace()); + + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + saved_mean->mutable_data>(ctx.GetPlace()); + saved_variance->mutable_data>(ctx.GetPlace()); + math::SetConstant> + functor; + functor(dev_ctx, saved_mean, static_cast>(0)); + functor(dev_ctx, saved_variance, static_cast>(0)); + + if ((N * H * W * D) == 1) { + LOG(WARNING) << "Only 1 element in normalization dimension, " + << "we skip the batch norm calculation, let y = x."; + framework::TensorCopy(*x, ctx.GetPlace(), y); + } else { + double this_factor = 1. - momentum; + + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, mode_, CudnnDataType::kOne(), CudnnDataType::kZero(), + data_desc_, x->template data(), data_desc_, + y->template mutable_data(ctx.GetPlace()), bn_param_desc_, + scale->template data>(), + bias->template data>(), this_factor, + mean_out->template mutable_data>( + ctx.GetPlace()), + variance_out->template mutable_data>( + ctx.GetPlace()), + epsilon, saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()))); + } + } + + // clean when exit. + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + } +}; + +template +static __global__ void KeBNBackwardScaleBias( + const T *dy, const T *x, const BatchNormParamType *mean, + const BatchNormParamType *variance, const double epsilon, const int N, + const int C, const int HxW, BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + BatchNormParamType inv_var_i = 1.0 / sqrt(variance[i] + epsilon); + BatchNormParamType mean_i = mean[i]; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + ds_sum += static_cast>(dy[index]) * + (static_cast>(x[index]) - mean_i); + db_sum += static_cast>(dy[index]); + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale[i] = ds_sum * inv_var_i; + dbias[i] = db_sum; + } + __syncthreads(); + } +} + +template +static __global__ void KeBNBackwardData(const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *variance, + const double epsilon, const int C, + const int HxW, const int num, T *dx) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); + dx[i] = static_cast(static_cast>(dy[i]) * + scale[c] * inv_var); + } +} + +template +class BatchNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + const std::string data_layout_str = ctx.Attr("data_layout"); + const bool use_global_stats = ctx.Attr("use_global_stats"); + + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + + const auto &x_dims = x->dims(); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + if (d_scale && d_bias) { + d_scale->mutable_data>(ctx.GetPlace()); + d_bias->mutable_data>(ctx.GetPlace()); + } + PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(scale->dims()[0], C); + + std::vector dims; + std::vector strides; + if (data_layout == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * C * D, 1, W * D * C, D * C, C}; + } + + auto &dev_ctx = ctx.template device_context(); + if (!use_global_stats) { + if ((N * H * W * D) == 1) { + framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); + math::SetConstant> + functor; + functor(dev_ctx, d_scale, static_cast>(0)); + functor(dev_ctx, d_bias, static_cast>(0)); + return; + } + + // ------------------- cudnn descriptors --------------------- + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); +#if CUDNN_VERSION_MIN(7, 0, 0) + if (FLAGS_cudnn_batchnorm_spatial_persistent) { + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#else + mode_ = CUDNN_BATCHNORM_SPATIAL; +#endif + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); + + const auto *saved_mean = ctx.Input("SavedMean"); + const auto *saved_var = ctx.Input("SavedVariance"); + const void *saved_mean_data = + saved_mean->template data>(); + const void *saved_var_data = + saved_var->template data>(); + + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( + dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), + CudnnDataType::kZero(), CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, d_y->template data(), data_desc_, + d_x->template mutable_data(ctx.GetPlace()), bn_param_desc_, + scale->template data>(), + d_scale->template mutable_data>(ctx.GetPlace()), + d_bias->template mutable_data>(ctx.GetPlace()), + epsilon, saved_mean_data, saved_var_data)); + + // clean when exit. + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); + } else { + const auto *running_mean = ctx.Input("Mean"); + const auto *running_var = ctx.Input("Variance"); + + const auto *running_mean_data = + running_mean->template data>(); + const auto *running_var_data = + running_var->template data>(); + + const int num = x->numel(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid1 = (num + block - 1) / block; + int grid2 = std::min(C, max_blocks); + + if (data_layout == framework::DataLayout::kNCHW) { + if (d_x) { + KeBNBackwardData<<< + grid1, block, 0, dev_ctx.stream()>>>( + d_y->data(), scale->data>(), + running_var_data, epsilon, C, H * W, num, d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias<<< + grid2, block, 0, dev_ctx.stream()>>>( + d_y->data(), x->data(), running_mean_data, running_var_data, + epsilon, N, C, H * W * D, d_scale->data>(), + d_bias->data>()); + } + } else { + if (d_x) { + KeBNBackwardData<<< + grid1, block, 0, dev_ctx.stream()>>>( + d_y->data(), scale->data>(), + running_var_data, epsilon, C, H * W, num, d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias<<< + grid2, block, 0, dev_ctx.stream()>>>( + d_y->data(), x->data(), running_mean_data, running_var_data, + epsilon, N, C, H * W * D, d_scale->data>(), + d_bias->data>()); + } + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL( + batch_norm, ops::BatchNormKernel, + ops::BatchNormKernel, + ops::BatchNormKernel); +REGISTER_OP_CUDA_KERNEL( + batch_norm_grad, ops::BatchNormGradKernel, + ops::BatchNormGradKernel, + ops::BatchNormGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h new file mode 100644 index 00000000..6e89d73e --- /dev/null +++ b/paddle/fluid/operators/batch_norm_op.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +class BatchNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class BatchNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +class BatchNormGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override; + + virtual std::string GradOpType() const { + return this->ForwardOpType() + "_grad"; + } +}; + +class BatchNormOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Y"}}; + } +}; + +template +class BatchNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override; +}; + +template +class BatchNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override; +}; + +inline void ExtractNCWHD(const framework::DDim &dims, + const DataLayout &data_layout, int *N, int *C, int *H, + int *W, int *D) { + *N = dims[0]; + if (dims.size() == 2) { + *C = dims[1]; + *H = 1; + *W = 1; + *D = 1; + } else { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *W = dims.size() > 3 + ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) + : 1; + *D = dims.size() > 4 + ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) + : 1; + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h new file mode 100644 index 00000000..7e2740f1 --- /dev/null +++ b/paddle/fluid/operators/batch_size_like.h @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +class BatchSizeLikeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of %s should not be null.", Type()); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of %s should not be null.", Type()); + + auto &shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE_GT(shape.size(), 0); + std::vector shape_int64(shape.size(), 0); + std::transform(shape.begin(), shape.end(), shape_int64.begin(), + [](int a) { return static_cast(a); }); + auto output_dim = framework::make_ddim(shape_int64); + + int input_dim_idx = ctx->Attrs().Get("input_dim_idx"); + PADDLE_ENFORCE_GE(input_dim_idx, 0); + PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx); + + int output_dim_idx = ctx->Attrs().Get("output_dim_idx"); + PADDLE_ENFORCE_GE(output_dim_idx, 0); + PADDLE_ENFORCE_GT(static_cast(shape.size()), output_dim_idx); + + output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx]; + ctx->SetOutputDim("Out", output_dim); + } +}; + +class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() final { + AddInput( + "Input", + "Tensor whose input_dim_idx'th dimension specifies the batch_size"); + AddOutput("Out", + "Tensor of specified shape will be filled " + "with the specified value"); + AddAttr>("shape", "The shape of the output"); + AddAttr("input_dim_idx", + "default 0. The index of input's batch size dimension") + .SetDefault(0); + AddAttr("output_dim_idx", + "default 0. The index of output's batch size dimension") + .SetDefault(0); + Apply(); + } + + protected: + virtual void Apply() = 0; +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(BatchSizeLikeNoNeedBufferVarsInference, + "Input"); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc new file mode 100644 index 00000000..4cef4928 --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -0,0 +1,223 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/operators/beam_search_decode_op.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +struct BeamSearchDecodeFunctor { + BeamSearchDecodeFunctor(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, LoDTensor* score_tensor, + size_t beam_size, int end_id) + : beam_size_(beam_size), + end_id_(end_id), + step_ids_origin_(step_ids), + step_scores_origin_(step_scores), + id_tensor_(id_tensor), + score_tensor_(score_tensor) { + tensor_on_gpu_ = false; + // First make a copy of GPU data on CPU + if (platform::is_gpu_place(step_ids_origin_[0].place())) { + tensor_on_gpu_ = true; + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(step_ids_origin_[0].place()); + // Copy all tensors in the input tensor array + for (auto& step_id : step_ids_origin_) { + framework::LoDTensor out; + if (step_id.numel() > 0) { + dev_ctx->Wait(); + framework::TensorCopy(step_id, platform::CPUPlace(), *dev_ctx, &out); + dev_ctx->Wait(); + } + + out.set_lod(step_id.lod()); + step_ids_.push_back(out); + } + } + if (platform::is_gpu_place(step_scores_origin_[0].place())) { + tensor_on_gpu_ = true; + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(step_scores_origin_[0].place()); + // Copy all tensors in the input tensor array + for (auto& step_score : step_scores_origin_) { + framework::LoDTensor out; + if (step_score.numel() > 0) { + dev_ctx->Wait(); + framework::TensorCopy(step_score, platform::CPUPlace(), *dev_ctx, + &out); + dev_ctx->Wait(); + } + + out.set_lod(step_score.lod()); + step_scores_.push_back(out); + } + } + } + + template + void apply() const; + + bool tensor_on_gpu_; + size_t beam_size_; + int end_id_; + // TODO(Superjomn) Here might result serious performance issue in the + // concurrency + // scenarios. + const LoDTensorArray& step_ids_origin_; + const LoDTensorArray& step_scores_origin_; + LoDTensorArray step_ids_ = LoDTensorArray(); + LoDTensorArray step_scores_ = LoDTensorArray(); + LoDTensor* id_tensor_; + LoDTensor* score_tensor_; +}; + +template +void BeamSearchDecodeFunctor::apply() const { + BeamSearchDecoder beam_search_decoder(beam_size_, end_id_); + // Check if the tensor is on GPU. If so, use the CPU copy instead + if (tensor_on_gpu_) { + beam_search_decoder.Backtrace(step_ids_, step_scores_, id_tensor_, + score_tensor_); + } else { + beam_search_decoder.Backtrace(step_ids_origin_, step_scores_origin_, + id_tensor_, score_tensor_); + } +} + +template <> +void BeamSearchDecodeFunctor::apply() const { + PADDLE_THROW("beam search decode op does not support bool!"); +} + +class BeamSearchDecodeOp : public framework::OperatorBase { + public: + BeamSearchDecodeOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& dev_ctx = *pool.Get(dev_place); + + framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); + framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx, nullptr); + + const LoDTensorArray* ids = ctx.Input("Ids"); + const LoDTensorArray* scores = ctx.Input("Scores"); + const size_t step_num = ids->size(); + PADDLE_ENFORCE_GT(step_num, 0UL, + "beam search steps should be larger than 0"); + const size_t source_num = ids->at(0).lod().at(0).size() - 1; + PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0"); + + for (size_t i = 0; i < step_num; ++i) { + PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL, + "Level of LodTensor should be 2"); + } + + size_t beam_size = ctx.Attr("beam_size"); + int end_id = ctx.Attr("end_id"); + + // prepare output + LoDTensor* sentenceIds = ctx.Output("SentenceIds"); + LoDTensor* sentenceScores = ctx.Output("SentenceScores"); + + framework::VisitDataType( + scores->at(0).type(), + BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores, + beam_size, end_id)); + } +}; + +class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Ids", + "(LodTensorArray)" + "The LodTensorArray containing the selected ids of all steps"); + AddInput("Scores", + "(LodTensorArray)" + "The LodTensorArray containing the selected scores of all steps"); + AddOutput( + "SentenceIds", + "(LodTensor)" + "An LodTensor containing all generated id sequences for all source " + "sentences"); + AddOutput( + "SentenceScores", + "(LodTensor)" + "An LodTensor containing scores corresponding to Output(SentenceIds)"); + AddAttr("beam_size", "beam size for beam search"); + AddAttr("end_id", + "the token id which indicates the end of a sequence"); + AddComment(R"DOC( +Beam Search Decode Operator. This Operator constructs the full hypotheses for +each source sentence by walking back along the LoDTensorArray Input(ids) +whose lods can be used to restore the path in the beam search tree. + +The Output(SentenceIds) and Output(SentenceScores) separately contain the +generated id sequences and the corresponding scores. The shapes and lods of the +two LodTensor are same. The lod level is 2 and the two levels separately +indicate how many hypotheses each source sentence has and how many ids each +hypothesis has. +)DOC"); + } +}; + +class BeamSearchDecodeInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE(context->HasInput("Ids"), + "BeamSearchDecodeOp must has input Ids"); + PADDLE_ENFORCE(context->HasInput("Scores"), + "BeamSearchDecodeOp must has input Scores"); + PADDLE_ENFORCE(context->HasOutput("SentenceIds"), + "BeamSearchDecodeOp must has output SentenceIds"); + PADDLE_ENFORCE(context->HasOutput("SentenceScores"), + "BeamSearchDecodeOp must has output SentenceScores"); + } +}; + +class BeamSearchDecodeInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + for (auto& o : ctx->Output("SentenceIds")) { + ctx->SetType(o, framework::proto::VarType::LOD_TENSOR); + } + for (auto& o : ctx->Output("SentenceScores")) { + ctx->SetType(o, framework::proto::VarType::LOD_TENSOR); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp, + paddle::operators::BeamSearchDecodeOpProtoMaker, + paddle::operators::BeamSearchDecodeInferShape, + paddle::operators::BeamSearchDecodeInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/beam_search_decode_op.h b/paddle/fluid/operators/beam_search_decode_op.h new file mode 100644 index 00000000..0b883c31 --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op.h @@ -0,0 +1,215 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using LoDTensorArray = framework::LoDTensorArray; + +// all the lod have 2 levels. +// The first is source level, the second is sentence level. +// source level describe how many prefixes (branchs) for each source sentece +// (beam). sentence level describe how these candidates belong to the prefixes. +const size_t kSourceLevel = 0; +const size_t kSentenceLevel = 1; + +template +struct Sentence { + std::vector word_ids; + std::vector scores; +}; + +template +using SentenceVector = std::vector>; + +template +struct BeamSearchDecoder { + BeamSearchDecoder(size_t beam_size, int end_id) + : beam_size_(beam_size), end_id_(end_id) {} + + /** + * convert the result sentence_vector for each source sentence into two + * LodTensor. + * One is all candidate sentences with word id, one is all candidate sentences + * with word score. + * Param: + * sentence_vector_list: sentence_vector for each source sentence. + * id_tensor: result LoDTensor for sentences of id. + * score_tensor: result LoDTensor for sentences of score. + * reverse: whether ids of sentence in sentence_vector_list is reversed + * sort_by_score: whether to sort hypotheses of each sentence by scores. + */ + void ConvertSentenceVectorToLodTensor( + std::vector> sentence_vector_list, LoDTensor* id_tensor, + LoDTensor* score_tensor, bool reverse = true, + bool sort_by_score = true) const; + + /** + * Gather the hypotheses for each source sentence by backtrace though the + * LoDTensorArray step_ids whose lods reserve the path in the tree. + */ + void Backtrace(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, LoDTensor* id_tensor, + LoDTensor* score_tensor) const; + + size_t beam_size_; + int end_id_; +}; + +template +void BeamSearchDecoder::ConvertSentenceVectorToLodTensor( + std::vector> sentence_vector_list, LoDTensor* id_tensor, + LoDTensor* score_tensor, bool reverse, bool sort_by_score) const { + size_t src_num = sentence_vector_list.size(); + + PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0"); + + std::vector source_level_lod = {0}; + std::vector sentence_level_lod = {0}; + std::vector id_data; + std::vector score_data; + + for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + if (sort_by_score) { + sort(sentence_vector_list[src_idx].begin(), + sentence_vector_list[src_idx].end(), + [reverse](const Sentence& a, const Sentence& b) { + if (reverse) + return a.scores.front() > b.scores.front(); + else + return a.scores.back() > b.scores.back(); + }); + } + for (Sentence& sentence : sentence_vector_list[src_idx]) { + if (reverse) { + id_data.insert(id_data.end(), sentence.word_ids.rbegin(), + sentence.word_ids.rend()); + score_data.insert(score_data.end(), sentence.scores.rbegin(), + sentence.scores.rend()); + } else { + id_data.insert(id_data.end(), sentence.word_ids.begin(), + sentence.word_ids.end()); + score_data.insert(score_data.end(), sentence.scores.begin(), + sentence.scores.end()); + } + + sentence_level_lod.push_back(sentence_level_lod.back() + + sentence.word_ids.size()); + } + source_level_lod.push_back(source_level_lod.back() + + sentence_vector_list[src_idx].size()); + } + + auto cpu_place = std::unique_ptr( + new paddle::platform::CPUPlace()); + paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place); + + framework::LoD lod; + lod.push_back(source_level_lod); + lod.push_back(sentence_level_lod); + + id_tensor->set_lod(lod); + id_tensor->Resize({static_cast(id_data.size())}); + id_tensor->mutable_data(paddle::platform::CPUPlace()); + framework::TensorFromVector(id_data, cpu_ctx, id_tensor); + + score_tensor->set_lod(lod); + score_tensor->Resize({static_cast(score_data.size())}); + score_tensor->mutable_data(paddle::platform::CPUPlace()); + framework::TensorFromVector(score_data, cpu_ctx, score_tensor); +} + +template +void BeamSearchDecoder::Backtrace(const LoDTensorArray& step_ids, + const LoDTensorArray& step_scores, + LoDTensor* id_tensor, + LoDTensor* score_tensor) const { + PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0"); + PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(), + "step_ids and step_scores should be the same"); + const size_t step_num = step_ids.size(); + const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1; + std::vector> sentence_vector_list( + src_num, SentenceVector(beam_size_)); + std::vector> prefix_idx_vector_list(src_num); + for (int step_id = step_num - 1; step_id >= 0; --step_id) { + auto& cur_ids = step_ids.at(step_id); + auto& cur_scores = step_scores.at(step_id); + for (size_t src_idx = 0; src_idx < src_num; ++src_idx) { + // for each source sentence + auto& sentence_vector = sentence_vector_list.at(src_idx); + auto& prefix_idx_vector = prefix_idx_vector_list.at(src_idx); + size_t src_prefix_start = cur_ids.lod().at(kSourceLevel)[src_idx]; + size_t src_prefix_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1]; + if (prefix_idx_vector.empty()) { // be finished and pruned at this step + // or the last time step + for (size_t prefix_idx = src_prefix_start; prefix_idx < src_prefix_end; + ++prefix_idx) { + size_t candidate_start = cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + size_t candidate_end = + cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1]; + for (size_t candidate_idx = candidate_start; + candidate_idx < candidate_end; ++candidate_idx) { + prefix_idx_vector.push_back(prefix_idx); + size_t idx = prefix_idx_vector.size() - 1; + auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_score = cur_scores.data()[candidate_idx]; + sentence_vector.at(idx).word_ids.push_back(cur_id); + sentence_vector.at(idx).scores.push_back(cur_score); + } + } + } else { // use prefix_idx_vector to backtrace + size_t src_candidate_start = + cur_ids.lod().at(kSentenceLevel)[src_prefix_start]; + size_t prefix_idx = src_prefix_start; + size_t candidate_num = + cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - + cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + for (size_t idx = 0; idx < prefix_idx_vector.size(); ++idx) { + auto candidate_idx = prefix_idx_vector.at(idx); + auto cur_id = cur_ids.data()[candidate_idx]; + auto cur_score = cur_scores.data()[candidate_idx]; + if (cur_id != end_id_ || sentence_vector.at(idx).word_ids.empty()) { + // to skip redundant end tokens + sentence_vector.at(idx).word_ids.push_back(cur_id); + sentence_vector.at(idx).scores.push_back(cur_score); + } + + while (src_candidate_start + candidate_num <= + candidate_idx) { // search the corresponding prefix + prefix_idx++; + candidate_num += cur_ids.lod().at(kSentenceLevel)[prefix_idx + 1] - + cur_ids.lod().at(kSentenceLevel)[prefix_idx]; + } + prefix_idx_vector.at(idx) = prefix_idx; + } + } + } + } + + ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor, + score_tensor, true, true); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/beam_search_decode_op_test.cc b/paddle/fluid/operators/beam_search_decode_op_test.cc new file mode 100644 index 00000000..88339e38 --- /dev/null +++ b/paddle/fluid/operators/beam_search_decode_op_test.cc @@ -0,0 +1,129 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_decode_op.h" +#include "gtest/gtest.h" + +using CPUPlace = paddle::platform::CPUPlace; +using LoD = paddle::framework::LoD; +using LoDTensor = paddle::framework::LoDTensor; +using LoDTensorArray = paddle::framework::LoDTensorArray; + +template +using BeamSearchDecoder = paddle::operators::BeamSearchDecoder; +template +using Sentence = paddle::operators::Sentence; +template +using SentenceVector = paddle::operators::SentenceVector; + +namespace paddle { +namespace test { + +void GenerateExample(const std::vector& level_0, + const std::vector& level_1, + const std::vector& data, LoDTensorArray* ids, + LoDTensorArray* scores) { + PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1, + "source level is used to describe candidate set"); + PADDLE_ENFORCE_EQ(level_1.back(), data.size(), + "the lowest level is used to describe data" + ", so it's last element should be data length"); + + CPUPlace place; + + LoD lod; + lod.push_back(level_0); + lod.push_back(level_1); + + // Ids + LoDTensor tensor_id; + tensor_id.set_lod(lod); + tensor_id.Resize({static_cast(data.size())}); + // malloc memory + int64_t* id_ptr = tensor_id.mutable_data(place); + for (size_t i = 0; i < data.size(); ++i) { + id_ptr[i] = static_cast(data.at(i)); + } + + // Scores + LoDTensor tensor_score; + tensor_score.set_lod(lod); + tensor_score.Resize({static_cast(data.size())}); + // malloc memory + float* score_ptr = tensor_score.mutable_data(place); + for (size_t i = 0; i < data.size(); ++i) { + score_ptr[i] = static_cast(data.at(i)); + } + + ids->push_back(tensor_id); + scores->push_back(tensor_score); +} + +} // namespace test +} // namespace paddle + +TEST(BeamSearchDecodeOp, Backtrace) { + CPUPlace place; + + // Construct sample data with 5 steps and 2 source sentences + // beam_size = 2, start_id = 0, end_id = 1 + LoDTensorArray ids; + LoDTensorArray scores; + + paddle::test::GenerateExample( + std::vector{0, 1, 2}, std::vector{0, 1, 2}, + std::vector{0, 0}, &ids, &scores); // start with start_id + paddle::test::GenerateExample(std::vector{0, 1, 2}, + std::vector{0, 2, 4}, + std::vector{2, 3, 4, 5}, &ids, &scores); + paddle::test::GenerateExample(std::vector{0, 2, 4}, + std::vector{0, 2, 2, 4, 4}, + std::vector{3, 1, 5, 4}, &ids, &scores); + paddle::test::GenerateExample(std::vector{0, 2, 4}, + std::vector{0, 1, 2, 3, 4}, + std::vector{1, 1, 3, 5}, &ids, &scores); + paddle::test::GenerateExample( + std::vector{0, 2, 4}, + std::vector{0, 0, 0, 2, + 2}, // the branchs of the first source sentence + // are pruned since finished + std::vector{5, 1}, + &ids, &scores); + + ASSERT_EQ(ids.size(), 5UL); + ASSERT_EQ(scores.size(), 5UL); + + BeamSearchDecoder helper(2, 1); // beam_size = 2, end_id = 1 + + LoDTensor id_tensor; + LoDTensor score_tensor; + helper.Backtrace(ids, scores, &id_tensor, &score_tensor); + + LoD lod = id_tensor.lod(); + std::vector expect_source_lod = {0, 2, 4}; + EXPECT_EQ(lod[0], expect_source_lod); + std::vector expect_sentence_lod = {0, 4, 7, 12, 17}; + EXPECT_EQ(lod[1], expect_sentence_lod); + std::vector expect_data = {0, 2, 3, 1, 0, 2, 1, 0, 4, + 5, 3, 5, 0, 4, 5, 3, 1}; + ASSERT_EQ(id_tensor.dims()[0], static_cast(expect_data.size())); + for (size_t i = 0; i < expect_data.size(); ++i) { + ASSERT_EQ(id_tensor.data()[i], + static_cast(expect_data[i])); + } + for (int64_t i = 0; i < id_tensor.dims()[0]; ++i) { + ASSERT_EQ(score_tensor.data()[i], + static_cast(id_tensor.data()[i])); + } +} diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc new file mode 100644 index 00000000..a6aa35e0 --- /dev/null +++ b/paddle/fluid/operators/beam_search_op.cc @@ -0,0 +1,145 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_op.h" + +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + // inputs and outputs stored in proto + AddInput("pre_ids", + "(LoDTensor) The LoDTensor containing the selected ids at the " + "previous step. It should be a tensor with shape (batch_size, 1) " + "and lod `[[0, 1, ... , batch_size], [0, 1, ..., batch_size]]` at " + "the first step."); + AddInput("pre_scores", + "(LoDTensor) The LoDTensor containing the accumulated " + "scores corresponding to the selected ids at the previous step."); + AddInput("ids", + "(LoDTensor) The LoDTensor containing the candidates ids. Its " + "shape should be (batch_size * beam_size, W). If not set, it will " + "be calculated out according to Input(scores) in this operator.") + .AsDispensable(); + AddInput("scores", + "(LoDTensor) The LoDTensor containing the current scores " + "corresponding to Input(ids). If Input(ids) is not nullptr, its " + "shape is the same as that of Input(ids)." + "If is_accumulated is true, Input(scores) is accumulated scores " + "and will be used derectedly. Else, each score will be " + "transformed to the log field and accumulate Input(pre_sores) " + "first."); + AddOutput("selected_ids", + "A LodTensor that stores the IDs selected by beam search."); + AddOutput("selected_scores", + "A LoDTensor containing the accumulated scores corresponding to " + "Output(selected_ids)."); + AddOutput("parent_idx", + "A Tensor preserving the selected_ids' parent indice in pre_ids.") + .AsDispensable(); + + // Attributes stored in AttributeMap + AddAttr("level", "the level of LoDTensor"); + AddAttr("beam_size", "beam size for beam search"); + AddAttr("end_id", + "the token id which indicates the end of a sequence"); + AddAttr("is_accumulated", + "Whether the Input(scores) is accumulated scores.") + .SetDefault(true); + + AddComment(R"DOC( +This operator does the search in beams for one time step. +Specifically, it selects the top-K candidate word ids of current step from +Input(ids) according to their Input(scores) for all source sentences, +where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results +from the computation cell. Additionally, Input(pre_ids) and Input(pre_scores) +are the output of beam_search at previous step, they are needed for special use +to handle ended candidate translations. The paths linking prefixes and selected +candidates are organized and reserved in lod. + +Note that the Input(scores) passed in should be accumulated scores, and +length penalty should be done with extra operators before calculating the +accumulated scores if needed, also suggest finding top-K before it and +using the top-K candidates following. +)DOC"); + } +}; + +class BeamSearchOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + for (const std::string &arg : + std::vector({"pre_ids", "scores"})) { + PADDLE_ENFORCE(ctx->HasInput(arg), "BeamSearch need input argument '%s'", + arg); + } + for (const std::string &arg : + std::vector({"selected_ids", "selected_scores"})) { + PADDLE_ENFORCE(ctx->HasOutput(arg), + "BeamSearch need output argument '%s'", arg); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto *scores = ctx.Input("scores"); + size_t level = ctx.Attr("level"); + size_t batch_size = scores->lod()[level].size() - 1; + // The current CUDA kernel only support cases with batch_size < 4. + // Compute on CPU for cases with batch_size > 4. + if (batch_size <= 4) { + return framework::OpKernelType( + ctx.Input("pre_ids")->type(), ctx.GetPlace()); + } else { + return framework::OpKernelType( + ctx.Input("pre_ids")->type(), + platform::CPUPlace()); + } + } +}; + +class BeamSearchInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + for (auto &o : ctx->Output("selected_ids")) { + ctx->SetType(o, framework::proto::VarType::LOD_TENSOR); + } + for (auto &o : ctx->Output("selected_scores")) { + ctx->SetType(o, framework::proto::VarType::LOD_TENSOR); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(beam_search, ops::BeamSearchOp, ops::BeamSearchOpMaker, + ops::BeamSearchInferVarType); +REGISTER_OP_CPU_KERNEL( + beam_search, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel); diff --git a/paddle/fluid/operators/beam_search_op.cu.cc b/paddle/fluid/operators/beam_search_op.cu.cc new file mode 100644 index 00000000..4ef9476e --- /dev/null +++ b/paddle/fluid/operators/beam_search_op.cu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/beam_search_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + beam_search, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel, + ops::BeamSearchOpKernel); diff --git a/paddle/fluid/operators/beam_search_op.h b/paddle/fluid/operators/beam_search_op.h new file mode 100644 index 00000000..3d32ea0c --- /dev/null +++ b/paddle/fluid/operators/beam_search_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/math/beam_search.h" + +namespace paddle { +namespace operators { + +template +class BeamSearchOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* ids = context.Input("ids"); + auto* scores = context.Input("scores"); + auto* pre_ids = context.Input("pre_ids"); + auto* pre_scores = context.Input("pre_scores"); + + PADDLE_ENFORCE_NOT_NULL(scores); + PADDLE_ENFORCE_NOT_NULL(pre_ids); + PADDLE_ENFORCE_NOT_NULL(pre_scores); + + size_t level = context.Attr("level"); + size_t beam_size = context.Attr("beam_size"); + int end_id = context.Attr("end_id"); + bool is_accumulated = context.Attr("is_accumulated"); + + auto selected_ids = context.Output("selected_ids"); + auto selected_scores = + context.Output("selected_scores"); + auto* parent_idx = context.Output("parent_idx"); + PADDLE_ENFORCE_NOT_NULL(selected_ids); + PADDLE_ENFORCE_NOT_NULL(selected_scores); + + math::BeamSearchFunctor alg; + alg(context.template device_context(), pre_ids, pre_scores, + ids, scores, selected_ids, selected_scores, parent_idx, level, + beam_size, end_id, is_accumulated); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/benchmark/CMakeLists.txt b/paddle/fluid/operators/benchmark/CMakeLists.txt new file mode 100644 index 00000000..54008336 --- /dev/null +++ b/paddle/fluid/operators/benchmark/CMakeLists.txt @@ -0,0 +1,3 @@ +cc_test(op_tester SRCS op_tester.cc op_tester_config.cc + DEPS memory timer framework_proto proto_desc lod_tensor op_registry + device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc new file mode 100644 index 00000000..ac487223 --- /dev/null +++ b/paddle/fluid/operators/benchmark/op_tester.cc @@ -0,0 +1,522 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/benchmark/op_tester.h" +#include +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/timer.h" +#include "paddle/fluid/pybind/pybind.h" + +namespace paddle { +namespace operators { +namespace benchmark { + +DEFINE_string(op_config_list, "", "Path of op config file."); +DEFINE_int32(specified_config_id, -1, "Test the specified op config."); + +void OpTester::Init(const std::string &filename) { + Init(OpTesterConfig(filename)); +} + +void OpTester::Init(const OpTesterConfig &config) { + config_ = config; + + auto &op_desc_info = framework::OpInfoMap::Instance(); + // Initialize the OpDesc + if (op_desc_info.Has(config_.op_type)) { + type_ = config_.op_type; + + CreateOpDesc(); + CreateInputVarDesc(); + CreateOutputVarDesc(); + } else { + LOG(FATAL) << "Op \"" << config_.op_type << "\" is not registered."; + } + + if (config_.device_id >= 0) { + place_ = paddle::platform::CUDAPlace(config_.device_id); + } else { + place_ = paddle::platform::CPUPlace(); + } + + framework::InitDevices(false); + scope_.reset(new paddle::framework::Scope()); + + op_ = framework::OpRegistry::CreateOp(op_desc_); + CreateVariables(scope_.get()); +} + +void OpTester::Run() { + if (config_.print_debug_string) { + LOG(INFO) << DebugString(); + } + + // Warm up + RunImpl(); + + platform::Timer timer; + if (config_.profile) { + if (platform::is_cpu_place(place_)) { + platform::EnableProfiler(platform::ProfilerState::kCPU); + } else { +#ifdef PADDLE_WITH_CUDA + platform::EnableProfiler(platform::ProfilerState::kAll); + platform::SetDeviceId(config_.device_id); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif + } + + timer.Start(); + for (int i = config_.repeat; i > 0; --i) { + RunImpl(); + } + timer.Pause(); + platform::DisableProfiler(platform::EventSortingKey::kDefault, + "op_tester_profiler"); + } else { + timer.Start(); + for (int i = config_.repeat; i > 0; --i) { + RunImpl(); + } + timer.Pause(); + } + config_.runtime = timer.ElapsedMS() / config_.repeat; + LOG(INFO) << "=== Run " << config_.repeat + << " times, latency: " << config_.runtime << " ms ==="; +} + +void OpTester::RunImpl() { + op_->Run(*scope_, place_); + platform::DeviceContextPool::Instance().Get(place_)->Wait(); + scope_->DropKids(); +} + +std::vector OpTester::GetOpProtoInputNames() { + std::vector input_names; + const framework::proto::OpProto &proto = + framework::OpInfoMap::Instance().Get(type_).Proto(); + for (int i = 0; i != proto.inputs_size(); ++i) { + const auto &input = proto.inputs(i); + input_names.push_back(input.name()); + } + return input_names; +} + +std::vector OpTester::GetOpProtoOutputNames() { + std::vector output_names; + const framework::proto::OpProto &proto = + framework::OpInfoMap::Instance().Get(type_).Proto(); + for (int i = 0; i != proto.outputs_size(); ++i) { + const auto &output = proto.outputs(i); + output_names.push_back(output.name()); + } + return output_names; +} + +std::unordered_map +OpTester::GetOpProtoAttrNames() { + std::unordered_map attr_types; + const framework::proto::OpProto &proto = + framework::OpInfoMap::Instance().Get(type_).Proto(); + const std::vector skipped_attrs = { + framework::OpProtoAndCheckerMaker::OpRoleAttrName(), + framework::OpProtoAndCheckerMaker::OpRoleVarAttrName(), + framework::OpProtoAndCheckerMaker::OpNamescopeAttrName(), + framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName()}; + for (int i = 0; i != proto.attrs_size(); ++i) { + const auto &attr = proto.attrs(i); + if (!Has(skipped_attrs, attr.name())) { + VLOG(4) << "attr: " << attr.name() << ", type: " << attr.type(); + attr_types[attr.name()] = attr.type(); + } + } + return attr_types; +} + +framework::proto::VarType::Type OpTester::TransToVarType(std::string str) { + if (str == "int32") { + return framework::proto::VarType::INT32; + } else if (str == "int64") { + return framework::proto::VarType::INT64; + } else if (str == "fp32") { + return framework::proto::VarType::FP32; + } else if (str == "fp64") { + return framework::proto::VarType::FP64; + } else { + PADDLE_THROW("Unsupported dtype %s.", str.c_str()); + } +} + +void OpTester::CreateInputVarDesc() { + std::vector input_names = GetOpProtoInputNames(); + for (auto &name : input_names) { + const OpInputConfig *input = config_.GetInput(name); + if (input == nullptr) { + LOG(FATAL) << "The input " << name << " of op " << config_.op_type + << " is not correctlly provided."; + } + + std::string var_name = config_.op_type + "." + name; + framework::VarDesc *var = Var(var_name); + // Need to support more type + var->SetType(framework::proto::VarType::LOD_TENSOR); + var->SetPersistable(false); + var->SetDataType(TransToVarType(input->dtype)); + var->SetShape(input->dims); + + op_desc_.SetInput(name, {var_name}); + inputs_[var_name] = *input; + } +} + +void OpTester::CreateOutputVarDesc() { + std::vector output_names = GetOpProtoOutputNames(); + for (auto &name : output_names) { + std::string var_name = config_.op_type + "." + name; + framework::VarDesc *var = Var(var_name); + // Need to support more type + var->SetType(framework::proto::VarType::LOD_TENSOR); + var->SetPersistable(false); + var->SetDataType(framework::proto::VarType::FP32); + + op_desc_.SetOutput(name, {var_name}); + } +} + +void OpTester::CreateOpDesc() { + op_desc_.SetType(config_.op_type); + std::unordered_map attr_types = + GetOpProtoAttrNames(); + for (auto item : config_.attrs) { + const std::string &name = item.first; + if (attr_types.find(name) == attr_types.end()) { + LOG(FATAL) << "Operator " << type_ << " do not have attr " << name; + } + + const std::string &value_str = item.second; + const framework::proto::AttrType &type = attr_types[name]; + switch (type) { + case framework::proto::AttrType::BOOLEAN: + break; + case framework::proto::AttrType::INT: { + int value = StringTo(value_str); + op_desc_.SetAttr(name, {value}); + } break; + case framework::proto::AttrType::FLOAT: { + float value = StringTo(value_str); + op_desc_.SetAttr(name, {value}); + } break; + case framework::proto::AttrType::STRING: { + op_desc_.SetAttr(name, {value_str}); + } break; + case framework::proto::AttrType::BOOLEANS: + case framework::proto::AttrType::INTS: + case framework::proto::AttrType::FLOATS: + case framework::proto::AttrType::STRINGS: + LOG(FATAL) << "Not supported yet."; + break; + case framework::proto::AttrType::LONG: { + int64_t value = StringTo(value_str); + op_desc_.SetAttr(name, value); + } break; + case framework::proto::AttrType::LONGS: + default: + PADDLE_THROW("Unsupport attr type %d", type); + } + } +} + +framework::VarDesc *OpTester::Var(const std::string &name) { + auto it = vars_.find(name); + if (it != vars_.end()) { + return it->second.get(); + } + auto *var = new framework::VarDesc(name); + vars_[name].reset(var); + return var; +} + +template +void OpTester::SetupTensor(framework::LoDTensor *tensor, + const std::vector &shape, T lower, T upper, + const std::string &initializer, + const std::string &filename) { + static unsigned int seed = 100; + std::mt19937 rng(seed++); + std::uniform_real_distribution uniform_dist(0, 1); + + T *ptr = tensor->mutable_data(framework::make_ddim(shape), place_); + + framework::LoDTensor cpu_tensor; + T *cpu_ptr = nullptr; + + if (!platform::is_cpu_place(place_)) { + cpu_ptr = cpu_tensor.mutable_data(framework::make_ddim(shape), + platform::CPUPlace()); + } else { + cpu_ptr = ptr; + } + + if (initializer == "random") { + for (int i = 0; i < cpu_tensor.numel(); ++i) { + cpu_ptr[i] = static_cast(uniform_dist(rng) * (upper - lower) + lower); + } + } else if (initializer == "natural") { + for (int i = 0; i < cpu_tensor.numel(); ++i) { + cpu_ptr[i] = static_cast(lower + i); + } + } else if (initializer == "zeros") { + for (int i = 0; i < cpu_tensor.numel(); ++i) { + cpu_ptr[i] = static_cast(0); + } + } else if (initializer == "file") { + std::ifstream is(filename); + for (size_t i = 0; i < cpu_tensor.numel(); ++i) { + T value; + is >> value; + cpu_ptr[i] = static_cast(value); + } + is.close(); + } else { + PADDLE_THROW("Unsupported initializer %s.", initializer.c_str()); + } + + if (!platform::is_cpu_place(place_)) { + TensorCopySync(cpu_tensor, place_, tensor); + } +} + +void OpTester::CreateVariables(framework::Scope *scope) { + for (auto &item : vars_) { + auto &var = item.second; + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + auto *ptr = scope->Var(var->Name()); + framework::InitializeVariable(ptr, var->GetType()); + if (var->Persistable()) { + VLOG(3) << "Create Variable " << var->Name() + << " global, which pointer is " << ptr; + } else { + VLOG(3) << "Create Variable " << var->Name() + << " locally, which pointer is " << ptr; + } + } + + for (auto &item : inputs_) { + // Allocate memory for input tensor + auto &var_name = item.first; + VLOG(3) << "Allocate memory for tensor " << var_name; + + auto &var_desc = vars_[var_name]; + std::vector shape = var_desc->GetShape(); + + auto *var = scope->Var(var_name); + auto *tensor = var->GetMutable(); + const auto &data_type = var_desc->GetDataType(); + if (data_type == framework::proto::VarType::INT32) { + SetupTensor(tensor, shape, 0, 1, item.second.initializer, + item.second.filename); + } else if (data_type == framework::proto::VarType::INT64) { + SetupTensor(tensor, shape, 0, 1, item.second.initializer, + item.second.filename); + } else if (data_type == framework::proto::VarType::FP32) { + SetupTensor(tensor, shape, static_cast(0.0), + static_cast(1.0), item.second.initializer, + item.second.filename); + } else if (data_type == framework::proto::VarType::FP64) { + SetupTensor(tensor, shape, static_cast(0.0), + static_cast(1.0), item.second.initializer, + item.second.filename); + } else { + PADDLE_THROW("Unsupported dtype %d.", data_type); + } + + VLOG(3) << "Set lod for tensor " << var_name; + std::vector> &lod_vec = item.second.lod; + framework::LoD lod; + for (size_t i = 0; i < lod_vec.size(); ++i) { + lod.push_back(lod_vec[i]); + } + tensor->set_lod(lod); + } +} + +static std::string GenSpaces(int count) { + std::stringstream ss; + for (int i = 0; i < count; ++i) { + ss << " "; + } + return ss.str(); +} + +std::string OpTester::DebugString() { + std::stringstream ss; + int count = 0; + for (auto &item : vars_) { + auto &var = item.second; + ss << GenSpaces(count++) << "vars {\n"; + ss << GenSpaces(count) << "name: \"" << var->Name() << "\"\n"; + ss << GenSpaces(count++) << "type: {\n"; + ss << GenSpaces(count) << "type: LOD_TENSOR\n"; + ss << GenSpaces(count++) << "lod_tensor {\n"; + ss << GenSpaces(count++) << "tensor {\n"; + const auto &data_type = var->GetDataType(); + if (data_type == framework::proto::VarType::INT32) { + ss << GenSpaces(count) << "data_type: INT32\n"; + } else if (data_type == framework::proto::VarType::INT64) { + ss << GenSpaces(count) << "data_type: INT64\n"; + } else if (data_type == framework::proto::VarType::FP32) { + ss << GenSpaces(count) << "data_type: FP32\n"; + } else if (data_type == framework::proto::VarType::FP64) { + ss << GenSpaces(count) << "data_type: FP64\n"; + } + std::vector shape = var->GetShape(); + for (auto d : shape) { + ss << GenSpaces(count) << "dims: " << d << "\n"; + } + ss << GenSpaces(--count) << "}\n"; + ss << GenSpaces(--count) << "}\n"; + ss << GenSpaces(--count) << "}\n"; + ss << GenSpaces(count) << "persistable: " << var->Persistable() << "\n"; + ss << GenSpaces(--count) << "}\n"; + } + ss << GenSpaces(count++) << "ops {\n"; + for (auto &name : op_desc_.InputNames()) { + ss << GenSpaces(count++) << "inputs {\n"; + ss << GenSpaces(count) << "parameters: \"" << name << "\"\n"; + ss << GenSpaces(count) << "arguments: \"" << op_desc_.Input(name)[0] + << "\"\n"; + ss << GenSpaces(--count) << "}\n"; + } + for (auto &name : op_desc_.OutputNames()) { + ss << GenSpaces(count++) << "outputs {\n"; + ss << GenSpaces(count) << "parameters: \"" << name << "\"\n"; + ss << GenSpaces(count) << "arguments: \"" << op_desc_.Output(name)[0] + << "\"\n"; + ss << GenSpaces(--count) << "}\n"; + } + ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n"; + for (auto &name : op_desc_.AttrNames()) { + ss << GenSpaces(count++) << "attrs {\n"; + const auto &attr_type = op_desc_.GetAttrType(name); + const auto &attr = op_desc_.GetAttr(name); + ss << GenSpaces(count) << "name: \"" << name << "\"\n"; + switch (attr_type) { + case framework::proto::AttrType::BOOLEAN: { + ss << GenSpaces(count) << "type: BOOLEAN\n"; + ss << GenSpaces(count) << "b: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::INT: { + ss << GenSpaces(count) << "type: INT\n"; + ss << GenSpaces(count) << "i: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::FLOAT: { + ss << GenSpaces(count) << "type: FLOAT\n"; + ss << GenSpaces(count) << "f: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::STRING: { + ss << GenSpaces(count) << "type: STRING\n"; + ss << GenSpaces(count) << "s: \"" << boost::get(attr) + << "\"\n"; + } break; + case framework::proto::AttrType::BOOLEANS: { + ss << GenSpaces(count) << "type: BOOLEANS\n"; + ss << GenSpaces(count) << "bools: " + << "\n"; + } break; + case framework::proto::AttrType::INTS: { + ss << GenSpaces(count) << "type: INTS\n"; + ss << GenSpaces(count) << "ints: " + << "\n"; + } break; + case framework::proto::AttrType::FLOATS: { + ss << GenSpaces(count) << "type: FLOATS\n"; + ss << GenSpaces(count) << "floats: " + << "\n"; + } break; + case framework::proto::AttrType::STRINGS: { + ss << GenSpaces(count) << "type: STRINGS\n"; + ss << GenSpaces(count) << "strings: " + << "\n"; + } break; + case framework::proto::AttrType::LONG: { + ss << GenSpaces(count) << "type: LONG\n"; + ss << GenSpaces(count) << "l: " << boost::get(attr) << "\n"; + } break; + case framework::proto::AttrType::LONGS: { + ss << GenSpaces(count) << "type: LONGS\n"; + ss << GenSpaces(count) << "longs: " + << "\n"; + } break; + default: + PADDLE_THROW("Unsupport attr type %d", attr_type); + } + ss << GenSpaces(--count) << "}\n"; + } + ss << GenSpaces(--count) << "}\n"; + return ss.str(); +} + +TEST(op_tester, base) { + if (!FLAGS_op_config_list.empty()) { + std::ifstream fin(FLAGS_op_config_list, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", + FLAGS_op_config_list.c_str()); + std::vector op_configs; + while (!fin.eof()) { + VLOG(4) << "Reading config " << op_configs.size() << "..."; + OpTesterConfig config; + bool result = config.Init(fin); + if (result) { + op_configs.push_back(config); + } + } + if (FLAGS_specified_config_id >= 0 && + FLAGS_specified_config_id < static_cast(op_configs.size())) { + OpTester tester; + tester.Init(op_configs[FLAGS_specified_config_id]); + tester.Run(); + } else { + for (size_t i = 0; i < op_configs.size(); ++i) { + OpTester tester; + tester.Init(op_configs[i]); + tester.Run(); + } + } + } else { + OpTester tester; + OpTesterConfig config; + config.op_type = "elementwise_add"; + config.inputs.resize(2); + config.inputs[0].name = "X"; + config.inputs[0].dims = {64, 64}; + config.inputs[1].name = "Y"; + config.inputs[1].dims = {64, 1}; + tester.Init(config); + tester.Run(); + } +} + +} // namespace benchmark +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/benchmark/op_tester.h b/paddle/fluid/operators/benchmark/op_tester.h new file mode 100644 index 00000000..a6d21573 --- /dev/null +++ b/paddle/fluid/operators/benchmark/op_tester.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/benchmark/op_tester_config.h" + +namespace paddle { +namespace operators { +namespace benchmark { + +class OpTester { + public: + OpTester() {} + + void Init(const std::string &filename); + void Init(const OpTesterConfig &config); + + void Run(); + + std::string DebugString(); + + private: + std::vector GetOpProtoInputNames(); + std::vector GetOpProtoOutputNames(); + std::unordered_map + GetOpProtoAttrNames(); + + framework::proto::VarType::Type TransToVarType(std::string str); + void CreateInputVarDesc(); + void CreateOutputVarDesc(); + void CreateOpDesc(); + + framework::VarDesc *Var(const std::string &name); + void CreateVariables(framework::Scope *scope); + + template + void SetupTensor(framework::LoDTensor *input, + const std::vector &shape, T lower, T upper, + const std::string &initializer, const std::string &filename); + + void RunImpl(); + + private: + OpTesterConfig config_; + std::string type_; + framework::OpDesc op_desc_; + std::unordered_map> vars_; + std::unordered_map inputs_; + std::unique_ptr op_; + platform::Place place_; + std::unique_ptr scope_; +}; + +} // namespace benchmark +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/benchmark/op_tester_config.cc b/paddle/fluid/operators/benchmark/op_tester_config.cc new file mode 100644 index 00000000..818e5f64 --- /dev/null +++ b/paddle/fluid/operators/benchmark/op_tester_config.cc @@ -0,0 +1,229 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/benchmark/op_tester_config.h" +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace benchmark { + +static const char kStartSeparator[] = "{"; +static const char kEndSeparator[] = "}"; +static const char kSepBetweenItems[] = ";"; + +static bool StartWith(const std::string& str, const std::string& substr) { + return str.find(substr) == 0; +} + +static bool EndWith(const std::string& str, const std::string& substr) { + return str.rfind(substr) == (str.length() - substr.length()); +} + +static void EraseEndSep(std::string* str, + std::string substr = kSepBetweenItems) { + if (EndWith(*str, substr)) { + str->erase(str->length() - substr.length(), str->length()); + } +} + +OpInputConfig::OpInputConfig(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (sep != kEndSeparator) { + is >> sep; + if (sep == "name" || sep == "name:") { + is >> name; + EraseEndSep(&name); + } else if (sep == "dtype" || sep == "dtype:") { + ParseDType(is); + } else if (sep == "initializer" || sep == "initializer:") { + ParseInitializer(is); + } else if (sep == "dims" || sep == "dims:") { + ParseDims(is); + } else if (sep == "lod" || sep == "lod:") { + ParseLoD(is); + } else if (sep == "filename") { + is >> filename; + EraseEndSep(&filename); + } + } + } +} + +void OpInputConfig::ParseDType(std::istream& is) { + std::string dtype_str; + is >> dtype_str; + EraseEndSep(&dtype_str); + + if (dtype_str == "int32" || dtype_str == "int") { + dtype = "int32"; + } else if (dtype_str == "int64" || dtype_str == "long") { + dtype = "int64"; + } else if (dtype_str == "fp32" || dtype_str == "float") { + dtype = "fp32"; + } else if (dtype_str == "fp64" || dtype_str == "double") { + dtype = "fp64"; + } else { + PADDLE_THROW("Unsupported dtype %s", dtype_str.c_str()); + } + VLOG(4) << "dtype of input " << name << " is: " << dtype; +} + +void OpInputConfig::ParseInitializer(std::istream& is) { + std::string initializer_str; + is >> initializer_str; + EraseEndSep(&initializer_str); + + const std::vector supported_initializers = {"random", "natural", + "zeros", "file"}; + if (!Has(supported_initializers, initializer_str)) { + PADDLE_THROW("Unsupported initializer %s", initializer_str.c_str()); + } + + initializer = initializer_str; + VLOG(4) << "initializer of input " << name << " is: " << initializer; +} + +void OpInputConfig::ParseDims(std::istream& is) { + std::string dims_str; + is >> dims_str; + + dims.clear(); + std::string token; + std::istringstream token_stream(dims_str); + while (std::getline(token_stream, token, 'x')) { + dims.push_back(std::stoi(token)); + } +} + +void OpInputConfig::ParseLoD(std::istream& is) { + std::string lod_str; + std::string start_sep = + std::string(kStartSeparator) + std::string(kStartSeparator); + std::string end_sep = std::string(kEndSeparator) + std::string(kEndSeparator); + + std::string sep; + is >> sep; + if (StartWith(sep, start_sep)) { + lod_str += sep; + while (!EndWith(sep, end_sep)) { + is >> sep; + lod_str += sep; + } + } + EraseEndSep(&lod_str); + PADDLE_ENFORCE_GE(lod_str.length(), 4U); + VLOG(4) << "lod: " << lod_str << ", length: " << lod_str.length(); + + // Parse the lod_str + lod.clear(); + for (size_t i = 1; i < lod_str.length() - 1;) { + if (lod_str[i] == '{') { + std::vector level; + while (lod_str[i] != '}') { + ++i; + + std::string number; + while (lod_str[i] >= '0' && lod_str[i] <= '9') { + number += lod_str[i]; + ++i; + } + level.push_back(StringTo(number)); + } + lod.push_back(level); + } else if (lod_str[i] == '}') { + ++i; + } + } +} + +OpTesterConfig::OpTesterConfig(const std::string& filename) { + std::ifstream fin(filename, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", + filename.c_str()); + + Init(fin); +} + +bool OpTesterConfig::Init(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (sep != kEndSeparator) { + is >> sep; + if (sep == "op_type" || sep == "op_type:") { + is >> op_type; + } else if (sep == "device_id" || sep == "device_id:") { + is >> device_id; + } else if (sep == "repeat" || sep == "repeat:") { + is >> repeat; + } else if (sep == "profile" || sep == "profile:") { + is >> profile; + } else if (sep == "print_debug_string" || sep == "print_debug_string:") { + is >> print_debug_string; + } else if (sep == "input" || sep == "input:") { + OpInputConfig input_config(is); + inputs.push_back(input_config); + } else if (sep == "attrs" || sep == "attrs:") { + ParseAttrs(is); + } else { + if (sep != kEndSeparator) { + return false; + } + } + } + } else { + return false; + } + return true; +} + +bool OpTesterConfig::ParseAttrs(std::istream& is) { + std::string sep; + is >> sep; + if (sep == kStartSeparator) { + while (true) { + std::string key; + is >> key; + if (key == kEndSeparator) { + break; + } + + std::string value; + is >> value; + EraseEndSep(&key, ":"); + EraseEndSep(&value); + VLOG(4) << "attrs: " << key << ", " << value; + + attrs[key] = value; + } + } + return true; +} + +const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) { + for (size_t i = 0; i < inputs.size(); ++i) { + if (inputs[i].name == name) { + return &inputs[i]; + } + } + return nullptr; +} + +} // namespace benchmark +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/benchmark/op_tester_config.h b/paddle/fluid/operators/benchmark/op_tester_config.h new file mode 100644 index 00000000..3956bc0a --- /dev/null +++ b/paddle/fluid/operators/benchmark/op_tester_config.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace paddle { +namespace operators { +namespace benchmark { + +struct OpInputConfig { + OpInputConfig() {} + explicit OpInputConfig(std::istream& is); + + void ParseDType(std::istream& is); + void ParseInitializer(std::istream& is); + void ParseDims(std::istream& is); + void ParseLoD(std::istream& is); + + std::string name; + std::string dtype{"fp32"}; // int32/int, int64/long, fp32/float, fp64/double + std::string initializer{"random"}; // random, natural, zeros, file + std::string filename{""}; + std::vector dims; + std::vector> lod; +}; + +struct OpTesterConfig { + OpTesterConfig() {} + explicit OpTesterConfig(const std::string& filename); + + bool Init(std::istream& is); + + bool ParseAttrs(std::istream& is); + + const OpInputConfig* GetInput(const std::string& name); + + std::string op_type; + std::vector inputs; + std::unordered_map attrs; + int device_id{-1}; // CPU: -1 + int repeat{1}; + int profile{0}; + int print_debug_string{0}; + double runtime{0.0}; +}; + +static bool Has(const std::vector& vec, const std::string& item) { + for (size_t i = 0; i < vec.size(); ++i) { + if (vec[i] == item) { + return true; + } + } + return false; +} + +template +T StringTo(const std::string& str) { + std::istringstream is(str); + T value; + is >> value; + return value; +} + +} // namespace benchmark +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc new file mode 100644 index 00000000..f2c30cd7 --- /dev/null +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc @@ -0,0 +1,195 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/bilinear_tensor_product_op.h" +#include +#include +#include + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class BilinearTensorProductOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto weight_dims = ctx->GetInputDim("Weight"); + + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor."); + PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor."); + PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL, + "The input(Weight) must be a 3D tensor."); + if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0)) { + PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0], + "The first dimension(batch_size) of input(X) must be " + "equal to the first dimension of the input(Y)."); + } + PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1], + "The second dimension of input(X) must be equal to " + "the second dimension of the input(Weight)."); + PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2], + "The second dimension of input(Y) must be equal to " + "the third dimension of the input(Weight)."); + + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL, + "The Input(Bias) must be a 2-D tensor with " + "the 2nd dimension fixed to 1 (a row vector)."); + PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0], + "The second dimension of input(Bias) must be equal " + "to the first dimension of the input(Weight)."); + } + + ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]}); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The first input of bilinear_tensor_product operator."); + AddInput("Y", "The second input of bilinear_tensor_product operator."); + AddInput("Weight", + "The learnable parameters of bilinear_tensor_product operator."); + AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.") + .AsDispensable(); + AddOutput("Out", "The output of bilinear_tensor_product operator."); + AddComment(R"DOC( +Bilinear Tensor Product operator. +Given input X and Y, a 3D tensor Weight and a Bias. Each column of the +Output is computed by one slice $i = 1, . . . , k$ of the tensor: + +$$ +M = (X W_i) * Y \\ +Out_i = \sum_j {M_j} + Bias_i +$$ + +Where $W_i$ is the $i$-th slice of Input(Weight); + $M_j$ is the $j$-th column of $M$; + $Out_i$ is the $i$-th column of Output(Out); + $Bias_i$ is a column vector, each element of it is equal to + the $i$-th element of $Bias$; + +)DOC"); + } +}; + +class BilinearTensorProductOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Weight"), + "Input(Weight) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto weight_dims = ctx->GetInputDim("Weight"); + auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ(out_dims.size(), 2UL, + "The input(Out@GRAD) must be a 2D Tensor."); + PADDLE_ENFORCE_EQ( + x_dims[0], out_dims[0], + "The first dimension(batch_size) of input(Out@GRAD) must be " + "equal to the first dimension of the Input(X)."); + PADDLE_ENFORCE_EQ( + weight_dims[0], out_dims[1], + "The second dimension of input(Out@GRAD) must be equal to " + "the third dimension of the Input(Weight)."); + + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) { + ctx->SetOutputDim(bias_grad_name, {1, out_dims[1]}); + } + + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + auto weight_grad_name = framework::GradVarName("Weight"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + if (ctx->HasOutput(weight_grad_name)) { + ctx->SetOutputDim(weight_grad_name, weight_dims); + } + } +}; + +class BilinearTensorProductGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("bilinear_tensor_product_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput("Weight", Input("Weight")); + if (ForwardOp().Inputs().count("Bias") > 0) { + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + } + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp, + ops::BilinearTensorProductOpMaker, + ops::BilinearTensorProductGradOpDescMaker); +REGISTER_OPERATOR(bilinear_tensor_product_grad, + ops::BilinearTensorProductOpGrad); +REGISTER_OP_CPU_KERNEL( + bilinear_tensor_product, + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); +REGISTER_OP_CPU_KERNEL( + bilinear_tensor_product_grad, + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu new file mode 100644 index 00000000..c2b4f69e --- /dev/null +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu @@ -0,0 +1,29 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/bilinear_tensor_product_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + bilinear_tensor_product, + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); +REGISTER_OP_CUDA_KERNEL( + bilinear_tensor_product_grad, + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h new file mode 100644 index 00000000..5017c3a4 --- /dev/null +++ b/paddle/fluid/operators/bilinear_tensor_product_op.h @@ -0,0 +1,183 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +class BilinearTensorProductKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* weight = ctx.Input("Weight"); + auto* bias = ctx.Input("Bias"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto y_mat = EigenMatrix::From(*y); + auto output_mat = EigenMatrix::From(*out); + + auto batch_size = x->dims()[0]; + auto weight_dims = weight->dims(); + int out_dim = weight_dims[0]; + auto x_dim = weight_dims[1]; + auto y_dim = weight_dims[2]; + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); + + // Create the intermediate variable to caculate the result of + // Input(X) multiplied by Input(Weight_i), the formula is: + // left_mul = X Weight_i. + Tensor left_mul; + left_mul.mutable_data(framework::make_ddim({batch_size, y_dim}), + ctx.GetPlace()); + auto left_mul_mat = EigenMatrix::From(left_mul); + + for (int i = 0; i < out_dim; ++i) { + auto output_col_vec = output_mat.chip(i, 1); + Tensor weight_mat = + weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim})); + math::GetBlas(dev_ctx).GEMM( + CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data(), + weight_mat.data(), 0, left_mul.data()); + output_col_vec.device(place) = + (left_mul_mat * y_mat).sum(Eigen::DSizes(1)); + } + if (bias) { + auto bias_vec = EigenMatrix::From(*bias); + Eigen::DSizes bcast(batch_size, 1); + output_mat.device(place) = bias_vec.broadcast(bcast).eval() + output_mat; + } + } +}; + +template +class BilinearTensorProductGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* x = ctx.Input("X"); + const Tensor* y = ctx.Input("Y"); + const Tensor* weight = ctx.Input("Weight"); + Tensor* d_x = ctx.Output(framework::GradVarName("X")); + Tensor* d_y = ctx.Output(framework::GradVarName("Y")); + Tensor* d_weight = ctx.Output(framework::GradVarName("Weight")); + Tensor* d_bias = ctx.Output(framework::GradVarName("Bias")); + const Tensor* d_out = ctx.Input(framework::GradVarName("Out")); + + auto batch_size = x->dims()[0]; + auto weight_dims = weight->dims(); + int out_dim = weight_dims[0]; + auto x_dim = weight_dims[1]; + auto y_dim = weight_dims[2]; + + auto x_mat = EigenMatrix::From(*x); + auto y_mat = EigenMatrix::From(*y); + auto d_out_mat = EigenMatrix::From(*d_out); + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); + // Create the intermediate variable to calculate the Output(Y@Grad). + Tensor x_scale; + x_scale.mutable_data(framework::make_ddim({batch_size, x_dim}), + ctx.GetPlace()); + auto x_scale_mat = EigenMatrix::From(x_scale); + + // Create the intermediate variable to calculate the Output(X@Grad). + Tensor y_scale; + y_scale.mutable_data(framework::make_ddim({batch_size, y_dim}), + ctx.GetPlace()); + auto y_scale_mat = EigenMatrix::From(y_scale); + + math::SetConstant set_zero; + + if (d_x) { + d_x->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, d_x, static_cast(0)); + } + + if (d_y) { + d_y->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, d_y, static_cast(0)); + } + + if (d_weight) { + d_weight->mutable_data(ctx.GetPlace()); + } + + auto blas = math::GetBlas(ctx); + + // Caculate the Output(X@Grad) and Output(Y@Grad). + if (d_x || d_y || d_weight) { + Eigen::DSizes bcast_for_x(1, y_dim); + Eigen::DSizes bcast_for_y(1, x_dim); + Eigen::DSizes bcast_for_weight(1, x_dim); + + for (int i = 0; i < out_dim; ++i) { + Tensor weight_i = weight->Slice(i, i + 1).Resize( + framework::make_ddim({x_dim, y_dim})); + auto output_vec = d_out_mat.chip(i, 1); + + if (d_x) { + y_scale_mat.device(place) = + output_vec.reshape(Eigen::DSizes(batch_size, 1)) + .broadcast(bcast_for_x) + .eval() * + y_mat; + blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1, + y_scale.data(), weight_i.data(), 1, d_x->data()); + } + + if (d_y || d_weight) { + auto output_vec_y = + output_vec.reshape(Eigen::DSizes(batch_size, 1)) + .broadcast(bcast_for_y) + .eval(); + x_scale_mat.device(place) = output_vec_y * x_mat; + if (d_y) { + blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, + x_scale.data(), weight_i.data(), 1, d_y->data()); + } + if (d_weight) { + Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize( + framework::make_ddim({x_dim, y_dim})); + blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1, + x_scale.data(), y->data(), 0, d_weight_i.data()); + } + } + } + } + + // calculate the gradient of Input(Bias). + if (d_bias) { + d_bias->mutable_data(ctx.GetPlace()); + auto d_bias_mat = framework::EigenVector::Flatten(*d_bias); + d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes(0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc new file mode 100644 index 00000000..51c4d878 --- /dev/null +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/bpr_loss_op.h" +#include + +namespace paddle { +namespace operators { + +class BprLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(rank, label_dims.size(), + "Input(X) and Input(Label) shall have the same rank."); + + if (ctx->IsRuntime() || (framework::product(x_dims) > 0 && + framework::product(label_dims) > 0)) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "Input(X) and Input(Label) shall have the same shape " + "except the last dimension."); + } + + auto y_dims = x_dims; + y_dims[rank - 1] = 1; + ctx->SetOutputDim("Y", y_dims); + ctx->ShareLoD("X", /*->*/ "Y"); + } + + protected: + // Explicitly set that the data type of computation kernel of Seq-bpr + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); + } +}; + +class BprLossGradientOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) shoudl be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(dy_dims.size(), rank, + "Input(Y@Grad) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(label_dims.size(), rank, + "Input(Label) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "The Input(X) and Input(Label) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(dy_dims, 0, rank - 1), + "The Input(X) and Input(Y@Grad) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, + "The last dimension of Input(Y@Grad) should be 1."); + PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1, + " the last dimension of Input(Label) should be 1."); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); + } +}; + +class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a tensor whose last dimension " + "size is equal to the number of classes. This input is a " + "real number."); + AddInput( + "Label", + "(Tensor), the tensor which represents the ground truth. It has the " + "same shape with 'X' except the last dimension. the last dimension " + "size is 1."); + AddOutput("Y", + "(Tensor, default Tensor), a tensor whose shape is same " + "with 'X' except that the last dimension size is 1. It " + "represents the sequence bpr loss."); + AddComment(R"DOC( +Bayesian Personalized Ranking Loss Operator. + +This operator belongs to pairwise ranking loss. Label is the desired item. +The loss at a given point in one session is defined as: +$Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$ + +Learn more details by reading paper (https://arxiv.org/abs/1511.06939) + +)DOC"); + } +}; + +class BprLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("bpr_loss_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Label", Input("Label")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPUCtx = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker, + ops::BprLossGradDescMaker); +REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp); +REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel, + ops::BprLossOpKernel); +REGISTER_OP_CPU_KERNEL(bpr_loss_grad, + ops::BprLossGradientOpKernel, + ops::BprLossGradientOpKernel); diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h new file mode 100644 index 00000000..f9570e4e --- /dev/null +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +/*Todo: + *Find a way to adapt TolerableValue, using blas or eigen. + */ +template +struct TolerableValue { + HOSTDEVICE T operator()(const T& x) const { + PADDLE_ASSERT(std::is_floating_point::value); + const T kApproInf = 1e20; + if (x == INFINITY) return kApproInf; + if (x == -INFINITY) return -kApproInf; + return x; + } +}; + +template +class BprLossOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* label = ctx.Input("Label"); + auto* y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + int rank = x->dims().size(); + + Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1); + Tensor labels_2d = framework::ReshapeToMatrix(*label, rank - 1); + Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1); + + const framework::Tensor* logits = &x_2d; + const framework::Tensor* labels = &labels_2d; + framework::Tensor* out = &y_2d; + + const int step_size = logits->dims()[0]; + const int class_num = logits->dims()[1]; + const T* logits_data = logits->data(); + T* loss_data = out->data(); + + const int64_t* label_data = labels->data(); + for (int i = 0; i < step_size; ++i) { + int lbl_pos = label_data[i]; + PADDLE_ENFORCE_GE(lbl_pos, 0); + PADDLE_ENFORCE_LT(lbl_pos, class_num); + int index_pos = i * class_num + lbl_pos; + T sum = static_cast(0); + for (int j = 0; j < class_num; j++) { + if (j == lbl_pos) continue; + int index_neg = i * class_num + j; + sum += TolerableValue()(-std::log( + 1.0f + TolerableValue()(std::exp(logits_data[index_neg] - + logits_data[index_pos])))); + } + loss_data[i] = -sum / (class_num - 1); + } + } +}; + +template +class BprLossGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dy = ctx.Input(framework::GradVarName("Y")); + auto* label = ctx.Input("Label"); + auto* dx = ctx.Output(framework::GradVarName("X")); + + const size_t step_size = static_cast(x->dims()[0]); + const size_t num_classes = static_cast(x->dims()[1]); + T* dx_data = dx->mutable_data(ctx.GetPlace()); + const T* dy_data = dy->data(); + const T* x_data = x->data(); + const int64_t* label_data = label->data(); + + for (size_t sample_id = 0; sample_id < step_size; sample_id++) { + for (size_t x_offset = sample_id * num_classes; + x_offset < (sample_id + 1) * num_classes; x_offset++) { + dx_data[x_offset] = static_cast(0); + } + auto p_index = sample_id * num_classes + label_data[sample_id]; + for (size_t ni = 0; ni < num_classes; ni++) { + if (label_data[sample_id] == ni) continue; + auto n_index = sample_id * num_classes + ni; + auto grad_ = -dy_data[sample_id] / + ((num_classes - 1) * + (1.0f + TolerableValue()(std::exp(x_data[p_index] - + x_data[n_index])))); + dx_data[p_index] += grad_; + dx_data[n_index] -= grad_; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc new file mode 100644 index 00000000..0c517cc7 --- /dev/null +++ b/paddle/fluid/operators/cast_op.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cast_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input tensor of cast op"); + AddOutput("Out", "The output tensor of cast op"); + AddAttr("out_dtype", "output data type"); + AddAttr("in_dtype", "input data type"); + AddComment(R"DOC( +Cast Operator. + +This Operator casts the input tensor to another data type and +returns the Output Tensor. It's meaningless if the output dtype equals +the input dtype, but it's fine if you do so. + +)DOC"); + } +}; + +class CastOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("X"), "The input of cast op must be set"); + PADDLE_ENFORCE(context->HasOutput("Out"), + "The output of cast op must be set"); + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +class CastOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto grad = new framework::OpDesc(); + grad->SetType("cast"); + grad->SetInput("X", OutputGrad("Out")); + grad->SetOutput("Out", InputGrad("X")); + grad->SetAttr("out_dtype", GetAttr("in_dtype")); + grad->SetAttr("in_dtype", GetAttr("out_dtype")); + return std::unique_ptr(grad); + } +}; + +class CastOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // CastOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; +REGISTER_OPERATOR(cast, ops::CastOp, ops::CastOpGradMaker, + ops::CastOpInferShape, ops::CastOpProtoMaker); +REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel, + ops::CastOpKernel); diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu new file mode 100644 index 00000000..657d1628 --- /dev/null +++ b/paddle/fluid/operators/cast_op.cu @@ -0,0 +1,25 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cast_op.h" +#include "paddle/fluid/platform/float16.h" + +template +using CastOpKernel = + paddle::operators::CastOpKernel; + +REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel, CastOpKernel, + CastOpKernel, CastOpKernel, + CastOpKernel, CastOpKernel, + CastOpKernel); diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h new file mode 100644 index 00000000..8fa04160 --- /dev/null +++ b/paddle/fluid/operators/cast_op.h @@ -0,0 +1,66 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +template +struct CastOpTransformFunctor { + HOSTDEVICE OutT operator()(InT in) const { return static_cast(in); } +}; + +template +struct CastOpFunctor { + const framework::Tensor* in_; + framework::Tensor* out_; + const DeviceContext& ctx_; + CastOpFunctor(const framework::Tensor* in, framework::Tensor* out, + const DeviceContext& ctx) + : in_(in), out_(out), ctx_(ctx) {} + + template + void apply() const { + auto* in_begin = in_->data(); + auto numel = in_->numel(); + auto* in_end = in_begin + numel; + auto* out_begin = out_->mutable_data(ctx_.GetPlace()); + platform::Transform trans; + trans(ctx_, in_begin, in_end, out_begin, + CastOpTransformFunctor()); + } +}; + +template +class CastOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + framework::VisitDataType( + static_cast( + context.Attr("out_dtype")), + CastOpFunctor( + in, out, context.template device_context())); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/chunk_eval_op.cc b/paddle/fluid/operators/chunk_eval_op.cc new file mode 100644 index 00000000..21dfaf91 --- /dev/null +++ b/paddle/fluid/operators/chunk_eval_op.cc @@ -0,0 +1,178 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/chunk_eval_op.h" +#include +#include + +namespace paddle { +namespace operators { + +class ChunkEvalOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Inference"), + "Input(Inference) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input(Label) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Precision"), + "Output(Precision) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Recall"), + "Output(Recall) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("F1-Score"), + "Output(F1-Score) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"), + "Output(NumInferChunks) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"), + "Output(NumLabelChunks) of ChunkEvalOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NumCorrectChunks"), + "Output(NumCorrectChunks) of ChunkEvalOp should not be null."); + + auto inference_dim = ctx->GetInputDim("Inference"); + auto label_dim = ctx->GetInputDim("Label"); + + PADDLE_ENFORCE(inference_dim == label_dim, + "Inference's shape must be the same as Label's shape."); + + bool use_padding = ctx->HasInput("SeqLength"); + if (use_padding) { + PADDLE_ENFORCE(inference_dim.size() == 3, + "when SeqLength is provided, Inference should be of dim 3 " + "(batch, bucket, 1)"); + auto seq_length_dim = ctx->GetInputDim("SeqLength"); + PADDLE_ENFORCE(seq_length_dim.size() == 1, "seq_length should be rank 1"); + } + + ctx->SetOutputDim("Precision", {1}); + ctx->SetOutputDim("Recall", {1}); + ctx->SetOutputDim("F1-Score", {1}); + ctx->SetOutputDim("NumInferChunks", {1}); + ctx->SetOutputDim("NumLabelChunks", {1}); + ctx->SetOutputDim("NumCorrectChunks", {1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + platform::CPUPlace()); + } +}; + +class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Inference", + "(Tensor, default: Tensor). " + "Predictions from the network."); + AddInput("Label", + "(Tensor, default: Tensor). The true tag sequences."); + AddInput("SeqLength", + "(Tensor, default: Tensor). The length of each sequence, " + "used when Inference and Label are Tensor type .") + .AsDispensable(); + AddOutput("Precision", + "(float). The evaluated precision (called positive predictive " + "value) of chunks on the given mini-batch."); + AddOutput("Recall", + "(float). The evaluated recall (true positive rate or " + "sensitivity) of chunks on the given mini-batch."); + AddOutput("F1-Score", + "(float). The evaluated F1-Score on the given mini-batch."); + AddOutput("NumInferChunks", + "(int64_t). The number of chunks in Inference on the given " + "mini-batch."); + AddOutput( + "NumLabelChunks", + "(int64_t). The number of chunks in Label on the given mini-batch."); + AddOutput( + "NumCorrectChunks", + "(int64_t). The number of chunks both in Inference and Label on the " + "given mini-batch."); + AddAttr("num_chunk_types", + "The number of chunk type. See the description for details."); + AddAttr("chunk_scheme", + "The labeling scheme indicating " + "how to encode the chunks. Must be IOB, IOE, IOBES or " + "plain. See the description" + "for details.") + .SetDefault("IOB"); + AddAttr>("excluded_chunk_types", + "A list including chunk type ids " + "indicating chunk types that are not counted. " + "See the description for details.") + .SetDefault(std::vector{}); + AddComment(R"DOC( +For some basics of chunking, please refer to +'Chunking with Support Vector Machines '. + +ChunkEvalOp computes the precision, recall, and F1-score of chunk detection, +and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes. +Here is a NER example of labeling for these tagging schemes: + + Li Ming works at Agricultural Bank of China in Beijing. + IO I-PER I-PER O O I-ORG I-ORG I-ORG I-ORG O I-LOC + IOB B-PER I-PER O O B-ORG I-ORG I-ORG I-ORG O B-LOC + IOE I-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O E-LOC + IOBES B-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O S-LOC + +There are three chunk types(named entity types) including PER(person), ORG(organization) +and LOC(LOCATION), and we can see that the labels have the form -. + +Since the calculations actually use label ids rather than labels, extra attention +should be paid when mapping labels to ids to make CheckEvalOp work. The key point +is that the listed equations are satisfied by ids. + + tag_type = label % num_tag_type + chunk_type = label / num_tag_type + +where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type` +is the num of chunk types, and `tag_type` get its value from the following table. + + Scheme Begin Inside End Single + plain 0 - - - + IOB 0 1 - - + IOE - 0 1 - + IOBES 0 1 2 3 + +Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG, +PER and LOC. To satisfy the above equations, the label map can be like this: + + B-ORG 0 + I-ORG 1 + B-PER 2 + I-PER 3 + B-LOC 4 + I-LOC 5 + O 6 + +It's not hard to verify the equations noting that the num of chunk types +is 3 and the num of tag types in IOB scheme is 2. For example, the label +id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of +I-LOC is 2, which consistent with the results from the equations. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp, + ops::ChunkEvalOpMaker); +REGISTER_OP_CPU_KERNEL(chunk_eval, + ops::ChunkEvalKernel); diff --git a/paddle/fluid/operators/chunk_eval_op.h b/paddle/fluid/operators/chunk_eval_op.h new file mode 100644 index 00000000..63c77e52 --- /dev/null +++ b/paddle/fluid/operators/chunk_eval_op.h @@ -0,0 +1,266 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class ChunkEvalKernel : public framework::OpKernel { + public: + struct Segment { + int begin; + int end; + int type; + bool operator==(const Segment& y) const { + return begin == y.begin && end == y.end && type == y.type; + } + }; + + void GetSegments(const int64_t* label, int length, + std::vector* segments, int num_chunk_types, + int num_tag_types, int other_chunk_type, int tag_begin, + int tag_inside, int tag_end, int tag_single) const { + segments->clear(); + segments->reserve(length); + int chunk_start = 0; + bool in_chunk = false; + int tag = -1; + int type = other_chunk_type; + for (int i = 0; i < length; ++i) { + int prev_tag = tag; + int prev_type = type; + PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types); + tag = label[i] % num_tag_types; + type = label[i] / num_tag_types; + if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type, + tag_begin, tag_inside, tag_end, tag_single)) { + Segment segment{ + chunk_start, // begin + i - 1, // end + prev_type, + }; + segments->push_back(segment); + in_chunk = false; + } + if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type, + tag_begin, tag_inside, tag_end, tag_single)) { + chunk_start = i; + in_chunk = true; + } + } + if (in_chunk) { + Segment segment{ + chunk_start, // begin + length - 1, // end + type, + }; + segments->push_back(segment); + } + } + + bool ChunkEnd(int prev_tag, int prev_type, int tag, int type, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single) const { + if (prev_type == other_chunk_type) return false; + if (type == other_chunk_type) return true; + if (type != prev_type) return true; + if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single; + if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single; + if (prev_tag == tag_end) return true; + if (prev_tag == tag_single) return true; + return false; + } + + bool ChunkBegin(int prev_tag, int prev_type, int tag, int type, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single) const { + if (prev_type == other_chunk_type) return type != other_chunk_type; + if (type == other_chunk_type) return false; + if (type != prev_type) return true; + if (tag == tag_begin) return true; + if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single; + if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single; + if (tag == tag_single) return true; + return false; + } + + void Compute(const framework::ExecutionContext& context) const override { + // initialize to parse configurations + int num_chunk_types, num_tag_types; + int other_chunk_type; + int tag_begin, tag_inside, tag_end, tag_single; + std::vector label_segments; + std::vector output_segments; + std::set excluded_chunk_types; + + if (context.Attr("chunk_scheme") == "IOB") { + num_tag_types = 2; + tag_begin = 0; + tag_inside = 1; + tag_end = -1; + tag_single = -1; + } else if (context.Attr("chunk_scheme") == "IOE") { + num_tag_types = 2; + tag_begin = -1; + tag_inside = 0; + tag_end = 1; + tag_single = -1; + } else if (context.Attr("chunk_scheme") == "IOBES") { + num_tag_types = 4; + tag_begin = 0; + tag_inside = 1; + tag_end = 2; + tag_single = 3; + } else if (context.Attr("chunk_scheme") == "plain") { + num_tag_types = 1; + tag_begin = -1; + tag_inside = -1; + tag_end = -1; + tag_single = -1; + } else { + PADDLE_THROW("Unknown chunk scheme."); + } + other_chunk_type = num_chunk_types = context.Attr("num_chunk_types"); + excluded_chunk_types.insert( + context.Attr>("excluded_chunk_types").begin(), + context.Attr>("excluded_chunk_types").end()); + + auto* inference = context.Input("Inference"); + auto place = inference->place(); + auto* label = context.Input("Label"); + auto* precision = context.Output("Precision"); + auto* recall = context.Output("Recall"); + auto* f1 = context.Output("F1-Score"); + auto* num_infer_chunks = context.Output("NumInferChunks"); + auto* num_label_chunks = context.Output("NumLabelChunks"); + auto* num_correct_chunks = context.Output("NumCorrectChunks"); + + const int64_t* inference_data = inference->data(); + const int64_t* label_data = label->data(); + T* precision_data = precision->mutable_data(place); + T* racall_data = recall->mutable_data(place); + T* f1_data = f1->mutable_data(place); + int64_t* num_infer_chunks_data = + num_infer_chunks->mutable_data(place); + int64_t* num_label_chunks_data = + num_label_chunks->mutable_data(place); + int64_t* num_correct_chunks_data = + num_correct_chunks->mutable_data(place); + *num_infer_chunks_data = 0; + *num_label_chunks_data = 0; + *num_correct_chunks_data = 0; + + auto lod = label->lod(); + bool use_padding = lod.empty(); + int num_sequences = 0; + + if (use_padding) { + auto dim1 = inference->dims()[1]; + auto* seq_length_t = context.Input("SeqLength"); + auto* seq_length_data = seq_length_t->data(); + num_sequences = seq_length_t->dims()[0]; + + for (int i = 0; i < num_sequences; ++i) { + int seq_length = seq_length_data[i]; + EvalOneSeq(inference_data + i * dim1, label_data + i * dim1, seq_length, + &output_segments, &label_segments, num_infer_chunks_data, + num_label_chunks_data, num_correct_chunks_data, + num_chunk_types, num_tag_types, other_chunk_type, tag_begin, + tag_inside, tag_end, tag_single, excluded_chunk_types); + } + } else { + PADDLE_ENFORCE_EQ(lod.size(), 1UL, + "Only support one level sequence now."); + PADDLE_ENFORCE(lod == inference->lod(), + "LoD must be same between Inference and Label."); + num_sequences = lod[0].size() - 1; + + for (int i = 0; i < num_sequences; ++i) { + int seq_length = lod[0][i + 1] - lod[0][i]; + EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], + seq_length, &output_segments, &label_segments, + num_infer_chunks_data, num_label_chunks_data, + num_correct_chunks_data, num_chunk_types, num_tag_types, + other_chunk_type, tag_begin, tag_inside, tag_end, tag_single, + excluded_chunk_types); + } + } + + *precision_data = !(*num_infer_chunks_data) + ? 0 + : static_cast(*num_correct_chunks_data) / + (*num_infer_chunks_data); + *racall_data = !(*num_label_chunks_data) + ? 0 + : static_cast(*num_correct_chunks_data) / + (*num_label_chunks_data); + *f1_data = !(*num_correct_chunks_data) + ? 0 + : 2 * (*precision_data) * (*racall_data) / + ((*precision_data) + (*racall_data)); + } + + void EvalOneSeq(const int64_t* output, const int64_t* label, int length, + std::vector* output_segments, + std::vector* label_segments, + int64_t* num_output_segments, int64_t* num_label_segments, + int64_t* num_correct, int num_chunk_types, int num_tag_types, + int other_chunk_type, int tag_begin, int tag_inside, + int tag_end, int tag_single, + const std::set& excluded_chunk_types) const { + GetSegments(output, length, output_segments, num_chunk_types, num_tag_types, + other_chunk_type, tag_begin, tag_inside, tag_end, tag_single); + GetSegments(label, length, label_segments, num_chunk_types, num_tag_types, + other_chunk_type, tag_begin, tag_inside, tag_end, tag_single); + size_t i = 0, j = 0; + while (i < output_segments->size() && j < label_segments->size()) { + if (output_segments->at(i) == label_segments->at(j) && + excluded_chunk_types.count(output_segments->at(i).type) != 1) { + ++(*num_correct); + } + if (output_segments->at(i).end < label_segments->at(j).end) { + ++i; + } else if (output_segments->at(i).end > label_segments->at(j).end) { + ++j; + } else { + ++i; + ++j; + } + } + for (auto& segment : (*label_segments)) { + if (excluded_chunk_types.count(segment.type) != 1) { + ++(*num_label_segments); + } + } + for (auto& segment : (*output_segments)) { + if (excluded_chunk_types.count(segment.type) != 1) { + ++(*num_output_segments); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/clip_by_norm_op.cc b/paddle/fluid/operators/clip_by_norm_op.cc new file mode 100644 index 00000000..5720b295 --- /dev/null +++ b/paddle/fluid/operators/clip_by_norm_op.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_by_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp, + ops::ClipByNormOpMaker); + +REGISTER_OP_CPU_KERNEL( + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/fluid/operators/clip_by_norm_op.cu b/paddle/fluid/operators/clip_by_norm_op.cu new file mode 100644 index 00000000..788eab7c --- /dev/null +++ b/paddle/fluid/operators/clip_by_norm_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_by_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h new file mode 100644 index 00000000..b35e9c72 --- /dev/null +++ b/paddle/fluid/operators/clip_by_norm_op.h @@ -0,0 +1,133 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using SelectedRows = framework::SelectedRows; +template +using EigenVector = framework::EigenVector; + +template +class ClipByNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto max_norm = context.Attr("max_norm"); + auto in_var = context.InputVar("X"); + + Tensor* output = nullptr; + const Tensor* input = nullptr; + if (in_var->IsType()) { + input = context.Input("X"); + + output = context.Output("Out"); + output->mutable_data(context.GetPlace()); + } else if (in_var->IsType()) { + auto* x = context.Input("X"); + + // merge ids in selected rows first + math::scatter::MergeAdd merge_func; + SelectedRows* merged_input = + const_cast(context.scope()) + .Var() + ->GetMutable(); + merge_func(context.template device_context(), *x, + merged_input); + input = &(merged_input->value()); + + SelectedRows* output_selected_rows = context.Output("Out"); + output_selected_rows->set_rows(merged_input->rows()); + output_selected_rows->set_height(merged_input->height()); + output = output_selected_rows->mutable_value(); + output->Resize(merged_input->value().dims()); + output->mutable_data(context.GetPlace()); + } else { + PADDLE_THROW("Unexpected branch, input variable type is %s", + framework::ToTypeName(in_var->Type())); + } + + PADDLE_ENFORCE_NOT_NULL(input); + + auto x = EigenVector::Flatten(*input); + auto out = EigenVector::Flatten(*output); + auto x_norm = x.square().sum().sqrt(); + auto& place = + *context.template device_context().eigen_device(); + + auto temp = (x_norm <= max_norm).template cast().eval(); + auto scaling = temp + (static_cast(1) - temp) * max_norm / x_norm; + Eigen::array one_dim{{1}}; + Eigen::DSizes m_dsize(input->numel()); + out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize); + } +}; + +class ClipByNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ClipByNormOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ClipByNormOp should not be null."); + auto max_norm = ctx->Attrs().Get("max_norm"); + PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0."); + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor) The input of clip_by_norm op." + "The number of dimensions must be between [1, 9]."); + AddOutput("Out", + "(Tensor) The output of clip_by_norm op with shape as input(X)"); + AddAttr("max_norm", "(float) The maximum norm value."); + AddComment(R"DOC( +ClipByNorm Operator. + +This operator limits the L2 norm of the input $X$ within $max\_norm$. +If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be +the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will +be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as +shown in the following formula: + +$$ +Out = \\frac{max\\_norm * X}{norm(X)}, +$$ + +where $norm(X)$ represents the L2 norm of $X$. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc new file mode 100644 index 00000000..4fc6ae36 --- /dev/null +++ b/paddle/fluid/operators/clip_op.cc @@ -0,0 +1,106 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_op.h" +#include + +namespace paddle { +namespace operators { + +class ClipOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ClipOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ClipOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto max = ctx->Attrs().Get("max"); + auto min = ctx->Attrs().Get("min"); + PADDLE_ENFORCE_LT(min, max, "max should be greater than min."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +template +class ClipOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor)The input of clip op." + "The number of dimensions must be between [1, 9]."); + AddOutput("Out", "(Tensor)The output of clip op with shape as input(X)"); + AddAttr( + "min", "(float)Minimum value, under which element is replaced by min."); + AddAttr( + "max", "(float)Maximum value, above which element is replaced by max"); + AddComment(R"DOC( +Clip Operator. + +The clip operator limits the value of given input within an interval. The +interval is specified with arguments 'min' and 'max': + +$$ +Out = \min(\max(X, min), max) +$$ + +)DOC"); + } +}; + +class ClipOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + } +}; + +class ClipGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("clip_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker, + ops::ClipGradOpDescMaker); +REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad); +REGISTER_OP_CPU_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CPU_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu new file mode 100644 index 00000000..10bee444 --- /dev/null +++ b/paddle/fluid/operators/clip_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/clip_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CUDA_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h new file mode 100644 index 00000000..daf06f37 --- /dev/null +++ b/paddle/fluid/operators/clip_op.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +using platform::Transform; + +template +class ClipFunctor { + public: + explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {} + HOSTDEVICE T operator()(const T& x) const { + if (x < min_) + return min_; + else if (x > max_) + return max_; + else + return x; + } + + private: + T min_; + T max_; +}; + +template +class ClipGradFunctor { + public: + explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {} + HOSTDEVICE T operator()(const T& x, const T& y) const { + return (y > min_ && y < max_) ? x : 0; + } + + private: + T min_; + T max_; +}; + +template +class ClipKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto max = context.Attr("max"); + auto min = context.Attr("min"); + auto* x_var = context.InputVar("X"); + if (x_var->IsType()) { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + T* out_data = out->mutable_data(context.GetPlace()); + const T* x_data = x->data(); + int64_t numel = x->numel(); + Transform trans; + trans(context.template device_context(), x_data, + x_data + numel, out_data, ClipFunctor(min, max)); + } else if (x_var->IsType()) { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + PADDLE_ENFORCE_NE(x, out, + "Inplace clip is not allowed when x is SelectedRows"); + math::scatter::MergeAdd merge_func; + merge_func(context.template device_context(), *x, out); + auto* out_tensor = out->mutable_value(); + auto* out_data = out_tensor->data(); + int64_t numel = out_tensor->numel(); + Transform trans; + trans(context.template device_context(), out_data, + out_data + numel, out_data, ClipFunctor(min, max)); + } else { + PADDLE_THROW("ClipOp only supports LoDTensor and SelectedRows"); + } + } +}; + +template +class ClipGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto max = context.Attr("max"); + auto min = context.Attr("min"); + auto* d_out = + context.Input(framework::GradVarName("Out")); + auto* d_x = + context.Output(framework::GradVarName("X")); + if (d_x != nullptr) { + auto* x = context.Input("X"); + int64_t numel = d_out->numel(); + auto* d_x_data = d_x->mutable_data(context.GetPlace()); + const T* d_out_data = d_out->data(); + const T* x_data = x->data(); + Transform trans; + trans(context.template device_context(), d_out_data, + d_out_data + numel, x_data, d_x_data, ClipGradFunctor(min, max)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc new file mode 100644 index 00000000..ce425e7e --- /dev/null +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -0,0 +1,234 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_memory_aligment.h" + +namespace paddle { +namespace operators { + +static framework::proto::VarType::Type kDefaultDtype = + framework::proto::VarType::Type::VarType_Type_BOOL; + +template +class CoalesceTensorOp : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto &in_var_names = context.Inputs("Input"); + auto &out_var_names = context.Outputs("Output"); + auto &in_vars = context.MultiInputVar("Input"); + auto out_vars = context.MultiOutputVar("Output"); + + PADDLE_ENFORCE_GT(in_var_names.size(), static_cast(0)); + PADDLE_ENFORCE_EQ(in_var_names.size(), out_var_names.size()); + + for (size_t i = 0; i < in_var_names.size(); ++i) { + // Only support LoDTensor + PADDLE_ENFORCE_NOT_NULL(in_vars[i], "%s should not be nullptr,", + in_var_names[i]); + PADDLE_ENFORCE_NOT_NULL(out_vars[i], "%s should not be nullptr,", + out_var_names[i]); + PADDLE_ENFORCE(in_vars[i]->IsType()); + PADDLE_ENFORCE(out_vars[i]->IsType()); + } + + auto in_tensors = context.MultiInput("Input"); + + if (context.Attr("check_name")) { + for (size_t i = 0; i < in_var_names.size(); ++i) { + PADDLE_ENFORCE_EQ(in_var_names[i], out_var_names[i]); + } + } else { + // Init the output as input + for (size_t i = 0; i < in_tensors.size(); ++i) { + out_vars[i]->GetMutable()->Resize( + in_tensors[i]->dims()); + } + } + + auto &dev_ctx = context.template device_context(); + + // Get numel and dtype + size_t numel = 0; + auto dtype = kDefaultDtype; + GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype, + context.GetPlace()); + + // Alloc the continuous space + auto fused_tensor = context.Output("FusedOutput"); + fused_tensor->Resize(framework::make_ddim({static_cast(numel)})) + .mutable_data(context.GetPlace(), dtype); + + // Init the continuous space + auto out_tensors = context.MultiOutput("Output"); + size_t offset = 0; + size_t size_of_dtype = framework::SizeOfType(dtype); + if (context.Attr("copy_data")) { + for (size_t i = 0; i < in_var_names.size(); ++i) { + size_t len = static_cast(in_tensors[i]->numel()); + auto sub_tensor = fused_tensor->Slice( + static_cast(offset), static_cast(offset + len)); + framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx, + &sub_tensor); + + offset += platform::Alignment(len * size_of_dtype, context.GetPlace()) / + size_of_dtype; + } + } else if (context.Attr("set_constant")) { + math::SetConstant set_constant; + set_constant(dev_ctx, fused_tensor, + static_cast(context.Attr("constant"))); + } + + // Make the outputs point to the continuous space. + offset = 0; + std::stringstream ss; + ss << "alloc_space_for_vars: "; + for (size_t i = 0; i < out_tensors.size(); ++i) { + size_t len = static_cast(out_tensors[i]->numel()); + auto dim = out_tensors[i]->dims(); + out_tensors[i] + ->ShareDataWith(fused_tensor->Slice( + static_cast(offset), static_cast(offset + len))) + .Resize(dim); + len = platform::Alignment(len * size_of_dtype, context.GetPlace()) / + size_of_dtype; + offset += len; + ss << "output(" << out_var_names[i] << ") dim:(" << dim << ")" + << " address: " << out_tensors[i]->data() << ", "; + } + VLOG(10) << ss.str(); + } + + private: + void GetMemSizeAndDtype( + const std::vector &lod_tensors, + const std::vector var_names, size_t *numel, + framework::proto::VarType::Type *dtype, + const platform::Place &place) const { + PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size()); + *numel = 0; + size_t size_of_dtype = 0; + + std::stringstream ss; + ss << "alloc_space_for_vars: "; + for (size_t i = 0; i < var_names.size(); ++i) { + PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.", + var_names[i]); + + auto p_dtype = lod_tensors[i]->type(); + if (*dtype == kDefaultDtype) { + PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.", + var_names[i], kDefaultDtype); + *dtype = p_dtype; + size_of_dtype = framework::SizeOfType(p_dtype); + } + PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal."); + + auto size = lod_tensors[i]->numel(); + PADDLE_ENFORCE_GT(size, 0); + ss << "input(" << var_names[i] << ") dim:(" << lod_tensors[i]->dims() + << "), "; + *numel += platform::Alignment(static_cast(size) * size_of_dtype, + place) / + size_of_dtype; + } + + VLOG(10) << ss.str(); + } +}; + +class AllocContinuousSpaceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override {} +}; + +class AllocContinuousSpaceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(vector) The input tensors of" + " coalesce_tensor operator.") + .AsDuplicable(); + AddOutput("Output", + "(vector) The output " + "tensors of coalesce_tensor operator. And the address " + "of output tensors are continuous, they are sliced from the " + "tensor of FusedOutput.") + .AsDuplicable(); + AddOutput("FusedOutput", + "(LoDTensor) The output tensor " + "of coalesce_tensor operator. And the tensors of" + " Output is sliced from the tensor of FusedOutput."); + AddAttr("copy_data", "Whether to copy the Input value to Output.") + .SetDefault(false); + AddAttr("set_constant", + "Whether to set the Output with a constant value.") + .SetDefault(false); + AddAttr("constant", + "If set_constant is true, the constant value will be used " + "to set the Output.") + .SetDefault(0.0); + AddAttr("check_name", + "Whether to check the name of Input and Output to ensure " + "they are the same separately.") + .SetDefault(false); + AddComment(R"DOC( +AllocContinuousSpace Operator. + +coalesce_tensor is used to make the address of Output +continuous according to the Input. This Op will alloc a big tensor +according to the tensors of Input, the dtype is the same with those input tensors, +the size is the sum of those input tensors' numel, and the dim of the big +tensor is {sum(numel)}. And the big tensor is stored in FusedOutput. +The tensors of Output are sliced from the tensor of FusedOutput. +Note that, the dtype of Input should be the same, and the dim of Input +and Output should equal. +The tensors of Input and Output could be the same or different. And +coalesce_tensor allows copying the value of Input to Output, or +setting the Output with a constant value. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(coalesce_tensor, paddle::operators::AllocContinuousSpaceOp, + paddle::operators::AllocContinuousSpaceOpMaker); +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CPU_KERNEL( + coalesce_tensor, + ops::CoalesceTensorOp, + ops::CoalesceTensorOp, + ops::CoalesceTensorOp, + ops::CoalesceTensorOp); + +#ifdef PADDLE_WITH_CUDA +REGISTER_OP_CUDA_KERNEL( + coalesce_tensor, + ops::CoalesceTensorOp, + ops::CoalesceTensorOp, + ops::CoalesceTensorOp, + ops::CoalesceTensorOp); +#endif diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt new file mode 100644 index 00000000..89103f63 --- /dev/null +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -0,0 +1,39 @@ +include(operators) + +set(COLLECTIVE_DEPS "") +if(WITH_GRPC) + set(COLLECTIVE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) +else() + set(COLLECTIVE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb snappystream snappy protobuf ssl crypto zlib node) + if(WITH_BRPC_RDMA) + find_library(IBVERBS_LIBRARY NAMES ibverbs) + ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY}) + + + find_library(RDMACM_LIBRARY NAMES rdmacm) + ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY}) + + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} ibverbs rdmacm) + endif() +endif() + +set(COLLECTIVE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + +file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") +list(REMOVE_DUPLICATES OPS) + +foreach(src ${OPS}) + set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS}) +endforeach() + +register_operators(EXCLUDES c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) + +if(WITH_GPU AND NOT WIN32) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) + op_library(c_gen_nccl_id_op DEPS ${COLLECTIVE_DEPS} nccl_common) +endif() + +set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE) +set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency") diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc new file mode 100644 index 00000000..18c8f5d6 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allgather_op.h" + +#include + +namespace paddle { +namespace operators { + +class CAllGatherOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); + int nranks = ctx->Attrs().Get("nranks"); + PADDLE_ENFORCE_GE(nranks, 2, "nranks should be >=2"); + framework::DDim dim = ctx->GetInputDim("X"); + dim[0] = dim[0] * nranks; + ctx->SetOutputDim("Out", dim); + } +}; + +class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor to be allgather"); + AddOutput("Out", "(Tensor) the allgather result"); + AddAttr("ring_id", "(int default 0) communication ring id.") + .SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddAttr("nranks", + "Total trainer count of the distributed training job"); + AddComment(R"DOC( +CAllGather Operator +each rank receives the aggregation of data from all ranks in the order of the ranks + +reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#allgather +)DOC"); + } +}; + +class CAllGatherOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr retv(new framework::OpDesc()); + retv->SetType("c_reducescatter"); + retv->SetInput("X", OutputGrad("Out")); + retv->SetOutput("Out", InputGrad("X")); + retv->SetAttrMap(Attrs()); + return retv; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_allgather, ops::CAllGatherOp, ops::CAllGatherOpGradMaker, + ops::CAllGatherOpMaker); + +REGISTER_OP_CPU_KERNEL(c_allgather, ops::CAllGatherOpCPUKernel, + ops::CAllGatherOpCPUKernel, + ops::CAllGatherOpCPUKernel, + ops::CAllGatherOpCPUKernel, + ops::CAllGatherOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc new file mode 100644 index 00000000..330219cd --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allgather_op.h" + +#include + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CAllGatherOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + ncclDataType_t dtype = platform::ToNCCLDataType(in->type()); + + int nranks = ctx.Attr("nranks"); + int rid = ctx.Attr("ring_id"); + auto comm = platform::NCCLCommContext::Instance().Get(rid); + PADDLE_ENFORCE_EQ(nranks, comm->nranks()); + + auto place = ctx.GetPlace(); + framework::DDim out_dims = in->dims(); + out_dims[0] *= nranks; + out->mutable_data(out_dims, place); + + int64_t send_numel = in->numel(); + const T* send_buff = in->data(); + T* recv_buff = out->data(); + + cudaStream_t stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + PADDLE_ENFORCE(platform::dynload::ncclAllGather( + send_buff, recv_buff, send_numel, static_cast(dtype), + comm->comm(), stream)); +#else + PADDLE_THROW("PaddlePaddle should compile with GPU."); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_allgather, ops::CAllGatherOpCUDAKernel, + ops::CAllGatherOpCUDAKernel, + ops::CAllGatherOpCUDAKernel, + ops::CAllGatherOpCUDAKernel, + ops::CAllGatherOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h new file mode 100644 index 00000000..fe99a9e1 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CAllGatherOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW("unimplemented cpu kernel for CAllGatherOp."); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc new file mode 100644 index 00000000..bcb529f1 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc @@ -0,0 +1,39 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace operators { + +class CAllReduceMaxOpMaker : public CAllReduceOpMaker { + protected: + std::string GetName() const override { return "Max"; } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max, ops::CAllReduceOp, + ops::CAllReduceMaxOpMaker); + +REGISTER_OP_CPU_KERNEL(c_allreduce_max, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc new file mode 100644 index 00000000..34054103 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + c_allreduce_max, ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc new file mode 100644 index 00000000..9d27a9ce --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc @@ -0,0 +1,39 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace operators { + +class CAllReduceMinOpMaker : public CAllReduceOpMaker { + protected: + std::string GetName() const override { return "Min"; } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_min, ops::CAllReduceOp, + ops::CAllReduceMinOpMaker); + +REGISTER_OP_CPU_KERNEL(c_allreduce_min, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc new file mode 100644 index 00000000..4e8b6f9d --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + c_allreduce_min, ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h new file mode 100644 index 00000000..1db5f155 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -0,0 +1,139 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +enum ReduceType { kRedSum, kRedMax, kRedMin, kRedProd }; + +class CAllReduceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } +}; + +template +class CAllReduceOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW("CAllReduce op do not support CPUKernel for now."); + } +}; + +template +class CAllReduceOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + ncclDataType_t dtype = platform::ToNCCLDataType(in->type()); + int64_t numel = in->numel(); + const void* sendbuff = in->data(); + out->Resize(in->dims()); + void* recvbuff = out->mutable_data(place); + + int rid = ctx.Attr("ring_id"); + auto comm = platform::NCCLCommContext::Instance().Get(rid); + + cudaStream_t stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + ncclRedOp_t nccl_red_type = ncclSum; + switch (red_type) { + case kRedSum: + nccl_red_type = ncclSum; + break; + + case kRedMax: + nccl_red_type = ncclMax; + break; + + case kRedMin: + nccl_red_type = ncclMin; + break; + + case kRedProd: + nccl_red_type = ncclProd; + break; + + default: + PADDLE_THROW("Invalid reduce type: %d", red_type); + } + + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); +#else + PADDLE_THROW("PaddlePaddle should compile with GPU."); +#endif + } +}; + +class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor), tensor to be allreduced."); + AddOutput("Out", "(Tensor) the allreduced result."); + AddAttr("ring_id", "(int default 0) communication ring id.") + .SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddComment(string::Sprintf(R"DOC( +CAllReduce %s Operator + +Call collective AllReduce with reduce type %s. If input and output are +the same variable, in-place allreduce will be used. +Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#allreduce +)DOC", + GetName(), GetName())); + } + + protected: + virtual std::string GetName() const = 0; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc new file mode 100644 index 00000000..3cfb1723 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc @@ -0,0 +1,39 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace operators { + +class CAllReduceProdOpMaker : public CAllReduceOpMaker { + protected: + std::string GetName() const override { return "Prod"; } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod, ops::CAllReduceOp, + ops::CAllReduceProdOpMaker); + +REGISTER_OP_CPU_KERNEL(c_allreduce_prod, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc new file mode 100644 index 00000000..61f76c17 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + c_allreduce_prod, ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc new file mode 100644 index 00000000..c80c585a --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace operators { + +class CAllReduceSumOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr retv(new framework::OpDesc()); + retv->SetType("c_allreduce_sum"); + retv->SetInput("X", OutputGrad("Out")); + retv->SetOutput("Out", InputGrad("X")); + retv->SetAttrMap(Attrs()); + return retv; + } +}; + +class CAllReduceSumOpMaker : public CAllReduceOpMaker { + protected: + std::string GetName() const override { return "Sum"; } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_allreduce_sum, ops::CAllReduceOp, + ops::CAllReduceSumOpGradMaker, ops::CAllReduceSumOpMaker); + +REGISTER_OP_CPU_KERNEL(c_allreduce_sum, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel, + ops::CAllReduceOpCPUKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc new file mode 100644 index 00000000..8fe7fce2 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + c_allreduce_sum, ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel, + ops::CAllReduceOpCUDAKernel) diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc new file mode 100644 index 00000000..72d33030 --- /dev/null +++ b/paddle/fluid/operators/collective/c_broadcast_op.cc @@ -0,0 +1,70 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_broadcast_op.h" + +namespace paddle { +namespace operators { + +class CBroadcastOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } +}; + +class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor to be broadcasted."); + AddOutput("Out", "(Tensor) the result of broadcast."); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr("root", "(int default 0) root id for broadcasting.") + .SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddComment(R"DOC( +CBroadcast Operator + +Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#broadcast +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(c_broadcast, ops::CBroadcastOp, + ops::CBroadcastOpMaker); + +REGISTER_OP_CPU_KERNEL(c_broadcast, ops::CBroadcastOpCPUKernel, + ops::CBroadcastOpCPUKernel, + ops::CBroadcastOpCPUKernel, + ops::CBroadcastOpCPUKernel, + ops::CBroadcastOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc new file mode 100644 index 00000000..c0f5bbd2 --- /dev/null +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_broadcast_op.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CBroadcastOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + int numel = x->numel(); + ncclDataType_t dtype = platform::ToNCCLDataType(x->type()); + + int rid = ctx.Attr("ring_id"); + auto comm = platform::NCCLCommContext::Instance().Get(rid); + + auto place = ctx.GetPlace(); + cudaStream_t stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int root = ctx.Attr("root"); + if (root == comm->rank()) { + PADDLE_ENFORCE(platform::dynload::ncclBcast( + reinterpret_cast(const_cast(x->data())), numel, dtype, + root, comm->comm(), stream)); + VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent " + << x->numel(); + + if (out != x) { + framework::TensorCopy( + *static_cast(x), place, + *platform::DeviceContextPool::Instance().Get(place), + static_cast(out)); + } + } else { + PADDLE_ENFORCE(platform::dynload::ncclBcast(out->mutable_data(place), + numel, dtype, root, + comm->comm(), stream)); + VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved " + << framework::product(out->dims()); + } + + out->Resize(x->dims()); + out->set_lod(x->lod()); +#else + PADDLE_THROW("PaddlePaddle should compile with GPU."); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_broadcast, ops::CBroadcastOpCUDAKernel, + ops::CBroadcastOpCUDAKernel, + ops::CBroadcastOpCUDAKernel, + ops::CBroadcastOpCUDAKernel, + ops::CBroadcastOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_broadcast_op.h b/paddle/fluid/operators/collective/c_broadcast_op.h new file mode 100644 index 00000000..4ceb0aa8 --- /dev/null +++ b/paddle/fluid/operators/collective/c_broadcast_op.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CBroadcastOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW("Unimplemented cpu kernel for CBroadcastOp."); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_comm_init_op.cc b/paddle/fluid/operators/collective/c_comm_init_op.cc new file mode 100644 index 00000000..16ca6e52 --- /dev/null +++ b/paddle/fluid/operators/collective/c_comm_init_op.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include +#endif +#include +#include +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +class CCommInitOp : public framework::OperatorBase { + public: + CCommInitOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + PADDLE_ENFORCE(is_gpu_place(place), + "CCommInitOp can run on gpu place only."); + + auto var = scope.FindVar(Input("X")); + PADDLE_ENFORCE_NOT_NULL(var); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + ncclUniqueId* nccl_id = var->GetMutable(); + + int nranks = Attr("nranks"); + int rank_id = Attr("rank"); + int rid = Attr("ring_id"); + + platform::NCCLCommContext::Instance().CreateNCCLComm( + nccl_id, nranks, rank_id, boost::get(place).device, + rid); +#else + PADDLE_THROW("PaddlePaddle should compile with GPU."); +#endif + } +}; + +class CCommInitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Raw variable contains a NCCL UniqueId instaces."); + AddComment(R"DOC( +CCommInit operator + +Initialize collective communicatoin context within this trainer +)DOC"); + AddAttr("nranks", "(int) The number of ranks of distributed trainers"); + AddAttr("rank", + "(int) The rank of the trainer in distributed training."); + AddAttr("ring_id", "(int default 0) user specified ring id") + .SetDefault(0); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_comm_init, ops::CCommInitOp, ops::CCommInitOpMaker); diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc new file mode 100644 index 00000000..d576ca7d --- /dev/null +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -0,0 +1,150 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include +#endif + +#include +#include +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +class CGenNCCLIdOp : public framework::OperatorBase { + public: + CGenNCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + // put nccl id in CPUPlace + auto& dev_ctx = *pool.Get(platform::CPUPlace()); + int rank = Attr("rank"); + framework::Scope& local_scope = scope.NewScope(); + + if (rank == 0) { + GenerateAndSend(&local_scope, dev_ctx); + } else { + GetIdByServer(&local_scope, dev_ctx); + } + scope.DeleteScope(&local_scope); + } + + private: + void GenerateAndSend(framework::Scope* scope, + const platform::DeviceContext& dev_ctx) const { + std::string var_name = Output("Out"); + auto var = scope->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var); + auto id = var->GetMutable(); + PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(id)); + + std::vector endpoint_list = + Attr>("other_endpoints"); + distributed::RPCClient* client = + distributed::RPCClient::GetInstance(0); + + for (auto& ep : endpoint_list) { + VLOG(3) << "sending nccl id to " << ep; + client->AsyncSendVar(ep, dev_ctx, *scope, var_name); + } + client->Wait(); + for (auto& ep : endpoint_list) { + client->AsyncSendBatchBarrier(ep); + } + client->Wait(); + VLOG(3) << "sending completed..."; + } + + void GetIdByServer(framework::Scope* scope, + const platform::DeviceContext& dev_ctx) const { + std::string endpoint = Attr("endpoint"); + // NOTE: Can not use unique_ptr here because the default + // deleter will call GRPC Server's base class's dtor and + // that will cause a wired crash. + distributed::RequestSendHandler rpc_h(true); + std::unique_ptr rpc_service( + new RPCSERVER_T(endpoint, 1)); + + rpc_service->RegisterRPC(distributed::kRequestSend, &rpc_h); + rpc_h.SetRPCServer(rpc_service.get()); + + framework::ProgramDesc empty_program; + framework::Executor executor(dev_ctx.GetPlace()); + rpc_h.SetScope(scope); + rpc_h.SetDevCtx(&dev_ctx); + rpc_h.SetProgram(&empty_program); + rpc_h.SetExecutor(&executor); + + std::thread server_thread( + std::bind(&distributed::RPCServer::StartServer, rpc_service.get())); + + rpc_service->SetCond(distributed::kRequestSend); + VLOG(3) << "start getting nccl id from trainer 0..."; + rpc_service->WaitBarrier(distributed::kRequestSend); + VLOG(3) << "got nccl id and stop server..."; + rpc_service->ShutDown(); + VLOG(3) << "rpc server stopped"; + server_thread.join(); + } +}; + +class CGenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("Out", "Raw variable contains a NCCL UniqueId instaces."); + AddComment(R"DOC( +CGenNCCLId operator + +For trainer 0: generate a new UniqueId and send it to all the other trainers. +For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server. +)DOC"); + AddAttr("endpoint", + "(string), e.g. 127.0.0.1:6175 " + "current listen endpoint"); + AddAttr>( + "other_endpoints", + "['trainer1_ip:port', 'trainer2_ip:port', ...] " + "list of other trainer endpoints") + .SetDefault({}); + AddAttr("rank", + "(int default 0) " + "The rank of the trainer in distributed training.") + .SetDefault(0); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_gen_nccl_id, ops::CGenNCCLIdOp, ops::CGenNCCLIdOpMaker); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc new file mode 100644 index 00000000..1194ac71 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" + +#include + +namespace paddle { +namespace operators { + +class CReduceScatterOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); + int nranks = ctx->Attrs().Get("nranks"); + framework::DDim dim = ctx->GetInputDim("X"); + if (dim[0] > 0 || dim[0] < -1) { + PADDLE_ENFORCE(dim[0] % nranks == 0, + "dim[0] (%d) is not divisible by nranks(%d)", dim[0], + nranks); + dim[0] /= nranks; + } + ctx->SetOutputDim("Out", dim); + } +}; + +class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor to be allgather"); + AddOutput("Out", "(Tensor) the allgather result"); + AddAttr("ring_id", "(int default 0) communication ring id.") + .SetDefault(0); + AddAttr("nranks", + "Total trainer count of the distributed training job") + .SetDefault(1); + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddComment(R"DOC( +CReduceScatter Operator + +Reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#reducescatter +)DOC"); + } +}; + +class CReduceScatterOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr retv(new framework::OpDesc()); + retv->SetType("c_allgather"); + retv->SetInput("X", OutputGrad("Out")); + retv->SetOutput("Out", InputGrad("X")); + retv->SetAttrMap(Attrs()); + return retv; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_reducescatter, ops::CReduceScatterOp, + ops::CReduceScatterOpMaker); + +REGISTER_OP_CPU_KERNEL(c_reducescatter, ops::CReduceScatterOpCPUKernel, + ops::CReduceScatterOpCPUKernel, + ops::CReduceScatterOpCPUKernel, + ops::CReduceScatterOpCPUKernel, + ops::CReduceScatterOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc new file mode 100644 index 00000000..7244aa94 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CReduceScatterOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + int rid = ctx.Attr("ring_id"); + auto comm = platform::NCCLCommContext::Instance().Get(rid); + int nranks = comm->nranks(); + + auto place = ctx.GetPlace(); + auto out_dims = in->dims(); + out_dims[0] = out_dims[0] / nranks; + out->mutable_data(out_dims, place); + + int64_t recv_numel = in->numel() / nranks; + const T* send_buff = in->data(); + T* recv_buff = out->data(); + int dtype = platform::ToNCCLDataType(in->type()); + + cudaStream_t stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + PADDLE_ENFORCE(platform::dynload::ncclReduceScatter( + send_buff, recv_buff, recv_numel, static_cast(dtype), + ncclSum, comm->comm(), stream)); +#else + PADDLE_THROW("PaddlePaddle should compile with GPU."); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_reducescatter, ops::CReduceScatterOpCUDAKernel, + ops::CReduceScatterOpCUDAKernel, + ops::CReduceScatterOpCUDAKernel, + ops::CReduceScatterOpCUDAKernel, + ops::CReduceScatterOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h new file mode 100644 index 00000000..ee308080 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reducescatter_op.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CReduceScatterOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW("Unimplemented cpu kernel for CReduceScatterOp."); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc new file mode 100644 index 00000000..fe74fc59 --- /dev/null +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc @@ -0,0 +1,75 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include +#endif + +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/collective_helper.h" +#endif + +namespace paddle { +namespace operators { + +class CSyncCalcStreamOp : public framework::OperatorBase { + public: + CSyncCalcStreamOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + PADDLE_ENFORCE(is_gpu_place(place), + "Sync stream op can run on gpu place only for now."); +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + auto dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + cudaError_t e_sync = cudaStreamSynchronize(dev_ctx->stream()); + if (e_sync != 0) { + LOG(FATAL) << "Fail to sync cuda stream: " << cudaGetErrorString(e_sync); + } +#else + PADDLE_THROW("PaddlePaddle should compile with GPU."); +#endif + } +}; + +class CSyncCalcStreamOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) Dependency of the variable need to sync"); + AddOutput("Out", "(Tensor) Dependency of the variable need to sync"); + AddComment(R"DOC( +CSyncCalcStream Operator + +Call calculation stream synchronization. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_sync_calc_stream, ops::CSyncCalcStreamOp, + ops::CSyncCalcStreamOpMaker); diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc new file mode 100644 index 00000000..51703561 --- /dev/null +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include +#endif + +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +class CSyncCommStreamOp : public framework::OperatorBase { + public: + CSyncCommStreamOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + PADDLE_ENFORCE(is_gpu_place(place), + "Sync stream op can run on gpu place only for now."); + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + int ring_id = Attr("ring_id"); + auto stream = platform::NCCLCommContext::Instance().Get(ring_id)->stream(); + cudaError_t e_sync = cudaStreamSynchronize(stream); + if (e_sync != 0) { + LOG(FATAL) << "Fail to sync nccl stream: " << cudaGetErrorString(e_sync); + } +#else + PADDLE_THROW("PaddlePaddle should compile with GPU."); +#endif + } +}; + +class CSyncCommStreamOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) Dependency of the variable need to sync"); + AddOutput("Out", "(Tensor) Dependency of the variable need to sync"); + AddAttr("ring_id", "(int default 0) ring id.").SetDefault(0); + AddComment(R"DOC( +CSyncCommStream Operator + +Call communication stream synchronization. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_sync_comm_stream, ops::CSyncCommStreamOp, + ops::CSyncCommStreamOpMaker); diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc new file mode 100644 index 00000000..7f249924 --- /dev/null +++ b/paddle/fluid/operators/concat_op.cc @@ -0,0 +1,215 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include +#include +#include + +#ifdef PADDLE_WITH_MKLDNN +#include +#endif + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +class ConcatOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, + "Inputs(X) of ConcatOp should be empty."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ConcatOp should not be null."); + + auto ins = ctx->GetInputsDim("X"); + size_t axis = + ComputeAxis(static_cast(ctx->Attrs().Get("axis")), + static_cast(ins[0].size())); + + const size_t n = ins.size(); + + PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0."); + if (n == 1) { + VLOG(3) << "Warning: concat op have only one input, may waste memory"; + } + + auto out_dims = ins[0]; + size_t in_zero_dims_size = out_dims.size(); + for (size_t i = 1; i < n; i++) { + for (size_t j = 0; j < in_zero_dims_size; j++) { + if (j == axis) { + if (ctx->IsRuntime()) { + out_dims[axis] += ins[i][j]; + } else { + if (ins[i][j] == -1) { + out_dims[axis] = -1; + } else { + out_dims[axis] += ins[i][j]; + } + } + } else { + bool check_shape = + ctx->IsRuntime() || (out_dims[j] > 0 && ins[i][j] > 0); + if (check_shape) { + // check all shape in run time + PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], + "Input tensors should have the same " + "elements except the specify axis."); + } + } + } + } + if (out_dims[axis] < 0) { + out_dims[axis] = -1; + } + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto inputs = ctx.MultiInput("X"); + auto input_data_type = framework::proto::VarType::Type(0); + bool flag = 0; + for (auto *input : inputs) { + if (input->IsInitialized() && input->numel() > 0) { + input_data_type = input->type(); + flag = 1; + break; + } + } + if (flag == 0) { + PADDLE_THROW("All Inputs of Concat OP are Empty!"); + } + +#ifdef PADDLE_WITH_MKLDNN + if (platform::CanMKLDNNBeUsed(ctx)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + +class ConcatOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input tensors of concat operator.").AsDuplicable(); + AddOutput("Out", "Output tensor of concat operator."); + AddAttr( + "use_mkldnn", + "(bool, default false) Indicates if MKL-DNN kernel will be used") + .SetDefault(false); + AddAttr("axis", + "The axis along which the input tensors will be concatenated." + "The axis could also be negative numbers. Negative axis is " + "interpreted as counting from the end of the rank." + "i.e., axis + rank(X) th dimension.") + .SetDefault(0); + AddAttr("use_quantizer", + "(bool, default false) " + "Set to true for operators that should be quantized and use " + "int8 kernel. " + "Only used on CPU.") + .SetDefault(false); + AddComment(R"DOC( +Concat Operator. + +Concatenate the input tensors along dimension axis. +Examples: + Input[0] = [[1,2],[3,4]] + Input[1] = [[5,6]] + axis = 0 + Output = [[1,2], + [3,4], + [5,6]] + +)DOC"); + } +}; + +class ConcatOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + auto in_x = "X"; + auto out_x_g_n = framework::GradVarName(in_x); + ctx->SetOutputsDim(out_x_g_n, ctx->GetInputsDim(in_x)); + auto &in_names = ctx->Inputs(in_x); + auto &out_names = ctx->Outputs(out_x_g_n); + PADDLE_ENFORCE_EQ( + in_names.size(), out_names.size(), + "The number of arguments in %s[%d] and %s[%d] is not equal.", in_x, + in_names.size(), out_x_g_n, out_names.size()); + for (size_t i = 0; i < in_names.size(); ++i) { + if (out_names[i] != framework::kEmptyVarName) { + ctx->ShareLoD(in_x, out_x_g_n, i, i); + } + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ConcatOpGradNoNeedBufferVarInference, + "X"); + +class ConcatGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("concat_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker, + ops::ConcatGradOpDescMaker); +REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad, + ops::ConcatOpGradNoNeedBufferVarInference); +REGISTER_OP_CPU_KERNEL( + concat, ops::ConcatKernel, + ops::ConcatKernel, + ops::ConcatKernel, + ops::ConcatKernel); +REGISTER_OP_CPU_KERNEL( + concat_grad, + ops::ConcatGradKernel, + ops::ConcatGradKernel, + ops::ConcatGradKernel, + ops::ConcatGradKernel); diff --git a/paddle/fluid/operators/concat_op.cu.cc b/paddle/fluid/operators/concat_op.cu.cc new file mode 100644 index 00000000..334126c4 --- /dev/null +++ b/paddle/fluid/operators/concat_op.cu.cc @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/concat_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL( + concat, ops::ConcatKernel, + ops::ConcatKernel, + ops::ConcatKernel, + ops::ConcatKernel, + ops::ConcatKernel); +REGISTER_OP_CUDA_KERNEL( + concat_grad, + ops::ConcatGradKernel, + ops::ConcatGradKernel, + ops::ConcatGradKernel, + ops::ConcatGradKernel, + ops::ConcatGradKernel); diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h new file mode 100644 index 00000000..4a371de3 --- /dev/null +++ b/paddle/fluid/operators/concat_op.h @@ -0,0 +1,126 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { + +static inline int64_t ComputeAxis(int64_t axis, int64_t rank) { + if (axis < 0) { + axis = axis + rank; + } + return axis > 0 ? axis : 0; +} + +template +class ConcatKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + framework::Tensor* out = ctx.Output("Out"); + PADDLE_ENFORCE(ins[0], "The input should not be null."); + auto axis = ComputeAxis(static_cast(ctx.Attr("axis")), + static_cast(ins[0]->dims().size())); + auto place = ctx.GetPlace(); + out->mutable_data(place); + + // Sometimes direct copies will be faster, this maybe need deeply analysis. + if (axis == 0 && ins.size() < 10) { + size_t output_offset = 0; + for (auto* in : ins) { + if (!in || in->numel() == 0UL) { + continue; + } + auto in_stride = framework::stride_numel(in->dims()); + auto out_stride = framework::stride_numel(out->dims()); + StridedNumelCopyWithAxis(ctx.device_context(), axis, + out->data() + output_offset, out_stride, + in->data(), in_stride, in_stride[axis]); + output_offset += in_stride[axis]; + } + } else { + std::vector inputs; + for (size_t j = 0; j < ins.size(); ++j) { + if (ins[j] && ins[j]->numel() > 0) { + inputs.push_back(*ins[j]); + } else { + continue; + } + } + auto& dev_ctx = ctx.template device_context(); + paddle::operators::math::ConcatFunctor concat_functor; + concat_functor(dev_ctx, inputs, static_cast(axis), out); + } + } +}; + +template +class ConcatGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto ins = ctx.MultiInput("X"); + auto out_var_names = ctx.Outputs(framework::GradVarName("X")); + auto outs = + ctx.MultiOutput(framework::GradVarName("X")); + + { + auto dx = outs; + auto x = ins; + for (size_t i = 0; i < dx.size(); ++i) { + if (dx[i] != nullptr) { + dx[i]->set_lod(x[i]->lod()); + } + } + } + PADDLE_ENFORCE(ins[0], "The input should not be null."); + auto axis = ComputeAxis(static_cast(ctx.Attr("axis")), + static_cast(ins[0]->dims().size())); + + // get output tensor that the name is not kEmptyVarName + std::vector outputs; + for (size_t j = 0; j < outs.size(); ++j) { + if (out_var_names[j] != framework::kEmptyVarName && + outs[j]->numel() != 0UL) { + outs[j]->mutable_data(ctx.GetPlace()); + outputs.push_back(outs[j]); + } else { + outputs.push_back(nullptr); + } + } + auto& dev_ctx = ctx.template device_context(); + + // Sometimes direct copies will be faster, this maybe need deeply analysis. + if (axis == 0 && outs.size() < 10) { + std::vector ref_shape; + ref_shape.insert(ref_shape.begin(), ins.begin(), ins.end()); + StridedMemcpyWithAxis0(dev_ctx, *out_grad, ref_shape, &outputs); + } else { + math::SplitFunctor split_functor; + split_functor(dev_ctx, *out_grad, ctx.MultiInput("X"), + static_cast(axis), &outputs); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt new file mode 100644 index 00000000..f7281a2d --- /dev/null +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -0,0 +1,7 @@ +include(operators) +register_operators(DEPS naive_executor) +cc_library(op_variant SRCS op_variant.cc DEPS operator proto_desc) +cc_library(recurrent_op_helper SRCS recurrent_op_helper.cc DEPS operator op_variant recurrent_op) +cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator op_variant) + +file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc new file mode 100644 index 00000000..5d3f9b43 --- /dev/null +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -0,0 +1,139 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/controlflow/compare_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CompareOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + using Tensor = framework::Tensor; + + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* z = context.Output("Out"); + int axis = context.Attr("axis"); + + if (x->numel() == 1 && y->numel() == 1) { + bool* z_data = z->mutable_data(context.GetPlace()); + z_data[0] = Functor()(x->data()[0], y->data()[0]); + } else { + ElementwiseComputeEx( + context, x, y, axis, Functor(), z); + } + } +}; + +template +class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + OpComment comment; + AddInput("X", string::Sprintf("the left hand operand of %s operator", + comment.type)); + AddInput("Y", string::Sprintf("the right hand operand of %s operator", + comment.type)); + AddAttr( + "axis", + "The start dimension index for broadcasting Y onto X. [default -1]") + .SetDefault(-1) + .EqualGreaterThan(-1); + AddAttr("force_cpu", + "Force fill output variable to cpu " + "memory. Otherwise, fill output variable to the running " + "device [default true].") + .SetDefault(true); + AddOutput("Out", string::Sprintf("n-dim bool tensor. Each element is %s", + comment.equation)); + AddComment(string::Sprintf(R"DOC( +It operates element-wise on X and Y, and returns the Out. Each of them is a +N-dim tensor. X and Y could be any type. The each element of the Out tensor is +calculated by $%s$ +)DOC", + comment.equation)); + } +}; + +template +class CompareOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* context) const override { + OpComment comment; + PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X", + comment.type); + PADDLE_ENFORCE(context->HasInput("Y"), "%s operator must has input Y", + comment.type); + auto dim_x = context->GetInputDim("X"); + auto dim_y = context->GetInputDim("Y"); + PADDLE_ENFORCE_GE(dim_x.size(), dim_y.size(), + "The size of dim_y should not be greater than dim_x's."); + + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +class CompareOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // CompareOp kernel's device type is decided by input tensor place + bool force_cpu = ctx.Attr("force_cpu"); + kt.place_ = force_cpu ? platform::CPUPlace() + : ctx.Input("X")->place(); + return kt; + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_COMPARE_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::CompareOp, \ + ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \ + ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker); + +REGISTER_COMPARE_OP(less_than, "Out = X < Y"); +REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor); +REGISTER_COMPARE_OP(less_equal, "Out = X <= Y"); +REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor); +REGISTER_COMPARE_OP(greater_than, "Out = X > Y"); +REGISTER_COMPARE_KERNEL(greater_than, CPU, + paddle::operators::GreaterThanFunctor); +REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y"); +REGISTER_COMPARE_KERNEL(greater_equal, CPU, + paddle::operators::GreaterEqualFunctor); +REGISTER_COMPARE_OP(equal, "Out = X == Y"); +REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor); +REGISTER_COMPARE_OP(not_equal, "Out = X != Y"); +REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu new file mode 100644 index 00000000..b1f30635 --- /dev/null +++ b/paddle/fluid/operators/controlflow/compare_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/controlflow/compare_op.h" + +REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor); +REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor); +REGISTER_COMPARE_KERNEL(greater_than, CUDA, + paddle::operators::GreaterThanFunctor); +REGISTER_COMPARE_KERNEL(greater_equal, CUDA, + paddle::operators::GreaterEqualFunctor); +REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor); +REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h new file mode 100644 index 00000000..b7529e4a --- /dev/null +++ b/paddle/fluid/operators/controlflow/compare_op.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +template +struct LessThanFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; } +}; + +template +struct LessEqualFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; } +}; + +template +struct GreaterThanFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a > b; } +}; + +template +struct GreaterEqualFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a >= b; } +}; + +template +struct EqualFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { + if (std::is_floating_point::value) { + // This branch will be optimized while compiling if T is integer. It is + // safe to cast a and b to double. + return fabs(static_cast(a - b)) < 1e-8; + } else { + return (a == b); + } + } +}; + +template +struct NotEqualFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { + return !EqualFunctor()(a, b); + } +}; + +template +class CompareOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + using Tensor = framework::Tensor; + + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* z = context.Output("Out"); + int axis = context.Attr("axis"); + ElementwiseComputeEx(context, x, y, axis, + Functor(), z); + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_COMPARE_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc new file mode 100644 index 00000000..8ad2f793 --- /dev/null +++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/controlflow/conditional_block_op.h" + +namespace paddle { +namespace operators { + +/* We will implement the op with block separately in the future. + * The main reason is that some of the training requirements + * in these OPS can lead to problems(such as memory leaks) during inference. + */ +class ConditionalBlockInferOp : public ConditionalOp { + public: + ConditionalBlockInferOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ConditionalOp(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + bool need_run; + if (Attr("is_scalar_condition")) { + // When is_scalar_condition is True, the conditional variable is a scalar, + // whether need to execute the operators in sub-block depends on the + // conditional variable (Cond). + auto xs = InputTensors(scope, "Cond"); + need_run = ScalarCondition(xs); + } else { + // When is_scalar_condition is False, the conditional variable maybe a + // vector or tensor, whether need to execute the operators in sub-block + // depends on the input variables (Input). + auto xs = InputTensors(scope, "Input"); + need_run = std::all_of( + xs.begin(), xs.end(), + [](const framework::LoDTensor *t) { return t->numel() != 0; }); + } + + if (need_run) { + auto *scope_var = scope.FindVar(Output("Scope")); + PADDLE_ENFORCE(scope_var != nullptr, "Must set scope"); + auto *scopes = scope_var->GetMutable>(); + scopes->resize(1); + scopes->front() = &scope.NewScope(); + auto &cur_scope = *scopes->front(); + + framework::Executor exec(dev_place); + auto *block = Attr("sub_block"); + exec.Run(*block->Program(), &cur_scope, block->ID(), false); + scope.DeleteScope(scopes->front()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(conditional_block_infer, ops::ConditionalBlockInferOp, + ops::ConditionalBlockOpProtoMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc new file mode 100644 index 00000000..8358ef75 --- /dev/null +++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc @@ -0,0 +1,189 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/controlflow/conditional_block_op.h" + +namespace paddle { +namespace operators { + +class ConditionalBlockOp : public ConditionalOp { + public: + ConditionalBlockOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ConditionalOp(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + bool need_run; + if (Attr("is_scalar_condition")) { + // When is_scalar_condition is True, the conditional variable is a scalar, + // whether need to execute the operators in sub-block depends on the + // conditional variable (Cond). + auto xs = InputTensors(scope, "Cond"); + need_run = ScalarCondition(xs); + } else { + // When is_scalar_condition is False, the conditional variable maybe a + // vector or tensor, whether need to execute the operators in sub-block + // depends on the input variables (Input). + auto xs = InputTensors(scope, "Input"); + need_run = std::all_of( + xs.begin(), xs.end(), + [](const framework::LoDTensor *t) { return t->numel() != 0; }); + } + + if (need_run) { + auto *scope_var = scope.FindVar(Output("Scope")); + PADDLE_ENFORCE(scope_var != nullptr, "Must set scope"); + auto *scopes = scope_var->GetMutable>(); + scopes->resize(1); + scopes->front() = &scope.NewScope(); + auto &cur_scope = *scopes->front(); + + framework::Executor exec(dev_place); + auto *block = Attr("sub_block"); + exec.Run(*block->Program(), &cur_scope, block->ID(), false); + } + } +}; + +class ConditionalBlockGradOp : public ConditionalOp { + public: + ConditionalBlockGradOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ConditionalOp(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + bool need_run; + if (Attr("is_scalar_condition")) { + auto xs = this->InputTensors(scope, "Cond"); + need_run = ScalarCondition(xs); + } else { + auto xs = this->InputTensors(scope, "Input"); + need_run = std::all_of( + xs.begin(), xs.end(), + [](const framework::LoDTensor *t) { return t->numel() != 0; }); + } + + if (need_run) { + auto *scope_var = scope.FindVar(Input("Scope")); + PADDLE_ENFORCE(scope_var != nullptr, "Must set scope"); + auto &scopes = scope_var->Get>(); + framework::Scope &cur_scope = *scopes[0]; + + framework::Executor exec(dev_place); + auto *block = Attr("sub_block"); + + const auto &ins = Inputs("Input"); + const auto &d_ins = Outputs(framework::GradVarName("Input")); + const auto &conds = Inputs("Cond"); + const auto &d_conds = Outputs(framework::GradVarName("Cond")); + + std::vector ins_conds_grads; + ins_conds_grads.reserve(ins.size() + conds.size()); + for (auto &in : ins) { + ins_conds_grads.emplace_back(framework::GradVarName(in)); + } + for (auto &cond : conds) { + ins_conds_grads.emplace_back(framework::GradVarName(cond)); + } + + exec.Run(*block->Program(), &cur_scope, block->ID(), false, true, + ins_conds_grads); + + AssignLocalGradientToGlobal(dev_place, cur_scope, ins_conds_grads.data(), + ins.size(), d_ins); + + AssignLocalGradientToGlobal(dev_place, cur_scope, + ins_conds_grads.data() + ins.size(), + conds.size(), d_conds); + } + } + + private: + void AssignLocalGradientToGlobal( + const platform::Place &place, const framework::Scope &cur_scope, + const std::string *p_grad_names, size_t p_grad_names_num, + const std::vector &pg_names) const { + for (size_t i = 0; i < p_grad_names_num; ++i) { + auto out_grad_name = pg_names[i]; + const auto &in_grad_name = p_grad_names[i]; + auto *in_var = cur_scope.FindVar(in_grad_name); + if (in_var == nullptr) { + continue; + } + auto new_in_grad_name = cur_scope.Rename(in_grad_name); + auto assign = framework::OpRegistry::CreateOp( + "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}}, + framework::AttributeMap{}); + assign->Run(cur_scope, place); + cur_scope.Rename(new_in_grad_name, in_grad_name); + } + } +}; + +class ConditionalBlockGradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInputs("Cond")); + if (context->HasInputs("Input")) { + PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Input"))); + context->SetOutputsDim(framework::GradVarName("Input"), + context->GetInputsDim("Input")); + } + if (context->HasOutputs(framework::GradVarName("Cond"))) { + context->SetOutputsDim(framework::GradVarName("Cond"), + context->GetInputsDim("Cond")); + } + } +}; + +class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto grad_op = new framework::OpDesc(); + grad_op->SetType("conditional_block_grad"); + grad_op->SetInput("Cond", Input("Cond")); + grad_op->SetInput("Input", Input("Input")); + grad_op->SetInput("Out", Output("Out")); + grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + grad_op->SetInput("Scope", Output("Scope")); + grad_op->SetOutput(framework::GradVarName("Cond"), + InputGrad("Cond", false)); + grad_op->SetOutput(framework::GradVarName("Input"), + InputGrad("Input", false)); + grad_op->SetBlockAttr("sub_block", this->grad_block_[0]); + grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp, + ops::ConditionalBlockOpProtoMaker, + ops::ConditionalBlockGradMaker); +REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp, + ops::ConditionalBlockGradInferShape); diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h new file mode 100644 index 00000000..9a079c84 --- /dev/null +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/var_type.h" + +namespace paddle { +namespace operators { + +class ConditionalOp : public framework::OperatorBase { + public: + ConditionalOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + protected: + std::vector InputTensors( + const framework::Scope &scope, const std::string &in_name) const { + std::vector retv; + auto xs = Inputs(in_name); + retv.resize(xs.size(), nullptr); + std::transform( + xs.begin(), xs.end(), retv.begin(), + [&scope](const std::string &var_name) -> const framework::LoDTensor * { + auto *var = scope.FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name); + return &var->Get(); + }); + return retv; + } + + bool ScalarCondition( + const std::vector &ips) const { + if (!(ips.size() == 1UL && ips[0]->IsInitialized())) { + PADDLE_THROW("should have one initialized input as condition"); + } + + PADDLE_ENFORCE(ips[0]->type() == framework::proto::VarType::BOOL && + ips[0]->numel() == 1, + "condition input's data type should be bool, " + "numel should be 1, actual numel is %d", + ips[0]->numel()); + bool res = false; + if (platform::is_gpu_place(ips[0]->place())) { +#ifdef PADDLE_WITH_CUDA + framework::LoDTensor cpu_tensor; + framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); + platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); + res = cpu_tensor.data()[0]; +#endif + } else { + res = ips[0]->data()[0]; + } + return res; + } +}; + +class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Cond", + "The conditional variable of this operator. If Cond is empty, the " + "whole sub-block will not be executed.") + .AsDuplicable(); + AddInput("Input", "The input variables of the sub-block.").AsDuplicable(); + AddOutput("Out", "The output variables of the sub-block.").AsDuplicable(); + AddOutput("Scope", + "(std::vector) The step scope of conditional block. To " + "unify the conditional block, rnn and while op, the type of " + "scope is std::vector"); + AddAttr( + "sub_block", "The step block of conditional block operator"); + AddAttr("is_scalar_condition", + "The conditional variable (Cond) is used as scalar " + "condition.") + .SetDefault(false); + AddComment(R"DOC(Conditional block operator + +If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar, +run the operators in sub-block if Cond is True. + +If `is_scalar_condition` is False, the conditional variable (Cond) is a vector or +tensor, run the operators in sub-block if all of input variables are not empty. + + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc new file mode 100644 index 00000000..0dfed7f5 --- /dev/null +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +class FeedOp : public framework::OperatorBase { + public: + FeedOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + // get device context from pool + auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); + + auto feed_var_name = Input("X"); + auto *feed_var = scope.FindVar(feed_var_name); + + PADDLE_ENFORCE(feed_var != nullptr, + "Cannot find feed_var in scope, feed_var_name is %s", + feed_var_name); + + auto out_name = this->Output("Out"); + auto *out_var = scope.FindVar(out_name); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot find out_var in scope, out_var_name is %s", + out_name); + + auto col = Attr("col"); + + VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var " + << out_name; + + auto &feed_list = feed_var->Get(); + PADDLE_ENFORCE_LT(static_cast(col), feed_list.size()); + auto &feed_item = feed_list.at(static_cast(col)); + auto *out_item = out_var->GetMutable(); + + if (platform::is_same_place(feed_item.place(), place)) { + out_item->ShareDataWith(feed_item); + } else { + framework::TensorCopy(feed_item, place, *dev_ctx, out_item); + } + out_item->set_lod(feed_item.lod()); + } +}; + +class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input of feed op"); + AddOutput("Out", "The output of feed op"); + AddAttr("col", "(int) The column of feed"); + AddComment(R"DOC( +Feed Operator. + +It should not be configured by users directly. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(feed, paddle::operators::FeedOp, + paddle::framework::EmptyGradOpMaker, + paddle::operators::FeedOpInfoMaker); diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc new file mode 100644 index 00000000..85d36c5c --- /dev/null +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { + +class FetchOp : public framework::OperatorBase { + public: + FetchOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto fetch_var_name = Input("X"); + auto *fetch_var = scope.FindVar(fetch_var_name); + PADDLE_ENFORCE(fetch_var != nullptr, + "Cannot find fetch variable in scope, fetch_var_name is %s", + fetch_var_name); + + auto out_name = this->Output("Out"); + auto *out_var = scope.FindVar(out_name); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot find out_var in scope, out_var_name is %s", + out_name); + + auto col = static_cast(Attr("col")); + + auto *fetch_list = out_var->GetMutable(); + auto &src_item = fetch_var->Get(); + + if (col >= fetch_list->size()) { + fetch_list->resize(col + 1); + } + auto &dst_item = fetch_list->at(col); + + // FIXME(yuyang18): Should we assume the fetch operator always generate + // CPU outputs? + if (src_item.IsInitialized() && src_item.numel() > 0) { + TensorCopySync(src_item, platform::CPUPlace(), &dst_item); + } else { + // Not copy, if the src tensor is empty. + dst_item.clear(); + dst_item.Resize({0}); + } + dst_item.set_lod(src_item.lod()); + + VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name; + } +}; + +class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input of fetch op"); + AddOutput("Out", "The output of fetch op"); + AddAttr("col", "(int) The column of fetch"); + AddComment(R"DOC( +Fetch Operator. + +It should not be configured by users directly. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(fetch, paddle::operators::FetchOp, + paddle::framework::EmptyGradOpMaker, + paddle::operators::FetchOpInfoMaker); diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc new file mode 100644 index 00000000..fa77f974 --- /dev/null +++ b/paddle/fluid/operators/controlflow/get_places_op.cc @@ -0,0 +1,116 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // NOLINT +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif + +namespace paddle { +namespace operators { + +static size_t CUDADevCount() { +#ifdef PADDLE_WITH_CUDA + return platform::GetCUDADeviceCount(); +#else + return 0UL; +#endif +} + +class GetPlacesOp : public framework::OperatorBase { + public: + GetPlacesOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + bool is_gpu; + if (Attr("device_type") == "AUTO") { + is_gpu = platform::is_gpu_place(place); + } else { + is_gpu = Attr("device_type") == "CUDA"; + } + auto device_count = static_cast(Attr("device_count")); + if (device_count == 0) { + device_count = + is_gpu ? CUDADevCount() : std::thread::hardware_concurrency(); + } + PADDLE_ENFORCE_NE(device_count, 0UL, "Cannot indicate %s device count", + is_gpu ? "GPU" : "CPU"); + + auto out_var_name = Output("Out"); + auto &places = + *(detail::Ref(scope.FindVar(out_var_name), + "Output variable %s cannot be found", out_var_name) + .GetMutable()); + places.reserve(device_count); + if (is_gpu) { + PADDLE_ENFORCE_LE(device_count, CUDADevCount(), + "Only %d CUDA devices found, cannot set to %d", + CUDADevCount(), device_count); + for (size_t i = 0; i < device_count; ++i) { + places.emplace_back(platform::CUDAPlace(static_cast(i))); + } + } else { + for (size_t i = 0; i < device_count; ++i) { + places.emplace_back(platform::CPUPlace()); + } + } + } +}; + +class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("Out", "vector of Place"); + AddAttr("device_count", "device count").SetDefault(0); + AddAttr("device_type", "device type") + .InEnum({"CUDA", "CPU", "AUTO"}) + .SetDefault("AUTO"); + AddComment(R"DOC( +Returns a list of places based on arguments. The list will be used for parallel +execution. +)DOC"); + } +}; + +class GetPlacesInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + for (auto &o_name : ctx->Output("Out")) { + ctx->SetType(o_name, framework::proto::VarType::PLACE_LIST); + } + } +}; + +class GetPlacesInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + // Do nothing + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(get_places, ops::GetPlacesOp, ops::GetPlacesOpProtoMaker, + ops::GetPlacesInferVarType, ops::GetPlacesInferShape, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc new file mode 100644 index 00000000..37a82a80 --- /dev/null +++ b/paddle/fluid/operators/controlflow/logical_op.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/controlflow/logical_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +template +class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + OpComment comment; + AddInput("X", + string::Sprintf("(LoDTensor) Left hand operand of %s operator", + comment.type)); + AddInput("Y", + string::Sprintf("(LoDTensor) Right hand operand of %s operator", + comment.type)); + AddOutput("Out", string::Sprintf( + "(LoDTensor) n-dim bool tensor. Each element is %s", + comment.equation)); + AddComment(string::Sprintf(R"DOC(%s Operator + +It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean tensors. +Each element of Out is calculated by %s +)DOC", + comment.type, comment.equation)); + } +}; + +template +class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + OpComment comment; + AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator", + comment.type)); + AddOutput("Out", string::Sprintf( + "(LoDTensor) n-dim bool tensor. Each element is %s", + comment.equation)); + AddComment(string::Sprintf(R"DOC(%s Operator + +It operates element-wise on X, and returns the Out. X and Out are N-dim boolean tensors. +Each element of Out is calculated by %s +)DOC", + comment.type, comment.equation)); + } +}; + +template +class BinaryLogicalOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + OpComment comment; + PADDLE_ENFORCE(context->HasInput("X"), + "Input(X) of %s operator must not be null", comment.type); + PADDLE_ENFORCE(context->HasInput("Y"), + "Input(Y) of %s operator must not be null", comment.type); + auto dim_x = context->GetInputDim("X"); + auto dim_y = context->GetInputDim("Y"); + + int product_x = framework::product(dim_x); + int product_y = framework::product(dim_y); + bool check = context->IsRuntime() || (product_x >= 0 && product_y >= 0); + if (check) { + PADDLE_ENFORCE_EQ( + product_x, product_y, + "The number of elements in X and Y should be same, %d != %d", + product_x, product_y); + } + + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +template +class UnaryLogicalOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + OpComment comment; + PADDLE_ENFORCE(context->HasInput("X"), + "Input(X) of %s operator must not be null", comment.type); + context->SetOutputDim("Out", context->GetInputDim("X")); + context->ShareLoD("X", "Out"); + } +}; + +class LogicalOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // LogicalOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_BINARY_LOGICAL_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::LogicalOp, \ + ::paddle::operators::BinaryLogicalOpProtoMaker<_##op_type##Comment>, \ + ::paddle::operators::BinaryLogicalOpInferShape<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker); + +#define REGISTER_UNARY_LOGICAL_OP(op_type, _equation) \ + struct _##op_type##Comment { \ + static char type[]; \ + static char equation[]; \ + }; \ + char _##op_type##Comment::type[]{#op_type}; \ + char _##op_type##Comment::equation[]{_equation}; \ + REGISTER_OPERATOR( \ + op_type, ::paddle::operators::LogicalOp, \ + ::paddle::operators::UnaryLogicalOpProtoMaker<_##op_type##Comment>, \ + ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \ + ::paddle::framework::EmptyGradOpMaker); + +REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$"); +REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU, + paddle::operators::LogicalAndFunctor); +REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$"); +REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU, + paddle::operators::LogicalOrFunctor); +REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); +REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU, + paddle::operators::LogicalNotFunctor); +REGISTER_BINARY_LOGICAL_OP(logical_xor, + "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$"); +REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU, + paddle::operators::LogicalXorFunctor); diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu new file mode 100644 index 00000000..7ca54b48 --- /dev/null +++ b/paddle/fluid/operators/controlflow/logical_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/controlflow/logical_op.h" + +REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA, + paddle::operators::LogicalAndFunctor); +REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA, + paddle::operators::LogicalOrFunctor); +REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA, + paddle::operators::LogicalNotFunctor); +REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA, + paddle::operators::LogicalXorFunctor); diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h new file mode 100644 index 00000000..4a83e0fd --- /dev/null +++ b/paddle/fluid/operators/controlflow/logical_op.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +template +struct LogicalAndFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; } +}; + +template +struct LogicalOrFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; } +}; + +template +struct LogicalNotFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a) const { return !a; } +}; + +template +struct LogicalXorFunctor { + using ELEM_TYPE = T; + HOSTDEVICE bool operator()(const T& a, const T& b) const { + return (a || b) && !(a && b); + } +}; + +template +class BinaryLogicalOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + auto* x = context.Input("X"); + auto* y = context.Input("Y"); + auto* out = context.Output("Out"); + Functor binary_func; + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), y->data(), + out->mutable_data(context.GetPlace()), binary_func); + } +}; + +template +class UnaryLogicalOpKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + using T = typename Functor::ELEM_TYPE; + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + Functor unary_func; + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), + out->mutable_data(context.GetPlace()), unary_func); + } +}; + +} // namespace operators +} // namespace paddle + +#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::BinaryLogicalOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); + +#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::UnaryLogicalOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/fluid/operators/controlflow/op_variant.cc b/paddle/fluid/operators/controlflow/op_variant.cc new file mode 100644 index 00000000..d6eea8c4 --- /dev/null +++ b/paddle/fluid/operators/controlflow/op_variant.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/controlflow/op_variant.h" + +namespace paddle { +namespace operators { + +struct InputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Inputs()); + } +}; + +struct OutputsVisitor + : public boost::static_visitor { + template + const framework::VariableNameMap *operator()(const OpType *op) const { + return &(op->Outputs()); + } +}; + +struct AttributeMapVisitor + : public boost::static_visitor { + const framework::AttributeMap *operator()(const framework::OpDesc *op) const { + return &(op->GetAttrMap()); + } + + const framework::AttributeMap *operator()( + const framework::OperatorBase *op) const { + return &(op->Attrs()); + } +}; + +struct RawPointerVisitor : public boost::static_visitor { + template + const void *operator()(const OpType *op) const { + return op; + } +}; + +const framework::VariableNameMap &OpVariant::Inputs() const { + return *boost::apply_visitor(InputsVisitor(), op_); +} + +const framework::VariableNameMap &OpVariant::Outputs() const { + return *boost::apply_visitor(OutputsVisitor(), op_); +} + +const framework::AttributeMap &OpVariant::Attrs() const { + return *boost::apply_visitor(AttributeMapVisitor(), op_); +} + +const void *OpVariant::RawPointer() const { + return boost::apply_visitor(RawPointerVisitor(), op_); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h new file mode 100644 index 00000000..26c70589 --- /dev/null +++ b/paddle/fluid/operators/controlflow/op_variant.h @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace operators { + +// OpVariant is a wrapper class of OpDesc and OperatorBase pointer +// So that API would be the same. +class OpVariant { + public: + OpVariant(const framework::OperatorBase *op) : op_(op) {} // NOLINT + + OpVariant(const framework::OpDesc *op) : op_(op) {} // NOLINT + + const framework::VariableNameMap &Inputs() const; + + const framework::VariableNameMap &Outputs() const; + + const framework::AttributeMap &Attrs() const; + + const void *RawPointer() const; + + template + const AttrType &Attr(const std::string &name) const { + auto &attrs = Attrs(); + auto it = attrs.find(name); + PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name); + return boost::get(it->second); + } + + bool operator==(const OpVariant &other) const { + return RawPointer() == other.RawPointer(); + } + + int which() const { return static_cast(op_.which()); } + + struct Hasher { + size_t operator()(const OpVariant &op) const { + return reinterpret_cast(op.RawPointer()); + } + }; + + private: + const boost::variant + op_; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc new file mode 100644 index 00000000..69250866 --- /dev/null +++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc @@ -0,0 +1,265 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/controlflow/recurrent_op_helper.h" + +#include +#include +#include +#include + +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/recurrent_op.h" + +namespace paddle { +namespace operators { + +static bool IsMatchedRecurrentOpAndRecurrentGradOp(const OpVariant &fwd_op, + const OpVariant &grad_op) { + return fwd_op.Inputs().at(RecurrentBase::kInputs) == + grad_op.Inputs().at(RecurrentBase::kInputs) && + fwd_op.Outputs().at(RecurrentBase::kOutputs) == + grad_op.Inputs().at(RecurrentBase::kOutputs); +} + +// Returns whether the variable is skippable in forward recurrent op +// The variable is skippable in recurrent_op when the variable used in +// recurrent_grad is not from grad_block. +static bool IsSkippableVar(const std::string &name, + framework::BlockDesc *grad_block) { + return name != framework::kEmptyVarName && !grad_block->HasVar(name); +} + +static void ClearSkipVars(const OpVariant &op) { + auto &attrs = const_cast(op.Attrs()); + std::vector &attr_skip_vars = + boost::get>( + attrs[RecurrentBase::kSkipEagerDeletionVars]); + attr_skip_vars.clear(); +} + +// Add skip vars into op's attribute +template +static void AddSkipVars(const OpVariant &op, const Container &skip_vars) { + auto &attrs = const_cast(op.Attrs()); + VLOG(2) << "Prepare to add " << skip_vars.size() + << " skip var(s): " << paddle::string::join_strings(skip_vars, ' '); + std::vector &attr_skip_vars = + boost::get>( + attrs[RecurrentBase::kSkipEagerDeletionVars]); + attr_skip_vars.insert(attr_skip_vars.end(), skip_vars.cbegin(), + skip_vars.cend()); +} + +// Find all ops and grad ops with given type name. The ops and grad ops +// may locate in different blocks so we should traverse all blocks in the +// program and find them out +static void FindAllOpAndGradOp(OpAndGradOpPair *op_and_grad_op, + const std::string &type_name, + const std::string &backward_type_name) { + OpVariantSet &ops = op_and_grad_op->first; + OpVariantSet &grad_ops = op_and_grad_op->second; + + PADDLE_ENFORCE_GE(ops.size(), grad_ops.size(), + "There are extra grad ops in the graph or program"); + + if (ops.empty()) return; + + const auto *program = + ops.begin() + ->Attr(RecurrentBase::kStepBlock) + ->Program(); + for (size_t i = 1; i < program->Size(); ++i) { + auto &block = program->Block(i); + for (size_t j = 0; j < block.OpSize(); ++j) { + auto *op = block.Op(j); + if (op->Type() == type_name) { + ops.emplace(op); + } else if (op->Type() == backward_type_name) { + grad_ops.emplace(op); + } + } + } + + PADDLE_ENFORCE_GE(ops.size(), grad_ops.size(), + "There are extra grad ops in the graph or program"); +} + +// Returns GradVarName of input var names +static std::vector GradVarLists( + const std::vector &var_names) { + std::vector retv; + retv.reserve(var_names.size()); + std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv), + framework::GradVarName); + return retv; +} + +// Add memory vars in recurrent op as skip vars. +static void AddOpMemVarsAsSkip(const OpVariant &op, bool set_grad_mem_vars) { + bool has_state = op.Attr(RecurrentBase::kHasStates); + if (has_state) { + std::unordered_set skip_vars; + + auto &mem_vars = op.Attr>(RecurrentBase::kStates); + skip_vars.insert(mem_vars.begin(), mem_vars.end()); + + auto &pre_mem_vars = + op.Attr>(RecurrentBase::kExStates); + skip_vars.insert(pre_mem_vars.begin(), pre_mem_vars.end()); + + if (set_grad_mem_vars) { + auto mem_grad_vars = GradVarLists(mem_vars); + skip_vars.insert(mem_grad_vars.begin(), mem_grad_vars.end()); + auto pre_mem_grad_vars = GradVarLists(pre_mem_vars); + skip_vars.insert(pre_mem_grad_vars.begin(), pre_mem_grad_vars.end()); + } + AddSkipVars(op, skip_vars); + } +} + +// Set outputs and memory vars of the input forward op as skip vars +static void SetRecurrentForwardOpOnlySkipVarAttr(const OpVariant &fwd_op) { + ClearSkipVars(fwd_op); + + AddOpMemVarsAsSkip(fwd_op, /* set_grad_mem_vars = */ false); + auto &output_vars = fwd_op.Outputs().at(RecurrentBase::kOutputs); + AddSkipVars(fwd_op, output_vars); +} + +// Set skip vars of matched recurrent op and recurrent_grad op +static void SetRecurrentOpAndRecurrentGradOpSkipVarAttr( + const OpVariant &fwd_op, const OpVariant &bwd_op) { + // Find all skippable variables in forward recurrent_op + ClearSkipVars(fwd_op); + AddOpMemVarsAsSkip(fwd_op, /* set_grad_mem_vars = */ false); + + auto *grad_block = + bwd_op.Attr(RecurrentBase::kStepBlock); + std::unordered_set fwd_skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + if (IsSkippableVar(in_arg_name, grad_block)) { + fwd_skip_vars.insert(in_arg_name); + } + } + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (IsSkippableVar(out_arg_name, grad_block)) { + fwd_skip_vars.insert(out_arg_name); + } + } + } + AddSkipVars(fwd_op, fwd_skip_vars); + + // Find all skippable variables in recurrent_grad_op + // The skippable variables are those which would be used across time steps + ClearSkipVars(bwd_op); + AddOpMemVarsAsSkip(bwd_op, /* set_grad_mem_vars = */ true); + std::unordered_set bwd_skip_vars; + + auto &fwd_input = fwd_op.Inputs().at(RecurrentBase::kInputs); + auto &in_grads = + bwd_op.Outputs().at(framework::GradVarName(RecurrentBase::kInputs)); + + PADDLE_ENFORCE_EQ( + fwd_input.size(), in_grads.size(), + "Backward input gradient number does not match forward input number."); + for (size_t i = 0; i < in_grads.size(); ++i) { + if (in_grads[i] == framework::kEmptyVarName) { + continue; + } + bwd_skip_vars.insert(in_grads[i]); + bwd_skip_vars.insert(framework::GradVarName(fwd_input[i])); + } + + auto &fwd_param = fwd_op.Inputs().at(RecurrentBase::kParameters); + auto ¶m_grads = + bwd_op.Outputs().at(framework::GradVarName(RecurrentBase::kParameters)); + PADDLE_ENFORCE_EQ(fwd_param.size(), param_grads.size(), + "Backward parameter gradient number does not match forward " + "parameter number."); + for (size_t i = 0; i < fwd_param.size(); ++i) { + if (param_grads[i] == framework::kEmptyVarName) { + continue; + } + bwd_skip_vars.insert(param_grads[i]); + bwd_skip_vars.insert(framework::GradVarName(fwd_param[i])); + } + + AddSkipVars(bwd_op, bwd_skip_vars); +} + +void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( + int block_id, + const std::vector> + &all_ops) { + // If block_id is not 0, returns + // This is because all recurrent_ops and recurrent_grad_ops in the whole + // program would be processed when block_id is 0 (i.e. when Executor::Run() + // or ParallelExecutor constructs). + + // What's more, all recurrent_ops and recurrent_grad_ops must be processed + // when block_id is zero. If not, recurrent_op may run first and erase + // variables + // used in recurrent_grad_op, and in this moment, recurrent_grad_ops may be + // not constructed yet. + if (block_id != 0) return; + + OpAndGradOpPair op_pair; + for (auto &op : all_ops) { + if (op->Type() == "recurrent") { + op_pair.first.emplace(op.get()); + } else if (op->Type() == "recurrent_grad") { + op_pair.second.emplace(op.get()); + } + } + PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(&op_pair); +} + +void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( + OpAndGradOpPair *op_pair) { + // Find all ops and grad ops at all blocks + FindAllOpAndGradOp(op_pair, "recurrent", "recurrent_grad"); + + OpVariantSet &recurrent_ops = op_pair->first; + OpVariantSet &recurrent_grad_ops = op_pair->second; + + VLOG(2) << "Found recurrent op num: " << recurrent_ops.size() + << ", recurrent grad op num: " << recurrent_grad_ops.size(); + + if (recurrent_ops.empty()) { + return; + } + + for (auto &bwd_op : recurrent_grad_ops) { + const OpVariant *matched_fwd_op = nullptr; + for (auto &fwd_op : recurrent_ops) { + if (IsMatchedRecurrentOpAndRecurrentGradOp(fwd_op, bwd_op)) { + PADDLE_ENFORCE(matched_fwd_op == nullptr, + "Found multiple matched recurrent op"); + matched_fwd_op = &fwd_op; + } + } + PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, "Cannot find matched forward op"); + SetRecurrentOpAndRecurrentGradOpSkipVarAttr(*matched_fwd_op, bwd_op); + recurrent_ops.erase(*matched_fwd_op); + } + + for (auto &fwd_op : recurrent_ops) { + SetRecurrentForwardOpOnlySkipVarAttr(fwd_op); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.h b/paddle/fluid/operators/controlflow/recurrent_op_helper.h new file mode 100644 index 00000000..b1e6e662 --- /dev/null +++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.h @@ -0,0 +1,52 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/controlflow/op_variant.h" +#include "paddle/fluid/operators/recurrent_op.h" +#include "paddle/fluid/platform/variant.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace operators { + +using OpVariantSet = std::unordered_set; +using OpAndGradOpPair = std::pair; + +// Set vars to skip eager deletion on input recurrent and recurrent_grad for +// preparing safe eager deletion. Input contains all recurrent and +// recurrent_grad ops at block 0 and the function will find all recurrent and +// recurrent_grad ops across blocks. +void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( + OpAndGradOpPair *op_pair); + +// Set vars to skip eager deletion on input recurrent and recurrent_grad for +// preparing safe eager deletion. The input block_id must be 0 and caller can +// input all ops in the block. The function will find all recurrent and +// recurrent_grad ops across blocks. +void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( + int block_id, + const std::vector> + &all_ops); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc new file mode 100644 index 00000000..2ca5242c --- /dev/null +++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc @@ -0,0 +1,232 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/array_operator.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +namespace paddle { +namespace operators { + +class WriteToArrayOp : public ArrayOp { + public: + WriteToArrayOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ArrayOp(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto *x = scope.FindVar(Input("X")); + if (x == nullptr) return; + auto &x_tensor = x->Get(); + size_t offset = GetOffset(scope, place); + auto *out = + scope.FindVar(Output("Out"))->GetMutable(); + if (offset >= out->size()) { + VLOG(10) << "Resize " << Output("Out") << " from " << out->size() + << " to " << offset + 1; + out->resize(offset + 1); + } + auto *out_tensor = &out->at(offset); + out_tensor->set_lod(x_tensor.lod()); + if (x_tensor.memory_size() > 0) { + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + + TensorCopy(x_tensor, place, dev_ctx, out_tensor); + } else { + VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " + "nothing has been written to output array[" + << offset << "]."; + } + } +}; + +class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(LoDTensor) the tensor will be written to tensor array"); + AddInput( + "I", + "(Tensor) the subscript index in tensor array. The number of element " + "should be 1"); + AddOutput("Out", "(TensorArray) the tensor array will be written"); + AddComment(R"DOC( +WriteToArray Operator. + +This operator writes a LoDTensor to a LoDTensor array. + +Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The +equation is + +$$A[i] = T$$ + +)DOC"); + } +}; + +class WriteToArrayInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index"); + if (context->IsRuntime()) { + PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1, + "The number of element of subscript index must be 1"); + } + if (!context->HasInput("X")) { + return; + } + PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError()); + context->SetOutputDim("Out", context->GetInputDim("X")); + } + + protected: + virtual const char *NotHasXError() const { return "Must set the lod tensor"; } + + virtual const char *NotHasOutError() const { + return "Must set the lod tensor array"; + } +}; + +class WriteToArrayInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_name = ctx->Input("X")[0]; + auto out_name = ctx->Output("Out")[0]; + VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; + ctx->SetType(out_name, framework::proto::VarType::LOD_TENSOR_ARRAY); + if (ctx->HasVar(x_name)) { + ctx->SetDataType(out_name, ctx->GetDataType(x_name)); + } + } +}; + +class ReadFromArrayOp : public ArrayOp { + public: + ReadFromArrayOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : ArrayOp(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto *x = scope.FindVar(Input("X")); + PADDLE_ENFORCE(x != nullptr, "X must be set"); + auto &x_array = x->Get(); + auto *out = scope.FindVar(Output("Out")); + PADDLE_ENFORCE(out != nullptr, "Out must be set"); + size_t offset = GetOffset(scope, place); + if (offset < x_array.size()) { + auto *out_tensor = out->GetMutable(); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + framework::TensorCopy(x_array[offset], place, dev_ctx, out_tensor); + out_tensor->set_lod(x_array[offset].lod()); + } else { + VLOG(10) << "offset " << offset << " >= " << x_array.size(); + } + } +}; + +class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(TensorArray) the array will be read from."); + AddInput("I", + "(Tensor) the subscript index in tensor array. The number of " + "element should be 1"); + AddOutput("Out", "(LoDTensor) the tensor will be read from."); + AddComment(R"DOC( +ReadFromArray Operator. + +Read a LoDTensor from a LoDTensor Array. + +Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The +equation is + +$$T = A[i]$$ + +)DOC"); + } +}; + +class ReadFromArrayInferShape : public WriteToArrayInferShape { + public: + void operator()(framework::InferShapeContext *context) const override { + WriteToArrayInferShape::operator()(context); + if (!context->HasInput("X")) { + return; + } + + // FIXME: just for compile time. + if (!context->IsRuntime()) { + context->ShareLoD("X", /*->*/ "Out"); + } + } + + protected: + const char *NotHasXError() const override { + return "The input array X must be set"; + } + const char *NotHasOutError() const override { + return "The output tensor out must be set"; + } +}; + +class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("read_from_array"); + grad_op->SetInput("I", Input("I")); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("write_to_array"); + grad_op->SetInput("I", Input("I")); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttrMap(Attrs()); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(write_to_array, ops::WriteToArrayOp, + ops::WriteToArrayInferShape, ops::WriteToArrayOpProtoMaker, + ops::WriteToArrayGradMaker, ops::WriteToArrayInferVarType); +REGISTER_OPERATOR(read_from_array, ops::ReadFromArrayOp, + ops::ReadFromArrayInferShape, ops::ReadFromArrayProtoMaker, + ops::ReadFromArrayGradMaker); diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc new file mode 100644 index 00000000..b3219208 --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -0,0 +1,459 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/controlflow/while_op_helper.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +using StepScopeVar = std::vector; +using LoDTensor = framework::LoDTensor; + +namespace { // NOLINT +static std::string GetSkipEagerDeletionVarsDebugString( + const std::vector &vars) { + std::string str = "Skip " + std::to_string(vars.size()) + + " var(s) in eager deletion mode: "; + for (auto &var : vars) { + str.append(var); + str.push_back(' '); + } + return str; +} +} // NOLINT + +class WhileOp : public framework::OperatorBase { + public: + WhileOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition))); + + auto &cond = scope.FindVar(Input(kCondition))->Get(); + PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1})); + + framework::Executor executor(dev_place); + auto *block = Attr(kStepBlock); + + auto *program = block->Program(); + + auto step_scopes = + scope.FindVar(Output(kStepScopes))->GetMutable(); + + PADDLE_ENFORCE(platform::is_cpu_place(cond.place()), + "Condition of while op must in CPU memory."); + + bool is_test = Attr("is_test"); + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); + + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); + if (!is_test) { + while (cond.data()[0]) { + auto ¤t_scope = scope.NewScope(); + step_scopes->push_back(¤t_scope); + executor.RunPreparedContext(ctx.get(), ¤t_scope, false, true, + true); + } + } else { + auto ¤t_scope = scope.NewScope(); + executor.CreateVariables(*program, ¤t_scope, block->ID()); + while (cond.data()[0]) { + for (auto &name : current_scope.LocalVarNames()) { + auto *var = current_scope.Var(name); + if (var->IsType()) { + // Clear all lod information for all lod_tensors. + auto *t = var->GetMutable(); + framework::LoD empty_lod; + t->set_lod(empty_lod); + } else if (var->IsType()) { + // Clear elements of all tensor arrays. + auto *t = var->GetMutable(); + t->clear(); + } + } + executor.RunPreparedContext(ctx.get(), ¤t_scope, false, false, + false); + } + scope.DeleteScope(¤t_scope); + } + } +}; + +class WhileOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput(kX, + "A set of variables, which are required by operators inside the " + "block of While Op.") + .AsDuplicable(); + AddInput( + kCondition, + "(Bool) An scalar. When it's False, the While Op will be terminated.") + .AsDuplicable(); + AddOutput(kOutputs, + "A set of variables, which will be assigned with values " + "generated by the operators inside the block of While Op.") + .AsDuplicable(); + AddOutput(kStepScopes, + "(StepScopeVar) A vector of local scope, which size equals the " + "step number of While Op. The i'th scope storages temporary " + "variables generated in the i'th step."); + AddAttr(kStepBlock, + "The step block inside WhileOp"); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddAttr>(kSkipEagerDeletionVars, + "Vars that would skip eager deletion." + "Users should not set this manually.") + .SetDefault(std::vector()); + AddComment(R"DOC( +)DOC"); + } +}; + +class WhileGradOp : public framework::OperatorBase { + public: + WhileGradOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + PADDLE_ENFORCE(!Attr("is_test"), + "GradOp is only callable when is_test is false"); + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + framework::Executor executor(dev_place); + auto *block = Attr(kStepBlock); + auto *program = block->Program(); + + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); + + auto *step_scopes = + scope.FindVar(Input(kStepScopes))->GetMutable(); + + auto outside_og_names = Inputs(framework::GradVarName(kOutputs)); + auto inside_og_names = + Attr>("original_output_grad"); + + PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size()); + + for (auto cur_scope_iter = step_scopes->rbegin(); + cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) { + VLOG(3) << "Start backward at time_step " + << cur_scope_iter - step_scopes->rbegin(); + framework::Scope &cur_scope = **cur_scope_iter; + // Link OG from outside to inside + for (size_t i = 0; i < outside_og_names.size(); ++i) { + auto outside_og_name = outside_og_names[i]; + auto inside_og_name = inside_og_names[i]; + VLOG(8) << "Linking outside " << outside_og_name << " --> inside " + << inside_og_name; + if (scope.FindVar(outside_og_name) == nullptr) { + continue; + } + + auto &og_outside = + detail::Ref(scope.FindVar(outside_og_name), + "Cannot find Outside Gradient %s", outside_og_name); + auto &og_inside = + detail::Ref(cur_scope.Var(inside_og_name), + "Cannot find inside gradient %s", inside_og_name); + if (og_outside.IsType()) { + auto &outside_tensor = og_outside.Get(); + auto &inside_tensor = + detail::Ref(og_inside.GetMutable()); + inside_tensor.set_lod(outside_tensor.lod()); + inside_tensor.ShareDataWith(outside_tensor); + } else if (og_outside.IsType()) { + auto &outside_array = og_outside.Get(); + auto &inside_array = + detail::Ref(og_inside.GetMutable()); + VLOG(8) << outside_og_name << " size = " << outside_array.size(); + inside_array.resize(outside_array.size()); + + for (size_t j = 0; j < inside_array.size(); ++j) { + VLOG(8) << j << " " << outside_array[j].numel(); + if (outside_array[j].numel() != 0) { + inside_array[j].set_lod(outside_array[j].lod()); + inside_array[j].ShareDataWith(outside_array[j]); + } else { + PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0); + } + } + } else { + PADDLE_THROW("Currently only support LoDTensor and LoDTensorArray."); + } + } + executor.RunPreparedContext(ctx.get(), *cur_scope_iter, false, true, + true); + + // The Outputs(kXGRAD) contains the names of the gradient of parameters + // and inputs. + auto &pg_ig_names = Outputs(kXGRAD); + auto &p_names = Inputs(kX); + PADDLE_ENFORCE_EQ(pg_ig_names.size(), p_names.size()); + for (size_t param_id = 0; param_id < pg_ig_names.size(); ++param_id) { + if (pg_ig_names[param_id] == framework::kEmptyVarName) { + continue; // parameter doesn't have gradient + } + auto inside_grad_name = framework::GradVarName(p_names[param_id]); + + // for some grad_op, their input doesn't have gradient, + // for example lookup_table_grad_op, the input(Idx) doesn't have + // gradient. + auto pg_ig_var = cur_scope.FindVar(inside_grad_name); + PADDLE_ENFORCE(pg_ig_var != nullptr); + if (pg_ig_var->IsType()) { + auto pg_ig_lod_t_arr = + pg_ig_var->GetMutable(); + bool empty = true; + for (auto &each : *pg_ig_lod_t_arr) { + if (each.numel() != 0) { + empty = false; + break; + } + } + if (empty) { + LOG(WARNING) << pg_ig_names[param_id] + << " is not found in cur_scope."; + continue; + } + } + + // // TODO(tonyyang-svail): Not sure we need the following + // // If does not compute gradient of that variable inside rnn, + // just + // // continue + // if (local_var_names.find(inside_grad_name) == + // local_var_names.end()) { + // continue; + // } + + // zero gradient variable in step 0 + if (cur_scope_iter == step_scopes->rbegin()) { + auto *var = (*cur_scope_iter)->FindVar(inside_grad_name); + PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name); + PADDLE_ENFORCE( + var->IsType() || + var->IsType(), + "Currently the type of var only can be LoDTensorArray, " + "or LoDTensor, but the received var[%s] is %s.", + inside_grad_name, framework::ToTypeName(var->Type())); + + if (var->IsType()) { + auto &inside_tensor = var->Get(); + framework::AttributeMap attrs; + attrs["dtype"] = inside_tensor.type(); + attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); + attrs["value"] = 0.0f; + + auto var_name = pg_ig_names[param_id]; + auto zero_op = framework::OpRegistry::CreateOp( + "fill_constant", framework::VariableNameMap{}, + {{"Out", {var_name}}}, attrs); + zero_op->Run(scope, dev_place); + scope.FindVar(var_name) + ->GetMutable() + ->set_lod(inside_tensor.lod()); + } + } + auto new_inside_name = cur_scope.Rename(inside_grad_name); + auto sum_op = framework::OpRegistry::CreateOp( + "sum", {{"X", {pg_ig_names[param_id], new_inside_name}}}, + {{"Out", {pg_ig_names[param_id]}}}, + framework::AttributeMap{{"use_mkldnn", {false}}}); + sum_op->Run(cur_scope, dev_place); + cur_scope.Rename(new_inside_name, inside_grad_name); + } + dev_ctx.Wait(); + const_cast(scope).DeleteScope(&cur_scope); + } + } +}; + +class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *while_grad = new framework::OpDesc(); + while_grad->SetType("while_grad"); + while_grad->SetInput(kX, Input(kX)); + while_grad->SetInput(kOutputs, Output(kOutputs)); + while_grad->SetInput(kStepScopes, Output(kStepScopes)); + + auto *grad_block = this->grad_block_[0]; + auto *fwd_block = grad_block->ForwardBlock(); + auto *parent_block = grad_block->ParentBlock(); + + // Not all of IGs will be generated by inner gradient operators of while op. + // Ignore IGs that is not generated by the inside block. + std::unordered_set inner_op_outputs; + for (const auto *op : grad_block->AllOps()) { + for (auto &oname : op->OutputArgumentNames()) { + inner_op_outputs.insert(oname); + } + } + auto igs = InputGrad(kX, /*do not drop empty gradient*/ false); + for (auto &each_ig : igs) { + if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) { + VLOG(8) << "Ignore " << each_ig; + each_ig = framework::kEmptyVarName; + } + } + while_grad->SetOutput(framework::GradVarName(kX), igs); + + // OG should be re-calculated by step blocks, since many outputs of while op + // do not need to calculate gradients. + std::unordered_set block_ins; + block_ins.reserve(Input(kX).size() + Output(kOutputs).size()); + for (auto &p : Input(kX)) { + block_ins.insert(p); + } + for (auto &o : Output(kOutputs)) { + block_ins.insert(o); + } + std::unordered_set output_grads; + for (const auto *op : grad_block->AllOps()) { + for (auto &input_name : op->InputArgumentNames()) { + // If the input of Op has been recorded or is generated by the forward + // block, do not make it as input again. + + // The input is located in I/O or other op's outputs or the variable is + // located in grad_block's parents + if (block_ins.find(input_name) != block_ins.end() || + (fwd_block->FindVarRecursive(input_name) != nullptr || + parent_block->FindVarRecursive(input_name) != nullptr)) { + continue; + } + + output_grads.insert(input_name); + } + for (auto &output_name : op->OutputArgumentNames()) { + block_ins.insert(output_name); + } + } + + std::vector output_grads_list; + output_grads_list.resize(output_grads.size()); + std::copy(output_grads.begin(), output_grads.end(), + output_grads_list.begin()); + while_grad->SetInput(framework::GradVarName(kOutputs), output_grads_list); + + while_grad->SetAttrMap(this->Attrs()); + while_grad->SetBlockAttr(kStepBlock, grad_block); + // record the original output gradient names, since the gradient name of + // while operator could be renamed. + while_grad->SetAttr("original_output_grad", output_grads_list); + + while_grad->SetAttr(kSkipEagerDeletionVars, std::vector()); + + return std::unique_ptr(while_grad); + } +}; + +class WhileGradOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto p_names = ctx->Input(kX); + auto pg_ig_names = ctx->Output(framework::GradVarName(kX)); + + for (size_t i = 0; i < p_names.size(); ++i) { + if (ctx->HasVar(pg_ig_names[i])) { + VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i] + << " type: " << ctx->GetType(p_names[i]); + ctx->SetType(pg_ig_names[i], ctx->GetType(p_names[i])); + ctx->SetDataType(pg_ig_names[i], ctx->GetDataType(p_names[i])); + } + } + } +}; + +class WhileGradOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + ctx->HasInputs(kX); + ctx->HasOutputs(framework::GradVarName(kX)); + ctx->HasInputs(kOutputs); + ctx->HasInputs(framework::GradVarName(kOutputs)); + + auto pg_ig_names = ctx->Outputs(kXGRAD); + std::vector in_var_ptrs = + ctx->GetInputVarPtrs(kX); + std::vector out_var_ptrs = + ctx->GetOutputVarPtrs(kXGRAD); + PADDLE_ENFORCE(in_var_ptrs.size() == out_var_ptrs.size()); + + for (size_t i = 0; i < in_var_ptrs.size(); ++i) { + if (pg_ig_names[i] == framework::kEmptyVarName) { + continue; + } + if (ctx->IsRuntime()) { + framework::Variable *in_var = + boost::get(in_var_ptrs[i]); + framework::Variable *out_var = + boost::get(out_var_ptrs[i]); + + auto type = framework::ToVarType(in_var->Type()); + if (type == framework::proto::VarType::LOD_TENSOR) { + out_var->GetMutable()->Resize( + in_var->Get().dims()); + } else if (type == framework::proto::VarType::SELECTED_ROWS) { + out_var->GetMutable()->set_height( + in_var->Get().GetCompleteDims()[0]); + } else if (type == framework::proto::VarType::LOD_TENSOR_ARRAY) { + PADDLE_THROW("WhileGradOp doesn't support type %d", + static_cast(type)); + } + } else { + framework::VarDesc *in_var = + boost::get(in_var_ptrs[i]); + boost::get(out_var_ptrs[i]) + ->SetShape(in_var->GetShape()); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(while, paddle::operators::WhileOp, + paddle::operators::WhileOpMaker, + paddle::operators::WhileGradOpDescMaker); +REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp, + paddle::operators::WhileGradOpShapeInference, + paddle::operators::WhileGradOpVarTypeInference); diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc new file mode 100644 index 00000000..009bc579 --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -0,0 +1,200 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/controlflow/while_op_helper.h" + +#include +#include +#include + +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/controlflow/op_variant.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace operators { + +// Set skip variables of while_op and while_grad_op +// These variables should be skipped when eager deletion enables. +// It is because: +// 1. while_grad_op needs some variables defined in while_op. +// 2. while_grad_op needs variables from the previous time step. +static void SetSkipVars(const OpVariant &op, std::vector attr) { + auto &attrs = const_cast(op.Attrs()); + VLOG(2) << "Prepare to skip " << attr.size() + << " var(s): " << string::join_strings(attr, ' '); + attrs[kSkipEagerDeletionVars] = std::move(attr); +} + +// Check whether the forward while_op and while_grad_op match +// The program may have many while_ops. +static bool IsMatchedWhileOpAndWhileGradOp(const OpVariant &fwd_op, + const OpVariant &grad_op) { + return fwd_op.Inputs().at(kX) == grad_op.Inputs().at(kX) && + fwd_op.Outputs().at(kOutputs) == grad_op.Inputs().at(kOutputs); +} + +// Test whether the variable is skippable in forward while_op +// The variable is skippable in while_op when the variable used in while_grad +// is not from grad_block. +static bool IsSkippableVar(const std::string &name, + framework::BlockDesc *grad_block) { + return name != framework::kEmptyVarName && !grad_block->HasVar(name); +} + +static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op, + const OpVariant &bwd_op) { + auto *grad_block = bwd_op.Attr(kStepBlock); + + // Find all skippable variables in forward while_op + std::unordered_set forward_skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + if (IsSkippableVar(in_arg_name, grad_block)) { + forward_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (IsSkippableVar(out_arg_name, grad_block)) { + forward_skip_vars.insert(out_arg_name); + } + } + } + + SetSkipVars(fwd_op, std::vector(forward_skip_vars.begin(), + forward_skip_vars.end())); + + // Find all skippable variables in while_grad_op + // The skipped variables are those which would be used across time steps. + auto &fwd_input = fwd_op.Inputs().at(kX); + auto &in_grads = bwd_op.Outputs().at(framework::GradVarName(kX)); + PADDLE_ENFORCE_EQ( + fwd_input.size(), in_grads.size(), + "Backward input gradient number does not match forward input number."); + + std::unordered_set backward_skip_vars; + for (size_t i = 0; i < in_grads.size(); ++i) { + if (in_grads[i] == framework::kEmptyVarName) { + continue; + } + backward_skip_vars.insert(in_grads[i]); + backward_skip_vars.insert(framework::GradVarName(fwd_input[i])); + } + + SetSkipVars(bwd_op, std::vector(backward_skip_vars.begin(), + backward_skip_vars.end())); +} + +// Find all while_ops and while_grad_ops in the graph or program +// The while_grad_op and while_op may located in different blocks +// So we should traverse all blocks in the program and find them out. +static void FindAllWhileAndWhileGradOp(std::vector *while_ops, + std::vector *while_grad_ops) { + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size()); + + if (while_ops->empty()) return; + + const auto *program = + while_ops->front().Attr(kStepBlock)->Program(); + for (size_t i = 1; i < program->Size(); ++i) { + auto &block = program->Block(i); + for (size_t j = 0; j < block.OpSize(); ++j) { + auto *op = block.Op(j); + if (op->Type() == "while") { + while_ops->emplace_back(op); + } else if (op->Type() == "while_grad") { + while_grad_ops->emplace_back(op); + } + } + } + + PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size(), + "There are extra while_grad ops in the graph or program"); +} + +static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( + std::vector *while_ops, std::vector *while_grad_ops) { + FindAllWhileAndWhileGradOp(while_ops, while_grad_ops); + + VLOG(2) << "Found while op num: " << while_ops->size() + << ", while grad op num: " << while_grad_ops->size(); + + if (while_grad_ops->empty()) { + return; + } + + std::unordered_set while_op_set( + while_ops->begin(), while_ops->end()); + + for (auto &bwd_op : *while_grad_ops) { + const OpVariant *matched_fwd_op = nullptr; + for (auto &fwd_op : while_op_set) { + if (IsMatchedWhileOpAndWhileGradOp(fwd_op, bwd_op)) { + PADDLE_ENFORCE(matched_fwd_op == nullptr, + "Found multiple matched while ops"); + matched_fwd_op = &fwd_op; + } + } + PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, + "Cannot find matched forward while op."); + ModifyWhileOpAndWhileGradOpAttr(*matched_fwd_op, bwd_op); + while_op_set.erase(*matched_fwd_op); + } +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops) { + // If block_id is not 0, returns + // This is because all while_ops and while_grad_ops in the whole program + // would be processed when block_id is 0 (i.e. when Executor::Run() or + // ParallelExecutor constructs). + + // What's more, all while_ops and while_grad_ops must be processed when + // block_id is zero. If not, while_op may run first and erase variables + // used in while_grad_op, and in this moment, while_grad_ops may be not + // constructed yet. + if (block_id != 0) return; + + std::vector fwd_ops, bwd_ops; + for (auto &op : all_ops) { + if (op->Type() == "while") { + fwd_ops.emplace_back(op.get()); + } else if (op->Type() == "while_grad") { + bwd_ops.emplace_back(op.get()); + } + } + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops) { + std::vector fwd_ops, bwd_ops; + fwd_ops.reserve(while_ops.size()); + for (auto *op : while_ops) { + fwd_ops.emplace_back(op); + } + + bwd_ops.reserve(while_grad_ops.size()); + for (auto *op : while_grad_ops) { + bwd_ops.emplace_back(op); + } + + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h new file mode 100644 index 00000000..456ba864 --- /dev/null +++ b/paddle/fluid/operators/controlflow/while_op_helper.h @@ -0,0 +1,43 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace operators { + +static constexpr char kStepBlock[] = "sub_block"; +static constexpr char kCondition[] = "Condition"; +static constexpr char kStepScopes[] = "StepScopes"; +static constexpr char kX[] = "X"; +static constexpr char kXGRAD[] = "X@GRAD"; +static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + int block_id, + const std::vector> &all_ops); + +void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const std::vector &while_ops, + const std::vector &while_grad_ops); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h new file mode 100644 index 00000000..4a5cd326 --- /dev/null +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -0,0 +1,413 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/operator_kernel_configs.h" +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/platform/cudnn_desc.h" +namespace paddle { +namespace operators { + +using framework::AlgorithmsCache; + +struct ConvArgs { + cudnnHandle_t handle; + platform::TensorDescriptor idesc, odesc; + platform::FilterDescriptor wdesc; + platform::ConvolutionDescriptor cdesc; + const framework::Tensor *x, *w, *o; + + // strides + std::vector s; + // paddings + std::vector p; + // dilations + std::vector d; + + ConvArgs(const framework::Tensor* x, const framework::Tensor* w, + const framework::Tensor* o, const std::vector s, + const std::vector p, const std::vector d) + : x(x), w(w), o(o), s(s), p(p), d(d) {} +}; + +template +struct SearchAlgorithm {}; + +template <> +struct SearchAlgorithm { + using perf_t = cudnnConvolutionFwdAlgoPerf_t; + using algo_t = cudnnConvolutionFwdAlgo_t; + + template + static algo_t Find(const ConvArgs& args, bool exhaustive_search, + bool deterministic, int algo_cache_id, + const framework::ExecutionContext& ctx) { + auto dtype = platform::CudnnDataType::type; + bool has_got_workspace_size = true; + bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF); + size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; + size_t workspace_size = 0; + algo_t algo; + +#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) + auto& dev_ctx = ctx.template device_context(); + if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); + VLOG(5) << "use cudnn_tensor_op_math"; + } else { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + args.cdesc.desc(), CUDNN_DEFAULT_MATH)); + VLOG(5) << "NOT use cudnn_tensor_op_math"; + } +#endif + + if (!exhaustive) { +#if CUDNN_VERSION >= 7001 + int perf_count; + int best_algo_idx = 0; + std::unique_ptr perf_results(new perf_t[kNUM_CUDNN_FWD_ALGS]); + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm_v7( + args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), + args.odesc.desc(), kNUM_CUDNN_FWD_ALGS, &perf_count, + perf_results.get())); + algo = (perf_results.get())[best_algo_idx].algo; + workspace_size = GetWorkspaceSize(args, algo); + + if (workspace_size > workspace_size_limit) { + has_got_workspace_size = false; + VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " + "the workspace size request(" + << workspace_size << ") exceeds the limit(" + << workspace_size_limit << ")"; + } + if (!has_got_workspace_size) { + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + args.handle, args.idesc.desc(), args.wdesc.desc(), + args.cdesc.desc(), args.odesc.desc(), + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace_size_limit, + &algo)); + } +#else + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), + args.odesc.desc(), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); +#endif + VLOG(3) << "choose algo " << algo; + } else { + AlgorithmsCache& algo_cache = + ctx.GetKernelConfig>(algo_cache_id); + auto& dev_ctx = + ctx.template device_context(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + + auto x_dims = framework::vectorize(args.x->dims()); + auto w_dims = framework::vectorize(args.w->dims()); + + algo = algo_cache.GetAlgorithm( + x_dims, w_dims, args.s, args.p, args.d, 0, [&]() { + int returned_algo_count; + std::array perf_stat; + + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + CUDNN_ENFORCE( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + args.handle, args.idesc.desc(), args.x->data(), + args.wdesc.desc(), args.w->data(), args.cdesc.desc(), + args.odesc.desc(), const_cast(args.o->data()), + kNUM_CUDNN_FWD_ALGS, &returned_algo_count, + perf_stat.data(), cudnn_workspace_ptr, + workspace_size_limit)); + }; + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); + + VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)"; + for (int i = 0; i < returned_algo_count; ++i) { + const auto& stat = perf_stat[i]; + VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time + << " " << stat.memory; + } + return perf_stat[0].algo; + }); + } + VLOG(3) << "choose algo " << algo; + return algo; + } + + static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + size_t workspace_size = 0; + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + args.handle, args.idesc.desc(), args.wdesc.desc(), args.cdesc.desc(), + args.odesc.desc(), algo, &workspace_size)); + return workspace_size; + } +}; + +template <> +struct SearchAlgorithm { + using perf_t = cudnnConvolutionBwdDataAlgoPerf_t; + using algo_t = cudnnConvolutionBwdDataAlgo_t; + + template + static algo_t Find(const ConvArgs& args, bool exhaustive_search, + bool deterministic, int algo_cache_id, + const framework::ExecutionContext& ctx) { + auto dtype = platform::CudnnDataType::type; + bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF); + size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; + size_t workspace_size = 0; + bool has_got_workspace_size = true; + algo_t algo; + +#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) + auto& dev_ctx = ctx.template device_context(); + if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); + VLOG(5) << "use cudnn_tensor_op_math"; + } else { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + args.cdesc.desc(), CUDNN_DEFAULT_MATH)); + VLOG(5) << "NOT use cudnn_tensor_op_math"; + } +#endif + + if (!exhaustive && !deterministic) { +#if CUDNN_VERSION >= 7001 + int perf_count; + int best_algo_idx = 0; + std::unique_ptr perf_results( + new perf_t[kNUM_CUDNN_BWD_DATA_ALGS]); + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm_v7( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), kNUM_CUDNN_BWD_DATA_ALGS, + &perf_count, perf_results.get())); + algo = (perf_results.get())[best_algo_idx].algo; + +#if CUDNN_VERSION < 7500 + int stride_dim = args.x->dims().size() - 2; + bool blacklist = std::any_of(args.s.begin(), args.s.begin() + stride_dim, + [=](int n) { return n != 1; }); + if (blacklist && (static_cast( + perf_results[best_algo_idx].algo) == + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING || + static_cast( + perf_results[best_algo_idx].algo) == + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT)) { + algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } +#endif + workspace_size = GetWorkspaceSize(args, algo); + if (workspace_size > workspace_size_limit) { + has_got_workspace_size = false; + VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " + "the workspace size request(" + << workspace_size << ") exceeds the limit(" + << workspace_size_limit << ")"; + } + if (!has_got_workspace_size) { + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + } +#else + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + args.handle, args.wdesc.desc(), args.odesc.desc(), args.cdesc.desc(), + args.idesc.desc(), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); +#endif + } else if (deterministic) { + return CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } else { + AlgorithmsCache& algo_cache = + ctx.GetKernelConfig>(algo_cache_id); + auto& dev_ctx = + ctx.template device_context(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + + auto x_dims = framework::vectorize(args.x->dims()); + auto w_dims = framework::vectorize(args.w->dims()); + + algo = algo_cache.GetAlgorithm( + x_dims, w_dims, args.s, args.p, args.d, 0, [&]() { + int returned_algo_count; + std::array perf_stat; + + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + CUDNN_ENFORCE( + platform::dynload:: + cudnnFindConvolutionBackwardDataAlgorithmEx( + args.handle, args.wdesc.desc(), args.w->data(), + args.odesc.desc(), args.o->data(), + args.cdesc.desc(), args.idesc.desc(), + const_cast(args.x->data()), + kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, + perf_stat.data(), cudnn_workspace_ptr, + workspace_size_limit)); + }; + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); + + VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)"; + for (int i = 0; i < returned_algo_count; ++i) { + const auto& stat = perf_stat[i]; + VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time + << " " << stat.memory; + } + + return perf_stat[0].algo; + }); + } + VLOG(3) << "choose algo " << algo; + return algo; + } + + static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + size_t workspace_size = 0; + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle, args.wdesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.idesc.desc(), algo, &workspace_size)); + return workspace_size; + } +}; + +template <> +struct SearchAlgorithm { + using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; + using algo_t = cudnnConvolutionBwdFilterAlgo_t; + + template + static algo_t Find(const ConvArgs& args, bool exhaustive_search, + bool deterministic, int algo_cache_id, + const framework::ExecutionContext& ctx) { + auto dtype = platform::CudnnDataType::type; + bool exhaustive = (exhaustive_search) & (dtype != CUDNN_DATA_HALF); + size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; + size_t workspace_size = 0; + bool has_got_workspace_size = true; + +#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1) + auto& dev_ctx = ctx.template device_context(); + if (dev_ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + args.cdesc.desc(), CUDNN_TENSOR_OP_MATH)); + VLOG(5) << "use cudnn_tensor_op_math"; + } else { + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + args.cdesc.desc(), CUDNN_DEFAULT_MATH)); + VLOG(5) << "NOT use cudnn_tensor_op_math"; + } +#endif + + algo_t algo; + if (!exhaustive && !deterministic) { +#if CUDNN_VERSION >= 7001 + using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; + int perf_count; + int best_algo_idx = 0; + std::unique_ptr perf_results( + new perf_t[kNUM_CUDNN_BWD_FILTER_ALGS]); + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm_v7( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), kNUM_CUDNN_BWD_FILTER_ALGS, + &perf_count, perf_results.get())); + algo = (perf_results.get())[best_algo_idx].algo; + workspace_size = GetWorkspaceSize(args, algo); + if (workspace_size > workspace_size_limit) { + has_got_workspace_size = false; + VLOG(1) << "Fallback to non-v7 method to find conv algorithm becasue " + "the workspace size request(" + << workspace_size << ") exceeds the limit(" + << workspace_size_limit << ")"; + } + if (!has_got_workspace_size) { + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + } +#else + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); +#endif + } else if (deterministic) { + return CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; + } else { + AlgorithmsCache& algo_cache = + ctx.GetKernelConfig>(algo_cache_id); + auto& dev_ctx = + ctx.template device_context(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + + auto x_dims = framework::vectorize(args.x->dims()); + auto w_dims = framework::vectorize(args.w->dims()); + + algo = algo_cache.GetAlgorithm( + x_dims, w_dims, args.s, args.p, args.d, 0, [&]() { + int returned_algo_count; + std::array perf_stat; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + CUDNN_ENFORCE( + platform::dynload:: + cudnnFindConvolutionBackwardFilterAlgorithmEx( + args.handle, args.idesc.desc(), args.x->data(), + args.odesc.desc(), args.o->data(), + args.cdesc.desc(), args.wdesc.desc(), + const_cast(args.w->data()), + kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, + perf_stat.data(), cudnn_workspace_ptr, + workspace_size_limit)); + }; + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); + + VLOG(3) << "BwdFilterAlgo Perf result: (algo: stat, time, memory)"; + for (int i = 0; i < returned_algo_count; ++i) { + const auto& stat = perf_stat[i]; + VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time + << " " << stat.memory; + } + return perf_stat[0].algo; + }); + } + VLOG(3) << "choose algo " << algo; + return algo; + } + + static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + size_t workspace_size = 0; + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle, args.idesc.desc(), args.odesc.desc(), + args.cdesc.desc(), args.wdesc.desc(), algo, &workspace_size)); + return workspace_size; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc new file mode 100644 index 00000000..ec0278e5 --- /dev/null +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -0,0 +1,520 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/conv_cudnn_helper.h" +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/operators/conv_op.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_bool(cudnn_deterministic, false, + "Whether allow using an autotuning algorithm for convolution " + "operator. The autotuning algorithm may be non-deterministic. If " + "true, the algorithm is deterministic."); +DEFINE_uint64(conv_workspace_size_limit, + paddle::platform::kDefaultConvWorkspaceSizeLimitMB, + "cuDNN convolution workspace limit in MB unit."); +DEFINE_bool(cudnn_exhaustive_search, false, + "Whether enable exhaustive search for cuDNN convolution or " + "not, default is False."); + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; +using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; +using DataLayout = platform::DataLayout; +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; +using framework::AlgorithmsCache; + +static inline void GetNCDHW(const framework::DDim& dims, + const DataLayout& layout, int* N, int* C, int* D, + int* H, int* W) { + *N = dims[0]; + *C = layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + int i = layout == DataLayout::kNCHW ? 0 : 1; + if (dims.size() == 5) { + *D = dims[2 - i]; + *H = dims[3 - i]; + *W = dims[4 - i]; + } else { + *D = 1; + *H = dims[2 - i]; + *W = dims[3 - i]; + } +} + +template +class CUDNNConvOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || ctx.Attr("exhaustive_search"); + + if (exhaustive_search && FLAGS_cudnn_deterministic) { + PADDLE_THROW( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time."); + } + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + // ------------------- cudnn descriptors --------------------- + ConvArgs args{input, filter, output, strides, paddings, dilations}; + auto handle = dev_ctx.cudnn_handle(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + auto dtype = platform::CudnnDataType::type; + DataLayout layout = DataLayout::kNCHW; + if (input->dims().size() == 5) { + layout = DataLayout::kNCDHW; + } + auto layout_format = GetCudnnTensorFormat(layout); + + args.handle = handle; + args.cdesc.set(dtype, paddings, strides, dilations); +#if CUDNN_VERSION_MIN(7, 0, 1) + // cudnn 7 can support groups, no need to do it manually + // FIXME(typhoonzero): find a better way to disable groups + // rather than setting it to 1. + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + args.cdesc.desc(), groups)); + groups = 1; +#endif + args.idesc.set(*input, groups); + args.wdesc.set(*filter, layout_format, groups); + args.odesc.set(*output, groups); + int i_n, i_c, i_d, i_h, i_w; + GetNCDHW(input->dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); + int o_n, o_c, o_d, o_h, o_w; + GetNCDHW(output->dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, &o_h, &o_w); + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = filter->numel() / groups; + // ------------------- cudnn conv workspace --------------------- + size_t workspace_size = 0; // final workspace to allocate. + // ------------------- cudnn conv algorithm --------------------- + cudnnConvolutionFwdAlgo_t algo{}; + + using search = SearchAlgorithm; + algo = search::Find(args, exhaustive_search, false, 0, ctx); + workspace_size = search::GetWorkspaceSize(args, algo); + + // ------------------- cudnn conv forward --------------------- + ScalingParamType alpha = 1.0f, beta = 0.0f; + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* workspace_ptr) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, args.idesc.desc(), + input_data + i * group_offset_in, args.wdesc.desc(), + filter_data + i * group_offset_filter, args.cdesc.desc(), algo, + workspace_ptr, workspace_size, &beta, args.odesc.desc(), + output_data + i * group_offset_out)); + }, + workspace_size); + } + } +}; + +template +class CUDNNConvGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto input = ctx.Input("Input"); + auto filter = ctx.Input("Filter"); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + + const T* input_data = input->data(); + const T* output_grad_data = output_grad->data(); + const T* filter_data = filter->data(); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || ctx.Attr("exhaustive_search"); + bool deterministic = FLAGS_cudnn_deterministic; + if (exhaustive_search && deterministic) { + PADDLE_THROW( + "Can't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time."); + } + + T* filter_grad_data = nullptr; + T* input_grad_data = nullptr; + ConvArgs args1{input_grad, filter, output_grad, + strides, paddings, dilations}; + ConvArgs args2{input, filter_grad, output_grad, + strides, paddings, dilations}; + // conv_cudnn_helper.h + auto handle = dev_ctx.cudnn_handle(); + auto dtype = platform::CudnnDataType::type; + DataLayout layout = DataLayout::kNCHW; + if (input->dims().size() == 5) { + layout = DataLayout::kNCDHW; + } + auto layout_tensor = GetCudnnTensorFormat(layout); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + + int i_n, i_c, i_d, i_h, i_w; + GetNCDHW(input->dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); + int o_n, o_c, o_d, o_h, o_w; + GetNCDHW(output_grad->dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, &o_h, + &o_w); + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = filter->numel() / groups; + // ------------------- cudnn backward algorithm --------------------- + cudnnConvolutionBwdDataAlgo_t data_algo = + static_cast(0); + cudnnConvolutionBwdFilterAlgo_t filter_algo = + static_cast(0); + size_t workspace_size = 0; + int iwo_groups, c_groups; + +#if CUDNN_VERSION_MIN(7, 0, 1) + iwo_groups = 1; + c_groups = groups; + groups = 1; +#endif + + if (input_grad) { + // ------------------- cudnn descriptors --------------------- + input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + args1.handle = handle; + args1.idesc.set(*input_grad, iwo_groups); + args1.wdesc.set(*filter, layout_tensor, iwo_groups); + args1.odesc.set(*output_grad, iwo_groups); + args1.cdesc.set(dtype, paddings, strides, dilations, c_groups); + + using search1 = SearchAlgorithm; + data_algo = + search1::Find(args1, exhaustive_search, deterministic, 0, ctx); + workspace_size = + std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); + } + + if (filter_grad) { + // ------------------- cudnn descriptors --------------------- + filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + args2.handle = handle; + args2.idesc.set(*input, iwo_groups); + args2.wdesc.set(*filter_grad, layout_tensor, iwo_groups); + args2.odesc.set(*output_grad, iwo_groups); + args2.cdesc.set(dtype, paddings, strides, dilations, c_groups); + + using search2 = SearchAlgorithm; + filter_algo = + search2::Find(args2, exhaustive_search, deterministic, 1, ctx); + workspace_size = std::max(workspace_size, + search2::GetWorkspaceSize(args2, filter_algo)); + } + + // ------------------- cudnn conv backward data --------------------- + ScalingParamType alpha = 1.0f, beta = 0.0f; + if (input_grad) { + // Because beta is zero, it is unnecessary to reset input_grad. + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, args1.wdesc.desc(), + filter_data + i * group_offset_filter, args1.odesc.desc(), + output_grad_data + i * group_offset_out, args1.cdesc.desc(), + data_algo, cudnn_workspace_ptr, workspace_size, &beta, + args1.idesc.desc(), input_grad_data + i * group_offset_in)); + }, + workspace_size); + } + } + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { + // Because beta is zero, it is unnecessary to reset filter_grad. + for (int i = 0; i < groups; i++) { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, args2.idesc.desc(), + input_data + i * group_offset_in, args2.odesc.desc(), + output_grad_data + i * group_offset_out, args2.cdesc.desc(), + filter_algo, cudnn_workspace_ptr, workspace_size, &beta, + args2.wdesc.desc(), + filter_grad_data + i * group_offset_filter)); + }, + workspace_size); + } + } + } +}; + +/* + * Inputs: I, W, dO, ddI, ddW + * Outputs: ddO, dW, dI + * ddo = conv(ddI, W) + conv(I, ddW) + * dW = conv_bp_filter(ddI, dO) + * dI = conv_bp_data(ddW, dO) + */ +template +class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto X = ctx.Input("Input"); + auto W = ctx.Input("Filter"); + auto dO = ctx.Input("DOutput"); + auto ddX = ctx.Input("DDInput"); + auto ddW = ctx.Input("DDFilter"); + + auto ddO = ctx.Output("DDOutput"); + auto dW = ctx.Output("DFilter"); + auto dX = ctx.Output("DInput"); + + const T* x = X->data(); + const T* dy = dO->data(); + const T* w = W->data(); + + const T* ddx = nullptr; + const T* ddw = nullptr; + T *dw, *dx, *ddy; + dw = dx = ddy = nullptr; + + const std::vector& strides = ctx.Attr>("strides"); + const std::vector& paddings = ctx.Attr>("paddings"); + const std::vector& dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || ctx.Attr("exhaustive_search"); + bool deterministic = FLAGS_cudnn_deterministic; + if (exhaustive_search && deterministic) { + PADDLE_THROW( + "Can't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time."); + } + + int iwo_group = groups; + int c_group = 1; +#if CUDNN_VERSION_MIN(7, 0, 1) + iwo_group = 1; + c_group = groups; +#endif + auto dtype = platform::CudnnDataType::type; + + auto handle = dev_ctx.cudnn_handle(); + + ConvArgs args1{ddX, W, ddO, strides, paddings, dilations}; + ConvArgs args2{X, ddW, ddO, strides, paddings, dilations}; + ConvArgs args3{ddX, dW, dO, strides, paddings, dilations}; + ConvArgs args4{dX, ddW, dO, strides, paddings, dilations}; + + cudnnConvolutionFwdAlgo_t fwd_algo1 = + static_cast(0); + cudnnConvolutionFwdAlgo_t fwd_algo2 = + static_cast(0); + cudnnConvolutionBwdDataAlgo_t data_algo = + static_cast(0); + cudnnConvolutionBwdFilterAlgo_t filter_algo = + static_cast(0); + + auto layout = GetCudnnTensorFormat(DataLayout::kNCHW); + + // ddo = conv(ddI, W) + conv(I, ddW) + size_t workspace_size = 0; + if (ddO) { + ddy = ddO->mutable_data(ctx.GetPlace()); + args1.handle = handle; + args1.idesc.set(*ddX, iwo_group); + args1.wdesc.set(*W, layout, iwo_group); + args1.odesc.set(*ddO, iwo_group); + args1.cdesc.set(dtype, paddings, strides, dilations, c_group); + + using search1 = SearchAlgorithm; + fwd_algo1 = search1::Find(args1, exhaustive_search, false, 0, ctx); + workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); + + if (ddW) { + ddw = ddW->data(); + args2.handle = handle; + args2.idesc.set(*X, iwo_group); + args2.wdesc.set(*ddW, layout, iwo_group); + args2.odesc.set(*ddO, iwo_group); + args2.cdesc.set(dtype, paddings, strides, dilations, c_group); + + using search2 = SearchAlgorithm; + fwd_algo2 = search2::Find(args2, exhaustive_search, false, 0, ctx); + workspace_size = std::max(workspace_size, + search2::GetWorkspaceSize(args2, fwd_algo2)); + } + } + + if (dW) { + dw = dW->mutable_data(ctx.GetPlace()); + args3.handle = handle; + args3.idesc.set(*ddX, iwo_group); + args3.wdesc.set(*dW, layout, iwo_group); + args3.odesc.set(*dO, iwo_group); + args3.cdesc.set(dtype, paddings, strides, dilations, c_group); + + using search3 = SearchAlgorithm; + filter_algo = + search3::Find(args3, exhaustive_search, deterministic, 1, ctx); + workspace_size = std::max(workspace_size, + search3::GetWorkspaceSize(args3, filter_algo)); + } + + if (ddW && dX) { + dx = dX->mutable_data(ctx.GetPlace()); + args4.handle = handle; + args4.idesc.set(*dX, iwo_group); + args4.wdesc.set(*ddW, layout, iwo_group); + args4.odesc.set(*dO, iwo_group); + args4.cdesc.set(dtype, paddings, strides, dilations, c_group); + + using search4 = SearchAlgorithm; + data_algo = + search4::Find(args4, exhaustive_search, deterministic, 2, ctx); + workspace_size = + std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); + } + + int i_n, i_c, i_d, i_h, i_w; + GetNCDHW(X->dims(), DataLayout::kNCHW, &i_n, &i_c, &i_d, &i_h, &i_w); + int o_n, o_c, o_d, o_h, o_w; + GetNCDHW(dO->dims(), DataLayout::kNCHW, &o_n, &o_c, &o_d, &o_h, &o_w); + + int group_offset_in = i_c / groups * i_h * i_w * i_d; + int group_offset_out = o_c / groups * o_h * o_w * o_d; + int group_offset_filter = W->numel() / groups; + + ScalingParamType alpha = 1.0f, beta = 0.0f; + auto wkspace_handle = dev_ctx.cudnn_workspace_handle(); + + if (ddO) { + ddx = ddX->data(); + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, args1.idesc.desc(), ddx + i * group_offset_in, + args1.wdesc.desc(), w + i * group_offset_filter, + args1.cdesc.desc(), fwd_algo1, workspace_ptr, workspace_size, + &beta, args1.odesc.desc(), ddy + i * group_offset_out)); + }, + workspace_size); + } + if (ddW) { + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, args2.idesc.desc(), x + i * group_offset_in, + args2.wdesc.desc(), ddw + i * group_offset_filter, + args2.cdesc.desc(), fwd_algo2, workspace_ptr, + workspace_size, &alpha, args2.odesc.desc(), + ddy + i * group_offset_out)); + }, + workspace_size); + } + } + } + + if (dW) { + ddx = ddX->data(); + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, args3.idesc.desc(), ddx + i * group_offset_in, + args3.odesc.desc(), dy + i * group_offset_out, + args3.cdesc.desc(), filter_algo, workspace_ptr, + workspace_size, &beta, args3.wdesc.desc(), + dw + i * group_offset_filter)); + }, + workspace_size); + } + } + + if (dX && ddW) { + ddw = ddW->data(); + for (int i = 0; i < groups; i++) { + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, args4.wdesc.desc(), + ddw + i * group_offset_filter, args4.odesc.desc(), + dy + i * group_offset_out, args4.cdesc.desc(), data_algo, + workspace_ptr, workspace_size, &beta, args4.idesc.desc(), + dx + i * group_offset_in)); + }, + workspace_size); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace plat = paddle::platform; +REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); +REGISTER_OP_KERNEL( + conv2d_grad_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel); + +REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h new file mode 100644 index 00000000..1158dc2d --- /dev/null +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +DECLARE_uint64(conv_workspace_size_limit); +DECLARE_bool(cudnn_exhaustive_search); +DECLARE_int64(cudnn_exhaustive_search_times); + +namespace paddle { +namespace operators { + +static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache"; +static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache"; +static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache"; + +#if CUDNN_VERSION_MIN(6, 0, 5) +static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT; +static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; +static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = + CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; +#else +// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc. +static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7; +static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4; +static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; +#endif + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_fusion_op.cc b/paddle/fluid/operators/conv_fusion_op.cc new file mode 100644 index 00000000..23b8087e --- /dev/null +++ b/paddle/fluid/operators/conv_fusion_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/operators/conv_op.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +// This fused conv follows the equation: +// y = act ( alpha1 * conv(x) + alpha2 * z + bias ). +// here, y is Output, +// x is Input, +// z is ResidualData, +// bias is Bias +// When `split_channels` is set, y will be splitted into multiple outputs, +// each output has split_channels[i] number of channels. +class Conv2DFusionOpMaker : public Conv2DOpMaker { + protected: + void Apply() override { + AddAttr( + "activation", + "The activation type can be 'identity', 'sigmoid', 'relu', 'relu6' " + "'relux' , 'tanh', 'band_pass'") + .SetDefault("relu"); + AddAttr>( + "split_channels", + "When `split_channels` are set, there will be multiple outputs, the " + "output size is equal to the number of `split_channels`.") + .SetDefault({}); + AddOutput("Outputs", + "This Outputs is used when setting `split_channels`." + "Usually used to fuse conv with same input and same filter size, " + "padding, stride, dilation size.") + .AsDuplicable() + .AsDispensable(); + AddInput("AlgoCache", + "The cache of convolution algorithm, a RAW type variable.") + .AsDispensable(); + AddAttr( + "search_times", + "The number of exhaustive search times for convolution algorithm.") + .SetDefault(-1); + } +}; + +class Conv2DFusionOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of ConvOp should not be null."); + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::vector dilations = + ctx->Attrs().Get>("dilations"); + + std::vector oshape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + oshape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], strides[i])); + } + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of ConvOp should not be null."); + ctx->SetOutputDim("Output", framework::make_ddim(oshape)); + std::vector channels = + ctx->Attrs().Get>("split_channels"); + if (channels.size()) { + PADDLE_ENFORCE(ctx->HasOutputs("Outputs"), + "Output(Outputs) of ConvOp should not be null."); + std::vector oshapes; + oshapes.reserve(channels.size()); + for (size_t i = 0; i < channels.size(); ++i) { + oshapes.push_back({oshape[0], channels[i], oshape[2], oshape[3]}); + } + ctx->SetOutputsDim("Outputs", oshapes); + } + } +}; + +// TODO(qingqing): add gradient operator for conv2d_fusion + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(conv2d_fusion, ops::ConvOp, ops::Conv2DFusionOpMaker, + ops::Conv2DFusionOpInferShape, ops::ConvOpInferVarType, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc new file mode 100644 index 00000000..d1fa7b9d --- /dev/null +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -0,0 +1,235 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +DEFINE_int64(cudnn_exhaustive_search_times, -1, + "Exhaustive search times for cuDNN convolution, " + "default is -1, not exhaustive search"); + +namespace paddle { +namespace operators { + +#if CUDNN_VERSION >= 7100 +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; +using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; +using ScopedActivationDescriptor = platform::ScopedActivationDescriptor; +using DataLayout = platform::DataLayout; +using framework::AlgorithmsCache; + +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; + +template +class CUDNNConvFusionOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.Input("Bias"); + PADDLE_ENFORCE(bias, "The bias should not be null."); + auto* residual = ctx.Input("ResidualData"); + auto* output = ctx.Output("Output"); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + const std::string activation = ctx.Attr("activation"); + int groups = ctx.Attr("groups"); + int64_t user_workspace_size = + static_cast(ctx.Attr("workspace_size_MB")); + bool exhaustive_search = + FLAGS_cudnn_exhaustive_search || ctx.Attr("exhaustive_search"); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + const T* bias_data = bias->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + const T* residual_data = residual ? residual->data() : output_data; + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedFilterDescriptor filter_desc; + ScopedTensorDescriptor bias_desc; + ScopedConvolutionDescriptor conv_desc; + ScopedActivationDescriptor act_desc; + DataLayout layout = DataLayout::kNCHW; + if (input->dims().size() == 5) { + layout = DataLayout::kNCDHW; + } + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount( + cudnn_conv_desc, groups)); + + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims())); + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output->dims())); + cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize2int(filter->dims())); + // Now only support NCHW + std::vector bias_dim = {1, static_cast(output->dims()[1]), 1, 1}; + cudnnTensorDescriptor_t cudnn_bias_desc = + bias_desc.descriptor(layout, bias_dim); + cudnnActivationDescriptor_t cudnn_act_desc = + act_desc.descriptor(activation); + + // ------------------- cudnn conv workspace --------------------- + size_t workspace_size_in_bytes; // final workspace to allocate. + size_t workspace_size_limit = 0; + if (FLAGS_conv_workspace_size_limit > 0 || user_workspace_size > 0) { + int64_t max_user_size = + std::min(static_cast(FLAGS_conv_workspace_size_limit), + user_workspace_size); + workspace_size_limit = max_user_size * 1024 * 1024; + } + + // ------------------- cudnn conv algorithm --------------------- + cudnnConvolutionFwdAlgo_t algo; + auto handle = dev_ctx.cudnn_handle(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + + CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType( + cudnn_conv_desc, CUDNN_DEFAULT_MATH)); + + auto x_dims = framework::vectorize(input->dims()); + auto f_dims = framework::vectorize(filter->dims()); + if (!exhaustive_search) { + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + VLOG(3) << "cuDNN forward algo " << algo; + } else { + auto search_func = [&]() { + int returned_algo_count; + std::array + fwd_perf_stat; + auto cudnn_find_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE( + platform::dynload::cudnnFindConvolutionForwardAlgorithmEx( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, + kNUM_CUDNN_FWD_ALGS, &returned_algo_count, + fwd_perf_stat.data(), cudnn_workspace, workspace_size_limit)); + }; + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); + VLOG(3) << "Perf result: (algo: stat, time, memory)"; + for (int i = 0; i < returned_algo_count; ++i) { + const auto& stat = fwd_perf_stat[i]; + VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " " + << stat.memory; + } + return fwd_perf_stat[0].algo; + }; + AlgorithmsCache& algo_cache = + ctx.GetKernelConfig>(0); + int search_times = ctx.Attr("search_times"); + search_times = std::max( + static_cast(FLAGS_cudnn_exhaustive_search_times), search_times); + // TODO(dangqingqing): Unify this if-else. + if (search_times > 0) { + // The searched algo will be cached by `search_times` times for + // different input dimension. For other dimensions, select the algo + // of closest area. + algo = algo_cache.GetAlgorithm(x_dims[2] * x_dims[3], search_times, 0, + search_func); + } else { + algo = algo_cache.GetAlgorithm(x_dims, f_dims, strides, paddings, + dilations, 0, search_func); + } + VLOG(3) << "choose algo " << algo; + } + + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_output_desc, algo, &workspace_size_in_bytes)); + PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, + "workspace_size to be allocated exceeds the limit"); + + if ((activation == "identity") && (!residual)) { + // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is + // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. + // But test in some case, the speed is slower, change to use + // cudnnConvolutionForward and cudnnAddTensor + // ------------- cudnn conv forward and bias add --------------------- + ScalingParamType alpha = 1.0f, beta = 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( + handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, + output_data)); + } else { + if (activation == "identity") { + algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + } + // ------------------- cudnn conv+bias+act forward -------------------- + ScalingParamType alpha1 = 1.0f; + ScalingParamType alpha2 = residual ? 1.0f : 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, + cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } + std::vector channels = ctx.Attr>("split_channels"); + if (channels.size()) { + auto outs = ctx.MultiOutput("Outputs"); + if (x_dims[0] == 1) { + // share data with Output + framework::Tensor t; + t.ShareDataWith(*output); + auto y_dims = output->dims(); + t.Resize({y_dims[1], y_dims[2], y_dims[3]}); + int s = 0; + for (size_t i = 0; i < channels.size(); ++i) { + int e = s + channels[i]; + outs[i]->ShareDataWith(t.Slice(s, e)); + outs[i]->Resize({x_dims[0], channels[i], y_dims[2], y_dims[3]}); + s = e; + } + } else { + // TODO(qingiqng): do copy when batch size large than 1 + PADDLE_THROW("Batch size greater than 1 is Unsupported"); + } + } + } +}; +#endif + +} // namespace operators +} // namespace paddle + +#if CUDNN_VERSION >= 7100 +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel, + ops::CUDNNConvFusionOpKernel); +#endif diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc new file mode 100644 index 00000000..d2036c61 --- /dev/null +++ b/paddle/fluid/operators/conv_op.cc @@ -0,0 +1,652 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_op.h" + +#include +#include +#include + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/operators/conv_cudnn_op_cache.h" +#include "paddle/fluid/platform/cudnn_helper.h" +#endif +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif +#include "paddle/fluid/platform/cudnn_workspace_helper.h" + +namespace paddle { +namespace operators { + +void ConvOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of ConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of ConvOp should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + int groups = ctx->Attrs().Get("groups"); + std::vector dilations = ctx->Attrs().Get>("dilations"); + + PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, + "Conv intput should be 4-D or 5-D tensor, get %u", + in_dims.size()); + + PADDLE_ENFORCE_EQ( + in_dims.size(), filter_dims.size(), + "Conv input dimension and filter dimension should be the same."); + PADDLE_ENFORCE( + in_dims.size() - strides.size() == 2U, + "Conv input dimension and strides dimension should be consistent."); + PADDLE_ENFORCE_EQ( + paddings.size(), strides.size(), + "Conv paddings dimension and Conv strides dimension should be the same."); + + PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups, + "The number of input channels should be equal to filter " + "channels * groups."); + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, 0, + "The number of output channels should be divided by groups."); + + std::vector output_shape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + if ((!ctx->IsRuntime()) && + (in_dims[i + 2] <= 0 || filter_dims[i + 2] <= 0)) { + output_shape.push_back(-1); + } else { + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); + } + } + ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); + ctx->ShareLoD("Input", "Output"); +} + +framework::OpKernelType ConvOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; + framework::LibraryType library{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + auto input_data_type = ctx.Input("Input")->type(); + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout = framework::StringToDataLayout(data_format); + +#ifdef PADDLE_WITH_CUDA + if (platform::CanCUDNNBeUsed(ctx)) { + library = framework::LibraryType::kCUDNN; + } +#endif +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + customized_type_value = + (input_data_type == framework::DataTypeTrait::DataType() || + input_data_type == framework::DataTypeTrait::DataType()) + ? kConvMKLDNNINT8 + : kConvMKLDNNFP32; + } +#endif + + if (input_data_type != framework::proto::VarType::INT8 && + input_data_type != framework::proto::VarType::UINT8) { + auto filter_data_type = ctx.Input("Filter")->type(); + PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, + "input and filter data type should be consistent"); + } + if (input_data_type == framework::proto::VarType::FP16) { + PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, + "float16 can only be used when CUDNN is used"); + } + + auto type = framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library, customized_type_value); +#ifdef PADDLE_WITH_CUDA + std::vector& configs = kernel_configs_map_[type]; + // TODO(dangqingqing): Currently conv_fusion_op use cudnn but sets use_cudnn + // to false. It should be fixed and then here should only create if library + // is kCUDNN. + if (configs.empty()) { + std::shared_ptr> p( + new framework::AlgorithmsCache()); + configs.push_back(p); + } +#endif + return type; +} + +void Conv2DOpMaker::Make() { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddInput( + "Input", + "(Tensor) The input tensor of convolution operator. " + "The format of input tensor is NCHW, where N is batch size, C is the " + "number of channels, H is the height of the feature, " + "and W is the width of the feature."); + AddInput("Filter", + "(Tensor) The filter tensor of convolution operator. " + "The format of the filter tensor is MCHW, where M is the number of " + "output image channels, C is the number of input image channels, " + "H is the height of the filter, and W is the width of the filter. " + "If the groups attribute is greater than 1, C equals the number of " + "input image channels divided by the groups."); + AddInput("Bias", + "(Tensor) Bias to be added to each output of filter application." + "The format of output tensor is X (one-dimensional) of size equal" + "to the number of output channels. Only used with MKL-DNN.") + .AsDispensable(); + AddInput("ResidualData", + "(Tensor) Tensor with residual data " + "to which convolution output will be added." + "Used with fuse_residual_connection fusion.") + .AsDispensable(); + AddOutput("Output", + "(Tensor) The output tensor of convolution operator. " + "The format of output tensor is also NCHW."); + AddAttr>("strides", + "(vector default:{1, 1}), the " + "strides(h_stride, w_stride) of " + "convolution operator.") + .SetDefault({1, 1}); + AddAttr>("paddings", + "(vector default:{0, 0}), the " + "paddings(h_pad, w_pad) of " + "convolution operator.") + .SetDefault({0, 0}); + AddAttr( + "groups", + "(int default:1), the groups number of the convolution operator. " + "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " + "when group=2, the first half of the filters is only connected to the " + "first half of the input channels, while the second half of the filters " + "is only connected to the second half of the input channels.") + .SetDefault(1); + AddAttr>("dilations", + "(vector default:{1, 1}), the " + "dilations(h_dilation, w_dilation) of " + "convolution operator.") + .SetDefault({1, 1}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr("fuse_relu_before_depthwise_conv", + "(bool, default false) Only used in cuda depthwise kernel") + .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("use_quantizer", + "(bool, default false) " + "Set to true for operators that should be quantized and use " + "int8 kernel. " + "Only used on CPU.") + .SetDefault(false); + AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("fuse_brelu", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("fuse_brelu_threshold", + "(float, default false 6.0) Only used in mkldnn kernel") + .SetDefault(6.0f); + AddAttr("fuse_residual_connection", + "(bool, default false) Only used in mkldnn kernel. Used " + "whenever convolution output is as an input to residual " + "connection.") + .SetDefault(false); + AddAttr("Scale_in", + "Scale_in to be used for int8 input data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Scale_out", + "Scale_out to be used for int8 output data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Scale_in_eltwise", + "Scale_in_eltwise to be used for int8 eltwise input data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr>("Scale_weights", + "Scale_weights to be used for int8 weights data." + "Only used with MKL-DNN INT8.") + .SetDefault({1.0f}); + AddAttr("force_fp32_output", + "(bool, default false) Force INT8 kernel output FP32, only " + "used in MKL-DNN INT8") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Only used in cudnn kernel. Need set use_cudnn to true." + "workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardware. This size should be chosen carefully.") + .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB); + AddAttr("exhaustive_search", + "(bool, default false) cuDNN has many algorithm to calculation " + "convolution, whether enable exhaustive search " + "for cuDNN convolution or not, default is False.") + .SetDefault(false); + AddComment(R"DOC( +Convolution Operator. + +The convolution operation calculates the output based on the input, filter +and strides, paddings, dilations, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and Output(Output) are in NCHW format. Where N is batch +size, C is the number of channels, H is the height of the feature, and W is +the width of the feature. +Filters(Input) is MCHW format. Where M is the number of output image channels, C is +the number of input image channels, H is the height of the filter, and W +is the width of the filter. +Parameters(strides, paddings, dilations) are two elements. These two elements represent +height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where +$$ + H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\ + W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1 +$$ +)DOC"); + Apply(); +} + +void Conv3DOpMaker::Make() { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddInput( + "Input", + "(Tensor) The input tensor of convolution operator. " + "The format of input tensor is NCDHW. Where N is batch size, C is the " + "number of channels, D is the depth of the feature, H is the height of " + "the feature, " + "and W is the width of the feature."); + AddInput("Filter", + "(Tensor) The filter tensor of convolution operator. " + "The format of the filter tensor is MCDHW, where M is the number of " + "output image channels, C is the number of input image channels, " + "D is the depth of the filter, H is the height of the filter, and W " + "is the width of the filter." + "If the groups attribute is greater than 1, C equals the number of " + "input image channels divided by the groups."); + AddInput("ResidualData", + "(Tensor) Tensor with residual data " + "to which convolution output will be added." + "Used with fuse_residual_connection fusion.") + .AsDispensable(); + AddOutput("Output", + "(Tensor) The output tensor of convolution operator." + "The format of output tensor is also NCDHW."); + AddAttr>("strides", + "(vector, default:{1, 1, 1}), the " + "strides(d_stride, h_stride, w_stride) of " + "convolution operator.") + .SetDefault({1, 1, 1}); + AddAttr>("paddings", + "(vector, default:{0, 0, 0}), the " + "paddings(d_pad, h_pad, w_pad) of convolution " + "operator.") + .SetDefault({0, 0, 0}); + AddAttr( + "groups", + "(int default:1), the groups number of the convolution operator. " + "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " + "when group=2, the first half of the filters is only connected to the " + "first half of the input channels, while the second half of the filters " + "is only connected to the second half of the input channels.") + .SetDefault(1); + AddAttr>("dilations", + "(vector default:{1, 1, 1}), the " + "dilations(d_dilation, h_dilation, w_dilation) of " + "convolution operator.") + .SetDefault({1, 1, 1}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("fuse_residual_connection", + "(bool, default false) Only used in mkldnn kernel. Used " + "whenever convolution output is as an input to residual " + "connection.") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + AddAttr("force_fp32_output", + "(bool, default false) Only used in mkldnn INT8 kernel") + .SetDefault(false); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Only used in cudnn kernel. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardware. This size should be chosen carefully.") + .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB); + AddAttr("exhaustive_search", + "(bool, default false) cuDNN has many algorithm to calculation " + "convolution, whether enable exhaustive search " + "for cuDNN convolution or not, default is False.") + .SetDefault(false); + AddComment(R"DOC( +Convolution3D Operator. + +The convolution operation calculates the output based on the input, filter +and strides, paddings, dilations, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and output(Output) are in NCDHW format, where N is batch +size, C is the number of channels,D is the depth of the feature, H is the height of +the feature, and W is the width of the feature. +Filters(Input) is MCDHW format, where M is the number of output image channels, +C is the number of input image channels, D is the depth of the filter, +H is the height of the filter, and W is the width of the filter. +Parameters(strides, paddings, dilations) are three elements. These three elements +represent depth, height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\ + H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\ + W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1 + $$ +)DOC"); + Apply(); +} + +void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); + } + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); + } +} + +framework::OpKernelType ConvOpGrad::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; + framework::LibraryType library_{framework::LibraryType::kPlain}; + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + +#ifdef PADDLE_WITH_CUDA + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; + } +#endif +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + customized_type_value = kConvMKLDNNFP32; + } +#endif + + auto type = framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_, + customized_type_value); +#ifdef PADDLE_WITH_CUDA + if (library_ == framework::LibraryType::kCUDNN) { + std::vector& configs = kernel_configs_map_[type]; + if (configs.empty()) { + std::shared_ptr> + p(new framework::AlgorithmsCache()); + configs.push_back(p); + + std::shared_ptr< + framework::AlgorithmsCache> + p2(new framework::AlgorithmsCache()); + configs.push_back(p2); + } + } +#endif + return type; +} + +class Conv2DGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetInput("Bias", Input("Bias")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + op->SetAttrMap(Attrs()); + + return std::unique_ptr(op); + } +}; + +class Conv3DGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + + if (ForwardOp().Inputs().count("ResidualData") != 0) { + op->SetInput("ResidualData", Input("ResidualData")); + } + + op->SetAttrMap(Attrs()); + + return std::unique_ptr(op); + } +}; + +/* + * Inputs: I, W, dO, ddI, ddW + * Outputs: ddO, dW, dI + */ +class Conv2DDoubleGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType(this->ForwardOpType() + "_grad"); + // I, W, dO, ddI, ddW + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetInput("DOutput", Input(framework::GradVarName("Output"))); + op->SetInput("DDInput", OutputGrad(framework::GradVarName("Input"))); + op->SetInput("DDFilter", OutputGrad(framework::GradVarName("Filter"))); + + // ddO, dI, dW + // Unlike grad op, double grad op does not use name@GRAD@GRAD + // as key of ops' inputs and outputs. + auto ddx = OutputGrad(framework::GradVarName("Input")); + auto ddw = OutputGrad(framework::GradVarName("Filter")); + std::vector empty_str = {}; + + op->SetOutput( + "DDOutput", + ddx.empty() ? empty_str : InputGrad(framework::GradVarName("Output"))); + op->SetOutput("DFilter", ddx.empty() ? empty_str : InputGrad("Filter")); + op->SetOutput("DInput", ddw.empty() ? empty_str : InputGrad("Input")); + + op->SetAttrMap(Attrs()); + + return std::unique_ptr(op); + } +}; + +void ConvOpDoubleGrad::InferShape(framework::InferShapeContext* ctx) const { + auto x_dims = ctx->GetInputDim("Input"); + auto w_dims = ctx->GetInputDim("Filter"); + auto do_dims = ctx->GetInputDim("DOutput"); + + if (ctx->HasOutput("DDOutput") && ctx->HasInput("DDInput")) { + ctx->SetOutputDim("DDOutput", do_dims); + } + if (ctx->HasOutput("DFilter") && ctx->HasInput("DDInput")) { + ctx->SetOutputDim("DFilter", w_dims); + } + if (ctx->HasOutput("DInput") && ctx->HasInput("DDFilter")) { + ctx->SetOutputDim("DInput", x_dims); + } +} + +framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; + framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + +#ifdef PADDLE_WITH_CUDA + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; + } else { + PADDLE_THROW("Now ConvDoubleGrad only supports cuDNN."); + } +#endif + auto type = framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_, + customized_type_value); +#ifdef PADDLE_WITH_CUDA + if (library_ == framework::LibraryType::kCUDNN) { + std::vector& configs = kernel_configs_map_[type]; + if (configs.empty()) { + std::shared_ptr> p0( + new framework::AlgorithmsCache()); + configs.push_back(p0); + + std::shared_ptr< + framework::AlgorithmsCache> + p1(new framework::AlgorithmsCache()); + configs.push_back(p1); + + std::shared_ptr> + p2(new framework::AlgorithmsCache()); + configs.push_back(p2); + } + } +#endif + return type; +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(conv2d, ops::ConvOp, ops::Conv2DOpMaker, + ops::ConvOpInferVarType, ops::Conv2DGradMaker); +REGISTER_OPERATOR(conv2d_grad, ops::ConvOpGrad, ops::Conv2DDoubleGradMaker); +REGISTER_OPERATOR(conv2d_grad_grad, ops::ConvOpDoubleGrad); + +// depthwise convolution op +REGISTER_OPERATOR(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker, + ops::ConvOpInferVarType, ops::Conv2DGradMaker); +REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad); + +REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker, + ops::ConvOpInferVarType, ops::Conv3DGradMaker); +REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad); + +// depthwise conv kernel +// TODO(xingzhaolong): neon kernel for mobile +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d, + ops::GemmConvKernel, + ops::GemmConvKernel); + +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); + +REGISTER_OP_CPU_KERNEL( + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); + +REGISTER_OP_CPU_KERNEL( + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/fluid/operators/conv_op.cu.cc b/paddle/fluid/operators/conv_op.cu.cc new file mode 100644 index 00000000..d07593f5 --- /dev/null +++ b/paddle/fluid/operators/conv_op.cu.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d, + ops::DepthwiseConvKernel, + ops::DepthwiseConvKernel); + +REGISTER_OP_CUDA_KERNEL( + depthwise_conv2d_grad, + ops::DepthwiseConvGradKernel, + ops::DepthwiseConvGradKernel); + +REGISTER_OP_CUDA_KERNEL( + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); + +REGISTER_OP_CUDA_KERNEL( + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h new file mode 100644 index 00000000..4df47ef2 --- /dev/null +++ b/paddle/fluid/operators/conv_op.h @@ -0,0 +1,485 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/depthwise_conv.h" +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +constexpr int kConvMKLDNNFP32 = 1; +constexpr int kConvMKLDNNINT8 = 2; +constexpr int MaxKeyLength = 256; + +// Base convolution operator definations for other conv +// like operators to reuse the implementation. +inline int ConvOutputSize(int input_size, int filter_size, int dilation, + int padding, int stride) { + const int dkernel = dilation * (filter_size - 1) + 1; + int output_size = (input_size + 2 * padding - dkernel) / stride + 1; + PADDLE_ENFORCE( + output_size > 0, + "Due to the settings of padding(%d), filter_size(%d), dilation(%d) and " + "stride(%d), the output size is less than 0, please check " + "again. Input_size:%d", + padding, filter_size, dilation, stride, input_size); + + return output_size; +} +inline bool IsExpand(const std::vector& filter_dim, + const std::vector& strides, + const std::vector& paddings, + const std::vector& dilations) { + bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true; + for (size_t j = 0; j < strides.size(); ++j) { + filter_1 = filter_1 && (static_cast(filter_dim[j + 2]) == 1); + strides_1 = strides_1 && (strides[j] == 1); + padding_0 = padding_0 && (paddings[j] == 0); + dilation_1 = dilation_1 && (dilations[j] == 1); + } + return !(filter_1 && strides_1 && padding_0 && dilation_1); +} + +// Define Op classes in .h file so that other conv +// operator implementations can reuse the code. +class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() final; + + protected: + virtual void Apply() {} +}; + +class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() final; + + protected: + virtual void Apply() {} +}; + +class ConvOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{ + {"Input", /*->*/ "Output"}}; + } +}; + +class ConvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class ConvOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class ConvOpDoubleGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +template +class GemmConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + // The filter will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + int groups = context.Attr("groups"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + auto& dev_ctx = context.template device_context(); + + const int batch_size = static_cast(input->dims()[0]); + + // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec(framework::vectorize(output->dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, + // o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d * + // o_h * o_w) + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + if (is_expand) { + col = context.AllocateTmpTensor(col_shape, dev_ctx); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + framework::DDim input_shape = + framework::slice_ddim(input->dims(), 1, input->dims().size()); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + framework::DDim output_matrix_shape = { + output->dims()[1], + output->numel() / (output->dims()[0] * output->dims()[1])}; + + // convolution operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + + auto blas = math::GetBlas(dev_ctx); + for (int i = 0; i < batch_size; i++) { + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(dev_ctx, in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul(filter_slice, false, col_matrix, false, T(1.0), &out_slice, + T(0.0)); + } + } + } +}; + +template +class GemmConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + // The filter and filter_grad will be reshaped in the calculations, + // so here use an assignment operation, + // that avoids modifying the variable in the Scope. + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + int groups = context.Attr("groups"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + const int batch_size = static_cast(input->dims()[0]); + + auto& dev_ctx = context.template device_context(); + + // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w} + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w} + std::vector output_shape_vec( + framework::vectorize(output_grad->dims())); + + // use col_shape in the im2col calculation + // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d, + // o_h, o_w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = input->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (i_c/g * k_h * k_w, o_h * o_w) + // or + // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w) + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + + framework::DDim input_shape = + framework::slice_ddim(input->dims(), 1, input->dims().size()); + + framework::DDim filter_matrix_shape = {filter.dims()[0], + filter.numel() / filter.dims()[0]}; + filter.Resize(filter_matrix_shape); + + framework::DDim output_matrix_shape = { + output_grad->dims()[1], + output_grad->numel() / + (output_grad->dims()[0] * output_grad->dims()[1])}; + + // convolution backward input operator: gemm + col2im(or col2vol) + // convolution backward weight operator: im2col(or vol2col) + gemm + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output_grad->dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + if (is_expand) { + col = context.AllocateTmpTensor(col_shape, dev_ctx); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + math::SetConstant set_zero; + auto blas = math::GetBlas(dev_ctx); + + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + + // if is_expand is false, the operation of set_zero is unnecessary, + // because math::matmul will reset input_grad. + if (is_expand) { + set_zero(dev_ctx, input_grad, static_cast(0)); + } + math::Col2VolFunctor col2vol; + math::Col2ImFunctor col2im; + + for (int i = 0; i < batch_size; i++) { + Tensor out_grad_batch = + output_grad->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + Tensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); + + Tensor in_grad_slice = + in_grad_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col_matrix.ShareDataWith(in_grad_slice); + col_matrix.Resize(col_matrix_shape); + } + blas.MatMul(filter_slice, true, out_grad_slice, false, T(1.0), + &col_matrix, T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, col, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &in_grad_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); + } + } + } + } + + if (filter_grad) { + filter_grad->mutable_data(context.GetPlace()); + Tensor filter_grad_ = *filter_grad; + filter_grad_.Resize(filter_matrix_shape); + set_zero(dev_ctx, filter_grad, static_cast(0)); + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; i++) { + Tensor out_grad_batch = + output_grad->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // im2col + Tensor out_grad_slice = + out_grad_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step); + + if (!is_expand) { + col.ShareDataWith(in_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, in_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor filter_grad_slice = + filter_grad_.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul(out_grad_slice, false, col_matrix, true, T(1.0), + &filter_grad_slice, T(1.0)); + } + } + } + } +}; + +template +class DepthwiseConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + PADDLE_ENFORCE_EQ( + output->dims()[1] % input->dims()[1], 0, + "The output channels must be a multiple of the input channels"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); + auto& dev_ctx = context.template device_context(); + + if (fuse_relu) { + math::DepthwiseConvFunctor depthwiseConv; + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output); + } else { + math::DepthwiseConvFunctor depthwiseConv; + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output); + } + } +}; + +template +class DepthwiseConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + bool fuse_relu = context.Attr("fuse_relu_before_depthwise_conv"); + + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + + if (fuse_relu) { + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, dilations, input_grad); + } else { + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, dilations, input_grad); + } + } + + if (filter_grad) { + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + if (fuse_relu) { + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, + paddings, dilations, filter_grad); + } else { + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, + paddings, dilations, filter_grad); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc new file mode 100644 index 00000000..fa4edb70 --- /dev/null +++ b/paddle/fluid/operators/conv_shift_op.cc @@ -0,0 +1,205 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_shift_op.h" +#include "paddle/fluid/framework/eigen.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; +template +using EigenMatrix = framework::EigenMatrix; + +class ConvShiftOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2."); + if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0)) + PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0], + "The 1st dimension of Input(X) and Input(Y) should " + "be equal."); + if (ctx->IsRuntime() || y_dims[1] > 0) + PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1, + "The 2nd dimension of Input(Y) should be odd."); + if (ctx->IsRuntime() || (x_dims[1] > 0 && y_dims[1] > 0)) + PADDLE_ENFORCE_LE(y_dims[1], x_dims[1], + "The 2nd dimension of Input(Y) should be less than or " + "equal to the 2nd dimension of Input(X)."); + ctx->ShareDim("X", /*->*/ "Out"); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class ConvShiftGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should be not null."); + + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(x_grad_name, x_dims); + } + + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(y_grad_name)) { + auto y_dims = ctx->GetInputDim("Y"); + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a 2-D tensor with shape B x M, " + "where B is the batch size and M is the data dimension."); + AddInput("Y", + "(Tensor, default Tensor), a 2-D tensor with shape B x N, " + "where B is the batch size and N is the data dimension. N must " + "be odd."); + AddOutput("Out", + "(Tensor, default Tensor), a 2-D tensor with shape B x M, " + "i.e., the same shape as X."); + AddComment(R"DOC( +ConvShift Operator. + +A layer for circular convolution of two vectors, +as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401 + +The equation is: + +$$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$ + +where X's index is computed modulo M, and Y's index is computed modulo N. + +Both inputs X and Y can carry LoD (Level of Details) information. +However, the output only shares the LoD information with input X. + +)DOC"); + } +}; + +template +class ConvShiftKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Y = context.Input("Y"); + auto *Out = context.Output("Out"); + Out->mutable_data(context.GetPlace()); + + auto x = EigenMatrix::From(*X); + auto y = EigenMatrix::From(*Y); + auto out = EigenMatrix::From(*Out); + out.setZero(); + + size_t batch_size = X->dims()[0]; + size_t x_width = X->dims()[1]; + size_t y_width = Y->dims()[1]; + size_t y_half_width = (y_width - 1) / 2; + + for (size_t k = 0; k < batch_size; ++k) { + for (size_t i = 0; i < x_width; ++i) { + for (size_t j = 0; j < y_width; ++j) { + int index = (i + j - y_half_width + x_width) % x_width; + out(k, i) += x(k, index) * y(k, j); + } + } + } + } +}; + +template +class ConvShiftGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Y = context.Input("Y"); + auto *dOut = context.Input(framework::GradVarName("Out")); + auto *dX = context.Output(framework::GradVarName("X")); + auto *dY = context.Output(framework::GradVarName("Y")); + + auto x = EigenMatrix::From(*X); + auto y = EigenMatrix::From(*Y); + auto dout = EigenMatrix::From(*dOut); + + auto x_dims = X->dims(); + auto y_dims = Y->dims(); + size_t batch_size = x_dims[0]; + size_t x_width = x_dims[1]; + size_t y_width = y_dims[1]; + size_t y_half_width = (y_width - 1) / 2; + + // The below trades code duplication for efficiency (keeping the if + // statement outside of the loop). + if (dX) { + dX->mutable_data(context.GetPlace()); + auto dx = EigenMatrix::From(*dX); + dx.setZero(); + for (size_t k = 0; k < batch_size; ++k) { + for (size_t i = 0; i < x_width; ++i) { + for (size_t j = 0; j < y_width; ++j) { + int index = (i + j - y_half_width + x_width) % x_width; + dx(k, index) += dout(k, i) * y(k, j); + } + } + } + } + + if (dY) { + dY->mutable_data(context.GetPlace()); + auto dy = EigenMatrix::From(*dY); + dy.setZero(); + for (size_t k = 0; k < batch_size; ++k) { + for (size_t i = 0; i < x_width; ++i) { + for (size_t j = 0; j < y_width; ++j) { + int index = (i + j - y_half_width + x_width) % x_width; + dy(k, j) += x(k, index) * dout(k, i); + } + } + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(conv_shift_grad, ops::ConvShiftGradOp); +REGISTER_OP_CPU_KERNEL(conv_shift, + ops::ConvShiftKernel); +REGISTER_OP_CPU_KERNEL( + conv_shift_grad, + ops::ConvShiftGradKernel); diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu new file mode 100644 index 00000000..314d3331 --- /dev/null +++ b/paddle/fluid/operators/conv_shift_op.cu @@ -0,0 +1,197 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_shift_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +namespace { + +inline int DivUp(int x, int y) { return (x + y - 1) / y; } + +// Some notes on the design: +// +// Each thread is responsible for computing a single output out[k, i]. +// Thread blocks are based on tiles of x with height 1 in the batch dimension. +// +// This design is based on the typical use case where the filter +// y is fairly small. For large y, it would probably be more efficient +// to also tile across y. +template +__global__ void ConvShiftForward(const T *x, const T *y, int x_width, + int y_width, int y_half_width, int batch_size, + T *out) { + extern __shared__ T mem[]; + + int tx = threadIdx.x; + int i = blockIdx.x * blockDim.x + tx; // global x index + int k = blockIdx.y; // batch index + + // Check if we are in a boundary block with fewer x's to process than + // blockDim.x. + int num_x = + (blockIdx.x == gridDim.x - 1) ? (x_width % blockDim.x) : blockDim.x; + + T *sx = mem; + T *sx_pad = &mem[num_x]; + T *sy = &mem[blockDim.x + y_width]; + + // Collaboratively load y[k, :] and length-y padding of x into shared memory. + int pad_start = blockIdx.x * blockDim.x + num_x + x_width - y_half_width; + for (int j = tx; j < y_width; j += blockDim.x) { + sy[j] = y[k * y_width + j]; + sx_pad[j] = x[k * x_width + (pad_start + j) % x_width]; + } + + // Load a cyclically shifted slice of x into shared memory. + if (tx < num_x) { + int load_i = (i - y_half_width + x_width) % x_width; + sx[tx] = x[k * x_width + load_i]; + } + __syncthreads(); + + if (tx < num_x) { + // Compute dot product of sx[tx:tx + y_width] and sy. + T sum = 0; + for (int j = 0; j < y_width; ++j) { + sum += sx[tx + j] * sy[j]; + } + + // Save to out[k, i]. + out[k * x_width + i] = sum; + } +} + +// Compute x gradient - initial naive implementation with atomic add. +template +__global__ void ConvShiftGradX(const T *dout, const T *y, int x_width, + int y_width, int y_half_width, int batch_size, + T *dx) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // x index + int j = blockIdx.y; // y index + int k = blockIdx.z; // batch index + + if (i < x_width) { + int index = (i + j - y_half_width + x_width) % x_width; + atomicAdd(&dx[k * x_width + index], + dout[k * x_width + i] * y[k * y_width + j]); + } +} + +// Compute y gradient - initial naive implementation with atomic add. +template +__global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width, + int y_half_width, int batch_size, T *dy) { + int i = blockIdx.x * blockDim.x + threadIdx.x; // x index + int j = blockIdx.y; // y index + int k = blockIdx.z; // batch index + + if (i < x_width) { + int index = (i + j - y_half_width + x_width) % x_width; + atomicAdd(&dy[k * y_width + j], + x[k * x_width + index] * dout[k * x_width + i]); + } +} +} // namespace + +template +class ConvShiftKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Y = context.Input("Y"); + Tensor *Out = context.Output("Out"); + const T *x_data = X->data(); + const T *y_data = Y->data(); + T *out_data = Out->mutable_data(context.GetPlace()); + + int batch_size = X->dims()[0]; + int x_width = X->dims()[1]; + int y_width = Y->dims()[1]; + int y_half_width = (y_width - 1) / 2; + + const int x_per_block = 256; + int num_x_blocks = DivUp(x_width, x_per_block); + int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T); + + dim3 grid_dim(num_x_blocks, batch_size); + + auto stream = + context.template device_context().stream(); + + ConvShiftForward<<>>( + x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data); + } +}; + +template +class ConvShiftGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Y = context.Input("Y"); + const Tensor *dOut = context.Input(framework::GradVarName("Out")); + const T *x_data = X->data(); + const T *y_data = Y->data(); + const T *dout_data = dOut->data(); + + Tensor *dX = context.Output(framework::GradVarName("X")); + Tensor *dY = context.Output(framework::GradVarName("Y")); + + int batch_size = X->dims()[0]; + int x_width = X->dims()[1]; + int y_width = Y->dims()[1]; + int y_half_width = (y_width - 1) / 2; + + auto &device_ctx = + context.template device_context(); + math::SetConstant zero; + + const int x_per_block = 256; + int num_x_blocks = DivUp(x_width, x_per_block); + dim3 grid_dim(num_x_blocks, y_width, batch_size); + + if (dX) { + T *dx_data = dX->mutable_data(context.GetPlace()); + zero(device_ctx, dX, static_cast(0.0)); + ConvShiftGradX<<>>( + dout_data, y_data, x_width, y_width, y_half_width, batch_size, + dx_data); + } + if (dY) { + T *dy_data = dY->mutable_data(context.GetPlace()); + zero(device_ctx, dY, static_cast(0.0)); + ConvShiftDy<<>>( + x_data, dout_data, x_width, y_width, y_half_width, batch_size, + dy_data); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + conv_shift, + ops::ConvShiftKernel); +REGISTER_OP_CUDA_KERNEL( + conv_shift_grad, + ops::ConvShiftGradKernel); diff --git a/paddle/fluid/operators/conv_shift_op.h b/paddle/fluid/operators/conv_shift_op.h new file mode 100644 index 00000000..6d8ddd79 --- /dev/null +++ b/paddle/fluid/operators/conv_shift_op.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class ConvShiftKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; + +template +class ConvShiftGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc new file mode 100644 index 00000000..f44094ca --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -0,0 +1,266 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/conv_transpose_op.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; +using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; +using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; +using DataLayout = platform::DataLayout; + +static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024; + +template +class CUDNNConvTransposeOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + // cudnn v5 does not support dilations + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + int user_workspace_size = ctx.Attr("workspace_size_MB"); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedFilterDescriptor filter_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout; + + if (strides.size() == 2U) { + layout = DataLayout::kNCHW; + } else { + layout = DataLayout::kNCDHW; + } + + // (N, M, H, W) or (N, M, D, H, W) + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims()), groups); + // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output->dims()), groups); + // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w) + cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize2int(filter->dims()), groups); + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + + // ------------------- cudnn conv workspace --------------------- + size_t workspace_size_in_bytes; // final workspace to allocate. + size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + // ------------------- cudnn conv algorithm --------------------- + cudnnConvolutionBwdDataAlgo_t algo; + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + // Get the algorithm + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( + handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, + // dxDesc: Handle to the previously initialized output tensor + // descriptor. + cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &algo)); + + // get workspace size able to allocate + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_output_desc, algo, &workspace_size_in_bytes)); + + // ------------------- cudnn conv transpose forward --------------------- + int input_offset = input->numel() / input->dims()[0] / groups; + int output_offset = output->numel() / output->dims()[0] / groups; + int filter_offset = filter->numel() / groups; + T alpha = 1.0f, beta = 0.0f; + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + for (int g = 0; g < groups; g++) { + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardData( + handle, &alpha, cudnn_filter_desc, filter_data + filter_offset * g, + cudnn_input_desc, input_data + input_offset * g, cudnn_conv_desc, + algo, cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_output_desc, output_data + output_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } + } +}; + +template +class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto input = ctx.Input("Input"); + auto filter = ctx.Input("Filter"); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto input_grad = ctx.Output(framework::GradVarName("Input")); + auto filter_grad = ctx.Output(framework::GradVarName("Filter")); + const T* input_data = input->data(); + const T* output_grad_data = output_grad->data(); + const T* filter_data = filter->data(); + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + // cudnn v5 does not support dilations + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + int user_workspace_size = ctx.Attr("workspace_size_MB"); + + // ------------------- cudnn descriptors --------------------- + ScopedTensorDescriptor input_desc; + ScopedTensorDescriptor output_desc; + ScopedFilterDescriptor filter_desc; + ScopedConvolutionDescriptor conv_desc; + DataLayout layout = DataLayout::kNCHW; + + // Input: (N, M, H, W) or (N, M, D, H, W) + cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize2int(input->dims()), groups); + // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) + cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize2int(output_grad->dims()), groups); + // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w) + cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize2int(filter->dims()), groups); + + cudnnConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(paddings, strides, dilations); + + // ------------------- cudnn backward algorithm --------------------- + cudnnConvolutionFwdAlgo_t data_algo; + cudnnConvolutionBwdFilterAlgo_t filter_algo; + size_t bwd_filter_ws_size, fwd_ws_size; + size_t workspace_size_in_bytes = 0; + size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes; + if (user_workspace_size > 0) { + workspace_size_limit = user_workspace_size * 1024 * 1024; + } + + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + if (input_grad) { + // choose backward algorithm for data + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( + handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &data_algo)); + CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize( + handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc, + cudnn_input_desc, data_algo, &fwd_ws_size)); + workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size); + } + + if (filter_grad) { + // choose backward algorithm for filter + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm( + handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_filter_desc, + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + workspace_size_limit, &filter_algo)); + + // get workspace for backwards filter algorithm + CUDNN_ENFORCE( + platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_filter_desc, filter_algo, &bwd_filter_ws_size)); + workspace_size_in_bytes = + std::max(workspace_size_in_bytes, bwd_filter_ws_size); + } + + // ------------------- cudnn conv backward data --------------------- + // FIXME(typhoonzero): template type T may not be the same as cudnn call. + int input_offset = input->numel() / input->dims()[0] / groups; + int output_grad_offset = + output_grad->numel() / output_grad->dims()[0] / groups; + int filter_offset = filter->numel() / groups; + T alpha = 1.0f, beta = 0.0f; + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + if (input_grad) { + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset input_grad. + for (int g = 0; g < groups; g++) { + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_filter_desc, + filter_data + filter_offset * g, cudnn_conv_desc, data_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc, + input_grad_data + input_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } + } + + // ------------------- cudnn conv backward filter --------------------- + if (filter_grad) { + T* filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + // Because beta is zero, it is unnecessary to reset filter_grad. + // Gradient with respect to the filter + for (int g = 0; g < groups; g++) { + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter( + handle, &alpha, cudnn_output_desc, + output_grad_data + output_grad_offset * g, cudnn_input_desc, + input_data + input_offset * g, cudnn_conv_desc, filter_algo, + cudnn_workspace, workspace_size_in_bytes, &beta, + cudnn_filter_desc, filter_grad_data + filter_offset * g)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeOpKernel, + ops::CUDNNConvTransposeOpKernel); +REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeGradOpKernel, + ops::CUDNNConvTransposeGradOpKernel); + +REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeOpKernel, + ops::CUDNNConvTransposeOpKernel); +REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace, + ops::CUDNNConvTransposeGradOpKernel, + ops::CUDNNConvTransposeGradOpKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc new file mode 100644 index 00000000..01afdd28 --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -0,0 +1,422 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_transpose_op.h" +#include +#include +#include +#include "paddle/fluid/platform/cudnn_workspace_helper.h" + +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of ConvTransposeOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of ConvTransposeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of ConvTransposeOp should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + std::vector output_size = + ctx->Attrs().Get>("output_size"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::vector dilations = ctx->Attrs().Get>("dilations"); + int groups = ctx->Attrs().Get("groups"); + + PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, + "ConvTransposeOp intput should be 4-D or 5-D tensor."); + PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(), + "ConvTransposeOp input dimension and filter dimension " + "should be the same."); + PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U, + "ConvTransposeOp input dimension and strides dimension should " + "be consistent."); + if (output_size.size()) + PADDLE_ENFORCE_EQ(output_size.size(), strides.size(), + "ConvTransposeOp output_size dimension and strides " + "dimension should be the same."); + PADDLE_ENFORCE_EQ(paddings.size(), strides.size(), + "ConvTransposeOp paddings dimension and strides " + "dimension should be the same."); + PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(), + "ConvTransposeOp paddings dimension and dilations " + "dimension should be the same."); + PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0], + "In ConvTransposeOp, The number of input channels should " + "be equal to the number of filter's channels."); + + std::vector output_shape({in_dims[0], filter_dims[1] * groups}); + for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; + auto infer_shape = + (in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] + filter_extent; + if (output_size.size()) { + PADDLE_ENFORCE((output_size[i] >= infer_shape && + output_size[i] < infer_shape + strides[i]), + "ConvTransposeOp output_size should be " + "in appropriate range."); + output_shape.push_back(output_size[i]); + } else { + output_shape.push_back(infer_shape); + } + } + ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); +} + +framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library_{framework::LibraryType::kPlain}; + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } + } +#endif +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + } +#endif + + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_); +} + +void Conv2DTransposeOpMaker::Make() { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddInput( + "Input", + "(Tensor) The input tensor of convolution transpose operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of input channels, H is the height of the feature, and " + "W is the width of the feature."); + AddInput( + "Filter", + "(Tensor) The filter tensor of convolution transpose operator. " + "The format of the filter tensor is MCHW, where M is the number of " + "input feature channels, C is the number of " + "output feature channels," + "H is the height of the filter, and W is the width of the filter. " + "We enforce groups number == 1 in the convolution transpose scenario."); + AddInput("Bias", + "(Tensor) Bias to be added to each output of filter application." + "The format of output tensor is X (one-dimensional) of size equal" + "to the number of output channels. Only used with MKL-DNN.") + .AsDispensable(); + + AddOutput("Output", + "(Tensor) The output tensor of convolution transpose operator. " + "The format of output tensor is also NCHW."); + AddAttr>("output_size", + "(vector default: []), the " + "size of the output tensor") + .SetDefault({}); + AddAttr("groups", + "(int default:1), the groups number of the convolution " + "transpose operator. ") + .SetDefault(1); + AddAttr>("dilations", + "(vector default:{1, 1}), the " + "dilations(h_dilation, w_dilation) of convolution " + "transpose operator.") + .SetDefault({1, 1}); + AddAttr>( + "strides", + "(vector default:{1, 1}), the strides(h_stride, w_stride) of " + "convolution transpose operator.") + .SetDefault({1, 1}); + AddAttr>( + "paddings", + "(vector default:{0, 0}), the paddings(h_pad, w_pad) of convolution " + "transpose operator.") + .SetDefault({0, 0}); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Used in cudnn kernel only. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardward. This size should be carefully setted.") + .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB); + AddComment(R"DOC( +Convolution2D Transpose Operator. + +The convolution transpose operation calculates the output based on the input, filter +and dilations, strides, paddings, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the +number of channels, H is the height of the feature, and W is the width of the feature. +Filter(Input) is in MCHW format. Where M is the number of input feature channels, +C is the number of output feature channels, H is the height of the filter, +and W is the width of the filter. +Parameters(strides, paddings) are two elements. These two elements represent height +and width, respectively. +The input(X) size and output(Out) size may be different. + +For an example: + Input: + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{in}, C_{out}, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where + $$ + H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\ + W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 + $$ +)DOC"); +} + +void Conv3DTransposeOpMaker::Make() { + AddInput("Input", + "(Tensor) The input tensor of convolution transpose operator." + "The format of input tensor is NCDHW. Where N is batch size, C is " + "the number of channels, D is the depth of the feature, H is the " + "height of the feature, and " + "W is the width of the feature."); + AddInput("Filter", + "(Tensor) The filter tensor of convolution transpose operator." + "The format of the filter tensor is MCDHW, where M is the number of " + "input feature channels, C is the number of " + "output feature channels, D " + "is the depth of the filter, H is the height of the filter, and " + "W is the width of the filter." + "We enforce groups number == 1 and padding == 0 in " + "the convolution3d transpose scenario."); + AddOutput("Output", + "(Tensor) The output tensor of convolution transpose operator." + "The format of output tensor is also NCDHW." + "Where N is batch size, C is " + "the number of channels, D is the depth of the feature, H is the " + "height of the feature, and W is the width of the feature."); + AddAttr>("output_size", + "(vector default: []), the " + "size of the output tensor") + .SetDefault({}); + AddAttr>( + "dilations", + "(vector default:{1, 1, 1}), the " + "dilations(d_dilation,h_dilation, w_dilation) of convolution " + "transpose operator.") + .SetDefault({1, 1, 1}); + AddAttr>("strides", + "(vector default:{1, 1, 1}), the " + "strides{d_stride, h_stride, w_stride} of " + "convolution transpose operator.") + .SetDefault({1, 1, 1}); + AddAttr>("paddings", + "(vector default:{0, 0, 0}), paddings(d_pad, " + "h_pad, w_pad) of convolution transpose operator.") + .SetDefault({0, 0, 0}); + AddAttr("groups", + "(int default:1), the groups number of the convolution3d " + "transpose operator. ") + .SetDefault(1); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + // TODO(dzhwinter): need to registered layout transform function + AddAttr("workspace_size_MB", + "Used in cudnn kernel only. workspace size for cudnn, in MB, " + "workspace is a section of GPU memory which will be " + "allocated/freed each time the operator runs, larger " + "workspace size can increase performance but also requires " + "better hardward. This size should be carefully setted.") + .SetDefault(platform::kDefaultConvWorkspaceSizeLimitMB); + AddComment(R"DOC( +Convolution3D Transpose Operator. + +The convolution transpose operation calculates the output based on the input, filter +and dilations, strides, paddings, groups parameters. The size of each dimension of the +parameters is checked in the infer-shape. +Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the +number of channels, D is the depth of the feature, H is the height of the feature, +and W is the width of the feature. +Filter(Input) is in MCDHW format. Where M is the number of input feature channels, +C is the number of output feature channels, D is the depth of the filter,H is the +height of the filter, and W is the width of the filter. +Parameters(strides, paddings) are three elements. These three elements represent +depth, height and width, respectively. +The input(X) size and output(Out) size may be different. + +Example: + Input: + Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$ + Output: + Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\ + H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\ + W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1 + $$ +)DOC"); +} + +void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const { + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); + } + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); + } +} + +framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + framework::LibraryType library_; + if (use_cudnn) { + library_ = framework::LibraryType::kCUDNN; + } else { + library_ = framework::LibraryType::kPlain; + } + + std::string data_format = ctx.Attr("data_format"); + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_); +} + +class ConvTransposeGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType(ForwardOp().Type() + "_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + if (ForwardOp().Inputs().count("Bias") > 0) { + op->SetInput("Bias", Input("Bias")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + } + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +// conv2d_transpose +REGISTER_OPERATOR(conv2d_transpose, ops::ConvTransposeOp, + ops::Conv2DTransposeOpMaker, + ops::ConvTransposeGradOpDescMaker); +REGISTER_OPERATOR(conv2d_transpose_grad, ops::ConvTransposeOpGrad); + +REGISTER_OP_CPU_KERNEL( + conv2d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); + +// conv3d_transpose +REGISTER_OPERATOR(conv3d_transpose, ops::ConvTransposeOp, + ops::Conv3DTransposeOpMaker, + ops::ConvTransposeGradOpDescMaker); +REGISTER_OPERATOR(conv3d_transpose_grad, ops::ConvTransposeOpGrad); + +REGISTER_OP_CPU_KERNEL( + conv3d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); + +// depthwise conv2d_transpose +REGISTER_OPERATOR(depthwise_conv2d_transpose, ops::ConvTransposeOp, + ops::Conv2DTransposeOpMaker, + ops::ConvTransposeGradOpDescMaker); +REGISTER_OPERATOR(depthwise_conv2d_transpose_grad, ops::ConvTransposeOpGrad); + +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CPU_KERNEL( + depthwise_conv2d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.cu.cc b/paddle/fluid/operators/conv_transpose_op.cu.cc new file mode 100644 index 00000000..a6d5665d --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_op.cu.cc @@ -0,0 +1,42 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/conv_transpose_op.h" + +namespace ops = paddle::operators; +using CUDA = paddle::platform::CUDADeviceContext; + +// conv2d +REGISTER_OP_CUDA_KERNEL(conv2d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); + +// conv3d +REGISTER_OP_CUDA_KERNEL(conv3d_transpose, + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL(conv3d_transpose_grad, + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); + +// depthwise conv2d +REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose, + ops::DepthwiseConvTransposeKernel, + ops::DepthwiseConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL(depthwise_conv2d_transpose_grad, + ops::DepthwiseConvTransposeGradKernel, + ops::DepthwiseConvTransposeGradKernel); diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h new file mode 100644 index 00000000..88c578b1 --- /dev/null +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -0,0 +1,391 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/depthwise_conv.h" +#include "paddle/fluid/operators/math/im2col.h" +#include "paddle/fluid/operators/math/vol2col.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +// Define Op classes in .h file so that other conv transpose +// operator implementations can reuse the code. +class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +class ConvTransposeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class ConvTransposeOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +template +class GemmConvTransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + // The filter will be reshaped, so it should not be constant pointer + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + int groups = context.Attr("groups"); + + const int batch_size = static_cast(input->dims()[0]); + + // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} + std::vector input_shape_vec = framework::vectorize(input->dims()); + // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w} + std::vector filter_shape_vec = framework::vectorize(filter.dims()); + + // use col_shape in the im2col and col2im (or vol2col and col2vol) + // calculation + // col_shape_vec: {c/g, k_h, k_w, h, w} or {c/g, k_d, k_h, k_w, d, h, w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = output->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2]; + } + DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (c/g * k_h * k_w, h * w) or (c/g * k_d * k_h * k_w, d * h * w) + DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1); + + Tensor col; + col.mutable_data(col_shape, context.GetPlace()); + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + + // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w) + DDim output_shape = + framework::slice_ddim(output->dims(), 1, output->dims().size()); + + // input matrix size: (m, h * w) or (m, d * h * w) + DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]}; + + // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w) + DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]}; + filter.Resize(filter_matrix_shape); + + output->mutable_data(context.GetPlace()); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + auto blas = math::GetBlas(dev_ctx); + set_zero(dev_ctx, output, static_cast(0)); + + int in_step = static_cast(input->dims()[1]) / groups; + int out_step = static_cast(output->dims()[1]) / groups; + math::Col2ImFunctor col2im; + math::Col2VolFunctor col2vol; + + // convolution transpose: gemm + col2im or col2vol (similar to conv-backward + // on input) + for (int i = 0; i < batch_size; i++) { + // batch with size (m, h * w) or (m, d * h * w) + Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); + + // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w) + Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape); + + for (int g = 0; g < groups; g++) { + Tensor in_slice = input_batch.Slice(g * in_step, (g + 1) * in_step); + Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); + Tensor out_slice = output_batch.Slice(g * out_step, (g + 1) * out_step); + + // col_matrix = filter_slice * input_slice + // of shape (c/g * k_h * k_w, h * w) + // or (c/g * k_d * k_h * k_w, d * h * w) + blas.MatMul(filter_slice, true, in_slice, false, static_cast(1.0), + &col_matrix, static_cast(0.0)); + + if (data_dim == 2U) { + // col2im: col_matrix -> dy + // from (c/g * k_h * k_w, h * w) to (c/g, o_h, o_w) + col2im(dev_ctx, col, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &out_slice); + } else if (data_dim == 3U) { + // col2vol: col_matrix -> dy + // from (c/g * k_d * k_h * k_w, d * h * w) to (c/g, o_d, o_h, o_w) + col2vol(dev_ctx, col, dilations, strides, paddings, &out_slice); + } + } + } + } +}; + +template +class GemmConvTransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + // For filter, we do not use const pointer b/c we will do reshape, + // but we should avoid modifying its value. + Tensor filter = *context.Input("Filter"); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + + if ((!input_grad) && (!filter_grad)) return; + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + int groups = context.Attr("groups"); + + const int batch_size = static_cast(input->dims()[0]); + + // input_shape_vec: {n, c, h, w} or {n, c, d, h, w} + std::vector input_shape_vec = framework::vectorize(input->dims()); + // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w} + std::vector filter_shape_vec = framework::vectorize(filter.dims()); + + // use col_shape in the im2col and col2im (or vol2col and col2vol) + // calculation + // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w} + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + col_shape_vec[0] = output_grad->dims()[1]; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2]; + } + DDim col_shape(framework::make_ddim(col_shape_vec)); + + // use col_matrix_shape in the gemm calculation + // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) + DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1); + + // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w) + DDim output_shape = framework::slice_ddim(output_grad->dims(), 1, + output_grad->dims().size()); + + // input matrix size: (m, h * w) or (m, d * h * w) + DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]}; + + // filter size: (m, c/g * k_h * k_w) or (m, c/g * k_d * k_h * k_w) + DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0] / groups}; + filter.Resize(filter_matrix_shape); + int in_step = static_cast(input->dims()[1]) / groups; + int col_step = static_cast(col_matrix_shape[0]) / groups; + + // convolution transpose grad on input: + // im2col + gemm (similar to conv-forward) + // input need to compute gradient + auto& dev_ctx = context.template device_context(); + auto blas = math::GetBlas(dev_ctx); + if (input_grad || filter_grad) { + Tensor col; + col.mutable_data(col_shape, context.GetPlace()); + // col_matrix shares the same piece of data with col, + // but will be reshaped into a two-dimensional matrix shape + // to call the matrix multiplication interface. + Tensor col_matrix; + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + + Tensor filter_grad_; + math::SetConstant set_zero; + + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + + if (input_grad) { + input_grad->mutable_data(context.GetPlace()); + } + if (filter_grad) { // filter size (m, c/g, k_h, k_w) + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + filter_grad_ = *filter_grad; + filter_grad_.Resize(filter_matrix_shape); + } + + for (int i = 0; i < batch_size; i++) { + // batch with size (c, o_h * o_w) + Tensor output_grad_batch = + output_grad->Slice(i, i + 1).Resize(output_shape); + + if (data_dim == 2U) { + // im2col: dy -> col matrix + // from (c, o_h, o_w) to (c * k_h * k_w, h * w) + im2col(dev_ctx, output_grad_batch, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col: dy -> col_matrix + // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w) + vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings, + &col); + } + + if (input_grad) { + // batch with size (m, h, w) + Tensor input_grad_batch = + input_grad->Slice(i, i + 1).Resize(input_matrix_shape); + // gemm: dx = filter * dy + // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w) + // or + // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m, + // d, h, w) + for (int g = 0; g < groups; g++) { + Tensor input_grad_slice = + input_grad_batch.Slice(g * in_step, (g + 1) * in_step); + Tensor filter_slice = filter.Slice(g * in_step, (g + 1) * in_step); + Tensor col_matrix_slice = + col_matrix.Slice(g * col_step, (g + 1) * col_step); + + blas.MatMul(filter_slice, false, col_matrix_slice, false, + static_cast(1.0), &input_grad_slice, + static_cast(0.0)); + } + } + if (filter_grad) { + // input batch + Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape); + // gemm: d_filter = x * dy^T + // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w) + // or + // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d * + // k_h * k_w) + for (int g = 0; g < groups; g++) { + Tensor in_batch_slice = + in_batch.Slice(g * in_step, (g + 1) * in_step); + Tensor filter_grad_slice = + filter_grad_.Slice(g * in_step, (g + 1) * in_step); + Tensor col_matrix_slice = + col_matrix.Slice(g * col_step, (g + 1) * col_step); + blas.MatMul(in_batch_slice, false, col_matrix_slice, true, + static_cast(1.0), &filter_grad_slice, + static_cast(1.0)); + } + } + } + } + } +}; + +template +class DepthwiseConvTransposeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + Tensor filter = *context.Input("Filter"); + Tensor* output = context.Output("Output"); + output->mutable_data(context.GetPlace()); + + int groups = context.Attr("groups"); + PADDLE_ENFORCE_EQ(groups, filter.dims()[0]); + + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + for (auto v : dilations) { + PADDLE_ENFORCE_EQ(v, 1); + } + + output->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); + math::SetConstant set_zero; + set_zero(dev_ctx, output, static_cast(0)); + + math::DepthwiseConvInputGradFunctor + depthwiseConvInputGrad; + depthwiseConvInputGrad(dev_ctx, *output, filter, *input, strides, paddings, + dilations, output); + } +}; + +template +class DepthwiseConvTransposeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("Input"); + const Tensor* output_grad = + context.Input(framework::GradVarName("Output")); + Tensor* input_grad = + context.Output(framework::GradVarName("Input")); + Tensor* filter_grad = + context.Output(framework::GradVarName("Filter")); + Tensor filter = *context.Input("Filter"); + + if (!input_grad && !filter_grad) return; + + auto& dev_ctx = context.template device_context(); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + std::vector dilations = context.Attr>("dilations"); + + if (input_grad) { + math::DepthwiseConvFunctor depthwiseConv; + depthwiseConv(dev_ctx, *output_grad, filter, strides, paddings, dilations, + input_grad); + } + + if (filter_grad) { + math::SetConstant set_zero; + filter_grad->mutable_data(context.GetPlace()); + set_zero(dev_ctx, filter_grad, static_cast(0)); + + math::DepthwiseConvFilterGradFunctor + depthwiseConvFilterGrad; + depthwiseConvFilterGrad(dev_ctx, *output_grad, *input, strides, paddings, + dilations, filter_grad); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc new file mode 100644 index 00000000..93304ec6 --- /dev/null +++ b/paddle/fluid/operators/cos_sim_op.cc @@ -0,0 +1,175 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cos_sim_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class CosSimOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + // notnull check + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("XNorm"), + "Output(XNorm) of CosSimOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("YNorm"), + "Output(YNorm) of CosSimOp should not be null."); + + // shape check + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + bool check = true; + if ((!ctx->IsRuntime()) && + (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(), + "Ranks of Input(X) and Input(Y) must be equal."); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "Rank of Input(X) must not be less than 2."); + PADDLE_ENFORCE_EQ( + framework::slice_ddim(x_dims, 1, x_dims.size()), + framework::slice_ddim(y_dims, 1, y_dims.size()), + "All dimensions except the 1st of Input(X) and Input(Y) " + "must be equal."); + PADDLE_ENFORCE( + x_dims[0] == y_dims[0] || y_dims[0] == 1, + "The 1st dimension of Input(Y) must be equal to Input(X) or" + " just 1 (which will be broadcasted to match Input(X))."); + } + + // resize tensor + ctx->SetOutputDim("Out", {x_dims[0], 1}); + ctx->SetOutputDim("XNorm", {x_dims[0], 1}); + ctx->SetOutputDim("YNorm", {y_dims[0], 1}); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class CosSimOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The 1st input of cos_sim op."); + AddInput("Y", "The 2nd input of cos_sim op."); + AddOutput("Out", "The output of cos_sim op."); + AddOutput("XNorm", + "Norm of the first input, reduced along the 1st " + "dimension.") + .AsIntermediate(); + AddOutput("YNorm", + "Norm of the second input, reduced along the 1st " + "dimension.") + .AsIntermediate(); + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") + .SetDefault(true); + + AddComment(R"DOC( +**Cosine Similarity Operator** + +$Out = \frac{X^T * Y}{(\sqrt{X^T * X} * \sqrt{Y^T * Y})}$ + +The input X and Y must have the same shape, except that the 1st dimension +of input Y could be just 1 (different from input X), which will be +broadcasted to match the shape of input X before computing their cosine +similarity. + +Both the input X and Y can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input X. + +)DOC"); + } +}; + +class CosSimOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + // notnull check + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("XNorm"), "Input(XNorm) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("YNorm"), "Input(YNorm) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) must not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) must not be null."); + + // shape check + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + auto xnorm_dims = ctx->GetInputDim("XNorm"); + auto ynorm_dims = ctx->GetInputDim("YNorm"); + auto out_dims = ctx->GetInputDim("Out"); + auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Ranks of Input(X) and Input(Y) must be equal."); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "Rank of Input(X) must not be less than 2."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()), + framework::slice_ddim(y_dims, 1, y_dims.size()), + "All dimensions except the 1st of Input(X) and Input(Y) " + "must be equal."); + PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1, + "The 1st dimension of Input(Y) must be equal to Input(X) or" + " just 1 (which will be broadcasted to match Input(X))."); + auto target_xnorm_dims = framework::make_ddim({x_dims[0], 1}); + auto target_ynorm_dims = framework::make_ddim({y_dims[0], 1}); + PADDLE_ENFORCE_EQ(xnorm_dims, target_xnorm_dims, + "Shape of Input(XNorm) must be [X.Dim(0), 1]."); + PADDLE_ENFORCE_EQ(ynorm_dims, target_ynorm_dims, + "Shape of Input(YNorm) must be [Y.Dim(0), 1]."); + PADDLE_ENFORCE_EQ(out_dims, target_xnorm_dims, + "Shape of Input(Out) must be [X.Dim(0), 1]."); + PADDLE_ENFORCE_EQ(out_grad_dims, target_xnorm_dims, + "Shape of Input(Out@Grad) must be [X.Dim(0), 1]."); + + // resize tensor + auto x_grad_name = framework::GradVarName("X"); + auto y_grad_name = framework::GradVarName("Y"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, y_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(cos_sim_grad, ops::CosSimOpGrad); +REGISTER_OP_CPU_KERNEL( + cos_sim, ops::CosSimKernel); +REGISTER_OP_CPU_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/fluid/operators/cos_sim_op.cu b/paddle/fluid/operators/cos_sim_op.cu new file mode 100644 index 00000000..3d144ca2 --- /dev/null +++ b/paddle/fluid/operators/cos_sim_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/cos_sim_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + cos_sim, ops::CosSimKernel); +REGISTER_OP_CUDA_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h new file mode 100644 index 00000000..0b4e3f77 --- /dev/null +++ b/paddle/fluid/operators/cos_sim_op.h @@ -0,0 +1,139 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/cos_sim_functor.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CosSimKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // get Tensor + auto* in_x = context.Input("X"); + auto* in_y = context.Input("Y"); + auto* out_z = context.Output("Out"); + auto* out_x_norm = context.Output("XNorm"); + auto* out_y_norm = context.Output("YNorm"); + + int rows_x = in_x->dims()[0]; + int rows_y = in_y->dims()[0]; + out_z->Resize({rows_x, 1}); + out_x_norm->Resize({rows_x, 1}); + out_y_norm->Resize({rows_y, 1}); + out_z->mutable_data(context.GetPlace()); + out_x_norm->mutable_data(context.GetPlace()); + out_y_norm->mutable_data(context.GetPlace()); + out_z->set_lod(in_x->lod()); + + int cols = framework::product(in_x->dims()) / rows_x; + + if (rows_x == rows_y) { + math::CosSimFunctor functor( + in_x->data(), in_y->data(), out_x_norm->data(), + out_y_norm->data(), out_z->data(), cols); + platform::ForRange for_range( + static_cast(context.device_context()), rows_x); + for_range(functor); + } else { + math::CosSimFunctor functor( + in_x->data(), in_y->data(), out_x_norm->data(), + out_y_norm->data(), out_z->data(), cols); + platform::ForRange for_range( + static_cast(context.device_context()), rows_x); + for_range(functor); + } + } +}; + +template +class CosSimGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // get Tensor + auto* in_x = context.Input("X"); + auto* in_y = context.Input("Y"); + auto* in_z = context.Input("Out"); + auto* in_x_norm = context.Input("XNorm"); + auto* in_y_norm = context.Input("YNorm"); + auto* out_grad_x = context.Output(framework::GradVarName("X")); + auto* out_grad_y = context.Output(framework::GradVarName("Y")); + auto* in_grad_z = context.Input(framework::GradVarName("Out")); + + // compute gradident + int rows_x = in_x->dims()[0]; + int rows_y = in_y->dims()[0]; + int cols = framework::product(in_x->dims()) / rows_x; + + if (rows_x == rows_y) { + if (out_grad_x) { + out_grad_x->Resize(in_x->dims()); + math::CosSimGradFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_x->mutable_data(context.GetPlace()), cols); + platform::ForRange for_range( + static_cast(context.device_context()), + rows_x); + for_range(functor); + } + if (out_grad_y) { + out_grad_y->Resize(in_y->dims()); + math::CosSimGradFunctor functor( + in_y_norm->data(), in_x_norm->data(), in_y->data(), + in_x->data(), in_z->data(), in_grad_z->data(), + out_grad_y->mutable_data(context.GetPlace()), cols); + platform::ForRange for_range( + static_cast(context.device_context()), + rows_x); + for_range(functor); + } + } else { + if (out_grad_x) { + out_grad_x->Resize(in_x->dims()); + math::CosSimDxFunctor functor( + in_x_norm->data(), in_y_norm->data(), in_x->data(), + in_y->data(), in_z->data(), in_grad_z->data(), + out_grad_x->mutable_data(context.GetPlace()), cols); + platform::ForRange for_range( + static_cast(context.device_context()), + rows_x); + for_range(functor); + } + if (out_grad_y) { + out_grad_y->Resize(in_y->dims()); + out_grad_y->mutable_data(context.GetPlace()); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, out_grad_y, static_cast(0)); + + math::CosSimDyFunctor functor; + functor(dev_ctx, in_x_norm->data(), in_y_norm->data(), + in_x->data(), in_y->data(), in_z->data(), + in_grad_z->data(), static_cast(rows_x), + static_cast(cols), out_grad_y->data()); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc new file mode 100644 index 00000000..c701e895 --- /dev/null +++ b/paddle/fluid/operators/crf_decoding_op.cc @@ -0,0 +1,137 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/crf_decoding_op.h" + +namespace paddle { +namespace operators { +class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Emission", + "(LoDTensor, default: LoDTensor). A LoDTensor with shape " + "[N x D] where N is the size of the mini-batch and D is the total " + "tag number. This input is the unscaled emission weight matrix of " + "the linear_chain_crf operator."); + AddInput( + "Transition", + "(Tensor, default: Tensor). A Tensor with shape [(D + 2) x D]. " + "This input is the transition weights learned by the linear_chain_crf " + "operator, denoted as w. The 1st row of w are transition weights for " + "the start mask. The 2nd row of w are transition weights for the end " + "mask. Transition weights between other tags begin from the 3rd row of " + "w. See more details in comments of the linear_chain_crf operator."); + AddInput( + "Label", + "(LoDTensor, LoDTensor). The ground truth with shape " + "[N x 1]. This input is optional. See more details in the operator's " + "comments.") + .AsDispensable(); + AddOutput( + "ViterbiPath", + "(LoDTensor, LoDTensor). The decoding results. What to " + "return changes depending on whether the Input(Label) (the ground " + "truth) is given. See more details in the operator's comment."); + AddComment(R"DOC( +The crf_decoding operator reads the emission feature weights and the transition +feature weights learned by the linear_chain_crf operator. It implements the +Viterbi algorithm which is a dynamic programming algorithm for finding the most +likely sequence of hidden states, called the Viterbi path, that results in a +sequence of observed tags. + +The output of this operator changes according to whether Input(Label) is given: + +1. Input(Label) is given: + This happens in training. This operator is used to co-work with the chunk_eval + operator. + When Input(Label) is given, the crf_decoding operator returns a row vector + with shape [N x 1] whose values are fixed to be 0, indicating an incorrect + prediction, or 1 indicating a tag is correctly predicted. Such an output is the + input to chunk_eval operator. + +2. Input(Label) is not given: + This is the standard decoding process. + +The crf_decoding operator returns a row vector with shape [N x 1] whose values +range from 0 to maximum tag number - 1, Each element indicates an index of a +predicted tag. +)DOC"); + } +}; + +class CRFDecodingOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Emission"), + "Input(Emission) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Transition"), + "Input(Transition) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("ViterbiPath"), + "Output(ViterbiPath) should be not null."); + + auto emission_dims = ctx->GetInputDim("Emission"); + PADDLE_ENFORCE_EQ(emission_dims.size(), 2, + "The Input(Emission) should be a 2-D tensor."); + PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); + + auto transition_dims = ctx->GetInputDim("Transition"); + PADDLE_ENFORCE_EQ(transition_dims.size(), 2, + "The Input(Transition) should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + transition_dims[0] - 2, transition_dims[1], + "An invalid dimension for the Input(Transition), which should " + "be a 2-D tensor with shape [(D + 2) x D]."); + if (ctx->IsRuntime() || (emission_dims[1] > 0 && transition_dims[1] > 0)) { + PADDLE_ENFORCE_EQ( + emission_dims[1], transition_dims[1], + "The 2nd dimension of the Input(Emission) and the Input(Transition) " + "should be equal to the tag number."); + } + if (ctx->HasInput("Label")) { + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, + "The Input(Label) should be a 2-D tensor with the 2nd " + "dimensions fixed to 1."); + if (ctx->IsRuntime() || (emission_dims[0] > 0 && label_dims[0] > 0)) { + PADDLE_ENFORCE_EQ( + emission_dims[0], label_dims[0], + "The height of Input(Emission) and the height of Input(Label) " + "should be the same."); + } + } + + ctx->ShareLoD("Emission", /*->*/ "ViterbiPath"); + ctx->SetOutputDim("ViterbiPath", {emission_dims[0], 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("Emission")->type(), + platform::CPUPlace()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp, + ops::CRFDecodingOpMaker); +REGISTER_OP_CPU_KERNEL( + crf_decoding, + ops::CRFDecodingOpKernel, + ops::CRFDecodingOpKernel); diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h new file mode 100644 index 00000000..13a587dc --- /dev/null +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using framework::LoDTensor; +using framework::LoD; +using framework::Tensor; + +template +class CRFDecodingOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* emission_weights = ctx.Input("Emission"); + auto* transition_weights = ctx.Input("Transition"); + auto* label = ctx.Input("Label"); + auto* decoded_path = ctx.Output("ViterbiPath"); + + PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL, + "The Input(Emission) should be a sequence."); + auto lod = emission_weights->lod(); + PADDLE_ENFORCE(lod.size(), "Input(Emission) must be a sequence."); + const size_t level = 0; + const size_t seq_num = lod[level].size() - 1; + + int64_t* path = decoded_path->mutable_data(platform::CPUPlace()); + math::SetConstant()( + ctx.template device_context(), decoded_path, 0); + for (size_t i = 0; i < seq_num; ++i) { + if (lod[level][i] == lod[level][i + 1]) continue; + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos); + Decode(emission_weights->Slice(start_pos, end_pos), *transition_weights, + &decoded_path_one_seq); + } + + if (label) { + PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL, + "The Input(Label) should be a sequence."); + const int64_t* label_value = label->data(); + size_t batch_size = emission_weights->dims()[0]; + for (size_t i = 0; i < batch_size; ++i) { + path[i] = label_value[i] == path[i] ? 1 : 0; + } + } + } + + private: + void Decode(const Tensor& emission_weights, const Tensor& transition_weights, + Tensor* decoded_path) const { + auto emission_dims = emission_weights.dims(); + const size_t seq_len = emission_dims[0]; + const size_t tag_num = emission_dims[1]; + const T* x = emission_weights.data(); + const T* w = transition_weights.data(); + int64_t* path = decoded_path->data(); + + // alpha is a memo table. An element alpha(k, v) records the score of the + // best sequence of tags from position 1 to position k with v being the end + // tag. + Tensor alpha; + T* alpha_value = alpha.mutable_data(emission_dims, platform::CPUPlace()); + Tensor track; + int* track_value = + track.mutable_data(emission_dims, platform::CPUPlace()); + auto ker = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(tag_num); + ker(static_cast(seq_len), x, w, alpha_value, track_value, tag_num); + T max_score = -std::numeric_limits::max(); + int max_i = 0; + for (size_t i = 0; i < tag_num; ++i) { + T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i]; + if (score > max_score) { + max_score = score; + max_i = i; + } + } + path[seq_len - 1] = max_i; + for (int k = seq_len - 1; k >= 1; --k) { + path[k - 1] = max_i = track_value[k * tag_num + max_i]; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc new file mode 100644 index 00000000..78fcd07e --- /dev/null +++ b/paddle/fluid/operators/crop_op.cc @@ -0,0 +1,212 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/crop_op.h" +#include +#include +#include + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class CropOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of CropOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of CropOp should not be null."); + auto x_dim = ctx->GetInputDim("X"); + if (!ctx->HasInput("Y")) { + auto shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE_EQ( + int64_t(shape.size()), x_dim.size(), + "Shape size should be equal to dimention size of input tensor."); + std::vector tensor_shape(shape.size()); + for (size_t i = 0; i < shape.size(); ++i) { + tensor_shape[i] = static_cast(shape[i]); + } + ctx->SetOutputDim("Out", framework::make_ddim(tensor_shape)); + } else { + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(y_dim), + "Tensor rank of both CropOp's " + "inputs must be same."); + ctx->SetOutputDim("Out", y_dim); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class CropOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input of pad op. " + "The input should be a k-D tensor(k > 0 and k < 7)."); + AddInput("Y", + "The input used as reference for cropping, " + "which is of the same dimensions as X.") + .AsDispensable(); + AddInput("Offsets", + "The input used to describe offsets in runtime, which is a " + "1-D vector whose size equals to the rank of input 'X'. The " + "elements data type must be int.") + .AsDispensable(); + AddOutput("Out", + "The output of crop op, " + "which is of the same dimensions as X."); + AddAttr>("offsets", + "A list describing offsets to be cropped. " + "The size of offsets list should be the same as " + "the dimension size of input X.") + .SetDefault(std::vector()); + AddAttr>("shape", + "A list describing the shape of output. " + "The size of shape list should be the same as " + "the dimension size of input X.") + .SetDefault(std::vector()); + AddComment(R"DOC( +Crop Operator. + +Crop input into output, as specified by offsets and shape. + +There are two ways to set the offsets: +1. In runtime: Using the input 'Offsets', which is a Vairbale and can be + output of other operators. This way is suitable for + dynamic offsets. +2. In network configuration: Using the attribute 'offsets', which will be + set in Python configure script. This way is + suitable for fixed offsets. +You CANNOT use these two ways at the same time. An exception will be raised +if input 'Offset' is configured and meanwhile the attribute 'offsets' is +not empty. + +There are two ways to set shape: +1. reference input: crop input X into the same shape as reference input. + The dimension of reference input should + be the same as the dimension of input X. +2. shape list: crop input X into the shape described by a list. + The size of shape list should be the same as + the dimension size of input X. + +The input should be a k-D tensor(k > 0 and k < 7). As an example: + +Case 1: +Given + + X = [[0, 1, 2, 0, 0] + [0, 3, 4, 0, 0] + [0, 0, 0, 0, 0]], + +and + + offsets = [0, 1], + +and + + shape = [2, 2], + +we get: + + Out = [[1, 2], + [3, 4]]. + + +Case 2: +Given + + X = [[0, 1, 2, 5, 0] + [0, 3, 4, 6, 0] + [0, 0, 0, 0, 0]], + +and + + offsets = [0, 1], + +and + + Y = [[0, 0, 0] + [0, 0, 0]], + +we get: + + Out = [[1, 2, 5], + [3, 4, 6]]. +)DOC"); + } +}; + +class CropOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); + } +}; + +class CropGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("crop_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("X", Input("X")); + if (ForwardOp().Inputs().count("Offsets") > 0) { + op->SetInput("Offsets", Input("Offsets")); + } + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(crop, ops::CropOp, ops::CropOpMaker, + ops::CropGradOpDescMaker); +REGISTER_OPERATOR(crop_grad, ops::CropOpGrad); +REGISTER_OP_CPU_KERNEL( + crop, ops::CropKernel); +REGISTER_OP_CPU_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.cu b/paddle/fluid/operators/crop_op.cu new file mode 100644 index 00000000..66cb5c45 --- /dev/null +++ b/paddle/fluid/operators/crop_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/crop_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + crop, ops::CropKernel); +REGISTER_OP_CUDA_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/fluid/operators/crop_op.h b/paddle/fluid/operators/crop_op.h new file mode 100644 index 00000000..cfc2cac7 --- /dev/null +++ b/paddle/fluid/operators/crop_op.h @@ -0,0 +1,175 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { // Internal + +template +using EigenTensor = framework::EigenTensor; +using framework::Tensor; + +static std::vector GetOffsets(const framework::ExecutionContext& ctx) { + std::vector res; + int rank = ctx.Input("X")->dims().size(); + if (ctx.HasInput("Offsets")) { + PADDLE_ENFORCE(ctx.Attr>("offsets").empty(), + "Input 'Offsets' and attribute 'offsets' should not be used " + "at the same time."); + const auto* offsets_tensor = ctx.Input("Offsets"); + PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1); + PADDLE_ENFORCE_EQ( + rank, offsets_tensor->dims()[0], + "Offsets size should be equal to dimension size of input tensor."); + const int* offsets_data; + framework::Tensor cpu_tmp_tensor; + if (platform::is_cpu_place(offsets_tensor->place())) { + offsets_data = offsets_tensor->data(); + } else { + framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(), + &cpu_tmp_tensor); + offsets_data = cpu_tmp_tensor.data(); + } + res = std::vector(offsets_data, offsets_data + rank); + } else { + res = ctx.Attr>("offsets"); + PADDLE_ENFORCE_EQ( + rank, static_cast(res.size()), + "Offsets size should be equal to dimension size of input tensor."); + } + return res; +} + +template +void CropFunction(const framework::ExecutionContext& context) { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + auto out_dims = out->dims(); + if (out_dims[0] == -1) { + out_dims[0] = x->dims()[0]; + } + out->mutable_data(out_dims, context.GetPlace()); + auto x_stride = framework::stride(x->dims()); + auto offsets = GetOffsets(context); + int64_t offset = 0; + for (size_t i = 0; i < offsets.size(); ++i) { + offset += (x_stride[i] * offsets[i]); + } + + auto x_tensor = EigenTensor::From(*x); + auto out_tensor = EigenTensor::From(*out); + Eigen::array e_offsets; + Eigen::array e_shape; + for (size_t i = 0; i < D; ++i) { + e_offsets[i] = offsets[i]; + e_shape[i] = out->dims()[i]; + } + auto& place = + *context.template device_context().eigen_device(); + out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape); +} + +template +class CropKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + int rank = context.Input("X")->dims().size(); + switch (rank) { + case 1: + CropFunction(context); + break; + case 2: + CropFunction(context); + break; + case 3: + CropFunction(context); + break; + case 4: + CropFunction(context); + break; + case 5: + CropFunction(context); + break; + case 6: + CropFunction(context); + break; + default: + PADDLE_THROW( + "CropOp only support tensors with no more than 6 dimensions."); + } + } +}; + +template +void CropGradFunction(const framework::ExecutionContext& context) { + auto* d_x = context.Output(framework::GradVarName("X")); + auto* x = context.Input("X"); + if (d_x != nullptr) { + auto* d_out = context.Input(framework::GradVarName("Out")); + d_x->mutable_data(x->dims(), context.GetPlace()); + auto offsets = GetOffsets(context); + Eigen::array, D> paddings; + for (size_t i = 0; i < D; ++i) { + paddings[i].first = offsets[i]; + paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i]; + } + auto d_x_tensor = EigenTensor::From(*d_x); + auto d_out_tensor = EigenTensor::From(*d_out); + d_x_tensor.device( + *context.template device_context().eigen_device()) = + d_out_tensor.pad(paddings, 0); + } +} + +template +class CropGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + size_t rank = + context.Input(framework::GradVarName("Out"))->dims().size(); + switch (rank) { + case 1: + CropGradFunction(context); + break; + case 2: + CropGradFunction(context); + break; + case 3: + CropGradFunction(context); + break; + case 4: + CropGradFunction(context); + break; + case 5: + CropGradFunction(context); + break; + case 6: + CropGradFunction(context); + break; + default: + PADDLE_THROW( + "CropOp only support tensors with no more than 6 dimensions."); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc new file mode 100644 index 00000000..da2c74b0 --- /dev/null +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -0,0 +1,389 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cross_entropy_op.h" +#include +#include +#include + +namespace paddle { +namespace operators { + +class CrossEntropyOpBase : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(rank, label_dims.size(), + "Input(X) and Input(Label) shall have the same rank."); + bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) || + framework::contain_unknown_dim(label_dims); + bool check = ctx->IsRuntime() || !contain_unknown_dim; + if (check) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "Input(X) and Input(Label) shall have the same shape " + "except the last dimension."); + } + + if (IsSoftLabel(ctx)) { + if (check) { + PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1], + "If Attr(soft_label) == true, the last dimension of " + "Input(X) and Input(Label) should be equal."); + } + } else { + PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL, + "If Attr(softLabel) == false, the last dimension of " + "Input(Label) should be 1."); + } + + auto y_dims = x_dims; + y_dims[rank - 1] = 1; + ctx->SetOutputDim("Y", y_dims); + ctx->ShareLoD("X", /*->*/ "Y"); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } + + virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const { + return ctx->Attrs().Get("soft_label"); + } +}; + +class CrossEntropyGradientOpBase : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) shoudl be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = GetXDim(ctx); + auto label_dims = ctx->GetInputDim("Label"); + auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(dy_dims.size(), rank, + "Input(Y@Grad) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(label_dims.size(), rank, + "Input(Label) and Input(X) should have the same rank."); + + bool check = true; + if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 || + framework::product(label_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "The Input(X) and Input(Label) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(dy_dims, 0, rank - 1), + "The Input(X) and Input(Y@Grad) should have the same " + "shape except the last dimension."); + } + if (IsSoftLabel(ctx)) { + if (check) { + PADDLE_ENFORCE_EQ( + x_dims[rank - 1], label_dims[rank - 1], + "When Attr(soft_label) == true, the last dimension of " + "Input(X) and Input(Label) should be equal."); + } + } else { + PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1, + "When Attr(soft_label) == false, the last dimension of " + "Input(Label) should be 1."); + } + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, + "The last dimension of Input(Y@Grad) should be 1."); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD(VarNameWithXLoD(), framework::GradVarName("X")); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Y"))->type(), + ctx.device_context()); + } + + virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const { + return ctx->GetInputDim("X"); + } + + virtual const char* VarNameWithXLoD() const { return "X"; } + + virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const { + return ctx->Attrs().Get("soft_label"); + } +}; + +class CrossEntropyOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Y"}}; + } +}; + +class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a tensor whose last dimension " + "size is equal to the number of classes. This input is a " + "probability computed by the previous operator, which is almost " + "always the result of a softmax operator."); + AddInput( + "Label", + "(Tensor), the tensor which represents the ground truth. It has the " + "same shape with 'X' except the last dimension. When soft_label is set " + "to false, the last dimension size is 1; when soft_label is set to " + "true, the last dimension size is equal to the number of classes."); + AddOutput("Y", + "(Tensor, default Tensor), a tensor whose shape is same " + "with 'X' except that the last dimension size is 1. It " + "represents the cross entropy loss."); + AddAttr("soft_label", + "(bool, default false), a flag indicating whether to " + "interpretate the given labels as soft labels.") + .SetDefault(false); + AddAttr("ignore_index", + "(int, default -100), Specifies a target value that is" + "ignored and does not contribute to the input gradient." + "Only valid if soft_label is set to False") + .SetDefault(-100); + AddComment(R"DOC( +CrossEntropy Operator. + +The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. +The matrix's second dimension(row length) is as same as the original last +dimension, and the first dimension(column length) is the product of all other +original dimensions. Then the softmax computation will take palce on each raw +of flattened matrixs. + +It supports both standard cross-entropy and soft-label cross-entropy loss +computation. +1) One-hot cross-entropy: + soft_label = false, Label[i, 0] indicates the class index for sample i: + + $Y[i] = -\log(X[i, Label[i]])$ + +2) Soft-label cross-entropy: + soft_label = true, Label[i, j] indicates the soft label of class j + for sample i: + + $Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}$ + + Please make sure that in this case the summuation of each row of Label + equals one. + +3) One-hot cross-entropy with vecterized Input(Label): + As a special case of 2), when each row of Input(Label) has only one + non-zero element (equals 1), soft-label cross-entropy degenerates to a + one-hot cross-entropy with one-hot label representation. + +Both the input X and Label can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input X. + +)DOC"); + } +}; + +class CrossEntropyGradientOp : public CrossEntropyGradientOpBase { + public: + using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + CrossEntropyGradientOpBase::InferShape(ctx); + } +}; + +class CrossEntropyGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("cross_entropy_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Label", Input("Label")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +class CrossEntropyOp2 : public CrossEntropyOpBase { + public: + using CrossEntropyOpBase::CrossEntropyOpBase; + + void InferShape(framework::InferShapeContext* ctx) const override { + CrossEntropyOpBase::InferShape(ctx); + + PADDLE_ENFORCE(ctx->HasOutput("XShape"), + "Output(XShape) should be not null."); + + PADDLE_ENFORCE(ctx->HasOutput("MatchX"), + "Output(MatchX) should be not null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_dims_vec = framework::vectorize(x_dims); + x_dims_vec.push_back(0); + ctx->SetOutputDim("XShape", framework::make_ddim(x_dims_vec)); + x_dims[x_dims.size() - 1] = 1; + ctx->SetOutputDim("MatchX", x_dims); + ctx->ShareLoD("X", /*->*/ "XShape"); + } + + protected: + bool IsSoftLabel(framework::InferShapeContext* ctx) const override { + return false; + } +}; + +class CrossEntropyGradientOp2 : public CrossEntropyGradientOpBase { + public: + using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("MatchX"), "Input(MatchX) must exist"); + CrossEntropyGradientOpBase::InferShape(ctx); + } + + protected: + virtual framework::DDim GetXDim(framework::InferShapeContext* ctx) const { + auto x_shape = ctx->GetInputDim("XShape"); + return framework::DDim(x_shape.Get(), x_shape.size() - 1); + } + + virtual const char* VarNameWithXLoD() const { return "XShape"; } + + virtual bool IsSoftLabel(framework::InferShapeContext* ctx) const { + return false; + } +}; + +class CrossEntropyOpMaker2 : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a tensor whose last dimension " + "size is equal to the number of classes. This input is a " + "probability computed by the previous operator, which is almost " + "always the result of a softmax operator."); + AddInput( + "Label", + "(Tensor), the tensor which represents the ground truth. It has the " + "same shape with 'X' except the last dimension. One hot Tensor."); + AddOutput("Y", + "(Tensor, default Tensor), a tensor whose shape is same " + "with 'X' except that the last dimension size is 1. It " + "represents the cross entropy loss."); + AddOutput("XShape", "Temporaily variable to save shape and LoD of X."); + AddOutput("MatchX", + "X value that matches label, used for gradient computation."); + AddAttr("ignore_index", + "(int, default -100), Specifies a target value that is" + "ignored and does not contribute to the input gradient." + "Only valid if soft_label is set to False") + .SetDefault(-100); + AddComment(R"DOC( +Hard-label CrossEntropy Operator. + +The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. +The matrix's second dimension(row length) is as same as the original last +dimension, and the first dimension(column length) is the product of all other +original dimensions. Then the softmax computation will take palce on each raw +of flattened matrixs. + +Only support hard label. + +Both the input X and Label can carry the LoD (Level of Details) information, +or not. But the output only shares the LoD information with input X. + +)DOC"); + } +}; + +class CrossEntropyGradOpDescMaker2 : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("cross_entropy_grad2"); + op->SetInput("Label", Input("Label")); + op->SetInput("MatchX", Output("MatchX")); + op->SetInput("XShape", Output("XShape")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPUCtx = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(cross_entropy, ops::CrossEntropyOpBase, + ops::CrossEntropyOpMaker, ops::CrossEntropyOpInferVarType, + ops::CrossEntropyGradOpDescMaker); +REGISTER_OPERATOR(cross_entropy_grad, ops::CrossEntropyGradientOp); +REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel, + ops::CrossEntropyOpKernel); +REGISTER_OP_CPU_KERNEL(cross_entropy_grad, + ops::CrossEntropyGradientOpKernel, + ops::CrossEntropyGradientOpKernel); + +REGISTER_OPERATOR(cross_entropy2, ops::CrossEntropyOp2, + ops::CrossEntropyOpMaker2, ops::CrossEntropyOpInferVarType, + ops::CrossEntropyGradOpDescMaker2); +REGISTER_OPERATOR(cross_entropy_grad2, ops::CrossEntropyGradientOp2); +REGISTER_OP_CPU_KERNEL(cross_entropy2, + ops::CrossEntropyOpKernel2, + ops::CrossEntropyOpKernel2); +REGISTER_OP_CPU_KERNEL(cross_entropy_grad2, + ops::CrossEntropyGradientOpKernel2, + ops::CrossEntropyGradientOpKernel2); diff --git a/paddle/fluid/operators/cross_entropy_op.cu b/paddle/fluid/operators/cross_entropy_op.cu new file mode 100644 index 00000000..243e7f52 --- /dev/null +++ b/paddle/fluid/operators/cross_entropy_op.cu @@ -0,0 +1,39 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cross_entropy_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace plat = paddle::platform; +namespace ops = paddle::operators; +using CUDACtx = paddle::platform::CUDADeviceContext; +REGISTER_OP_CUDA_KERNEL(cross_entropy, + ops::CrossEntropyOpKernel, + ops::CrossEntropyOpKernel, + ops::CrossEntropyOpKernel); + +REGISTER_OP_CUDA_KERNEL( + cross_entropy_grad, ops::CrossEntropyGradientOpKernel, + ops::CrossEntropyGradientOpKernel, + ops::CrossEntropyGradientOpKernel); + +REGISTER_OP_CUDA_KERNEL(cross_entropy2, + ops::CrossEntropyOpKernel2, + ops::CrossEntropyOpKernel2, + ops::CrossEntropyOpKernel2); + +REGISTER_OP_CUDA_KERNEL( + cross_entropy_grad2, ops::CrossEntropyGradientOpKernel2, + ops::CrossEntropyGradientOpKernel2, + ops::CrossEntropyGradientOpKernel2); diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h new file mode 100644 index 00000000..309ba46c --- /dev/null +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -0,0 +1,264 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math.h" +#include "paddle/fluid/operators/math/cross_entropy.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CrossEntropyOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* labels = ctx.Input("Label"); + auto* y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + + int rank = x->dims().size(); + Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1); + Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1); + Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1); + + int axis_dim = x->dims()[rank - 1]; + math::CrossEntropyFunctor()( + ctx.template device_context(), &y_2d, &x_2d, &labels_2d, + ctx.Attr("soft_label"), ctx.Attr("ignore_index"), axis_dim); + } +}; + +template +class XeSoftlabelGradFunctor { + public: + XeSoftlabelGradFunctor(T* dx, + const T* dy, // NOLINT + const T* x, // NOLINT + const T* label, // NOLINT + size_t num_classes) + : dx_(dx), dy_(dy), x_(x), label_(label), num_classes_(num_classes) {} + + HOSTDEVICE void operator()(size_t i) { + auto row_ids = i / num_classes_; + dx_[i] = -label_[i] * dy_[row_ids] / x_[i]; + } + + private: + T* dx_; + const T* dy_; + const T* x_; + const T* label_; + size_t num_classes_; +}; + +template +class XeGradFunctor { + public: + XeGradFunctor(T* dx, + const T* dy, // NOLINT + const T* x, // NOLINT + const int64_t* label, // NOLINT + size_t num_classes, size_t ignore_index) + : dx_(dx), + dy_(dy), + x_(x), + label_(label), + num_classes_(num_classes), + ignore_index_(ignore_index) {} + + HOSTDEVICE void operator()(size_t sample_id) { + auto x_is_true_offset = sample_id * num_classes_ + label_[sample_id]; + for (size_t x_offset = sample_id * num_classes_; + x_offset < (sample_id + 1) * num_classes_; ++x_offset) { + dx_[x_offset] = (x_offset != x_is_true_offset || + label_[sample_id] == static_cast(ignore_index_)) + ? static_cast(0) + : -dy_[sample_id] / x_[x_offset]; + } + } + + private: + T* dx_; + const T* dy_; + const T* x_; + const int64_t* label_; + size_t num_classes_; + size_t ignore_index_; +}; + +template +class CrossEntropyGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dy = ctx.Input(framework::GradVarName("Y")); + auto* label = ctx.Input("Label"); + auto* dx = ctx.Output(framework::GradVarName("X")); + T* dx_data = dx->mutable_data(ctx.GetPlace()); + + // Following computation only depends on the last dimension size. So it's + // unnecessary to convert tensors to 2-D views. + int rank = x->dims().size(); + int64_t class_num = x->dims()[rank - 1]; + int64_t ignore_index = ctx.Attr("ignore_index"); + if (ctx.Attr("soft_label")) { + XeSoftlabelGradFunctor functor(dx_data, dy->data(), x->data(), + label->data(), + static_cast(class_num)); + platform::ForRange for_range( + ctx.template device_context(), + static_cast(dx->numel())); + for_range(functor); + } else { + XeGradFunctor functor( + dx_data, dy->data(), x->data(), label->data(), + static_cast(class_num), static_cast(ignore_index)); + platform::ForRange for_range( + ctx.template device_context(), + static_cast(dy->numel())); + for_range(functor); + } + } +}; + +template +struct HardLabelCrossEntropyForwardFunctor { + HardLabelCrossEntropyForwardFunctor(const T* x, T* y, T* match_x, + const int64_t* label, + int64_t ignore_index, + int64_t feature_size) + : x_(x), + y_(y), + match_x_(match_x), + label_(label), + ignore_index_(ignore_index), + feature_size_(feature_size) {} + + HOSTDEVICE void operator()(int64_t idx) const { + auto label = label_[idx]; + if (label != ignore_index_) { + PADDLE_ASSERT_MSG(label >= 0 && label < feature_size_, + "The label is out of the range.", label); + auto match_x = x_[idx * feature_size_ + label]; + y_[idx] = -math::TolerableValue()(real_log(match_x)); + match_x_[idx] = match_x; + } else { + y_[idx] = 0; + match_x_[idx] = 0; // any value is ok + } + } + + const T* x_; + T* y_; + T* match_x_; + const int64_t* label_; + int64_t ignore_index_; + int64_t feature_size_; +}; + +template +struct HardLabelCrossEntropyBackwardFunctor { + HardLabelCrossEntropyBackwardFunctor(T* dx, const T* dy, const T* match_x, + const int64_t* label, + int64_t ignore_index, + int64_t feature_size) + : dx_(dx), + dy_(dy), + match_x_(match_x), + label_(label), + ignore_index_(ignore_index), + feature_size_(feature_size) {} + + HOSTDEVICE void operator()(int64_t idx) const { + auto row_idx = idx / feature_size_; + auto col_idx = idx % feature_size_; + auto label = label_[row_idx]; + if (label == col_idx && label != ignore_index_) { + dx_[idx] = -dy_[row_idx] / match_x_[row_idx]; + } else { + dx_[idx] = 0; + } + } + + T* dx_; + const T* dy_; + const T* match_x_; + const int64_t* label_; + int64_t ignore_index_; + int64_t feature_size_; +}; + +template +class CrossEntropyOpKernel2 : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* label = ctx.Input("Label"); + auto* y = ctx.Output("Y"); + auto* match_x = ctx.Output("MatchX"); + + auto& x_dims = x->dims(); + auto feature_size = x_dims[x_dims.size() - 1]; + auto batch_size = framework::product(x->dims()) / feature_size; + + auto* p_x = x->data(); + auto* p_label = label->data(); + auto* p_y = y->mutable_data(ctx.GetPlace()); + auto* p_match_x = match_x->mutable_data(ctx.GetPlace()); + + auto ignore_index = ctx.Attr("ignore_index"); + + platform::ForRange for_range( + ctx.template device_context(), batch_size); + for_range(HardLabelCrossEntropyForwardFunctor( + p_x, p_y, p_match_x, p_label, ignore_index, feature_size)); + } +}; + +template +class CrossEntropyGradientOpKernel2 : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Input(framework::GradVarName("Y")); + auto* match_x = ctx.Input("MatchX"); + auto* label = ctx.Input("Label"); + + auto* p_dx = dx->mutable_data(ctx.GetPlace()); + auto* p_dy = dy->data(); + auto* p_match_x = match_x->data(); + auto* p_label = label->data(); + + int64_t ignore_index = ctx.Attr("ignore_index"); + int rank = dx->dims().size(); + int64_t feature_size = dx->dims()[rank - 1]; + int64_t batch_size = framework::product(dx->dims()) / feature_size; + + platform::ForRange for_range( + ctx.template device_context(), + batch_size * feature_size); + for_range(HardLabelCrossEntropyBackwardFunctor( + p_dx, p_dy, p_match_x, p_label, ignore_index, feature_size)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc new file mode 100644 index 00000000..e7c472f8 --- /dev/null +++ b/paddle/fluid/operators/ctc_align_op.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/ctc_align_op.h" + +namespace paddle { +namespace operators { + +class CTCAlignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input of CTCAlignOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output of CTCAlignOp should not be null."); + + auto input_dims = ctx->GetInputDim("Input"); + + // TODO(wanghaoshuang): it is tricky to set the wrong dimension here. + ctx->SetOutputDim("Output", input_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.device_context()); + } +}; + +class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(LodTensor, default: LoDTensor), Its shape is " + "[Lp, 1], where Lp is the sum of all input sequences' length."); + AddOutput("Output", "(Tensor, default: Tensor), The align result."); + AddAttr("blank", + "(int, default: 0), the blank label setted in Connectionist " + "Temporal Classification (CTC) op.") + .SetDefault(0); + AddAttr("merge_repeated", + "(bool, default: true), whether to " + "merge repeated elements between two blanks. ") + .SetDefault(true); + AddComment(R"DOC( +CTCAlign op is used to merge repeated elements between two blanks +and then delete all blanks in sequence. + +Given: + Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, + 6, 0, 0, 7, 7, 7, 0] + Input.dims = {18, 1} + Input.LoD = [[0, 11, 18]] + +And: + blank = 0 + merge_repeated = True + +Then: + Output.data = [1, 2, 4, 4, 5, 6, + 6, 7] + Output.dims = {8, 1} + Output.LoD = [[0, 6, 8]] + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(ctc_align, ops::CTCAlignOp, ops::CTCAlignOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + ctc_align, ops::CTCAlignKernel, + ops::CTCAlignKernel); diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu new file mode 100644 index 00000000..bbad74e9 --- /dev/null +++ b/paddle/fluid/operators/ctc_align_op.cu @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/operators/ctc_align_op.h" + +namespace paddle { +namespace operators { + +template +__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens, + const size_t num_seq, size_t* lod0, + const int blank, const int merge_repeated, + size_t* out_lod0, T* output) { + int ouput_idx = 0; + out_lod0[0] = 0; + + for (int i = 0; i < num_seq; ++i) { + T pre_token = -1; + for (int j = lod0[i]; j < lod0[i + 1]; ++j) { + if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) { + output[ouput_idx] = tokens[j]; + ++ouput_idx; + } + pre_token = tokens[j]; + } + out_lod0[i + 1] = ouput_idx; + } +} + +template +class CTCAlignOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + const size_t level = 0; + auto* input = ctx.Input("Input"); + auto* output = ctx.Output("Output"); + auto input_lod = framework::ToAbsOffset(input->lod()); + + const T* tokens = input->data(); + const int64_t num_tokens = input->dims()[0]; + const size_t num_seq = input_lod[level].size() - 1; + + const int blank = ctx.Attr("blank"); + const int merge_repeated = + static_cast(ctx.Attr("merge_repeated")); + + // prepare a lod to record lod information while merging elements + thrust::device_vector dev_out_lod0(input_lod[level].size()); + size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data()); + + // merge elements and delete blank + T* output_data = output->mutable_data({num_tokens, 1}, ctx.GetPlace()); + + auto stream = ctx.cuda_device_context().stream(); + MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( + num_tokens, tokens, num_seq, + input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated, + dev_out_lod0_ptr, output_data); + + // set output lod + std::vector host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end()); + framework::LoD out_lod; + out_lod.push_back(host_out_lod0); + output->set_lod(out_lod); + + // resize output dims + output->Resize({static_cast(host_out_lod0.back()), 1}); + + if (host_out_lod0.back() == 0) { + output->Resize({1, 1}); + output->mutable_data(ctx.GetPlace()); + math::SetConstant set_constant; + set_constant(ctx.template device_context(), + output, -1); + } + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL(ctc_align, paddle::operators::CTCAlignOpCUDAKernel, + paddle::operators::CTCAlignOpCUDAKernel); diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h new file mode 100644 index 00000000..9c5c6f5a --- /dev/null +++ b/paddle/fluid/operators/ctc_align_op.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class CTCAlignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* output = ctx.Output("Output"); + const size_t level = 0; + auto input_lod = framework::ToAbsOffset(input->lod()); + + // check input dims and lod + auto input_dims = input->dims(); + PADDLE_ENFORCE_EQ(input_dims[0], + static_cast(input_lod[level].back()), + "The first dimension of Input(Input) should be equal to " + "the sum of all sequences' lengths."); + + const size_t num_sequences = input_lod[level].size() - 1; + size_t blank = static_cast(ctx.Attr("blank")); + bool merge_repeated = ctx.Attr("merge_repeated"); + + // merge repeated tokens and delete blank + T* output_data = output->mutable_data(ctx.GetPlace()); + size_t output_idx = 0; + std::vector output_lod0(1, 0); + const T* input_data = input->data(); + for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) { + T prev_token = -1; + for (size_t i = input_lod[level][seq_idx]; + i < input_lod[level][seq_idx + 1]; ++i) { + if ((unsigned)input_data[i] != blank && + !(merge_repeated && input_data[i] == prev_token)) { + output_data[output_idx] = input_data[i]; + ++output_idx; + } + prev_token = input_data[i]; + } + output_lod0.push_back(output_idx); + } + + // set output lod + framework::LoD output_lod; + output_lod.push_back(output_lod0); + output->set_lod(output_lod); + // resize output dims + output->Resize({static_cast(output_lod0.back()), 1}); + // for empty sequence + if (output_lod0.back() == 0) { + output->Resize({1, 1}); + output_data = output->mutable_data(ctx.GetPlace()); + output_data[0] = -1; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cudnn_lstm_op.cc b/paddle/fluid/operators/cudnn_lstm_op.cc new file mode 100644 index 00000000..73e04da3 --- /dev/null +++ b/paddle/fluid/operators/cudnn_lstm_op.cc @@ -0,0 +1,247 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class CudnnLSTMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(Weight) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("InitH"), + "Input(init_h) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("InitC"), + "Input(init_c) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Cache"), + "Input(Cache) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("last_h"), + "Output(last_h) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("last_c"), + "Output(last_c) of LSTM should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3."); + + auto out_dims = in_dims; + auto hidden_size = ctx->Attrs().Get("hidden_size"); + out_dims[2] = hidden_size; + + ctx->SetOutputDim("Out", out_dims); + ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH")); + ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC")); + } +}; + +class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Input", + "(Tensor) RNN input tensor, which support variable-time length input " + "sequence." + "The shape of the Tensor MUST be ( seq_len * batch_size * input_size)" + "seq_len is the total time step in this mini-batch (CAN be change in " + "different batch)" + "batch_size is the instance number of this batch" + "input_size is the hidden size of the input." + "input_hidden_size and the hidden_size in the next may not be same"); + AddInput("InitH", + "(Tensor) the initial hidden state of the LSTM" + "input. This is a tensor with shape (num_layers x batch_size x " + "hidden_size)" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddInput("InitC", + "(Tensor) the initial cell state of the LSTm " + "input. This is a tensor with shape (num_layers x batch_size x " + "hidden_size)" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddInput("W", + "(Tensor) the learnable hidden-hidden weights." + " The shape is (N), where N is total weight size of the LSTM. " + " cudnn concatenate all the weight to one Tensor"); + AddInput("Cache", + "The cache of dropout op, a RAW type variable including random " + "number generator states and some descriptors, which is used in " + "cudnn kernel.") + .AsDispensable(); + AddOutput("Out", + "(Tensor) the hidden state of LSTM operator. " + "The shape is ( seq_len x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirec is True, the shape will be ( seq_len x " + "batch_size x hidden_size * 2) "); + AddOutput("last_h", + "(Tensor) the hidden state of the last step. " + "The shape is ( num_layers x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirec is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size)"); + AddOutput("last_c", + "(Tensor) the cell state of the last step" + "The shape is ( num_layers x batch_size x hidden_size) if " + "is_bidirec is False" + "and When is_bidirect is True, the shape will be (num_layers*2 x " + "batch_size x hidden_size*2)"); + AddAttr("max_len", + "max length of the LSTM op" + "the first dim of the Input can NOT be greater than max_len") + .SetDefault(20); + AddAttr( + "dropout_prob", + "dropout prob of the dropout op" + "the dropout ONLY work between lstm layers, not between time steps" + "There is no dropout work on the Out tensor") + .SetDefault(0.0); + AddAttr("is_bidirec", + "is_bidirec" + "if it is bidirection rnn" + "The will affect the shape of the Out, last_h, and last_c") + .SetDefault(false); + AddAttr("input_size", "input size ot the Input Tensor").SetDefault(10); + AddAttr("hidden_size", "hidden size of the LSTM").SetDefault(100); + AddAttr("num_layers", "the total layer number of the LSTM") + .SetDefault(1); + AddAttr("is_test", "True if in test phase.").SetDefault(false); + AddAttr("seed", "seed to used if fix_seed is True").SetDefault(-1); + AddComment(R"DOC( +CUDNN LSTM implementation + +A four-gate Long Short-Term Memory network with no peephole connections. +In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, +the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations: + +$$ i_t = sigmoid(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) $$ + +$$ f_t = sigmoid(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) $$ + +$$ o_t = sigmoid(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) $$ + +$$ \\tilde{c_t} = tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c) $$ + +$$ c_t = f_t \\odot c_{t-1} + i_t \\odot \\tilde{c_t} $$ + +$$ h_t = o_t \\odot tanh(c_t) $$ + +- W terms denote weight matrices (e.g. $W_{ix}$ is the matrix + of weights from the input gate to the input) +- The b terms denote bias vectors ($bx_i$ and $bh_i$ are the input gate bias vector). +- sigmoid is the logistic sigmoid function. +- $i, f, o$ and $c$ are the input gate, forget gate, output gate, + and cell activation vectors, respectively, all of which have the same size as + the cell output activation vector $h$. +- The $\odot$ is the element-wise product of the vectors. +- `tanh` is the activation functions. +- $\tilde{c_t}$ is also called candidate hidden state, + which is computed based on the current input and the previous hidden state. + +Where sigmoid is the sigmoid operator: sigmoid(x) = 1 / (1 + e^-x), * represents a point-wise multiplication, +X represensts a matrix multiplication + + +)DOC"); + } +}; + +class CudnnLSTMGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Cache"), + "Input(last_c) of LSTM should not be null."); + PADDLE_ENFORCE(ctx->HasInput("InitH"), + "Input(init_h) of LSTM should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("InitC"), + "Input(init_c) of LSTM should not be null."); + + auto SetOutGradDim = [&ctx](const std::string& name) { + auto g_name = framework::GradVarName(name); + if (ctx->HasOutput(g_name)) { + ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); + } + }; + + SetOutGradDim("Input"); + SetOutGradDim("W"); + SetOutGradDim("InitH"); + SetOutGradDim("InitC"); + } +}; + +class CudnnLSTMGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("cudnn_lstm_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("InitH", Input("InitH")); + op->SetInput("InitC", Input("InitC")); + op->SetInput("W", Input("W")); + if (ForwardOp().Inputs().count("Cache") > 0) { + op->SetInput("Cache", Input("Cache")); + } + op->SetInput("Out", Output("Out")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput(framework::GradVarName("last_c"), OutputGrad("last_c")); + op->SetInput(framework::GradVarName("last_h"), OutputGrad("last_h")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("W"), InputGrad("W")); + op->SetOutput(framework::GradVarName("InitH"), InputGrad("InitH")); + op->SetOutput(framework::GradVarName("InitC"), InputGrad("InitC")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +template +class NotImpleKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW( + "CPU is not support for this kernel now. Will be add in the future"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(cudnn_lstm, ops::CudnnLSTMOp, ops::CudnnLSTMOpMaker, + ops::CudnnLSTMGradOpDescMaker); +REGISTER_OPERATOR(cudnn_lstm_grad, ops::CudnnLSTMGradOp); + +REGISTER_OP_CPU_KERNEL(cudnn_lstm, ops::NotImpleKernel); +REGISTER_OP_CPU_KERNEL(cudnn_lstm_grad, ops::NotImpleKernel); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc new file mode 100644 index 00000000..1bf41ed9 --- /dev/null +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -0,0 +1,261 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/cudnn_rnn_cache.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +template +class CudnnLSTMGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const Tensor *x = ctx.Input("Input"); + const Tensor *init_h = ctx.Input("InitH"); + const Tensor *init_c = ctx.Input("InitC"); + + auto w = ctx.Input("W"); + + Tensor *out = ctx.Output("Out"); + Tensor *last_h = ctx.Output("last_h"); + Tensor *last_c = ctx.Output("last_c"); + + const T *x_data = x->data(); + const T *init_h_data = init_h->data(); + const T *init_c_data = init_c->data(); + + const T *w_data = w->data(); + + T *out_data = out->mutable_data(ctx.GetPlace()); + T *last_h_data = last_h->mutable_data(ctx.GetPlace()); + T *last_c_data = last_c->mutable_data(ctx.GetPlace()); + + size_t max_len = ctx.Attr("max_len"); + float dropout_prob = ctx.Attr("dropout_prob"); + bool is_bidirec = ctx.Attr("is_bidirec"); + int input_size = ctx.Attr("input_size"); + int hidden_size = ctx.Attr("hidden_size"); + int num_layers = ctx.Attr("num_layers"); + bool is_test = ctx.Attr("is_test"); + + auto &dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto *cache_var = ctx.InputVar("Cache"); + if (!cache_var) { + // The RAW type cache variable wouldn't be created and broadcasted on + // multi-devices before the first running. + // use parent scope to make cache persistable + auto *scope = const_cast(ctx.scope().parent()); + auto cache_var_name = ctx.Inputs("Cache")[0]; + cache_var = scope->Var(cache_var_name); + } + CudnnRNNCache *cudnn_rnn_cache = nullptr; + if (cache_var->IsInitialized()) { + // const_cast is usually bad. + cudnn_rnn_cache = const_cast(cache_var) + ->GetMutable(); + } else { + // const_cast is usually bad. + cudnn_rnn_cache = const_cast(cache_var) + ->GetMutable(); + std::random_device rnd; + int seed = ctx.Attr("seed"); + if (seed == -1) { + seed = rnd(); + } + + auto input_w_numel = w->numel(); + auto batch_size = x->dims()[1]; + cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size, + input_size, hidden_size, num_layers, dropout_prob, + is_bidirec, seed, input_w_numel); + } + + auto run_seq_len = x->dims()[0]; + + if (is_test) { + // for inference + CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardInference( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, + init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, + cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data, + cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, + last_c_data, cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_)); + } else { + // for train + CUDNN_ENFORCE(platform::dynload::cudnnRNNForwardTraining( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, + init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, + cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data, + cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, + last_c_data, cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_, + cudnn_rnn_cache->reserve_data_.data(), + cudnn_rnn_cache->reserve_size_)); + } + } +}; + +template +class CudnnLSTMGPUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *input = ctx.Input("Input"); + auto *weight = ctx.Input("W"); + auto *init_h = ctx.Input("InitH"); + auto *init_c = ctx.Input("InitC"); + // auto * last_h = ctx.Input("last_h"); + // auto * last_c = ctx.Input("last_c"); + auto *out = ctx.Input("Out"); + auto *out_grad = ctx.Input(framework::GradVarName("Out")); + auto *last_h_grad = ctx.Input(framework::GradVarName("last_h")); + auto *last_c_grad = ctx.Input(framework::GradVarName("last_c")); + + // auto* init_h = ctx.Input("init_h"); + // auto* init_c = ctx.Input("init_c"); + + auto *in_grad = ctx.Output(framework::GradVarName("Input")); + auto *weight_grad = ctx.Output(framework::GradVarName("W")); + auto *init_h_grad = ctx.Output(framework::GradVarName("InitH")); + auto *init_c_grad = ctx.Output(framework::GradVarName("InitC")); + + auto &dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto *cache_var = ctx.InputVar("Cache"); + PADDLE_ENFORCE(cache_var->IsInitialized()); + CudnnRNNCache *cudnn_rnn_cache = + const_cast(cache_var) + ->GetMutable(); + + auto input_dims = input->dims(); + auto init_h_dims = init_h->dims(); + auto init_c_dims = init_c->dims(); + in_grad->mutable_data(ctx.GetPlace()); + weight_grad->mutable_data(ctx.GetPlace()); + math::SetConstant zero; + zero(dev_ctx, in_grad, static_cast(0.0)); + zero(dev_ctx, weight_grad, static_cast(0.0)); + + T *init_h_grad_data = NULL; + if (init_h_grad == nullptr) { + Tensor init_h_grad_temp; + init_h_grad_temp.mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, &init_h_grad_temp, static_cast(0.0)); + + init_h_grad_data = init_h_grad_temp.data(); + } else { + init_h_grad->mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, init_h_grad, static_cast(0.0)); + init_h_grad_data = init_h_grad->data(); + } + + T *init_c_grad_data = NULL; + if (init_c_grad == nullptr) { + Tensor init_c_grad_temp; + init_c_grad_temp.mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, &init_c_grad_temp, static_cast(0.0)); + + init_c_grad_data = init_c_grad_temp.data(); + } else { + init_c_grad->mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, init_c_grad, static_cast(0.0)); + init_c_grad_data = init_c_grad->data(); + } + + const T *last_h_grad_data = NULL; + if (last_h_grad == nullptr) { + Tensor last_h_grad_temp; + last_h_grad_temp.mutable_data(init_h_dims, ctx.GetPlace()); + zero(dev_ctx, &last_h_grad_temp, static_cast(0.0)); + + last_h_grad_data = (const T *)last_h_grad_temp.data(); + } else { + last_h_grad_data = last_h_grad->data(); + } + + const T *last_c_grad_data = NULL; + if (last_c_grad == nullptr) { + Tensor last_c_grad_temp; + last_c_grad_temp.mutable_data(init_c_dims, ctx.GetPlace()); + zero(dev_ctx, &last_c_grad_temp, static_cast(0.0)); + + last_c_grad_data = (const T *)last_c_grad_temp.data(); + } else { + last_c_grad_data = last_c_grad->data(); + } + + const T *out_grad_data = NULL; + if (out_grad == nullptr) { + Tensor out_grad_temp; + out_grad_temp.mutable_data(out->dims(), ctx.GetPlace()); + zero(dev_ctx, &out_grad_temp, static_cast(0.0)); + + out_grad_data = (const T *)out_grad_temp.data(); + } else { + out_grad_data = out_grad->data(); + } + + // zero( dev_ctx, last_h_grad, static_cast(0.0)); + // zero( dev_ctx, last_c_grad, static_cast(0.0)); + + auto out_data = out->data(); + // auto out_grad_data = out_grad->data(); + auto weight_data = weight->data(); + auto init_h_data = init_h->data(); + auto init_c_data = init_c->data(); + auto in_grad_data = in_grad->data(); + + auto work_data = cudnn_rnn_cache->workspace_data_.data(); + auto reserve_data = cudnn_rnn_cache->reserve_data_.data(); + + auto run_seq_len = input_dims[0]; + PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_, + "cudnn running seq_len CAN not greater max_lengh"); + CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardData( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_, + out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data, + cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_, + weight_data, cudnn_rnn_cache->hx_desc_, init_h_data, + cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_, + in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data, + cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data, + cudnn_rnn_cache->workspace_size_, reserve_data, + cudnn_rnn_cache->reserve_size_)); + + CUDNN_ENFORCE(platform::dynload::cudnnRNNBackwardWeights( + handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, + cudnn_rnn_cache->x_desc_, input->data(), cudnn_rnn_cache->hx_desc_, + init_h->data(), cudnn_rnn_cache->y_desc_, out->data(), + cudnn_rnn_cache->workspace_data_.data(), + cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_, + weight_grad->data(), cudnn_rnn_cache->reserve_data_.data(), + cudnn_rnn_cache->reserve_size_)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel); +REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel); diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h new file mode 100644 index 00000000..7f18b839 --- /dev/null +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -0,0 +1,255 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +struct CudnnRNNCache { + CudnnRNNCache() { + x_desc_ = NULL; + y_desc_ = NULL; + dx_desc_ = NULL; + dy_desc_ = NULL; + } + ~CudnnRNNCache() { release(); } + + cudnnRNNDescriptor_t rnn_desc_; + cudnnTensorDescriptor_t *x_desc_; + cudnnTensorDescriptor_t *y_desc_; + cudnnTensorDescriptor_t *dx_desc_; + cudnnTensorDescriptor_t *dy_desc_; + + cudnnTensorDescriptor_t hx_desc_; + cudnnTensorDescriptor_t cx_desc_; + cudnnTensorDescriptor_t hy_desc_; + cudnnTensorDescriptor_t cy_desc_; + + cudnnTensorDescriptor_t dhx_desc_; + cudnnTensorDescriptor_t dcx_desc_; + cudnnTensorDescriptor_t dhy_desc_; + cudnnTensorDescriptor_t dcy_desc_; + + cudnnTensorDescriptor_t output_x_desc_; + cudnnTensorDescriptor_t output_y_desc_; + + cudnnDropoutDescriptor_t dropout_desc_; + + size_t weights_size_; + cudnnFilterDescriptor_t w_desc_; + cudnnFilterDescriptor_t dw_desc_; + + size_t workspace_size_; + size_t reserve_size_; + framework::Tensor reserve_data_; + framework::Tensor workspace_data_; + + framework::Tensor dropout_state_; + + size_t max_length_; + + float dropout_prob_; + bool is_bidirec_; + + int batch_size_; + int input_size_; + int hidden_size_; + int num_layers_; + int seed_; + + void init(cudnnHandle_t handle, const platform::Place &place, size_t max_len, + int batch_size, int input_size, int hidden_size, int num_layers, + float dropout_prob, bool is_bidirec, int seed, int weight_numel) { + max_length_ = max_len; + batch_size_ = batch_size; + input_size_ = input_size; + hidden_size_ = hidden_size; + num_layers_ = num_layers; + dropout_prob_ = dropout_prob; + is_bidirec_ = is_bidirec; + seed_ = seed; + + x_desc_ = new cudnnTensorDescriptor_t[max_length_]; + y_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dx_desc_ = new cudnnTensorDescriptor_t[max_length_]; + dy_desc_ = new cudnnTensorDescriptor_t[max_length_]; + int dim_a[3]; + int stride_a[3]; + + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&dy_desc_[i])); + dim_a[0] = batch_size_; + dim_a[1] = input_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + x_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dx_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + dim_a[0] = batch_size_; + dim_a[1] = is_bidirec_ ? hidden_size_ * 2 : hidden_size_; + dim_a[2] = 1; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + y_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dy_desc_[i], CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + } + + dim_a[0] = num_layers_ * (is_bidirec_ ? 2 : 1); + dim_a[1] = batch_size_; + dim_a[2] = hidden_size_; + + stride_a[0] = dim_a[2] * dim_a[1]; + stride_a[1] = dim_a[2]; + stride_a[2] = 1; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&dcy_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + hy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + cy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcx_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dhy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + dcy_desc_, CUDNN_DATA_FLOAT, 3, dim_a, stride_a)); + + CUDNN_ENFORCE( + platform::dynload::cudnnCreateDropoutDescriptor(&dropout_desc_)); + + size_t state_size; + CUDNN_ENFORCE( + platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size); + dropout_state_.Resize({static_cast(state_size)})); + auto *dropout_state_data = dropout_state_.mutable_data(place); + CUDNN_ENFORCE(platform::dynload::cudnnSetDropoutDescriptor( + dropout_desc_, handle, dropout_prob_, dropout_state_data, state_size, + seed_)); + + CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + +#if CUDNN_VERSION >= 6000 + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( + handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, + CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); +#else + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( + rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_DATA_FLOAT)); +#endif + + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNParamsSize( + handle, rnn_desc_, x_desc_[0], &weights_size_, CUDNN_DATA_FLOAT)); + + PADDLE_ENFORCE_EQ(weights_size_, sizeof(float) * weight_numel, + "cudnn lstm weight size should be SAME"); + int dim_w[3]; + dim_w[0] = weights_size_ / sizeof(float); + dim_w[1] = 1; + dim_w[2] = 1; + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + w_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( + dw_desc_, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, 3, dim_w)); + + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNWorkspaceSize( + handle, rnn_desc_, max_length_, x_desc_, &workspace_size_)); + CUDNN_ENFORCE(platform::dynload::cudnnGetRNNTrainingReserveSize( + handle, rnn_desc_, max_length_, x_desc_, &reserve_size_)); + + reserve_data_.Resize({static_cast(reserve_size_)}); + reserve_data_.mutable_data(place); + + workspace_data_.Resize({static_cast(workspace_size_)}); + workspace_data_.mutable_data(place); + } + + void release() { + for (size_t i = 0; i < max_length_; ++i) { + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(x_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(y_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dx_desc_[i])); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(dy_desc_[i])); + } + + delete[] x_desc_; + delete[] y_desc_; + delete[] dx_desc_; + delete[] dy_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(hy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(cy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcx_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dhy_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(dcy_desc_)); + + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyDropoutDescriptor(dropout_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyRNNDescriptor(rnn_desc_)); + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(w_desc_)); + CUDNN_ENFORCE(platform::dynload::cudnnDestroyFilterDescriptor(dw_desc_)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cum_op.h b/paddle/fluid/operators/cum_op.h new file mode 100644 index 00000000..7c0fda41 --- /dev/null +++ b/paddle/fluid/operators/cum_op.h @@ -0,0 +1,113 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +template +class CumKernel : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + + void Compute(const framework::ExecutionContext& context) const override { + auto& X = detail::Ref(context.Input("X"), + "Cannot get input tensor X, variable name = %s", + context.op().Input("X")); + + auto& Out = detail::Ref(context.Output("Out"), + "Cannot get output tensor Out, variable name = %s", + context.op().Output("Out")); + int axis = context.Attr("axis"); + bool exclusive = context.Attr("exclusive"); + bool reverse = context.Attr("reverse"); + auto x_dims = X.dims(); + if (axis == -1) { + axis = x_dims.size() - 1; + } + PADDLE_ENFORCE_LT( + axis, x_dims.size(), + "axis should be less than the dimensiotn of the input tensor"); + Out.mutable_data(context.GetPlace()); + + int pre = 1; + int post = 1; + int mid = x_dims[axis]; + for (int i = 0; i < axis; ++i) { + pre *= x_dims[i]; + } + for (int i = axis + 1; i < x_dims.size(); ++i) { + post *= x_dims[i]; + } + + auto x = framework::EigenVector::Flatten(X); + auto out = framework::EigenVector::Flatten(Out); + auto* place = + context.template device_context().eigen_device(); + + using IndexT = Eigen::DenseIndex; + if (pre == 1) { + if (post == 1) { + ComputeImp(*place, Eigen::DSizes(mid), x, out, + /* axis= */ 0, reverse, exclusive); + } else { + ComputeImp(*place, Eigen::DSizes(mid, post), x, out, + /* axis= */ 0, reverse, exclusive); + } + } else { + if (post == 1) { + ComputeImp(*place, Eigen::DSizes(pre, mid), x, out, + /* axis= */ 1, reverse, exclusive); + } else { + ComputeImp(*place, Eigen::DSizes(pre, mid, post), x, out, + /* axis= */ 1, reverse, exclusive); + } + } + } + + private: + template + void ComputeImp(Device d, const Dim& dims, X x, Out out, int axis, + bool reverse, bool exclusive) const { + if (!reverse) { + out.reshape(dims).device(d) = Functor()(x.reshape(dims), axis, exclusive); + } else { + std::array rev; + rev.fill(false); + rev[axis] = reverse; + out.reshape(dims).device(d) = + Functor()(x.reshape(dims).reverse(rev), axis, exclusive).reverse(rev); + } + } +}; + +template +struct CumsumFunctor { + using ELEMENT_TYPE = T; + template + const typename X::TensorScanSumOp operator()(X x, int axis, + bool exclusive) const { + return x.cumsum(axis, exclusive); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cumsum_op.cc b/paddle/fluid/operators/cumsum_op.cc new file mode 100644 index 00000000..5302b822 --- /dev/null +++ b/paddle/fluid/operators/cumsum_op.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cum_op.h" + +namespace paddle { +namespace operators { + +class CumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class CumsumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of cumsum operator"); + AddOutput("Out", "Output of cumsum operator"); + AddAttr("axis", + "The dimenstion to accumulate along. -1 means the last " + "dimenstion [default -1].") + .SetDefault(-1) + .EqualGreaterThan(-1); + AddAttr("exclusive", + "Whether to perform exclusive cumsum. [default false].") + .SetDefault(false); + AddAttr("reverse", + "If true, the cumsum is performed in the reversed direction. " + "[default false].") + .SetDefault(false); + AddComment(R"DOC( +The cumulative sum of the elements along a given axis. +By default, the first element of the result is the same of the first element of +the input. If exlusive is true, the first element of the result is 0. +)DOC"); + } +}; + +class CumsumGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *grad_op = new framework::OpDesc(); + grad_op->SetType("cumsum"); + grad_op->SetInput("X", OutputGrad("Out")); + grad_op->SetOutput("Out", InputGrad("X")); + grad_op->SetAttr("axis", Attr("axis")); + grad_op->SetAttr("reverse", !Attr("reverse")); + grad_op->SetAttr("exclusive", Attr("exclusive")); + return std::unique_ptr(grad_op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(cumsum, ops::CumOp, ops::CumsumOpMaker, ops::CumsumGradMaker); +REGISTER_OP_CPU_KERNEL(cumsum, ops::CumKernel>, + ops::CumKernel>, + ops::CumKernel>); diff --git a/paddle/fluid/operators/cumsum_op.cu b/paddle/fluid/operators/cumsum_op.cu new file mode 100644 index 00000000..eb5fd99c --- /dev/null +++ b/paddle/fluid/operators/cumsum_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cum_op.h" + +namespace ops = paddle::operators; +using CUDA = paddle::platform::CUDADeviceContext; + +REGISTER_OP_CUDA_KERNEL(cumsum, ops::CumKernel>, + ops::CumKernel>, + ops::CumKernel>); diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc new file mode 100644 index 00000000..53ed86ad --- /dev/null +++ b/paddle/fluid/operators/cvm_op.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/cvm_op.h" +#include +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class CVMOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("CVM"), "Input(CVM) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto cvm_dims = ctx->GetInputDim("CVM"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(cvm_dims.size(), 2UL, "Input(CVM)'s rank should be 2."); + PADDLE_ENFORCE_EQ(cvm_dims[1], 2UL, + "The 2nd dimension of " + "Input(CVM) should be 2."); + + if (ctx->Attrs().Get("use_cvm")) { + ctx->SetOutputDim("Y", {x_dims[0], x_dims[1]}); + } else { + ctx->SetOutputDim("Y", {x_dims[0], x_dims[1] - 2}); + } + ctx->ShareLoD("X", /*->*/ "Y"); + } + + protected: + // Explicitly set that the data type of computation kernel of + // cvm + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); + } +}; + +class CVMGradientOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("CVM"), "Input(CVM) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto cvm_dims = ctx->GetInputDim("CVM"); + auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2."); + PADDLE_ENFORCE_EQ(cvm_dims.size(), 2, "Input(CVM)'s rank should be 2."); + + PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0], + "The 1st dimension of Input(X) and Input(Y@Grad) should " + "be equal."); + + PADDLE_ENFORCE_EQ(cvm_dims[1], 2, + "When Attr(soft_label) == false, the 2nd dimension of " + "Input(CVM) should be 2."); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + // Explicitly set that the data type of computation kernel of + // cvm + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); + } +}; + +class CVMOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LodTensor, default LodTensor), a 2-D tensor with shape " + "[N x D]," + " where N is the batch size and D is the emebdding dim. "); + AddInput("CVM", + "(Tensor), a 2-D Tensor with shape [N x 2], where N is the batch " + "size, 2 is show and click."); + AddOutput("Y", + "(LodTensor, default LodTensor), a 2-D tensor with shape " + "[N x K]."); + AddAttr("use_cvm", "bool, use cvm or not").SetDefault(true); + AddComment(R"DOC( +CVM Operator. + + We assume that input X is a embedding vector with cvm_feature(show and click), which shape is [N * D] (D is 2(cvm_feature) + embedding dim, N is batch_size) + if use_cvm is True, we will log(cvm_feature), and output shape is [N * D]. + if use_cvm is False, we will remove cvm_feature from input, and output shape is [N * (D - 2)]. + +)DOC"); + } +}; + +class CVMGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("cvm_grad"); + op->SetInput("X", Input("X")); + op->SetInput("CVM", Input("CVM")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(cvm, ops::CVMOp, ops::CVMOpMaker, ops::CVMGradOpDescMaker); + +REGISTER_OPERATOR(cvm_grad, ops::CVMGradientOp); + +REGISTER_OP_CPU_KERNEL(cvm, ops::CVMOpKernel, ops::CVMOpKernel); + +REGISTER_OP_CPU_KERNEL(cvm_grad, ops::CVMGradOpKernel, + ops::CVMGradOpKernel); diff --git a/paddle/fluid/operators/cvm_op.h b/paddle/fluid/operators/cvm_op.h new file mode 100644 index 00000000..c6140483 --- /dev/null +++ b/paddle/fluid/operators/cvm_op.h @@ -0,0 +1,126 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +void CvmComputeKernel(const bool use_cvm, const int64_t item_width, const T** X, + T** Y) { + const auto cvm_offset = use_cvm ? 0 : 2; + + std::memcpy(*Y, *X + cvm_offset, (item_width - cvm_offset) * sizeof(T)); + + if (use_cvm) { + (*Y)[0] = log((*Y)[0] + 1); + (*Y)[1] = log((*Y)[1] + 1) - (*Y)[0]; + } + + (*X) += item_width; + (*Y) += item_width - cvm_offset; +} + +template +void CvmGradComputeKernel(const bool use_cvm, const int64_t item_width, + const T& CVM, const T** DY, T** DX) { + const auto cvm_offset = use_cvm ? 0 : 2; + + std::memcpy(*DX + cvm_offset, *DY, (item_width - cvm_offset) * sizeof(T)); + + (*DX)[0] = (&CVM)[0]; + (*DX)[1] = (&CVM)[1]; + + (*DX) += item_width; + (*DY) += item_width - cvm_offset; +} + +template +class CVMOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const auto* x = context.Input("X"); + const T* x_data = x->data(); + + auto batch_size = x->dims()[0]; + auto item_size = x->numel() / batch_size; + auto use_cvm = context.Attr("use_cvm"); + + auto* y = context.Output("Y"); + T* y_data = y->mutable_data(context.GetPlace()); + + // for Input X do not have Lod Information. + if (x->NumLevels() == 0) { + for (int i = 0; i < batch_size; i++) { + CvmComputeKernel(use_cvm, item_size, &x_data, &y_data); + } + } else { + auto lod = x->lod()[0]; + for (size_t i = 0; i < lod.size() - 1; ++i) { + for (size_t j = 0; j < lod[i + 1] - lod[i]; ++j) { + CvmComputeKernel(use_cvm, item_size, &x_data, &y_data); + } + } + } + } +}; + +template +class CVMGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* dx = context.Output(framework::GradVarName("X")); + T* dx_data = dx->mutable_data(context.GetPlace()); + + const Tensor* cvm = context.Input("CVM"); + const T* cvm_data = cvm->data(); + + const auto* dOut = + context.Input(framework::GradVarName("Y")); + const T* dout_data = dOut->data(); + + auto use_cvm = context.Attr("use_cvm"); + + auto offset = 2; + auto batch_size = dx->dims()[0]; + auto item_size = dx->numel() / batch_size; + + // for Input X do not have Lod Information. + if (dx->NumLevels() == 0) { + for (int x = 0; x < batch_size; ++x) { + CvmGradComputeKernel(use_cvm, item_size, *cvm_data, &dout_data, + &dx_data); + cvm_data += offset; + } + } else { + auto lod = dx->lod()[0]; + int seq_num = static_cast(lod.size()) - 1; + for (int i = 0; i < seq_num; ++i) { + for (size_t j = 0; j < lod[i + 1] - lod[i]; ++j) { + CvmGradComputeKernel(use_cvm, item_size, *cvm_data, &dout_data, + &dx_data); + } + cvm_data += offset; + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc new file mode 100644 index 00000000..a5c76db6 --- /dev/null +++ b/paddle/fluid/operators/data_norm_op.cc @@ -0,0 +1,409 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/data_norm_op.h" +#include +#include +#include "paddle/fluid/framework/data_layout.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +class DataNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSize"), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSum"), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), ""); + PADDLE_ENFORCE(ctx->HasOutput("Means"), ""); + PADDLE_ENFORCE(ctx->HasOutput("Scales"), ""); + PADDLE_ENFORCE(ctx->HasOutput("Y"), ""); + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "Input X must have 2 to 5 dimensions."); + + const int64_t C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum").size(), 1UL); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum").size(), 1UL); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C); + } + + ctx->SetOutputDim("Y", x_dims); + ctx->SetOutputDim("Means", {C}); + ctx->SetOutputDim("Scales", {C}); + ctx->ShareLoD("X", "Y"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = ctx.Input("X")->type(); + // By default, the type of the scale, bias, mean, + // and var tensors should both be float. (For float or float16 input tensor) + // or double (For double input tensor). + auto dn_param_type = framework::proto::VarType::FP32; + if (input_data_type == framework::proto::VarType::FP64) { + dn_param_type = framework::proto::VarType::FP64; + } + PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input("BatchSize")->type(), + "BatchSize input should be of float type"); + PADDLE_ENFORCE_EQ(dn_param_type, ctx.Input("BatchSum")->type(), + "BatchSum input should be of float type"); + PADDLE_ENFORCE_EQ(dn_param_type, + ctx.Input("BatchSquareSum")->type(), + "BatchSquareSum input should be of float type"); + + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } +#endif + + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library); + } +}; + +class DataNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + // AddAttr("is_test", "").SetDefault(false); + AddAttr("epsilon", "") + .SetDefault(1e-4) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr("data_layout", "").SetDefault("NCHW"); + AddInput("X", "The input tensor"); + AddInput("BatchSize", + "BatchSize is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("BatchSum", + "BatchSum is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("BatchSquareSum", + "The global BatchSquareSum (for training) or " + "estimated BatchSquareSum (for testing)"); + AddOutput("Y", "result after normalization"); + AddOutput("Means", + "Mean of the history data batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("Scales", + "Scales of the history data batch, " + "will apply to output when training") + .AsIntermediate(); + AddComment(R"DOC( +Data Normalization. + +Can be used as a normalizer function for data +The required data format for this layer is one of the following: +1. NHWC `[batch, in_height, in_width, in_channels]` +2. NCHW `[batch, in_channels, in_height, in_width]` + +)DOC"); + } +}; + +template +class DataNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + // const bool is_test = ctx.Attr("is_test"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2"); + const int N = x_dims[0]; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + auto *y = ctx.Output("Y"); + auto *mean_out = ctx.Output("Means"); + auto *scales = ctx.Output("Scales"); + + // alloc memory + y->mutable_data(ctx.GetPlace()); + + Eigen::Array inv_std(C); + ConstEigenVectorArrayMap b_size_arr( + ctx.Input("BatchSize")->data(), C); + ConstEigenVectorArrayMap b_sum_arr( + ctx.Input("BatchSum")->data(), C); + ConstEigenVectorArrayMap b_square_sum_arr( + ctx.Input("BatchSquareSum")->data(), C); + EigenVectorArrayMap means_arr(mean_out->mutable_data(ctx.GetPlace()), + C); + EigenVectorArrayMap scales_arr(scales->mutable_data(ctx.GetPlace()), + C); + means_arr = b_sum_arr / b_size_arr; + scales_arr = (b_size_arr / b_square_sum_arr).sqrt(); + + switch (data_layout) { + case DataLayout::kNCHW: // because it's two dimensions, so make no + // difference + case DataLayout::kNHWC: { + EigenArrayMap(y->mutable_data(ctx.GetPlace()), C, N) = + (ConstEigenArrayMap(x->data(), C, N).colwise() - means_arr) + .colwise() * + scales_arr; + break; + } + default: + PADDLE_THROW("Unknown storage order: %d", data_layout); + } + } +}; + +class DataNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + // check input + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSize"), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSum"), ""); + PADDLE_ENFORCE(ctx->HasInput("BatchSquareSum"), ""); + PADDLE_ENFORCE(ctx->HasInput("Means"), ""); + PADDLE_ENFORCE(ctx->HasInput("Scales"), ""); + + // check output + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSize")), ""); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSum")), ""); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSquareSum")), + ""); + + const auto x_dims = ctx->GetInputDim("X"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->SetOutputDim(framework::GradVarName("BatchSize"), {C}); + ctx->SetOutputDim(framework::GradVarName("BatchSum"), {C}); + ctx->SetOutputDim(framework::GradVarName("BatchSquareSum"), {C}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("can't find Y@GRAD"); + } + + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } +#endif + + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), layout, library); + } +}; + +template +class DataNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *batch_size = ctx.Input("BatchSize"); + const auto *batch_sum = ctx.Input("BatchSum"); + const auto *batch_square_sum = ctx.Input("BatchSquareSum"); + const auto *scales = ctx.Input("Scales"); + const auto *means = ctx.Input("Means"); + + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto &x_dims = x->dims(); + PADDLE_ENFORCE(x_dims.size() == 2, "The Input dim size should be 2"); + const int N = x_dims[0]; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + + // init output + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_batch_size = + ctx.Output(framework::GradVarName("BatchSize")); + auto *d_batch_sum = ctx.Output(framework::GradVarName("BatchSum")); + auto *d_batch_square_sum = + ctx.Output(framework::GradVarName("BatchSquareSum")); + + EigenVectorArrayMap d_batch_size_arr( + d_batch_size->mutable_data(ctx.GetPlace()), C); + EigenVectorArrayMap d_batch_sum_arr( + d_batch_sum->mutable_data(ctx.GetPlace()), C); + EigenVectorArrayMap d_batch_square_sum_arr( + d_batch_square_sum->mutable_data(ctx.GetPlace()), C); + + d_batch_size_arr.setZero(); + d_batch_sum_arr.setZero(); + d_batch_square_sum_arr.setZero(); + + const float epsilon = ctx.Attr("epsilon"); + switch ( + data_layout) { // because it's two dimensions, so make no difference + case DataLayout::kNCHW: + case DataLayout::kNHWC: { + ConstEigenVectorArrayMap scales_arr(scales->data(), C); + ConstEigenVectorArrayMap means_arr(means->data(), C); + ConstEigenArrayMap x_arr(x->data(), C, N); + ConstEigenArrayMap d_y_arr(d_y->data(), C, N); + EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, N); + d_x_arr.setZero(); + for (int nc = 0; nc < N; ++nc) { + d_x_arr.col(nc) = d_y_arr.col(nc) * scales_arr; + } + + // calculate data sum and squre sum + ConstEigenVectorArrayMap batch_size_arr(batch_size->data(), C); + ConstEigenVectorArrayMap batch_sum_arr(batch_sum->data(), C); + ConstEigenVectorArrayMap batch_square_sum_arr( + batch_square_sum->data(), C); + Eigen::Array sample_sum(C); + Eigen::Array sample_square_sum(C); + // calculate data sample sum and square sum + sample_sum.setZero(); + sample_square_sum.setZero(); + for (int nc = 0; nc < N; ++nc) { + sample_sum += x_arr.col(nc); + sample_square_sum += (x_arr.col(nc) - means_arr).square(); + } + // calculate gradient + d_batch_size_arr.setConstant(N); + d_batch_sum_arr = sample_sum; + d_batch_square_sum_arr = sample_square_sum + d_batch_size_arr * epsilon; + break; + } + default: + PADDLE_THROW("Unknown storage order: %s", data_layout_str); + } + } +}; + +class DataNormGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *op = new framework::OpDesc(); + op->SetType("data_norm_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + + op->SetInput("BatchSize", Input("BatchSize")); + op->SetInput("BatchSum", Input("BatchSum")); + op->SetInput("BatchSquareSum", Input("BatchSquareSum")); + op->SetInput("Scales", Output("Scales")); + op->SetInput("Means", Output("Means")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("BatchSize"), InputGrad("BatchSize")); + op->SetOutput(framework::GradVarName("BatchSum"), InputGrad("BatchSum")); + op->SetOutput(framework::GradVarName("BatchSquareSum"), + InputGrad("BatchSquareSum")); + + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(data_norm, ops::DataNormOp, ops::DataNormOpMaker, + ops::DataNormGradMaker); +REGISTER_OPERATOR(data_norm_grad, ops::DataNormGradOp); + +REGISTER_OP_CPU_KERNEL( + data_norm, ops::DataNormKernel, + ops::DataNormKernel); +REGISTER_OP_CPU_KERNEL( + data_norm_grad, + ops::DataNormGradKernel, + ops::DataNormGradKernel); diff --git a/paddle/fluid/operators/data_norm_op.h b/paddle/fluid/operators/data_norm_op.h new file mode 100644 index 00000000..63451214 --- /dev/null +++ b/paddle/fluid/operators/data_norm_op.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class DataNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +template +class DataNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc new file mode 100644 index 00000000..92a93dc7 --- /dev/null +++ b/paddle/fluid/operators/deformable_conv_op.cc @@ -0,0 +1,278 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/conv_op.h" + +namespace paddle { +namespace operators { +class DeformableConvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(Tensor) The input of deformable conv op. " + "The shape of input is " + "[N, channel_in, H, W]"); + AddInput("Offset", + "(Tensor) The input offset. " + "The shape of the offset is " + "[N, deformable_groups * kernel_w * kernel_h * 2, H, W"); + AddInput("Mask", + "(Tensor) The input mask. " + "The shape of the mask is " + "[N, deformable_groups * kernel_w * kernel_h, H, W]."); + AddInput("Filter", + "(Tensor) The Input Filter " + "The shape of the wight is " + "[num_filters, channel_in, kernel_h, kernel_w."); + AddOutput("Output", + "(Tensor) The output. " + "The shape of the output tensor is " + "[N, num_filters, out_height, out_width]]."); + AddAttr>("strides", + "(vector default:{1, 1}), the " + "strides(h_stride, w_stride) of " + "convolution operator.") + .SetDefault({1, 1}); + AddAttr>("paddings", + "(vector default:{0,0}), the " + "paddings(h_pad, w_pad) of " + "convolution operator. ") + .SetDefault({0, 0}); + AddAttr>("dilations", + "(vector default:{1, 1}), the " + "dilations(h_dilation, w_dilation) of " + "convolution operator.") + .SetDefault({1, 1}); + AddAttr( + "groups", + "(int default:1), the groups number of the convolution operator. " + "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " + "when group=2, the first half of the filters is only connected to the " + "first half of the input channels, while the second half of the " + "filters " + "is only connected to the second half of the input channels.") + .SetDefault(1); + AddAttr("deformable_groups", + "(int default:1), the number of the deformable groups.") + .SetDefault(1); + AddAttr("im2col_step", + "im2col maximum number of image per computation") + .SetDefault(64); + AddComment(R"DOC( +**Deformable Convolution Operator** + +Compute 2-D deformable convolution on 4-D input. + +Given input image x, output feature map y, the deformable convolution operation can be expressed as follow: + +$$ +y(p) = \\sum_{k=1}^{K}{w_k * x(p + p_k + \\Delta p_k) * \\Delta m_k} +$$ + +Where $$\\Delta p_k$$ and $$\Delta m_k$$ are the learnable offset and modulation scalar for the k-th location, respectively. + +Refer to 'Deformable ConvNets v2: More Deformable, Better Results +' + +Example: + Input: + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ + Offset shape: $(N, 2 * deformable_groups, * H_f * W_f, H_{out}, W_{out})$ + Mask shape: $(N, deformable_groups * H_f * W_f, H_{out}, W_{out})$ + Output: + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + where $H_{out}, W_{out}$ must be equal to $H_{in}, W_{in}$ respectively. + Where +$$ + H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\ + W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1 +$$ +)DOC"); + } +}; + +class DeformableConvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of DeformableConvOp " + "should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Offset"), + "Input(Offset) of DeformableConvOp " + "should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Mask"), + "Input(Mask) of DeformableConvOp " + "should not be null"); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of DeformableConvOp " + "should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of DeformableConvOp " + "should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + auto offset_dims = ctx->GetInputDim("Offset"); + auto mask_dims = ctx->GetInputDim("Mask"); + + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::vector dilations = + ctx->Attrs().Get>("dilations"); + int groups = ctx->Attrs().Get("groups"); + int deformable_groups = ctx->Attrs().Get("deformable_groups"); + int im2col_step = ctx->Attrs().Get("im2col_step"); + + PADDLE_ENFORCE(in_dims.size() == 4, + "Conv input should be 4-D tensor, get %u", in_dims.size()); + PADDLE_ENFORCE_EQ( + in_dims.size(), filter_dims.size(), + "Conv input dimension and filter dimension should be the same."); + PADDLE_ENFORCE_EQ( + in_dims.size() - strides.size(), 2U, + "Conv input dimension and strides dimension should be consistent."); + PADDLE_ENFORCE_EQ(paddings.size(), strides.size(), + "Conv paddings dimension and Conv strides dimension " + "should be the same."); + + PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups, + "The number of input channels should be equal to filter " + "channels * groups."); + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, 0, + "The number of output channels should be divided by groups."); + PADDLE_ENFORCE_EQ(filter_dims[0] % deformable_groups, 0, + "The number of output channels should be " + "divided by deformable groups."); + + if (in_dims[0] > im2col_step) { + PADDLE_ENFORCE_EQ( + in_dims[0] % im2col_step, 0U, + "Input batchsize must be smaller than or divide im2col_step"); + } + + for (size_t i = 0; i < strides.size(); ++i) { + PADDLE_ENFORCE_GT(strides[i], 0U, "stride %d size incorrect", i); + } + for (size_t i = 0; i < dilations.size(); ++i) { + PADDLE_ENFORCE_GT(dilations[i], 0U, "dilation %d size incorrect", i); + } + + std::vector output_shape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); + } + PADDLE_ENFORCE_EQ(output_shape[1] % deformable_groups, 0U, + "output num_filter must divide deformable group size."); + PADDLE_ENFORCE_EQ(output_shape[2], offset_dims[2], + "output height must equal to offset map height."); + PADDLE_ENFORCE_EQ(output_shape[3], offset_dims[3], + "output width must equal to offset map width."); + PADDLE_ENFORCE_EQ(offset_dims[1] % (filter_dims[2] * filter_dims[3]), 0U, + "offset filter must divide deformable group size."); + PADDLE_ENFORCE_EQ(offset_dims[1] / (2 * filter_dims[2] * filter_dims[3]), + deformable_groups, + "offset filter must divide deformable group size."); + PADDLE_ENFORCE_EQ(output_shape[2], mask_dims[2], + "output height must equal to mask map height."); + PADDLE_ENFORCE_EQ(output_shape[3], mask_dims[3], + "output width must equal to mask map width."); + PADDLE_ENFORCE_EQ(mask_dims[1] % (filter_dims[2] * filter_dims[3]), 0U, + "mask filter must divide deformable group size."); + PADDLE_ENFORCE_EQ(mask_dims[1] / (filter_dims[2] * filter_dims[3]), + deformable_groups, + "mask filter must divide deformable group size."); + + ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.device_context()); + } +}; + +class DeformableConvGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + + op->SetType("deformable_conv_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetInput("Offset", Input("Offset")); + op->SetInput("Mask", Input("Mask")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + op->SetOutput(framework::GradVarName("Offset"), InputGrad("Offset")); + op->SetOutput(framework::GradVarName("Mask"), InputGrad("Mask")); + + op->SetAttrMap(Attrs()); + return op; + } +}; + +class DeformableConvGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + auto offset_dims = ctx->GetInputDim("Offset"); + auto mask_dims = ctx->GetInputDim("Mask"); + + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Output")), + "the gradient of output(Out) must not be null"); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); + } + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); + } + if (ctx->HasOutput(framework::GradVarName("Offset"))) { + ctx->SetOutputDim(framework::GradVarName("Offset"), offset_dims); + } + if (ctx->HasOutput(framework::GradVarName("Mask"))) { + ctx->SetOutputDim(framework::GradVarName("Mask"), mask_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.device_context()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp, + ops::DeformableConvOpMaker, + ops::DeformableConvGradOpDescMaker); + +REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp); diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu new file mode 100644 index 00000000..afcd418f --- /dev/null +++ b/paddle/fluid/operators/deformable_conv_op.cu @@ -0,0 +1,752 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Part of the following code in this file refs to +// https://github.com/msracver/Deformable-ConvNets/blob/master/DCNv2_op/nn/modulated_deformable_im2col.cuh +// +// Copyright (c) 2018 Microsoft +// Licensed under The MIT License [see LICENSE for details] +// \file modulated_deformable_im2col.cuh +// \brief +// \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +__device__ T DmcnGetGradientWeight(T argmax_h, T argmax_w, const int h, + const int w, const int height, + const int width) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + T weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} + +template +__global__ void ModulatedDeformableCol2imGpuKernel( + const int nthreads, const T* data_col, const T* data_offset, + const T* data_mask, const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int dilation_h, + const int dilation_w, const int channel_per_deformable_group, + const int batch_size, const int deformable_group, const int height_col, + const int width_col, T* grad_im) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t thread = index; thread < nthreads; thread += offset) { + const int j = (thread / width_col / height_col / batch_size) % kernel_w; + const int i = + (thread / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = + thread / width_col / height_col / batch_size / kernel_w / kernel_h; + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = thread % width_col; + int h_out = (thread / width_col) % height_col; + int b = (thread / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const T* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * + width_col; + const T* data_mask_ptr = data_mask + + (b * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + const T cur_inv_h_data = h_in + i * dilation_h + offset_h; + const T cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const T cur_top_grad = data_col[thread] * mask; + const int cur_h = static_cast(cur_inv_h_data); + const int cur_w = static_cast(cur_inv_w_data); + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && + cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) { + int cur_bottom_grad_pos = + ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + T weight = + DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, + cur_w + dx, height, width); + + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +template +inline void ModulatedDeformableCol2im( + const platform::DeviceContext& ctx, const T* data_col, const T* data_offset, + const T* data_mask, const std::vector im_shape, + const std::vector col_shape, + const std::vector kernel_shape, const std::vector pad, + const std::vector stride, const std::vector dilation, + const int deformable_group, T* grad_im) { + int channel_per_deformable_group = im_shape[0] / deformable_group; + int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + int blocks = NumBlocks(num_kernels); + int threads = kNumCUDAThreads; + + ModulatedDeformableCol2imGpuKernel<<< + blocks, threads, 0, + reinterpret_cast(ctx).stream()>>>( + num_kernels, data_col, data_offset, data_mask, im_shape[0], im_shape[1], + im_shape[2], kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0], + stride[1], dilation[0], dilation[1], channel_per_deformable_group, + col_shape[1], deformable_group, col_shape[2], col_shape[3], grad_im); +} + +template +__device__ T DmcnGetCoordinateWeight(T argmax_h, T argmax_w, const int height, + const int width, const T* im_data, + const int data_width, const int bp_dir) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + T weight = 0; + + if (bp_dir == 0) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } else if (bp_dir == 1) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } + return weight; +} + +template +__global__ void ModulatedDeformableCol2imCoordGpuKernel( + const int nthreads, const T* data_col, const T* data_im, + const T* data_offset, const T* data_mask, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int offset_channels, const int deformable_group, const int height_col, + const int width_col, T* grad_offset, T* grad_mask) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + T val = 0, mval = 0; + const int w = i % width_col; + const int h = (i / width_col) % height_col; + const int c = (i / width_col / height_col) % offset_channels; + const int b = (i / width_col / height_col) / offset_channels; + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const T* data_col_ptr = data_col + + deformable_group_index * + channel_per_deformable_group * batch_size * + width_col * height_col; + const T* data_im_ptr = data_im + + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / + kernel_w * height * width; + const T* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * + width_col; + const T* data_mask_ptr = data_mask + + (b * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = offset_c / 2; col_c < channel_per_deformable_group; + col_c += col_step) { + const int col_pos = + (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = + (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = + (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = + (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + + w_out); + const int data_mask_hw_ptr = + (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T inv_h = h_in + i * dilation_h + offset_h; + T inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { + inv_h = inv_w = -2; + } else { + mval += data_col_ptr[col_pos] * + DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width, + height, width, inv_h, inv_w); + } + const T weight = DmcnGetCoordinateWeight( + inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, + width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + grad_offset[i] = val; + if (offset_c % 2 == 0) + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * + kernel_w + + offset_c / 2) * + height_col + + h) * + width_col + + w] = mval; + } +} + +template +inline void ModulatedDeformableCol2imCoord( + const platform::DeviceContext& ctx, const T* data_col, const T* data_im, + const T* data_offset, const T* data_mask, + const std::vector im_shape, const std::vector col_shape, + const std::vector kernel_shape, const std::vector paddings, + const std::vector strides, const std::vector dilations, + const int deformable_groups, T* grad_offset, T* grad_mask) { + int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] * + col_shape[2] * col_shape[3] * deformable_groups; + int channel_per_deformable_group = col_shape[0] / deformable_groups; + int blocks = NumBlocks(num_kernels); + int threads = kNumCUDAThreads; + + ModulatedDeformableCol2imCoordGpuKernel<<< + blocks, threads, 0, + reinterpret_cast(ctx).stream()>>>( + num_kernels, data_col, data_im, data_offset, data_mask, im_shape[0], + im_shape[1], im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0], + paddings[1], strides[0], strides[1], dilations[0], dilations[1], + channel_per_deformable_group, col_shape[1], + 2 * kernel_shape[2] * kernel_shape[3] * deformable_groups, + deformable_groups, col_shape[2], col_shape[3], grad_offset, grad_mask); +} + +template +__device__ T DmcnIm2colBilinear(const T* bottom_data, const int data_width, + const int height, const int width, T h, T w) { + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + T lh = h - h_low; + T lw = w - w_low; + T hh = 1 - lh, hw = 1 - lw; + + T v1 = 0; + if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; + T v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + T v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + T v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__global__ void ModulatedDeformableIm2colGpuKernel( + const int nthreads, const T* data_im, const T* data_offset, + const T* data_mask, const int height, const int width, const int kernel_h, + const int kernel_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int num_channels, const int deformable_group, const int height_col, + const int width_col, T* data_col) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + const int w_col = i % width_col; + const int h_col = (i / width_col) % height_col; + const int b_col = (i / width_col) / height_col % batch_size; + const int c_im = (i / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + T* data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T* data_offset_ptr = + data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + const T* data_mask_ptr = + data_mask + + (b_col * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +inline void ModulatedDeformableIm2col( + const platform::DeviceContext& ctx, const T* data_im, const T* data_offset, + const T* data_mask, const std::vector im_shape, + const std::vector col_shape, + const std::vector filter_shape, const std::vector paddings, + const std::vector strides, const std::vector dilations, + const int deformable_groups, T* data_col) { + int channel_per_deformable_group = im_shape[0] / deformable_groups; + int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + int blocks = NumBlocks(num_kernels); + int threads = kNumCUDAThreads; + + ModulatedDeformableIm2colGpuKernel<<< + blocks, threads, 0, + reinterpret_cast(ctx).stream()>>>( + num_kernels, data_im, data_offset, data_mask, im_shape[1], im_shape[2], + filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0], + strides[1], dilations[0], dilations[1], channel_per_deformable_group, + col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3], + data_col); +} + +template +__global__ void FilterGradAddupGpuKernel(const int nthreads, const int n, + const int height, const int width, + const T* dweight_3d, T* filter_grad) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + filter_grad[i] = filter_grad[i] + dweight_3d[i]; + } +} + +template +class DeformableConvCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* input = ctx.Input("Input"); + const Tensor offset = *ctx.Input("Offset"); + const Tensor mask = *ctx.Input("Mask"); + Tensor filter = *ctx.Input("Filter"); + Tensor* output = ctx.Output("Output"); + output->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.cuda_device_context(); + + const int groups = ctx.Attr("groups"); + const int deformable_groups = ctx.Attr("deformable_groups"); + const int im2col_step = ctx.Attr("im2col_step"); + const std::vector strides = ctx.Attr>("strides"); + const std::vector paddings = ctx.Attr>("paddings"); + const std::vector dilations = ctx.Attr>("dilations"); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + std::vector output_shape_vec(framework::vectorize(output->dims())); + + // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} + std::vector col_buffer_shape_vec(filter_shape_vec.size()); + col_buffer_shape_vec[0] = + input->dims()[1] * filter.dims()[2] * filter.dims()[3]; + col_buffer_shape_vec[1] = im2col_step; + for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { + col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec)); + std::vector output_buffer_shape_vec(1); + output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * + output_shape_vec[2] * output_shape_vec[3]; + framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec)); + Tensor col_buffer; + Tensor output_buffer; + col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); + output_buffer = + ctx.AllocateTmpTensor(output_shape, dev_ctx); + + int64_t M = output_shape_vec[1] / groups; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t K = + input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; + + Tensor weight_3d; + weight_3d.ShareDataWith(filter).Resize( + framework::make_ddim({groups, M, K})); + Tensor col_buffer_3d; + col_buffer_3d.ShareDataWith(col_buffer) + .Resize(framework::make_ddim({groups, K, N})); + Tensor output_4d; + output_4d.ShareDataWith(output_buffer) + .Resize(framework::make_ddim({batch_size / im2col_step, groups, M, N})); + output_4d.mutable_data(ctx.GetPlace()); + framework::DDim input_shape = + framework::slice_ddim(input->dims(), 1, input->dims().size()); + std::vector input_shape_vec = framework::vectorize(input_shape); + + int input_dim = input->numel() / input->dims()[0]; + int input_offset_dim = offset.numel() / offset.dims()[0]; + int input_mask_dim = mask.numel() / mask.dims()[0]; + + auto blas = math::GetBlas(dev_ctx); + + const T* input_ptr = input->data(); + const T* offset_ptr = offset.data(); + const T* mask_ptr = mask.data(); + col_buffer.mutable_data(ctx.GetPlace()); + T* col_buffer_ptr = col_buffer.data(); + + for (int i = 0; i < batch_size / im2col_step; ++i) { + ModulatedDeformableIm2col( + ctx.device_context(), input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, + col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, + deformable_groups, col_buffer_ptr); + + Tensor output_3d = output_4d.Slice(i, i + 1).Resize( + framework::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); + for (int g = 0; g < groups; ++g) { + Tensor weight_3d_slice = + weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + weight_3d.dims(), 1, weight_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + Tensor output_3d_slice = + output_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + output_3d.dims(), 1, output_3d.dims().size())); + + blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), + &output_3d_slice, T(0.0)); + } + } + output->ShareDataWith(output_buffer) + .Resize(framework::make_ddim(output_shape_vec)); + } +}; + +template +class DeformableConvGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* output_grad = + ctx.Input(framework::GradVarName("Output")); + Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); + Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); + Tensor* offset_grad = ctx.Output(framework::GradVarName("Offset")); + Tensor* mask_grad = ctx.Output(framework::GradVarName("Mask")); + + const Tensor* input = ctx.Input("Input"); + Tensor offset = *ctx.Input("Offset"); + Tensor mask = *ctx.Input("Mask"); + Tensor filter = *ctx.Input("Filter"); + if (!input_grad && !filter_grad && !offset_grad && !mask_grad) return; + + int groups = ctx.Attr("groups"); + int deformable_groups = ctx.Attr("deformable_groups"); + int im2col_step = ctx.Attr("im2col_step"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + + auto& dev_ctx = ctx.cuda_device_context(); + const int batch_size = static_cast(input->dims()[0]); + + framework::DDim input_shape = + framework::slice_ddim(input->dims(), 1, input->dims().size()); + std::vector input_shape_vec = framework::vectorize(input_shape); + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + std::vector output_shape_vec( + framework::vectorize(output_grad->dims())); + + std::vector col_buffer_shape_vec(filter_shape_vec.size()); + col_buffer_shape_vec[0] = + input->dims()[1] * filter.dims()[2] * filter.dims()[3]; + col_buffer_shape_vec[1] = im2col_step; + for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { + col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec)); + std::vector output_buffer_shape_vec(1); + output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * + output_shape_vec[2] * output_shape_vec[3]; + framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec)); + Tensor col_buffer; + Tensor output_buffer; + col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); + output_buffer = + ctx.AllocateTmpTensor(output_shape, dev_ctx); + + output_buffer.ShareDataWith(*output_grad); + + int64_t M = + input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3]; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t K = output_shape_vec[1] / groups; + + framework::DDim weight_3d_shape = {groups, K, M}; + framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K, + N}; + framework::DDim col_buffer_3d_shape = {groups, M, N}; + framework::DDim filter_grad_shape = {groups, K, M}; + + Tensor weight_3d; + weight_3d.ShareDataWith(filter).Resize(weight_3d_shape); + Tensor out_grad_4d; + out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape); + Tensor col_buffer_3d; + col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape); + + math::SetConstant set_zero; + auto blas = math::GetBlas(dev_ctx); + + col_buffer.mutable_data(ctx.GetPlace()); + col_buffer_3d.mutable_data(ctx.GetPlace()); + out_grad_4d.mutable_data(ctx.GetPlace()); + + int input_dim = input->numel() / input->dims()[0]; + int input_offset_dim = offset.numel() / offset.dims()[0]; + int input_mask_dim = mask.numel() / mask.dims()[0]; + + if (filter_grad) { + filter_grad->mutable_data(ctx.GetPlace()); + filter_grad->Resize(filter_grad_shape); + set_zero(dev_ctx, filter_grad, static_cast(0)); + } + + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + } + + if (offset_grad && mask_grad) { + offset_grad->mutable_data(ctx.GetPlace()); + mask_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, offset_grad, static_cast(0)); + set_zero(dev_ctx, mask_grad, static_cast(0)); + } + + for (int i = 0; i < batch_size / im2col_step; ++i) { + Tensor out_grad_3d = + out_grad_4d.Slice(i, i + 1).Resize(framework::slice_ddim( + out_grad_4d.dims(), 1, out_grad_4d.dims().size())); + for (int g = 0; g < groups; ++g) { + Tensor weight_3d_slice = + weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + weight_3d.dims(), 1, weight_3d.dims().size())); + Tensor out_grad_3d_slice = + out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + out_grad_3d.dims(), 1, out_grad_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + + blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0), + &col_buffer_3d_slice, T(0.0)); + } + col_buffer.Resize(col_shape); + + T* col_buffer_ptr = col_buffer.data(); + const T* input_ptr = input->data(); + const T* offset_ptr = offset.data(); + const T* mask_ptr = mask.data(); + + if (mask_grad && offset_grad) { + T* offset_grad_ptr = offset_grad->data(); + T* mask_grad_ptr = mask_grad->data(); + ModulatedDeformableCol2imCoord( + ctx.device_context(), col_buffer_ptr, + input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, + col_buffer_shape_vec, filter_shape_vec, paddings, strides, + dilations, deformable_groups, + offset_grad_ptr + i * im2col_step * input_offset_dim, + mask_grad_ptr + i * im2col_step * input_mask_dim); + } + if (input_grad) { + T* input_grad_ptr = input_grad->data(); + ModulatedDeformableCol2im( + ctx.device_context(), col_buffer_ptr, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, + col_buffer_shape_vec, filter_shape_vec, paddings, strides, + dilations, deformable_groups, + input_grad_ptr + i * im2col_step * input_dim); + input_grad->Resize(input->dims()); + } + + ModulatedDeformableIm2col( + ctx.device_context(), input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, + col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, + deformable_groups, col_buffer_ptr); + + col_buffer_3d.Resize(col_buffer_3d_shape); + + if (filter_grad) { + Tensor dweight_3d; + dweight_3d = + ctx.AllocateTmpTensor(filter_grad_shape, dev_ctx); + for (int g = 0; g < groups; ++g) { + Tensor out_grad_3d_slice = + out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + out_grad_3d.dims(), 1, out_grad_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + Tensor dweight_3d_slice = + dweight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + dweight_3d.dims(), 1, dweight_3d.dims().size())); + + blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true, + T(1.0), &dweight_3d_slice, T(0.0)); + } + FilterGradAddupGpuKernel< + T><<>>( + dweight_3d.numel(), groups, K, M, dweight_3d.data(), + filter_grad->data()); + } + } + if (filter_grad) { + filter_grad->Resize(filter.dims()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CUDA = paddle::platform::CUDADeviceContext; + +REGISTER_OP_CUDA_KERNEL(deformable_conv, + ops::DeformableConvCUDAKernel); +REGISTER_OP_CUDA_KERNEL(deformable_conv_grad, + ops::DeformableConvGradCUDAKernel); diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cc b/paddle/fluid/operators/deformable_psroi_pooling_op.cc new file mode 100644 index 00000000..d17f22b9 --- /dev/null +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc @@ -0,0 +1,270 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/deformable_psroi_pooling_op.h" +#include +#include +#include +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { +class DeformablePSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(Tensor), " + "the input of Deformable PSROIPooling. " + "The shape of input tensor is [N,C,H,W]. Where N is batch size, " + "C is number of input channels, " + "H is height of the feature, and " + "W is the width of the feature."); + AddInput("ROIs", + "(LoDTensor), " + "ROIs (Regions of Interest) to pool over. " + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [[x1, y1, x2, y2], ...]. " + "(x1, y1) is the top left coordinates, and " + "(x2, y2) is the bottom right coordinates."); + AddInput("Trans", + "(Tensor)," + "offset of features on ROIs while pooling. " + "The format is NCHW, where N is number of ROIs, " + "C is number of channels, which indicate the offset distance " + "in the x and y directions, " + "H is pooled height, and " + "W is pooled width."); + AddAttr("no_trans", + "(bool), " + "whether add offset to get new value or not while roi " + "pooling, which value is True or False"); + AddAttr("spatial_scale", + "(float), " + "ratio of input feature map height (or width) to " + "raw image height (or width). Equals the reciprocal " + "of total stride in convolutional layers."); + AddAttr("output_dim", + "(int), " + "the number of output channels, which should be less than " + "input channels. Deformable roi_pooling requires " + "output_channels = input_channels, while deformable " + "psroi_pooling requires output_channels = input_channels " + "* pooled_height * pooled_width"); + AddAttr>( + "group_size", + "(vector), " + "the number of groups which input channels are divided." + "(eg.number of input channels is k1*k2*(C+1), which k1 and k2 " + "are group width and height and C+1 is number of output " + "chanels. eg.(4, 6), which 4 is height of group and 6 is " + "width of group"); + AddAttr("pooled_height", + "(int), " + "the pooled output height."); + AddAttr("pooled_width", + "(int), " + "the pooled output width."); + AddAttr>( + "part_size", + "(vector), " + "the height and width of offset, eg.(4, 6), which height is 4 " + " and width is 6"); + AddAttr("sample_per_part", + "(int), " + "the number of samples in each bin"); + AddAttr("trans_std", + "(float), " + "Coefficient of offset"); + AddOutput("TopCount", + "(Tensor), " + "record the number of pixel in average pooling to in each bin. " + "The format is NCHW, where N is the number of ROIs, " + "C is the number of output channels, " + "H is the height of output, and " + "W is the width of output."); + AddOutput("Output", + "(Tensor), " + "the output of Deformable PSROIPooling. " + "The format is NCHW, where N is the number of ROIs, " + "C is the number of output channels, " + "H is the height of output, and " + "W is thewidth of output. "); + AddComment(R"DOC( +**DeformablePSROIPooling Operator** +DeformablePSROIPooling is a new method based Region of interest pooling +(also known as RoI pooling). +The operator has four steps: + +1. Dividing each region proposal into equal-sized sections with + the pooled_width and pooled_height. + +2. Add offset to pixel in ROI to get new location and the new value which are + computed directly through bilinear interpolation with four nearest pixel. + +3. Sample several points to get average values in each bin. + +4. Copying these average values to the output buffer. + +DeformablePSROIPooling is part of Deformable Convolutional Networks, +please refer to https://arxiv.org/abs/1703.06211 for more details. + )DOC"); + } +}; + +class DeformablePSROIPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of DeformablePSROIPoolOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ROIs"), + "Input(ROIs) of DeformablePSROIPoolOp " + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Trans"), + "Input(Trans) of DeformablePSROIPoolOp " + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of DeformablePSROIPoolOp " + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("TopCount"), + "Output(TopCount) of DeformablePSROIPoolOp " + "should not be null."); + auto input_dims = ctx->GetInputDim("Input"); + auto rois_dims = ctx->GetInputDim("ROIs"); + auto trans_dims = ctx->GetInputDim("Trans"); + PADDLE_ENFORCE(rois_dims.size() == 2, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" + "given as [[ x1, y1, x2, y2], ...]."); + PADDLE_ENFORCE(trans_dims.size() == 4, + "The format of Input Trans is (N, 2, H, W)."); + auto pooled_height = ctx->Attrs().Get("pooled_height"); + auto pooled_width = ctx->Attrs().Get("pooled_width"); + auto spatial_scale = ctx->Attrs().Get("spatial_scale"); + auto output_channels = ctx->Attrs().Get("output_dim"); + auto group_size = ctx->Attrs().Get>("group_size"); + auto group_height = group_size[0]; + auto group_width = group_size[1]; + auto part_size = ctx->Attrs().Get>("part_size"); + auto part_height = part_size[0]; + auto part_width = part_size[1]; + auto sample_per_part = ctx->Attrs().Get("sample_per_part"); + auto trans_std = ctx->Attrs().Get("trans_std"); + PADDLE_ENFORCE(trans_std >= 0.0f, "trans_std must greater than 0.0"); + PADDLE_ENFORCE(input_dims[1] >= output_channels, + "input channels must greater than out_channels"); + PADDLE_ENFORCE_GT(pooled_height, 0, + "The pooled height must greater than 0"); + PADDLE_ENFORCE_GT(pooled_width, 0, "The pooled width must greater than 0"); + PADDLE_ENFORCE_GT(spatial_scale, 0.0f, + "The spatial scale must greater than 0"); + PADDLE_ENFORCE_EQ(group_size.size(), 2, + "The size of group_size should be 2."); + PADDLE_ENFORCE_GT(group_height, 0, + "The group_height in group_size must greater than 0"); + PADDLE_ENFORCE_GT(group_width, 0, + "The group_width in group_size must greater than 0"); + PADDLE_ENFORCE_EQ(part_size.size(), 2, + "The size of part_size should be 2."); + PADDLE_ENFORCE_GT(part_height, 0, + "The part_height in part_size must greater than 0"); + PADDLE_ENFORCE_GT(part_width, 0, + "The part_width in part_size must greater than 0"); + PADDLE_ENFORCE(part_height <= trans_dims[2], + "The height of trans must greater than part_height"); + PADDLE_ENFORCE(part_width <= trans_dims[3], + "The width of trans must greater than part_width"); + PADDLE_ENFORCE_GT(sample_per_part, 0, + "The sample_per_part must greater than 0"); + auto out_dims = input_dims; + out_dims[0] = rois_dims[0]; + out_dims[1] = output_channels; + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + ctx->SetOutputDim("Output", out_dims); + ctx->SetOutputDim("TopCount", out_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.device_context()); + } +}; + +class DeformablePSROIPoolGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + + op->SetType("deformable_psroi_pooling_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Trans", Input("Trans")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput("TopCount", Output("TopCount")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Trans"), InputGrad("Trans")); + + op->SetAttrMap(Attrs()); + return op; + } +}; + +class DeformablePSROIPoolGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Output")), + "The gradient of Output should not be null."); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), + ctx->GetInputDim("Input")); + } + if (ctx->HasOutput(framework::GradVarName("Trans"))) { + ctx->SetOutputDim(framework::GradVarName("Trans"), + ctx->GetInputDim("Trans")); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Trans")->type(), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; +REGISTER_OPERATOR(deformable_psroi_pooling, ops::DeformablePSROIPoolOp, + ops::DeformablePSROIPoolOpMaker, + ops::DeformablePSROIPoolGradOpDescMaker); +REGISTER_OPERATOR(deformable_psroi_pooling_grad, + ops::DeformablePSROIPoolGradOp); +REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling, + ops::DeformablePSROIPoolCPUKernel, + ops::DeformablePSROIPoolCPUKernel); +REGISTER_OP_CPU_KERNEL(deformable_psroi_pooling_grad, + ops::DeformablePSROIPoolGradCPUKernel, + ops::DeformablePSROIPoolGradCPUKernel); diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu new file mode 100644 index 00000000..c38e9553 --- /dev/null +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -0,0 +1,532 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Part of the following code in this file refs to +// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_psroi_pooling.cu +// +// Copyright (c) 2017 Microsoft +// Licensed under The Apache-2.0 License [see LICENSE for details] +// \file deformable_psroi_pooling.cu +// \brief +// \author Yi Li, Guodong Zhang, Jifeng Dai + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/deformable_psroi_pooling_op.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +static inline int GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__device__ T bilinear_interpolation(const T* data, const T x, const T y, + const int width, const int height) { + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + T dist_x = static_cast(x - x1); + T dist_y = static_cast(y - y1); + T value11 = data[y1 * width + x1]; + T value12 = data[y2 * width + x1]; + T value21 = data[y1 * width + x2]; + T value22 = data[y2 * width + x2]; + T value = (1 - dist_x) * (1 - dist_y) * value11 + + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + + dist_x * dist_y * value22; + return value; +} + +template +__global__ void DeformablePSROIPoolForwardKernel( + const int count, const T* bottom_data, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const T* bottom_rois, + const T* bottom_trans, const bool no_trans, const T trans_std, + const int sample_per_part, const int output_dim, const int group_height, + const int group_width, const int part_height, const int part_width, + const int num_classes, const int channels_each_class, T* top_data, + T* top_count, int* roi_batch_id_data) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + const T* offset_bottom_rois = bottom_rois + n * 4; + int roi_batch_ind = roi_batch_id_data[n]; + + // location of roi on feature map + T roi_start_w = + static_cast(round(offset_bottom_rois[0])) * spatial_scale - 0.5; + T roi_start_h = + static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_end_w = + static_cast(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5; + T roi_end_h = + static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + + // width and height of roi + T roi_width = max(roi_end_w - roi_start_w, 0.1); // avoid 0 + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // width and height of each bin + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // sampling interval ineach bin + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + // obtain offset of roi + int part_h = floor(static_cast(ph) / pooled_height * part_height); + int part_w = floor(static_cast(pw) / pooled_width * part_width); + int class_id = ctop / channels_each_class; + + T trans_x = + no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + T trans_y = no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * + part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + + // location of start after adding offset + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + T sum = 0; + int count = 0; + int gw = floor(static_cast(pw) * group_width / pooled_width); + int gh = floor(static_cast(ph) * group_height / pooled_height); + gw = min(max(gw, 0), group_width - 1); + gh = min(max(gh, 0), group_height - 1); + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels) * height * width; + + // sampling in each bin + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_height + gh) * group_width + gw; + // bilinear interpolation + T val = bilinear_interpolation(offset_bottom_data + c * height * width, + w, h, width, height); + sum += val; + count++; + } + } + top_data[index] = count == 0 ? static_cast(0) : sum / count; + top_count[index] = count; + } +} + +template +class DeformablePSROIPoolCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* input = ctx.Input("Input"); + const LoDTensor* rois = ctx.Input("ROIs"); + const Tensor* trans = ctx.Input("Trans"); + Tensor* out = ctx.Output("Output"); + out->mutable_data(ctx.GetPlace()); + Tensor* top_count = ctx.Output("TopCount"); + top_count->mutable_data(ctx.GetPlace()); + + auto no_trans = ctx.Attr("no_trans"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_dim = ctx.Attr("output_dim"); + auto group_size = ctx.Attr>("group_size"); + auto group_height = group_size[0]; + auto group_width = group_size[1]; + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto part_size = ctx.Attr>("part_size"); + auto part_height = part_size[0]; + auto part_width = part_size[1]; + auto sample_per_part = ctx.Attr("sample_per_part"); + auto trans_std = ctx.Attr("trans_std"); + + const int batch = static_cast(input->dims()[0]); + const int channels = static_cast(input->dims()[1]); + const int height = static_cast(input->dims()[2]); + const int width = static_cast(input->dims()[3]); + const int channels_trans = no_trans ? 2 : trans->dims()[1]; + const int num_rois = rois->dims()[0]; + PADDLE_ENFORCE_EQ(num_rois, out->dims()[0], + "number of rois should be same with number of output"); + const int count = num_rois * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = + no_trans ? output_dim : output_dim / num_classes; + PADDLE_ENFORCE(channels_each_class >= 1, + "channels_each must greater than 1"); + + const T* bottom_data = input->data(); + const T* bottom_rois = rois->data(); + const T* bottom_trans = no_trans ? NULL : trans->data(); + + framework::Tensor roi_batch_id_list; + roi_batch_id_list.Resize({num_rois}); + auto cplace = platform::CPUPlace(); + int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch, + "The rois_batch_size and imgs batch_size must be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + + auto& dev_ctx = ctx.cuda_device_context(); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + int bytes = roi_batch_id_list.numel() * sizeof(int); + auto roi_ptr = allocator.Allocate(bytes); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + const auto gplace = boost::get(ctx.GetPlace()); + memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, + dev_ctx.stream()); + + T* top_data = out->mutable_data(ctx.GetPlace()); + T* top_count_data = top_count->mutable_data(ctx.GetPlace()); + + DeformablePSROIPoolForwardKernel<<>>( + count, bottom_data, (T)spatial_scale, channels, height, width, + pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, + (T)trans_std, sample_per_part, output_dim, group_height, group_width, + part_height, part_width, num_classes, channels_each_class, top_data, + top_count_data, roi_id_data); + } +}; + +template +__global__ void DeformablePSROIPoolBackwardAccKernel( + const int count, const T* top_diff, const T* top_count, const int num_rois, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int output_dim, T* bottom_data_diff, T* bottom_trans_diff, + const T* bottom_data, const T* bottom_rois, const T* bottom_trans, + const bool no_trans, const T trans_std, const int sample_per_part, + const int group_height, const int group_width, const int part_height, + const int part_width, const int num_classes, const int channels_each_class, + int* roi_batch_id_data) { + CUDA_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + int num_box = count / pooled_height / pooled_width / output_dim; + const T* offset_bottom_rois = bottom_rois + n * 4; + int roi_batch_ind = roi_batch_id_data[n]; + + // location of roi on feature map + T roi_start_w = + static_cast(round(offset_bottom_rois[0])) * spatial_scale - 0.5; + T roi_start_h = + static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_end_w = + static_cast(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5; + T roi_end_h = + static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + + // width and height of roi + T roi_width = max(roi_end_w - roi_start_w, 0.1); + T roi_height = max(roi_end_h - roi_start_h, 0.1); + + // width and height of each bin + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // sampling interval in each bin + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + // obtain offset of roi + int part_h = floor(static_cast(ph) / pooled_height * part_height); + int part_w = floor(static_cast(pw) / pooled_width * part_width); + int class_id = ctop / channels_each_class; + + T trans_x = + no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + T trans_y = no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * + part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + // location of start after adding offset + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) { + continue; + } + + T diff_val = top_diff[index] / top_count[index]; + const T* offset_bottom_data = + bottom_data + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw) * group_width / pooled_width); + int gh = floor(static_cast(ph) * group_height / pooled_height); + gw = min(max(gw, 0), group_width - 1); + gh = min(max(gh, 0), group_height - 1); + + // sampling in each bin + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_height + gh) * group_width + gw; + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + + // compute coefficient of gradient + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + + // compute gradient of input + if (bottom_data_diff) { + platform::CudaAtomicAdd( + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y0 * width + x0, + q00 * diff_val); + platform::CudaAtomicAdd( + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y1 * width + x0, + q01 * diff_val); + platform::CudaAtomicAdd( + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y0 * width + x1, + q10 * diff_val); + platform::CudaAtomicAdd( + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y1 * width + x1, + q11 * diff_val); + } + + // compute gradient of trans + if (no_trans || bottom_trans_diff == NULL) { + continue; + } + + T u00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T u01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T u10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T u11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + T diff_x = (u11 * dist_y + u10 * (1 - dist_y) - u01 * dist_y - + u00 * (1 - dist_y)) * + trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (u11 * dist_x + u01 * (1 - dist_x) - u10 * dist_x - + u00 * (1 - dist_x)) * + trans_std * diff_val; + diff_y *= roi_height; + platform::CudaAtomicAdd( + bottom_trans_diff + + (((n * num_classes + class_id) * 2) * part_height + part_h) * + part_width + + part_w, + diff_x); + platform::CudaAtomicAdd( + bottom_trans_diff + + (((n * num_classes + class_id) * 2 + 1) * part_height + + part_h) * + part_width + + part_w, + diff_y); + } + } + } +} + +template +class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* input = ctx.Input("Input"); + const LoDTensor* rois = ctx.Input("ROIs"); + const Tensor* trans = ctx.Input("Trans"); + const Tensor* top_count = ctx.Input("TopCount"); + const Tensor* output_grad = + ctx.Input(framework::GradVarName("Output")); + Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); + Tensor* trans_grad = ctx.Output(framework::GradVarName("Trans")); + + math::SetConstant set_zero; + auto& dev_ctx = ctx.cuda_device_context(); + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + } + if (trans_grad) { + trans_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, trans_grad, static_cast(0)); + } + + auto no_trans = ctx.Attr("no_trans"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_dim = ctx.Attr("output_dim"); + auto group_size = ctx.Attr>("group_size"); + auto group_height = group_size[0]; + auto group_width = group_size[1]; + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto part_size = ctx.Attr>("part_size"); + auto part_height = part_size[0]; + auto part_width = part_size[1]; + auto sample_per_part = ctx.Attr("sample_per_part"); + auto trans_std = ctx.Attr("trans_std"); + + const int batch = static_cast(input->dims()[0]); + const int channels = static_cast(input->dims()[1]); + const int height = static_cast(input->dims()[2]); + const int width = static_cast(input->dims()[3]); + const int channels_trans = no_trans ? 2 : trans->dims()[1]; + const int num_rois = rois->dims()[0]; + const int count = num_rois * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = + no_trans ? output_dim : output_dim / num_classes; + + const T* top_diff = output_grad->data(); + const T* bottom_data = input->data(); + const T* bottom_rois = rois->data(); + const T* bottom_trans = no_trans ? NULL : trans->data(); + + T* bottom_data_diff = NULL; + T* bottom_trans_diff = NULL; + if (input_grad) { + bottom_data_diff = input_grad->mutable_data(ctx.GetPlace()); + } + if (trans_grad) { + bottom_trans_diff = + no_trans ? NULL : trans_grad->mutable_data(ctx.GetPlace()); + } + + const T* top_count_data = top_count->data(); + framework::Tensor roi_batch_id_list; + roi_batch_id_list.Resize({num_rois}); + auto cplace = platform::CPUPlace(); + int* roi_batch_id_data = roi_batch_id_list.mutable_data(cplace); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch, + "The rois_batch_size and imgs batch_size must be the same."); + + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + int bytes = roi_batch_id_list.numel() * sizeof(int); + auto roi_ptr = allocator.Allocate(bytes); + int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); + const auto gplace = boost::get(ctx.GetPlace()); + memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, + dev_ctx.stream()); + + DeformablePSROIPoolBackwardAccKernel<<>>( + count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels, + height, width, pooled_height, pooled_width, output_dim, + bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois, + bottom_trans, no_trans, (T)trans_std, sample_per_part, group_height, + group_width, part_height, part_width, num_classes, channels_each_class, + roi_id_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CUDA = paddle::platform::CUDADeviceContext; +REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling, + ops::DeformablePSROIPoolCUDAKernel, + ops::DeformablePSROIPoolCUDAKernel); +REGISTER_OP_CUDA_KERNEL(deformable_psroi_pooling_grad, + ops::DeformablePSROIPoolGradCUDAKernel, + ops::DeformablePSROIPoolGradCUDAKernel); diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h new file mode 100644 index 00000000..22df51c6 --- /dev/null +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h @@ -0,0 +1,488 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Part of the following code in this file refs to +// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_psroi_pooling.cu +// +// Copyright (c) 2017 Microsoft +// Licensed under The Apache-2.0 License [see LICENSE for details] +// \file deformable_psroi_pooling.cu +// \brief +// \author Yi Li, Guodong Zhang, Jifeng Dai + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +T bilinear_interp(const T* data, const T x, const T y, const int width, + const int height) { + int x1 = floor(x); + int x2 = ceil(x); + int y1 = floor(y); + int y2 = ceil(y); + T dist_x = static_cast(x - x1); + T dist_y = static_cast(y - y1); + T value11 = data[y1 * width + x1]; + T value12 = data[y2 * width + x1]; + T value21 = data[y1 * width + x2]; + T value22 = data[y2 * width + x2]; + T value = (1 - dist_x) * (1 - dist_y) * value11 + + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + + dist_x * dist_y * value22; + return value; +} + +template +void DeformablePSROIPoolForwardCPUKernel( + const int count, const T* bottom_data, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const T* bottom_rois, + const T* bottom_trans, const bool no_trans, const float trans_std, + const int sample_per_part, const int output_dim, const int group_height, + const int group_width, const int part_height, const int part_width, + const int num_classes, const int channels_each_class, T* top_data, + T* top_count, const int batch_size, int* roi_batch_id_data, + const LoDTensor* rois) { + for (int ix = 0; ix < count; ix++) { + int pw = ix % pooled_width; + int ph = (ix / pooled_width) % pooled_height; + int ctop = (ix / pooled_width / pooled_height) % output_dim; + int n = ix / pooled_width / pooled_height / output_dim; + const T* offset_bottom_rois = bottom_rois + n * 4; + + int roi_batch_ind = roi_batch_id_data[n]; + T roi_start_w = + static_cast(round(offset_bottom_rois[0])) * spatial_scale - 0.5; + T roi_start_h = + static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_end_w = + static_cast(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5; + T roi_end_h = + static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + + // width and height of roi + T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); + T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); + + // width and height of each bin + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // sampling interval in each bin + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + // obtain offset of roi + int part_h = floor(static_cast(ph) / pooled_height * part_height); + int part_w = floor(static_cast(pw) / pooled_width * part_width); + int class_id = ctop / channels_each_class; + + T trans_x = + no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + T trans_y = no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * + part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + + // location of start after adding offset + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + T sum = 0; + int num_sample = 0; + int gw = floor(static_cast(pw) * group_width / pooled_width); + int gh = floor(static_cast(ph) * group_height / pooled_height); + gw = std::min(std::max(gw, 0), group_width - 1); + gh = std::min(std::max(gh, 0), group_height - 1); + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels) * height * width; + + // sampling in each bin + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = std::min(std::max(w, T(0.)), T(width - 1.)); + h = std::min(std::max(h, T(0.)), height - T(1.)); + int c = (ctop * group_height + gh) * group_width + gw; + // bilinear interpolation to get value + T val = bilinear_interp(offset_bottom_data + c * height * width, w, h, + width, height); + sum += val; + num_sample++; + } + } + top_data[ix] = num_sample == 0 ? static_cast(0) : sum / num_sample; + top_count[ix] = num_sample; + } +} + +template +class DeformablePSROIPoolCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* rois = ctx.Input("ROIs"); + auto* trans = ctx.Input("Trans"); + auto* out = ctx.Output("Output"); + out->mutable_data(ctx.GetPlace()); + auto* top_count = ctx.Output("TopCount"); + top_count->mutable_data(ctx.GetPlace()); + + math::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); + set_zero(dev_ctx, out, static_cast(0)); + set_zero(dev_ctx, top_count, static_cast(0)); + + const int num_rois = rois->dims()[0]; + PADDLE_ENFORCE_EQ(num_rois, out->dims()[0], + "number of rois should be same with number of output"); + + framework::Tensor roi_batch_id_list; + roi_batch_id_list.Resize({num_rois}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(ctx.GetPlace()); + auto no_trans = ctx.Attr("no_trans"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_dim = ctx.Attr("output_dim"); + auto group_size = ctx.Attr>("group_size"); + auto group_height = group_size[0]; + auto group_width = group_size[1]; + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto part_size = ctx.Attr>("part_size"); + auto part_height = part_size[0]; + auto part_width = part_size[1]; + auto sample_per_part = ctx.Attr("sample_per_part"); + auto trans_std = ctx.Attr("trans_std"); + + int batch = static_cast(input->dims()[0]); + int channels = static_cast(input->dims()[1]); + int height = static_cast(input->dims()[2]); + int width = static_cast(input->dims()[3]); + int channels_trans = no_trans ? 2 : trans->dims()[1]; + auto count = num_rois * output_dim * pooled_height * pooled_width; + auto num_classes = no_trans ? 1 : channels_trans / 2; + auto channels_each_class = no_trans ? output_dim : output_dim / num_classes; + PADDLE_ENFORCE(channels_each_class >= 1, + "channels_each must greater than 1"); + + const T* bottom_data = input->data(); + const T* bottom_rois = rois->data(); + const T* bottom_trans = no_trans ? NULL : trans->data(); + + T* top_data = out->mutable_data(ctx.GetPlace()); + T* top_count_data = top_count->mutable_data(ctx.GetPlace()); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ(rois_batch_size, batch, + "The rois_batch_size must equal to batch_size of img."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + + DeformablePSROIPoolForwardCPUKernel( + count, bottom_data, (T)spatial_scale, channels, height, width, + pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, + trans_std, sample_per_part, output_dim, group_height, group_width, + part_height, part_width, num_classes, channels_each_class, top_data, + top_count_data, batch, roi_batch_id_data, rois); + } +}; + +template +void DeformablePSROIPoolBackwardAccCPUKernel( + const int count, const T* top_diff, const T* top_count, const int num_rois, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int output_dim, T* bottom_data_diff, T* bottom_trans_diff, + const T* bottom_data, const T* bottom_rois, const T* bottom_trans, + const bool no_trans, const float trans_std, const int sample_per_part, + const int group_height, const int group_width, const int part_height, + const int part_width, const int num_classes, const int channels_each_class, + const int batch_size, int* roi_batch_id_data, const LoDTensor* rois) { + for (int index = 0; index < count; index++) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // location of roi on feature map + const T* offset_bottom_rois = bottom_rois + n * 4; + int roi_batch_ind = roi_batch_id_data[n]; + T roi_start_w = + static_cast(round(offset_bottom_rois[0])) * spatial_scale - 0.5; + T roi_start_h = + static_cast(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_end_w = + static_cast(round(offset_bottom_rois[2]) + 1.) * spatial_scale - 0.5; + T roi_end_h = + static_cast(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + + // width and height of roi + T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); + T roi_height = std::max(roi_end_h - roi_start_h, T(0.1)); + + // width and height of each bin + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // sampling interval in each bin + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + // obtain offset of roi + int part_h = floor(static_cast(ph) / pooled_height * part_height); + int part_w = floor(static_cast(pw) / pooled_width * part_height); + int class_id = ctop / channels_each_class; + + T trans_x = + no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + T trans_y = no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * + part_height + + part_h) * + part_width + + part_w] * + static_cast(trans_std); + + // location of start after adding offset + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) { + continue; + } + + T diff_val = top_diff[index] / top_count[index]; + const T* offset_bottom_data = + bottom_data + roi_batch_ind * channels * height * width; + int gw = floor(static_cast(pw) * group_width / pooled_width); + int gh = floor(static_cast(ph) * group_height / pooled_height); + gw = std::min(std::max(gw, 0), group_width - 1); + gh = std::min(std::max(gh, 0), group_height - 1); + + // sampling in each bin + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = std::min(std::max(w, T(0.)), T(width - 1.)); + h = std::min(std::max(h, T(0.)), T(height - 1.)); + int c = (ctop * group_height + gh) * group_width + gw; + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + + // compute coefficient of gradient + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + + // compute gradient of input + if (bottom_data_diff != NULL) { + T* offset_bottom_data_diff_addr00 = + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y0 * width + x0; + T* offset_bottom_data_diff_addr01 = + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y1 * width + x0; + T* offset_bottom_data_diff_addr10 = + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y0 * width + x1; + T* offset_bottom_data_diff_addr11 = + bottom_data_diff + roi_batch_ind * channels * height * width + + bottom_index_base + y1 * width + x1; + *offset_bottom_data_diff_addr00 = + *offset_bottom_data_diff_addr00 + q00 * diff_val; + *offset_bottom_data_diff_addr01 = + *offset_bottom_data_diff_addr01 + q01 * diff_val; + *offset_bottom_data_diff_addr10 = + *offset_bottom_data_diff_addr10 + q10 * diff_val; + *offset_bottom_data_diff_addr11 = + *offset_bottom_data_diff_addr11 + q11 * diff_val; + } + + // compute gradient of trans + if (no_trans || bottom_trans_diff == NULL) { + continue; + } + + T u00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T u01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T u10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T u11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + + T diff_x = (u11 * dist_y + u10 * (1 - dist_y) - u01 * dist_y - + u00 * (1 - dist_y)) * + trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (u11 * dist_x + u01 * (1 - dist_x) - u10 * dist_x - + u00 * (1 - dist_x)) * + trans_std * diff_val; + diff_y *= roi_height; + T* offset_bottom_trans_diff_x = + bottom_trans_diff + + (((n * num_classes + class_id) * 2) * part_height + part_h) * + part_width + + part_w; + T* offset_bottom_trans_diff_y = + bottom_trans_diff + + (((n * num_classes + class_id) * 2 + 1) * part_height + part_h) * + part_width + + part_w; + + *offset_bottom_trans_diff_x = *offset_bottom_trans_diff_x + diff_x; + *offset_bottom_trans_diff_y = *offset_bottom_trans_diff_y + diff_y; + } + } + } +} + +template +class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* rois = ctx.Input("ROIs"); + auto* trans = ctx.Input("Trans"); + auto* top_count = ctx.Input("TopCount"); + auto* output_grad = ctx.Input(framework::GradVarName("Output")); + auto* input_grad = ctx.Output(framework::GradVarName("Input")); + math::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(.0)); + } + auto* trans_grad = ctx.Output(framework::GradVarName("Trans")); + if (trans_grad) { + trans_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, trans_grad, static_cast(.0)); + } + auto no_trans = ctx.Attr("no_trans"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_dim = ctx.Attr("output_dim"); + auto group_size = ctx.Attr>("group_size"); + auto group_height = group_size[0]; + auto group_width = group_size[1]; + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto part_size = ctx.Attr>("part_size"); + auto part_height = part_size[0]; + auto part_width = part_size[1]; + auto sample_per_part = ctx.Attr("sample_per_part"); + auto trans_std = ctx.Attr("trans_std"); + + const int batch = static_cast(input->dims()[0]); + const int channels = static_cast(input->dims()[1]); + const int height = static_cast(input->dims()[2]); + const int width = static_cast(input->dims()[3]); + const int channels_trans = no_trans ? 2 : trans->dims()[1]; + const int num_rois = rois->dims()[0]; + const int count = num_rois * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = + no_trans ? output_dim : output_dim / num_classes; + Tensor roi_batch_id_list; + roi_batch_id_list.Resize({num_rois}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(ctx.GetPlace()); + + const T* top_diff = output_grad->data(); + const T* bottom_data = input->data(); + const T* bottom_rois = rois->data(); + const T* bottom_trans = no_trans ? NULL : trans->data(); + + T* bottom_data_diff = NULL; + T* bottom_trans_diff = NULL; + if (input_grad) { + bottom_data_diff = input_grad->mutable_data(ctx.GetPlace()); + } + if (trans_grad) { + bottom_trans_diff = + no_trans ? NULL : trans_grad->mutable_data(ctx.GetPlace()); + } + + const T* top_count_data = top_count->data(); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(num_rois, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + + DeformablePSROIPoolBackwardAccCPUKernel( + count, top_diff, top_count_data, num_rois, (T)spatial_scale, channels, + height, width, pooled_height, pooled_width, output_dim, + bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois, + bottom_trans, no_trans, (T)trans_std, sample_per_part, group_height, + group_width, part_height, part_width, num_classes, channels_each_class, + batch, roi_batch_id_data, rois); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc new file mode 100644 index 00000000..89416f7a --- /dev/null +++ b/paddle/fluid/operators/delete_var_op.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +class DeleteVarOp : public framework::OperatorBase { + public: + DeleteVarOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + dev_ctx.Wait(); + + auto delete_var_names = Inputs("X"); + const_cast(scope).EraseVars(delete_var_names); + } +}; + +class DeleteVarOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override {} +}; + +class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input of delete op").AsDuplicable(); + AddComment(R"DOC( +Delete Operator. +It should not be configured by users directly. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp, + paddle::framework::EmptyGradOpMaker, + paddle::operators::DeleteVarOpInfoMaker, + paddle::operators::DeleteVarOpShapeInference); diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc new file mode 100644 index 00000000..38159f84 --- /dev/null +++ b/paddle/fluid/operators/dequantize_op.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dequantize_op.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +framework::OpKernelType DeQuantOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library_ = framework::LibraryType::kMKLDNN; + framework::DataLayout layout_ = framework::DataLayout::kMKLDNN; + + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_); +} + +void DeQuantOpMaker::Make() { + AddInput("Input", "input data"); + AddOutput("Output", "output data"); + AddAttr("Scale", "scale data").SetDefault({1.0f}); + AddComment(R"DOC(This op will dequantize data from INT8 to FP32)DOC"); +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker, + paddle::framework::DefaultGradOpDescMaker); diff --git a/paddle/fluid/operators/dequantize_op.h b/paddle/fluid/operators/dequantize_op.h new file mode 100644 index 00000000..75c27a06 --- /dev/null +++ b/paddle/fluid/operators/dequantize_op.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::OpKernelType; +using framework::Tensor; + +class DeQuantOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim("Output", ctx->GetInputDim("Input")); + ctx->ShareLoD("Input", /*->*/ "Output"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class DeQuantOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +class DeQuantGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override {} +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h new file mode 100644 index 00000000..8660bc21 --- /dev/null +++ b/paddle/fluid/operators/detail/safe_ref.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace detail { +/** + * Get Reference From Pointer with check. The error message is printf format, + * and passed by `args` + */ +template +inline T& Ref(T* ptr, ARGS&&... args) { + PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...)); + return *ptr; +} + +template +inline std::vector> VectorRef( + const std::vector& vec, ARGS&&... args) { + std::vector> result; + result.reserve(vec.size()); + for (auto* ptr : vec) { + result.emplace_back(Ref(ptr, args...)); + } + return result; +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/strided_memcpy.h b/paddle/fluid/operators/detail/strided_memcpy.h new file mode 100644 index 00000000..94419d1f --- /dev/null +++ b/paddle/fluid/operators/detail/strided_memcpy.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace detail { + +template +struct StridedMemcpyFunctor; + +template +struct StridedMemcpyFunctor { + void operator()(const platform::DeviceContext& dev_ctx, const T* src, + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { + auto place = dev_ctx.GetPlace(); + if (platform::is_cpu_place(place)) { + auto& cpu_place = boost::get(place); + memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T)); + } else { +#ifdef PADDLE_WITH_CUDA + auto& gpu_place = boost::get(place); + auto& cuda_ctx = + reinterpret_cast(dev_ctx); + memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T), + cuda_ctx.stream()); +#else + PADDLE_THROW("Paddle is not compiled with GPU"); +#endif + } + } +}; + +template +struct StridedMemcpyFunctor { + void operator()(const platform::DeviceContext& dev_ctx, const T* src, + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { + auto place = dev_ctx.GetPlace(); + if (platform::is_cpu_place(place)) { + auto& cpu_place = boost::get(place); + memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim[0]); + } else { +#ifdef PADDLE_WITH_CUDA + auto& gpu_place = boost::get(place); + auto& cuda_ctx = + reinterpret_cast(dev_ctx); + memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim[0], + cuda_ctx.stream()); +#else + PADDLE_THROW("Paddle is not compiled with GPU"); +#endif + } + } +}; + +template +struct StridedMemcpyFunctor { + void operator()(const platform::DeviceContext& dev_ctx, const T* src, + const int64_t* src_stride, const int64_t* dst_dim, + const int64_t* dst_stride, T* dst) const { + for (int64_t i = 0; i < dst_dim[0]; ++i) { + StridedMemcpyFunctor func; + func(dev_ctx, src, src_stride + 1, dst_dim + 1, dst_stride + 1, dst); + src += src_stride[0]; + dst += dst_stride[0]; + } + } +}; + +template +struct StridedCopyDimVisitor { + StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src, + const framework::DDim& src_stride, + const framework::DDim& dst_stride, T* dst) + : dev_ctx_(dev_ctx), + src_(src), + src_stride_(src_stride), + dst_stride_(dst_stride), + dst_(dst) {} + + template + void operator()(const framework::Dim& dst_dim) const { + StridedMemcpyFunctor functor; + functor(dev_ctx_, src_, src_stride_.Get(), dst_dim.Get(), dst_stride_.Get(), + dst_); + } + + const platform::DeviceContext& dev_ctx_; + const T* src_; + const framework::DDim& src_stride_; + const framework::DDim& dst_stride_; + T* dst_; +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt new file mode 100644 index 00000000..f1c504d6 --- /dev/null +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -0,0 +1,61 @@ +set(LOCAL_DETECTION_LIBS) + +function(detection_library TARGET_NAME) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + set(options "") + set(common_deps op_registry) + set(pybind_flag 0) + cmake_parse_arguments(detection_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + op_library(${TARGET_NAME} SRCS ${detection_library_SRCS} DEPS ${common_deps} ${detection_library_DEPS}) + set(LOCAL_DETECTION_LIBS + ${TARGET_NAME} + ${LOCAL_DETECTION_LIBS} + PARENT_SCOPE) +endfunction() + +detection_library(bipartite_match_op SRCS bipartite_match_op.cc) +detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) +detection_library(iou_similarity_op SRCS iou_similarity_op.cc +iou_similarity_op.cu) +detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) +detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc) +detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) +detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) +detection_library(anchor_generator_op SRCS anchor_generator_op.cc +anchor_generator_op.cu) +detection_library(target_assign_op SRCS target_assign_op.cc +target_assign_op.cu) +detection_library(polygon_box_transform_op SRCS polygon_box_transform_op.cc +polygon_box_transform_op.cu) +detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc) +detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc) +detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu) +detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc) +detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu) +detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu) +detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc sigmoid_focal_loss_op.cu) +detection_library(retinanet_detection_output_op SRCS retinanet_detection_output_op.cc) + +if(WITH_GPU) + detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS memory cub) + detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS memory cub) + detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS memory cub) +else() + detection_library(generate_proposals_op SRCS generate_proposals_op.cc) + detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc) + detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc) +endif() + +detection_library(roi_perspective_transform_op SRCS roi_perspective_transform_op.cc roi_perspective_transform_op.cu) +#Export local libraries to parent +# set(DETECTION_LIBRARY ${LOCAL_DETECTION_LIBS} PARENT_SCOPE) + +foreach(src ${LOCAL_DETECTION_LIBS}) + set(OP_LIBRARY ${src} ${OP_LIBRARY} CACHE INTERNAL "op libs") +endforeach() + +cc_library(mask_util SRCS mask_util.cc DEPS memory) +cc_test(mask_util_test SRCS mask_util_test.cc DEPS memory mask_util) +detection_library(generate_mask_labels_op SRCS generate_mask_labels_op.cc DEPS mask_util) diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc new file mode 100644 index 00000000..4a333b55 --- /dev/null +++ b/paddle/fluid/operators/detection/anchor_generator_op.cc @@ -0,0 +1,153 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/anchor_generator_op.h" + +namespace paddle { +namespace operators { + +class AnchorGeneratorOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of AnchorGeneratorOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Anchors"), + "Output(Anchors) of AnchorGeneratorOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Variances"), + "Output(Variances) of AnchorGeneratorOp should not be null."); + + auto input_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + auto anchor_sizes = ctx->Attrs().Get>("anchor_sizes"); + auto aspect_ratios = ctx->Attrs().Get>("aspect_ratios"); + auto stride = ctx->Attrs().Get>("stride"); + auto variances = ctx->Attrs().Get>("variances"); + + size_t num_anchors = aspect_ratios.size() * anchor_sizes.size(); + + std::vector dim_vec(4); + dim_vec[0] = input_dims[2]; + dim_vec[1] = input_dims[3]; + dim_vec[2] = num_anchors; + dim_vec[3] = 4; + ctx->SetOutputDim("Anchors", framework::make_ddim(dim_vec)); + ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input("Input")->type(), ctx.device_context()); + } +}; + +class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(Tensor, default Tensor), " + "the input feature is a tensor with a rank of 4. " + "The layout is NCHW."); + AddOutput("Anchors", + "(Tensor, default Tensor), the output is a " + "tensor with a rank of 4. The layout is [H, W, num_anchors, 4]. " + "H is the height of input, W is the width of input, num_anchors " + "is the box count of each position. " + "Each anchor is in (xmin, ymin, xmax, ymax) format"); + AddOutput("Variances", + "(Tensor, default Tensor), the expanded variances for " + "normalizing bbox regression targets. The layout is [H, W, " + "num_anchors, 4]. " + "H is the height of input, W is the width of input, num_anchors " + "is the box count of each position. " + "Each variance is in (xcenter, ycenter, w, h) format"); + + AddAttr>( + "anchor_sizes", + "(vector) List of Region Proposal Network(RPN) anchor sizes " + " given in absolute pixels e.g. (64, 128, 256, 512)." + " For instance, the anchor size of 64 means the area of this anchor " + "equals to 64**2.") + .AddCustomChecker([](const std::vector& anchor_sizes) { + PADDLE_ENFORCE_GT(anchor_sizes.size(), 0UL, + "Size of anchor_sizes must be at least 1."); + for (size_t i = 0; i < anchor_sizes.size(); ++i) { + PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0, + "anchor_sizes[%d] must be positive.", i); + } + }); + AddAttr>( + "aspect_ratios", + "(vector) List of Region Proposal Network(RPN) anchor aspect " + "ratios, e.g. (0.5, 1, 2)." + "For instacne, the aspect ratio of 0.5 means the height / width of " + "this anchor equals 0.5."); + + AddAttr>("variances", + "(vector) List of variances to be used " + "in box regression deltas") + .AddCustomChecker([](const std::vector& variances) { + PADDLE_ENFORCE_EQ(variances.size(), 4UL, + "Must and only provide 4 variance."); + for (size_t i = 0; i < variances.size(); ++i) { + PADDLE_ENFORCE_GT(variances[i], 0.0, + "variance[%d] must be greater than 0.", i); + } + }); + + AddAttr>("stride", + "Anchors stride across width and height, " + "with a default of (16, 16)") + .SetDefault(std::vector(2, 16.0)) + .AddCustomChecker([](const std::vector& stride) { + PADDLE_ENFORCE_EQ( + stride.size(), 2UL, + "Must and only provide 2 stride for width and height."); + for (size_t i = 0; i < stride.size(); ++i) { + PADDLE_ENFORCE_GT(stride[i], 0.0, + "stride[%d] should be larger than 0.", i); + } + }); + + AddAttr("offset", + "(float) " + "Anchor center offset, with a default of 0.5") + .SetDefault(0.5); + AddComment(R"DOC( +AnchorGenerator operator +Generates anchors for Faster RCNN, FPN etc. algorithm. +Each position of the input produce N anchors, N = + size(anchor_sizes) * size(aspect_ratios). + +Please get more information from the following papers: +https://arxiv.org/abs/1506.01497. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(anchor_generator, ops::AnchorGeneratorOp, + ops::AnchorGeneratorOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(anchor_generator, ops::AnchorGeneratorOpKernel, + ops::AnchorGeneratorOpKernel); diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu new file mode 100644 index 00000000..3cc9bbee --- /dev/null +++ b/paddle/fluid/operators/detection/anchor_generator_op.cu @@ -0,0 +1,132 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/anchor_generator_op.h" + +namespace paddle { +namespace operators { + +template +__global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num, + const T* anchor_sizes, const int as_num, + const T* stride, const int sd_num, const int height, + const int width, const T offset) { + int num_anchors = as_num * ar_num; + int box_num = height * width * num_anchors; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num; + i += blockDim.x * gridDim.x) { + int h_idx = i / (num_anchors * width); + int w_idx = (i / num_anchors) % width; + T stride_width = stride[0]; + T stride_height = stride[1]; + T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1); + T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1); + T area, area_ratios; + T base_w, base_h; + T scale_w, scale_h; + T anchor_width, anchor_height; + int anch_idx = i % num_anchors; + int ar_idx = anch_idx / as_num; + int as_idx = anch_idx % as_num; + T aspect_ratio = aspect_ratios[ar_idx]; + T anchor_size = anchor_sizes[as_idx]; + area = stride_width * stride_height; + area_ratios = area / aspect_ratio; + base_w = round(sqrt(area_ratios)); + base_h = round(base_w * aspect_ratio); + scale_w = anchor_size / stride_width; + scale_h = anchor_size / stride_height; + anchor_width = scale_w * base_w; + anchor_height = scale_h * base_h; + + T xmin = (x_ctr - 0.5 * (anchor_width - 1)); + T ymin = (y_ctr - 0.5 * (anchor_height - 1)); + T xmax = (x_ctr + 0.5 * (anchor_width - 1)); + T ymax = (y_ctr + 0.5 * (anchor_height - 1)); + out[i * 4] = xmin; + out[i * 4 + 1] = ymin; + out[i * 4 + 2] = xmax; + out[i * 4 + 3] = ymax; + } +} + +template +__global__ void SetVariance(T* out, const T* var, const int vnum, + const int num) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + out[i] = var[i % vnum]; + } +} + +template +class AnchorGeneratorOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* anchors = ctx.Output("Anchors"); + auto* vars = ctx.Output("Variances"); + + auto anchor_sizes = ctx.Attr>("anchor_sizes"); + auto aspect_ratios = ctx.Attr>("aspect_ratios"); + auto stride = ctx.Attr>("stride"); + auto variances = ctx.Attr>("variances"); + + T offset = static_cast(ctx.Attr("offset")); + + auto width = input->dims()[3]; + auto height = input->dims()[2]; + + int num_anchors = aspect_ratios.size() * anchor_sizes.size(); + + int box_num = width * height * num_anchors; + + int block = 512; + int grid = (box_num + block - 1) / block; + + auto stream = + ctx.template device_context().stream(); + + anchors->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + framework::Tensor ar; + framework::TensorFromVector(aspect_ratios, ctx.device_context(), &ar); + + framework::Tensor as; + framework::TensorFromVector(anchor_sizes, ctx.device_context(), &as); + + framework::Tensor sd; + framework::TensorFromVector(stride, ctx.device_context(), &sd); + + GenAnchors<<>>( + anchors->data(), ar.data(), aspect_ratios.size(), as.data(), + anchor_sizes.size(), sd.data(), stride.size(), height, width, + offset); + + framework::Tensor v; + framework::TensorFromVector(variances, ctx.device_context(), &v); + grid = (box_num * 4 + block - 1) / block; + SetVariance<<>>(vars->data(), v.data(), + variances.size(), box_num * 4); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(anchor_generator, + ops::AnchorGeneratorOpCUDAKernel, + ops::AnchorGeneratorOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h new file mode 100644 index 00000000..e0e499d7 --- /dev/null +++ b/paddle/fluid/operators/detection/anchor_generator_op.h @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +template +class AnchorGeneratorOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* anchors = ctx.Output("Anchors"); + auto* vars = ctx.Output("Variances"); + + auto anchor_sizes = ctx.Attr>("anchor_sizes"); + auto aspect_ratios = ctx.Attr>("aspect_ratios"); + auto stride = ctx.Attr>("stride"); + auto variances = ctx.Attr>("variances"); + + T offset = static_cast(ctx.Attr("offset")); + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + T stride_width, stride_height; + stride_width = stride[0]; + stride_height = stride[1]; + + int num_anchors = aspect_ratios.size() * anchor_sizes.size(); + + anchors->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + auto e_anchors = framework::EigenTensor::From(*anchors); + for (int h_idx = 0; h_idx < feature_height; ++h_idx) { + for (int w_idx = 0; w_idx < feature_width; ++w_idx) { + T x_ctr = (w_idx * stride_width) + offset * (stride_width - 1); + T y_ctr = (h_idx * stride_height) + offset * (stride_height - 1); + T area, area_ratios; + T base_w, base_h; + T scale_w, scale_h; + T anchor_width, anchor_height; + int idx = 0; + for (size_t r = 0; r < aspect_ratios.size(); ++r) { + auto ar = aspect_ratios[r]; + for (size_t s = 0; s < anchor_sizes.size(); ++s) { + auto anchor_size = anchor_sizes[s]; + area = stride_width * stride_height; + area_ratios = area / ar; + base_w = round(sqrt(area_ratios)); + base_h = round(base_w * ar); + scale_w = anchor_size / stride_width; + scale_h = anchor_size / stride_height; + anchor_width = scale_w * base_w; + anchor_height = scale_h * base_h; + e_anchors(h_idx, w_idx, idx, 0) = + (x_ctr - 0.5 * (anchor_width - 1)); + e_anchors(h_idx, w_idx, idx, 1) = + (y_ctr - 0.5 * (anchor_height - 1)); + e_anchors(h_idx, w_idx, idx, 2) = + (x_ctr + 0.5 * (anchor_width - 1)); + e_anchors(h_idx, w_idx, idx, 3) = + (y_ctr + 0.5 * (anchor_height - 1)); + idx++; + } + } + } + } + + framework::Tensor var_t; + var_t.mutable_data( + framework::make_ddim({1, static_cast(variances.size())}), + ctx.GetPlace()); + auto var_et = framework::EigenTensor::From(var_t); + for (size_t i = 0; i < variances.size(); ++i) { + var_et(0, i) = variances[i]; + } + + int anchor_num = feature_height * feature_width * num_anchors; + auto var_dim = vars->dims(); + vars->Resize({anchor_num, static_cast(variances.size())}); + + auto e_vars = framework::EigenMatrix::From(*vars); + e_vars = var_et.broadcast(Eigen::DSizes(anchor_num, 1)); + + vars->Resize(var_dim); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h new file mode 100644 index 00000000..afc39c1d --- /dev/null +++ b/paddle/fluid/operators/detection/bbox_util.h @@ -0,0 +1,153 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { + +struct RangeInitFunctor { + int start; + int delta; + int* out; + HOSTDEVICE void operator()(size_t i) { out[i] = start + i * delta; } +}; + +template +inline HOSTDEVICE T RoIArea(const T* box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +/* + * transform that computes target bounding-box regression deltas + * given proposal boxes and ground-truth boxes. + */ +template +inline void BoxToDelta(const int box_num, const framework::Tensor& ex_boxes, + const framework::Tensor& gt_boxes, const float* weights, + const bool normalized, framework::Tensor* box_delta) { + auto ex_boxes_et = framework::EigenTensor::From(ex_boxes); + auto gt_boxes_et = framework::EigenTensor::From(gt_boxes); + auto trg = framework::EigenTensor::From(*box_delta); + T ex_w, ex_h, ex_ctr_x, ex_ctr_y, gt_w, gt_h, gt_ctr_x, gt_ctr_y; + for (int64_t i = 0; i < box_num; ++i) { + ex_w = ex_boxes_et(i, 2) - ex_boxes_et(i, 0) + (normalized == false); + ex_h = ex_boxes_et(i, 3) - ex_boxes_et(i, 1) + (normalized == false); + ex_ctr_x = ex_boxes_et(i, 0) + 0.5 * ex_w; + ex_ctr_y = ex_boxes_et(i, 1) + 0.5 * ex_h; + + gt_w = gt_boxes_et(i, 2) - gt_boxes_et(i, 0) + (normalized == false); + gt_h = gt_boxes_et(i, 3) - gt_boxes_et(i, 1) + (normalized == false); + gt_ctr_x = gt_boxes_et(i, 0) + 0.5 * gt_w; + gt_ctr_y = gt_boxes_et(i, 1) + 0.5 * gt_h; + + trg(i, 0) = (gt_ctr_x - ex_ctr_x) / ex_w; + trg(i, 1) = (gt_ctr_y - ex_ctr_y) / ex_h; + trg(i, 2) = std::log(gt_w / ex_w); + trg(i, 3) = std::log(gt_h / ex_h); + + if (weights) { + trg(i, 0) = trg(i, 0) / weights[0]; + trg(i, 1) = trg(i, 1) / weights[1]; + trg(i, 2) = trg(i, 2) / weights[2]; + trg(i, 3) = trg(i, 3) / weights[3]; + } + } +} + +template +void Gather(const T* in, const int in_stride, const int* index, const int num, + T* out) { + const int stride_bytes = in_stride * sizeof(T); + for (int i = 0; i < num; ++i) { + int id = index[i]; + memcpy(out + i * in_stride, in + id * in_stride, stride_bytes); + } +} + +template +void BboxOverlaps(const framework::Tensor& r_boxes, + const framework::Tensor& c_boxes, + framework::Tensor* overlaps) { + auto r_boxes_et = framework::EigenTensor::From(r_boxes); + auto c_boxes_et = framework::EigenTensor::From(c_boxes); + auto overlaps_et = framework::EigenTensor::From(*overlaps); + int r_num = r_boxes.dims()[0]; + int c_num = c_boxes.dims()[0]; + auto zero = static_cast(0.0); + T r_box_area, c_box_area, x_min, y_min, x_max, y_max, inter_w, inter_h, + inter_area; + for (int i = 0; i < r_num; ++i) { + r_box_area = (r_boxes_et(i, 2) - r_boxes_et(i, 0) + 1) * + (r_boxes_et(i, 3) - r_boxes_et(i, 1) + 1); + for (int j = 0; j < c_num; ++j) { + c_box_area = (c_boxes_et(j, 2) - c_boxes_et(j, 0) + 1) * + (c_boxes_et(j, 3) - c_boxes_et(j, 1) + 1); + x_min = std::max(r_boxes_et(i, 0), c_boxes_et(j, 0)); + y_min = std::max(r_boxes_et(i, 1), c_boxes_et(j, 1)); + x_max = std::min(r_boxes_et(i, 2), c_boxes_et(j, 2)); + y_max = std::min(r_boxes_et(i, 3), c_boxes_et(j, 3)); + inter_w = std::max(x_max - x_min + 1, zero); + inter_h = std::max(y_max - y_min + 1, zero); + inter_area = inter_w * inter_h; + overlaps_et(i, j) = + (inter_area == 0.) ? 0 : inter_area / + (r_box_area + c_box_area - inter_area); + } + } +} + +template +void ClipTiledBoxes(const platform::DeviceContext& ctx, + const framework::Tensor& im_info, + const framework::Tensor& input_boxes, + framework::Tensor* out) { + T* out_data = out->mutable_data(ctx.GetPlace()); + const T* im_info_data = im_info.data(); + const T* input_boxes_data = input_boxes.data(); + T zero(0); + T im_w = round(im_info_data[1] / im_info_data[2]); + T im_h = round(im_info_data[0] / im_info_data[2]); + for (int64_t i = 0; i < input_boxes.numel(); ++i) { + if (i % 4 == 0) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); + } else if (i % 4 == 1) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); + } else if (i % 4 == 2) { + out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero); + } else { + out_data[i] = std::max(std::min(input_boxes_data[i], im_h - 1), zero); + } + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc new file mode 100644 index 00000000..af7797a6 --- /dev/null +++ b/paddle/fluid/operators/detection/bipartite_match_op.cc @@ -0,0 +1,290 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class BipartiteMatchOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("DistMat"), + "Input(DistMat) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchIndices"), + "Output(ColToRowMatchIndices) of BipartiteMatch should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ColToRowMatchDist"), + "Output(ColToRowMatchDist) of BipartiteMatch should not be null."); + + auto dims = ctx->GetInputDim("DistMat"); + PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2."); + + ctx->SetOutputDim("ColToRowMatchIndices", dims); + ctx->SetOutputDim("ColToRowMatchDist", dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("DistMat")->type(), + platform::CPUPlace()); + } +}; + +template +bool DistPairDescend(std::tuple pair1, + std::tuple pair2) { + return std::get<2>(pair1) > std::get<2>(pair2); +} + +template +class BipartiteMatchKernel : public framework::OpKernel { + public: + // The match_indices must be initialized to -1 at first. + // The match_dist must be initialized to 0 at first. + void BipartiteMatch(const Tensor& dist, int* match_indices, + T* match_dist) const { + PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2."); + int64_t row = dist.dims()[0]; + int64_t col = dist.dims()[1]; + auto* dist_data = dist.data(); + // Test result: When row==130 the speed of these two methods almost the same + if (row >= 130) { + std::vector> match_pair; + + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + match_pair.push_back(std::make_tuple(i, j, dist_data[i * col + j])); + } + } + std::sort(match_pair.begin(), match_pair.end(), DistPairDescend); + std::vector row_indices(row, -1); + + int64_t idx = 0; + for (int64_t k = 0; k < row * col; ++k) { + int64_t i = std::get<0>(match_pair[k]); + int64_t j = std::get<1>(match_pair[k]); + T dist = std::get<2>(match_pair[k]); + + if (idx >= row) { + break; + } + if (match_indices[j] == -1 && row_indices[i] == -1 && dist > 0) { + match_indices[j] = i; + row_indices[i] = j; + match_dist[j] = dist; + idx += 1; + } + } + } else { + constexpr T kEPS = static_cast(1e-6); + std::vector row_pool; + for (int i = 0; i < row; ++i) { + row_pool.push_back(i); + } + while (row_pool.size() > 0) { + int max_idx = -1; + int max_row_idx = -1; + T max_dist = -1; + for (int64_t j = 0; j < col; ++j) { + if (match_indices[j] != -1) { + continue; + } + for (size_t k = 0; k < row_pool.size(); ++k) { + int m = row_pool[k]; + // distance is 0 between m-th row and j-th column + if (dist_data[m * col + j] < kEPS) { + continue; + } + if (dist_data[m * col + j] > max_dist) { + max_idx = j; + max_row_idx = m; + max_dist = dist_data[m * col + j]; + } + } + } + if (max_idx == -1) { + // Cannot find good match. + break; + } else { + PADDLE_ENFORCE_EQ(match_indices[max_idx], -1); + match_indices[max_idx] = max_row_idx; + match_dist[max_idx] = max_dist; + // Erase the row index. + row_pool.erase( + std::find(row_pool.begin(), row_pool.end(), max_row_idx)); + } + } + } + } + + void ArgMaxMatch(const Tensor& dist, int* match_indices, T* match_dist, + T overlap_threshold) const { + constexpr T kEPS = static_cast(1e-6); + int64_t row = dist.dims()[0]; + int64_t col = dist.dims()[1]; + auto* dist_data = dist.data(); + for (int64_t j = 0; j < col; ++j) { + if (match_indices[j] != -1) { + // the j-th column has been matched to one entity. + continue; + } + int max_row_idx = -1; + T max_dist = -1; + for (int i = 0; i < row; ++i) { + T dist = dist_data[i * col + j]; + if (dist < kEPS) { + // distance is 0 between m-th row and j-th column + continue; + } + if (dist >= overlap_threshold && dist > max_dist) { + max_row_idx = i; + max_dist = dist; + } + } + if (max_row_idx != -1) { + PADDLE_ENFORCE_EQ(match_indices[j], -1); + match_indices[j] = max_row_idx; + match_dist[j] = max_dist; + } + } + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* dist_mat = context.Input("DistMat"); + auto* match_indices = context.Output("ColToRowMatchIndices"); + auto* match_dist = context.Output("ColToRowMatchDist"); + + auto& dev_ctx = context.device_context(); + + auto col = dist_mat->dims()[1]; + + int64_t n = dist_mat->lod().size() == 0UL + ? 1 + : static_cast(dist_mat->lod().back().size() - 1); + if (dist_mat->lod().size()) { + PADDLE_ENFORCE_EQ(dist_mat->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + match_indices->mutable_data({n, col}, context.GetPlace()); + match_dist->mutable_data({n, col}, context.GetPlace()); + + math::SetConstant iset; + iset(dev_ctx, match_indices, static_cast(-1)); + math::SetConstant tset; + tset(dev_ctx, match_dist, static_cast(0)); + + int* indices = match_indices->data(); + T* dist = match_dist->data(); + auto type = context.Attr("match_type"); + auto threshold = context.Attr("dist_threshold"); + if (n == 1) { + BipartiteMatch(*dist_mat, indices, dist); + if (type == "per_prediction") { + ArgMaxMatch(*dist_mat, indices, dist, threshold); + } + } else { + auto lod = dist_mat->lod().back(); + for (size_t i = 0; i < lod.size() - 1; ++i) { + Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]); + BipartiteMatch(one_ins, indices + i * col, dist + i * col); + if (type == "per_prediction") { + ArgMaxMatch(one_ins, indices + i * col, dist + i * col, threshold); + } + } + } + } +}; + +class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "DistMat", + "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape " + "[K, M]. It is pair-wise distance matrix between the entities " + "represented by each row and each column. For example, assumed one " + "entity is A with shape [K], another entity is B with shape [M]. The " + "DistMat[i][j] is the distance between A[i] and B[j]. The bigger " + "the distance is, the better macthing the pairs are. Please note, " + "This tensor can contain LoD information to represent a batch of " + "inputs. One instance of this batch can contain different numbers of " + "entities."); + AddAttr( + "match_type", + "(string, default: per_prediction) " + "The type of matching method, should be 'bipartite' or " + "'per_prediction', 'bipartite' by default.") + .SetDefault("bipartite") + .InEnum({"bipartite", "per_prediction"}); + AddAttr( + "dist_threshold", + "(float, default: 0.5) " + "If `match_type` is 'per_prediction', this threshold is to determine " + "the extra matching bboxes based on the maximum distance.") + .SetDefault(0.5); + AddOutput("ColToRowMatchIndices", + "(Tensor) A 2-D Tensor with shape [N, M] in int type. " + "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it " + "means B[j] does not match any entity in i-th instance. " + "Otherwise, it means B[j] is matched to row " + "ColToRowMatchIndices[i][j] in i-th instance. The row number of " + "i-th instance is saved in ColToRowMatchIndices[i][j]."); + AddOutput("ColToRowMatchDist", + "(Tensor) A 2-D Tensor with shape [N, M] in float type. " + "N is batch size. If ColToRowMatchIndices[i][j] is -1, " + "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed " + "ColToRowMatchIndices[i][j] = d, and the row offsets of each " + "instance are called LoD. Then " + "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]"); + AddComment(R"DOC( +This operator is a greedy bipartite matching algorithm, which is used to +obtain the matching with the maximum distance based on the input +distance matrix. For input 2D matrix, the bipartite matching algorithm can +find the matched column for each row, also can find the matched row for +each column. And this operator only calculate matched indices from column +to row. For each instance, the number of matched indices is the number of +of columns of the input distance matrix. + +There are two outputs to save matched indices and distance. +A simple description, this algorithm matched the best (maximum distance) +row entity to the column entity and the matched indices are not duplicated +in each row of ColToRowMatchIndices. If the column entity is not matched +any row entity, set -1 in ColToRowMatchIndices. + +Please note that the input DistMat can be LoDTensor (with LoD) or Tensor. +If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size. +If Tensor, the height of ColToRowMatchIndices is 1. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(bipartite_match, ops::BipartiteMatchOp, + ops::BipartiteMatchOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(bipartite_match, ops::BipartiteMatchKernel, + ops::BipartiteMatchKernel); diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc new file mode 100644 index 00000000..3aa76655 --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/box_clip_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class BoxClipOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of BoxClipOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), + "Input(ImInfo) of BoxClipOp should not be null."); + + auto input_box_dims = ctx->GetInputDim("Input"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + + if (ctx->IsRuntime()) { + auto input_box_size = input_box_dims.size(); + PADDLE_ENFORCE_EQ(input_box_dims[input_box_size - 1], 4, + "The last dimension of Input must be 4"); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(Input) in BoxClipOp must be 2"); + PADDLE_ENFORCE_EQ(im_info_dims[1], 3, + "The last dimension of ImInfo must be 3"); + } + ctx->ShareDim("Input", /*->*/ "Output"); + ctx->ShareLoD("Input", /*->*/ "Output"); + } +}; + +class BoxClipOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(LoDTensor) " + "Input is a LoDTensor with shape [..., 4] holds 4 points" + "in last dimension in format [xmin, ymin, xmax, ymax]"); + AddInput("ImInfo", + "(Tensor) Information for image reshape is in shape (N, 3), " + "in format (height, width, im_scale)"); + AddOutput("Output", + "(LoDTensor) " + "Output is a LoDTensor with the same shape as Input" + "and it is the result after clip"); + AddComment(R"DOC( +This operator clips input boxes to original input images. + +For each input box, The formula is given as follows: + + $$xmin = \max(\min(xmin, im_w - 1), 0)$$ + $$ymin = \max(\min(ymin, im_h - 1), 0)$$ + $$xmax = \max(\min(xmax, im_w - 1), 0)$$ + $$ymax = \max(\min(ymax, im_h - 1), 0)$$ + +where im_w and im_h are computed from ImInfo, the formula is given as follows: + + $$im_w = \round(width / im_scale)$$ + $$im_h = \round(height / im_scale)$$ +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(box_clip, ops::BoxClipOp, ops::BoxClipOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + box_clip, ops::BoxClipKernel, + ops::BoxClipKernel); diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu new file mode 100644 index 00000000..b727da5f --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/box_clip_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTenso = framework::LoDTensor; + +static constexpr int ImInfoSize = 3; + +template +static __global__ void GPUBoxClip(const T *input, const size_t *lod, + const size_t width, const T *im_info, + T *output) { + T im_w = round(im_info[blockIdx.x * ImInfoSize + 1] / + im_info[blockIdx.x * ImInfoSize + 2]); + T im_h = round(im_info[blockIdx.x * ImInfoSize] / + im_info[blockIdx.x * ImInfoSize + 2]); + for (int i = threadIdx.x; i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * width; + i += BlockSize) { + int idx = lod[blockIdx.x] * width + i; + T im_size = (idx % 2 == 0) ? im_w : im_h; + output[idx] = max(min(input[idx], im_size - 1), T(0.)); + } +} + +template +class GPUBoxClipKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto *input = context.Input("Input"); + auto *im_info = context.Input("ImInfo"); + auto *output = context.Output("Output"); + const int64_t num = input->dims()[0]; + const int64_t bbox_width = input->numel() / num; + auto lod = input->lod(); + framework::LoD abs_offset_lod = framework::ToAbsOffset(lod); + auto &dev_ctx = context.template device_context(); + auto stream = dev_ctx.stream(); + const size_t batch_size = lod.back().size() - 1; + T *output_data = output->mutable_data(dev_ctx.GetPlace()); + GPUBoxClip<<>>( + input->data(), abs_offset_lod[0].CUDAMutableData(dev_ctx.GetPlace()), + bbox_width, im_info->data(), output_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + box_clip, ops::GPUBoxClipKernel, + ops::GPUBoxClipKernel); diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h new file mode 100644 index 00000000..74e1f88f --- /dev/null +++ b/paddle/fluid/operators/detection/box_clip_op.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class BoxClipKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input_box = context.Input("Input"); + auto* im_info = context.Input("ImInfo"); + auto* output_box = context.Output("Output"); + auto& dev_ctx = + context.template device_context(); + output_box->mutable_data(context.GetPlace()); + if (input_box->lod().size()) { + PADDLE_ENFORCE_EQ(input_box->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + auto box_lod = input_box->lod().back(); + int64_t n = static_cast(box_lod.size() - 1); + for (int i = 0; i < n; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor box_slice = input_box->Slice(box_lod[i], box_lod[i + 1]); + Tensor output_slice = output_box->Slice(box_lod[i], box_lod[i + 1]); + ClipTiledBoxes(dev_ctx, im_info_slice, box_slice, &output_slice); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc new file mode 100644 index 00000000..de361267 --- /dev/null +++ b/paddle/fluid/operators/detection/box_coder_op.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/box_coder_op.h" +#include + +namespace paddle { +namespace operators { + +class BoxCoderOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("PriorBox"), + "Input(PriorBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("TargetBox"), + "Input(TargetBox) of BoxCoderOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutputBox"), + "Output(OutputBox) of BoxCoderOp should not be null."); + + auto prior_box_dims = ctx->GetInputDim("PriorBox"); + auto target_box_dims = ctx->GetInputDim("TargetBox"); + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, + "The rank of Input PriorBox must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, + "The shape of PriorBox is [N, 4]"); + if (ctx->HasInput("PriorBoxVar")) { + auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); + PADDLE_ENFORCE(prior_box_var_dims.size() == 2, + "Input(PriorBoxVar) of BoxCoderOp should be 2."); + PADDLE_ENFORCE_EQ( + prior_box_dims, prior_box_var_dims, + "The dimension of Input(PriorBoxVar) should be equal to" + "the dimension of Input(PriorBox) when the rank is 2."); + } + } + + auto code_type = GetBoxCodeType(ctx->Attrs().Get("code_type")); + int axis = ctx->Attrs().Get("axis"); + if (code_type == BoxCodeType::kEncodeCenterSize) { + PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, + "The rank of Input TargetBox must be 2"); + PADDLE_ENFORCE_EQ(target_box_dims[1], 4, + "The shape of TargetBox is [M, 4]"); + ctx->SetOutputDim( + "OutputBox", + framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4})); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + PADDLE_ENFORCE_EQ(target_box_dims.size(), 3, + "The rank of Input TargetBox must be 3"); + PADDLE_ENFORCE(axis == 0 || axis == 1, "axis must be 0 or 1"); + if (ctx->IsRuntime()) { + if (axis == 0) { + PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]); + } else if (axis == 1) { + PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]); + } + PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]); + } + ctx->ShareDim("TargetBox", /*->*/ "OutputBox"); + } + + if (code_type == BoxCodeType::kDecodeCenterSize && axis == 1) { + ctx->ShareLoD("PriorBox", /*->*/ "OutputBox"); + } else { + ctx->ShareLoD("TargetBox", /*->*/ "OutputBox"); + } + } +}; + +class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "PriorBox", + "(Tensor, default Tensor) " + "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, " + "each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the anchor box, " + "if the input is image feature map, they are close to the origin " + "of the coordinate system. [xmax, ymax] is the right bottom " + "coordinate of the anchor box."); + AddInput("PriorBoxVar", + "(Tensor, default Tensor, optional) " + "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group " + "of variance. PriorBoxVar will set all elements to 1 by " + "default.") + .AsDispensable(); + AddInput( + "TargetBox", + "(LoDTensor or Tensor) This input can be a 2-D LoDTensor with shape " + "[N, 4] when code_type is 'encode_center_size'. This input also can " + "be a 3-D Tensor with shape [N, M, 4] when code_type is " + "'decode_center_size'. [N, 4], each box is represented as " + "[xmin, ymin, xmax, ymax], [xmin, ymin] is the left top coordinate " + "of the box if the input is image feature map, they are close to " + "the origin of the coordinate system. [xmax, ymax] is the right " + "bottom coordinate of the box. This tensor can contain LoD " + "information to represent a batch of inputs. One instance of this " + "batch can contain different numbers of entities."); + AddAttr("code_type", + "(string, default encode_center_size) " + "the code type used with the target box") + .SetDefault("encode_center_size") + .InEnum({"encode_center_size", "decode_center_size"}); + AddAttr("box_normalized", + "(bool, default true) " + "whether treat the priorbox as a noramlized box") + .SetDefault(true); + AddAttr("axis", + "(int, default 0)" + "which axis in PriorBox to broadcast for box decode," + "for example, if axis is 0 and TargetBox has shape" + "[N, M, 4] and PriorBox has shape [M, 4], then PriorBox " + "will broadcast to [N, M, 4] for decoding. It is only valid" + "when code type is decode_center_size") + .SetDefault(0) + .InEnum({0, 1}); + AddAttr>( + "variance", + "(vector, default {})," + "variance of prior box with shape [4]. PriorBoxVar and variance can" + "not be provided at the same time.") + .SetDefault(std::vector{}); + AddOutput("OutputBox", + "(LoDTensor or Tensor) " + "When code_type is 'encode_center_size', the output tensor of " + "box_coder_op with shape [N, M, 4] representing the result of N " + "target boxes encoded with M Prior boxes and variances. When " + "code_type is 'decode_center_size', N represents the batch size " + "and M represents the number of deocded boxes."); + + AddComment(R"DOC( + +Bounding Box Coder. + +Encode/Decode the target bounding box with the priorbox information. + +The Encoding schema described below: + + ox = (tx - px) / pw / pxv + + oy = (ty - py) / ph / pyv + + ow = log(abs(tw / pw)) / pwv + + oh = log(abs(th / ph)) / phv + +The Decoding schema described below: + + ox = (pw * pxv * tx * + px) - tw / 2 + + oy = (ph * pyv * ty * + py) - th / 2 + + ow = exp(pwv * tw) * pw + tw / 2 + + oh = exp(phv * th) * ph + th / 2 + +where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width +and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the +priorbox's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`, +`phv` denote the variance of the priorbox and `ox`, `oy`, `ow`, `oh` denote the +encoded/decoded coordinates, width and height. + +During Box Decoding, two modes for broadcast are supported. Say target box has +shape [N, M, 4], and the shape of prior box can be [N, 4] or [M, 4]. Then prior +box will broadcast to target box along the assigned axis. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + box_coder, ops::BoxCoderKernel, + ops::BoxCoderKernel); diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu new file mode 100644 index 00000000..19a5bb90 --- /dev/null +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -0,0 +1,209 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/detection/box_coder_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +template +__global__ void EncodeCenterSizeKernel( + const T* prior_box_data, const T* prior_box_var_data, + const T* target_box_data, const int row, const int col, const int len, + const bool normalized, const T prior_box_var_size, const float* variance, + const int var_size, T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < row * col) { + const int row_idx = idx / col; + const int col_idx = idx % col; + T prior_box_width = prior_box_data[col_idx * len + 2] - + prior_box_data[col_idx * len] + (normalized == false); + T prior_box_height = prior_box_data[col_idx * len + 3] - + prior_box_data[col_idx * len + 1] + + (normalized == false); + T prior_box_center_x = prior_box_data[col_idx * len] + prior_box_width / 2; + T prior_box_center_y = + prior_box_data[col_idx * len + 1] + prior_box_height / 2; + + T target_box_center_x = + (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) / + 2; + T target_box_center_y = (target_box_data[row_idx * len + 3] + + target_box_data[row_idx * len + 1]) / + 2; + T target_box_width = target_box_data[row_idx * len + 2] - + target_box_data[row_idx * len] + (normalized == false); + T target_box_height = target_box_data[row_idx * len + 3] - + target_box_data[row_idx * len + 1] + + (normalized == false); + + output[idx * len] = + (target_box_center_x - prior_box_center_x) / prior_box_width; + output[idx * len + 1] = + (target_box_center_y - prior_box_center_y) / prior_box_height; + output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)); + output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)); + if (prior_box_var_data) { + int prior_var_offset = col_idx * len; + output[idx * len] /= prior_box_var_data[prior_var_offset]; + output[idx * len + 1] /= prior_box_var_data[prior_var_offset + 1]; + output[idx * len + 2] /= prior_box_var_data[prior_var_offset + 2]; + output[idx * len + 3] /= prior_box_var_data[prior_var_offset + 3]; + } else if (var_size == 4) { + for (int k = 0; k < 4; ++k) { + output[idx * len + k] /= static_cast(variance[k]); + } + } + } +} + +template +__global__ void DecodeCenterSizeKernel( + const T* prior_box_data, const T* prior_box_var_data, + const T* target_box_data, const int row, const int col, const int len, + const bool normalized, const T prior_box_var_size, const float* variance, + const int var_size, const int axis, T* output) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + int prior_box_offset = 0; + if (idx < row * col) { + const int col_idx = idx % col; + const int row_idx = idx / col; + prior_box_offset = axis == 0 ? col_idx * len : row_idx * len; + T prior_box_width = prior_box_data[prior_box_offset + 2] - + prior_box_data[prior_box_offset] + + (normalized == false); + T prior_box_height = prior_box_data[prior_box_offset + 3] - + prior_box_data[prior_box_offset + 1] + + (normalized == false); + T prior_box_center_x = + prior_box_data[prior_box_offset] + prior_box_width / 2; + T prior_box_center_y = + prior_box_data[prior_box_offset + 1] + prior_box_height / 2; + T target_box_width, target_box_height; + T target_box_center_x, target_box_center_y; + T box_var_x = T(1), box_var_y = T(1); + T box_var_w = T(1), box_var_h = T(1); + if (prior_box_var_data) { + int prior_var_offset = axis == 0 ? col_idx * len : row_idx * len; + box_var_x = prior_box_var_data[prior_var_offset]; + box_var_y = prior_box_var_data[prior_var_offset + 1]; + box_var_w = prior_box_var_data[prior_var_offset + 2]; + box_var_h = prior_box_var_data[prior_var_offset + 3]; + } else if (var_size == 4) { + box_var_x = static_cast(variance[0]); + box_var_y = static_cast(variance[1]); + box_var_w = static_cast(variance[2]); + box_var_h = static_cast(variance[3]); + } + target_box_width = + exp(box_var_w * target_box_data[idx * len + 2]) * prior_box_width; + target_box_height = + exp(box_var_h * target_box_data[idx * len + 3]) * prior_box_height; + target_box_center_x = + box_var_x * target_box_data[idx * len] * prior_box_width + + prior_box_center_x; + target_box_center_y = + box_var_y * target_box_data[idx * len + 1] * prior_box_height + + prior_box_center_y; + + output[idx * len] = target_box_center_x - target_box_width / 2; + output[idx * len + 1] = target_box_center_y - target_box_height / 2; + output[idx * len + 2] = + target_box_center_x + target_box_width / 2 - (normalized == false); + output[idx * len + 3] = + target_box_center_y + target_box_height / 2 - (normalized == false); + } +} + +template +class BoxCoderCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* output_box = context.Output("OutputBox"); + std::vector variance = context.Attr>("variance"); + const T* prior_box_data = prior_box->data(); + const T* target_box_data = target_box->data(); + const T* prior_box_var_data = nullptr; + auto prior_box_var_size = 0; + if (prior_box_var) { + PADDLE_ENFORCE(variance.empty(), + "Input 'PriorBoxVar' and attribute 'variance' should not" + "be used at the same time."); + prior_box_var_data = prior_box_var->data(); + prior_box_var_size = prior_box_var->dims().size(); + } + if (!(variance.empty())) { + PADDLE_ENFORCE(static_cast(variance.size()) == 4, + "Size of attribute 'variance' should be 4"); + } + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, + "Only support 1 level of LoD."); + } + const int var_size = static_cast(variance.size()); + + auto code_type = GetBoxCodeType(context.Attr("code_type")); + bool normalized = context.Attr("box_normalized"); + int axis = context.Attr("axis"); + + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + if (code_type == BoxCodeType::kDecodeCenterSize) { + col = target_box->dims()[1]; + } + auto len = prior_box->dims()[1]; + int block = 512; + int grid = (row * col + block - 1) / block; + auto& device_ctx = context.cuda_device_context(); + + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(device_ctx); + int bytes = var_size * sizeof(float); + auto dev_var = allocator.Allocate(bytes); + float* dev_var_data = reinterpret_cast(dev_var->ptr()); + auto cplace = platform::CPUPlace(); + const auto gplace = boost::get(context.GetPlace()); + memory::Copy(gplace, dev_var_data, cplace, &variance[0], bytes, + device_ctx.stream()); + + output_box->mutable_data({row, col, len}, context.GetPlace()); + T* output = output_box->data(); + + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + normalized, prior_box_var_size, dev_var_data, var_size, output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + DecodeCenterSizeKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, row, col, len, + normalized, prior_box_var_size, dev_var_data, var_size, axis, output); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + box_coder, + ops::BoxCoderCUDAKernel, + ops::BoxCoderCUDAKernel); diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h new file mode 100644 index 00000000..d4c7e8cf --- /dev/null +++ b/paddle/fluid/operators/detection/box_coder_op.h @@ -0,0 +1,247 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 }; + +inline BoxCodeType GetBoxCodeType(const std::string &type) { + if (type == "encode_center_size") { + return BoxCodeType::kEncodeCenterSize; + } else if (type == "decode_center_size") { + return BoxCodeType::kDecodeCenterSize; + } + PADDLE_THROW("Not support type %s.", type); +} + +template +class BoxCoderKernel : public framework::OpKernel { + public: + void EncodeCenterSize(const framework::Tensor *target_box, + const framework::Tensor *prior_box, + const framework::Tensor *prior_box_var, + const bool normalized, + const std::vector variance, T *output) const { + int64_t row = target_box->dims()[0]; + int64_t col = prior_box->dims()[0]; + int64_t len = prior_box->dims()[1]; + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + auto *target_box_data = target_box->data(); + auto *prior_box_data = prior_box->data(); + size_t offset = i * col * len + j * len; + T prior_box_width = prior_box_data[j * len + 2] - + prior_box_data[j * len] + (normalized == false); + T prior_box_height = prior_box_data[j * len + 3] - + prior_box_data[j * len + 1] + + (normalized == false); + T prior_box_center_x = prior_box_data[j * len] + prior_box_width / 2; + T prior_box_center_y = + prior_box_data[j * len + 1] + prior_box_height / 2; + + T target_box_center_x = + (target_box_data[i * len + 2] + target_box_data[i * len]) / 2; + T target_box_center_y = + (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2; + T target_box_width = target_box_data[i * len + 2] - + target_box_data[i * len] + (normalized == false); + T target_box_height = target_box_data[i * len + 3] - + target_box_data[i * len + 1] + + (normalized == false); + + output[offset] = + (target_box_center_x - prior_box_center_x) / prior_box_width; + output[offset + 1] = + (target_box_center_y - prior_box_center_y) / prior_box_height; + output[offset + 2] = + std::log(std::fabs(target_box_width / prior_box_width)); + output[offset + 3] = + std::log(std::fabs(target_box_height / prior_box_height)); + } + } + + if (prior_box_var) { + const T *prior_box_var_data = prior_box_var->data(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(3) +#endif + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + for (int k = 0; k < 4; ++k) { + size_t offset = i * col * len + j * len; + int prior_var_offset = j * len; + output[offset + k] /= prior_box_var_data[prior_var_offset + k]; + } + } + } + } else if (!(variance.empty())) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(3) +#endif + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + for (int k = 0; k < 4; ++k) { + size_t offset = i * col * len + j * len; + output[offset + k] /= static_cast(variance[k]); + } + } + } + } + } + + template + void DecodeCenterSize(const framework::Tensor *target_box, + const framework::Tensor *prior_box, + const framework::Tensor *prior_box_var, + const bool normalized, std::vector variance, + T *output) const { + int64_t row = target_box->dims()[0]; + int64_t col = target_box->dims()[1]; + int64_t len = target_box->dims()[2]; + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif + for (int64_t i = 0; i < row; ++i) { + for (int64_t j = 0; j < col; ++j) { + auto *target_box_data = target_box->data(); + auto *prior_box_data = prior_box->data(); + + T var_data[4] = {1., 1., 1., 1.}; + T *var_ptr = var_data; + size_t offset = i * col * len + j * len; + int prior_box_offset = axis == 0 ? j * len : i * len; + + T prior_box_width = prior_box_data[prior_box_offset + 2] - + prior_box_data[prior_box_offset] + + (normalized == false); + T prior_box_height = prior_box_data[prior_box_offset + 3] - + prior_box_data[prior_box_offset + 1] + + (normalized == false); + T prior_box_center_x = + prior_box_data[prior_box_offset] + prior_box_width / 2; + T prior_box_center_y = + prior_box_data[prior_box_offset + 1] + prior_box_height / 2; + + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + int prior_var_offset = axis == 0 ? j * len : i * len; + if (var_size == 2) { + std::memcpy(var_ptr, prior_box_var->data() + prior_var_offset, + 4 * sizeof(T)); + } else if (var_size == 1) { + var_ptr = reinterpret_cast(variance.data()); + } + T box_var_x = *var_ptr; + T box_var_y = *(var_ptr + 1); + T box_var_w = *(var_ptr + 2); + T box_var_h = *(var_ptr + 3); + + target_box_center_x = + box_var_x * target_box_data[offset] * prior_box_width + + prior_box_center_x; + target_box_center_y = + box_var_y * target_box_data[offset + 1] * prior_box_height + + prior_box_center_y; + target_box_width = + std::exp(box_var_w * target_box_data[offset + 2]) * prior_box_width; + target_box_height = std::exp(box_var_h * target_box_data[offset + 3]) * + prior_box_height; + + output[offset] = target_box_center_x - target_box_width / 2; + output[offset + 1] = target_box_center_y - target_box_height / 2; + output[offset + 2] = + target_box_center_x + target_box_width / 2 - (normalized == false); + output[offset + 3] = + target_box_center_y + target_box_height / 2 - (normalized == false); + } + } + } + + void Compute(const framework::ExecutionContext &context) const override { + auto *prior_box = context.Input("PriorBox"); + auto *prior_box_var = context.Input("PriorBoxVar"); + auto *target_box = context.Input("TargetBox"); + auto *output_box = context.Output("OutputBox"); + std::vector variance = context.Attr>("variance"); + const int axis = context.Attr("axis"); + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL, + "Only support 1 level of LoD."); + } + if (prior_box_var) { + PADDLE_ENFORCE(variance.empty(), + "Input 'PriorBoxVar' and attribute 'variance' should not" + "be used at the same time."); + } + if (!(variance.empty())) { + PADDLE_ENFORCE(static_cast(variance.size()) == 4, + "Size of attribute 'variance' should be 4"); + } + auto code_type = GetBoxCodeType(context.Attr("code_type")); + bool normalized = context.Attr("box_normalized"); + + auto row = target_box->dims()[0]; + auto col = prior_box->dims()[0]; + if (code_type == BoxCodeType::kDecodeCenterSize) { + col = target_box->dims()[1]; + } + auto len = prior_box->dims()[1]; + + output_box->mutable_data({row, col, len}, context.GetPlace()); + + T *output = output_box->data(); + if (code_type == BoxCodeType::kEncodeCenterSize) { + EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, + variance, output); + } else if (code_type == BoxCodeType::kDecodeCenterSize) { + if (prior_box_var) { + if (axis == 0) { + DecodeCenterSize<0, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 2>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else if (!(variance.empty())) { + if (axis == 0) { + DecodeCenterSize<0, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 1>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } else { + if (axis == 0) { + DecodeCenterSize<0, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } else { + DecodeCenterSize<1, 0>(target_box, prior_box, prior_box_var, + normalized, variance, output); + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc new file mode 100644 index 00000000..945d575a --- /dev/null +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc @@ -0,0 +1,169 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +class BoxDecoderAndAssignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("PriorBox"), + "Input(PriorBox) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("PriorBoxVar"), + "Input(PriorBoxVar) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("TargetBox"), + "Input(TargetBox) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("BoxScore"), + "Input(BoxScore) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("DecodeBox"), + "Output(DecodeBox) of BoxDecoderAndAssignOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("OutputAssignBox"), + "Output(OutputAssignBox) of BoxDecoderAndAssignOp should not be null."); + + auto prior_box_dims = ctx->GetInputDim("PriorBox"); + auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar"); + auto target_box_dims = ctx->GetInputDim("TargetBox"); + auto box_score_dims = ctx->GetInputDim("BoxScore"); + + PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2, + "The rank of Input of PriorBox must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]"); + PADDLE_ENFORCE_EQ(prior_box_var_dims.size(), 1, + "The rank of Input of PriorBoxVar must be 1"); + PADDLE_ENFORCE_EQ(prior_box_var_dims[0], 4, + "The shape of PriorBoxVar is [4]"); + PADDLE_ENFORCE_EQ(target_box_dims.size(), 2, + "The rank of Input of TargetBox must be 2"); + PADDLE_ENFORCE_EQ(box_score_dims.size(), 2, + "The rank of Input of BoxScore must be 2"); + PADDLE_ENFORCE_EQ(prior_box_dims[0], target_box_dims[0], + "The first dim of prior_box and target_box is roi nums " + "and should be same!"); + PADDLE_ENFORCE_EQ(prior_box_dims[0], box_score_dims[0], + "The first dim of prior_box and box_score is roi nums " + "and should be same!"); + PADDLE_ENFORCE_EQ(target_box_dims[1], box_score_dims[1] * prior_box_dims[1], + "The shape of target_box is [N, classnum * 4], The shape " + "of box_score is [N, classnum], The shape of prior_box " + "is [N, 4]"); + + ctx->SetOutputDim("DecodeBox", framework::make_ddim({target_box_dims[0], + target_box_dims[1]})); + ctx->ShareLoD("TargetBox", /*->*/ "DecodeBox"); + ctx->SetOutputDim( + "OutputAssignBox", + framework::make_ddim({prior_box_dims[0], prior_box_dims[1]})); + ctx->ShareLoD("PriorBox", /*->*/ "OutputAssignBox"); + } +}; + +class BoxDecoderAndAssignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "PriorBox", + "(Tensor, default Tensor) " + "Box list PriorBox is a 2-D Tensor with shape [N, 4] which holds N " + "boxes and each box is represented as [xmin, ymin, xmax, ymax], " + "[xmin, ymin] is the left top coordinate of the anchor box, " + "if the input is image feature map, they are close to the origin " + "of the coordinate system. [xmax, ymax] is the right bottom " + "coordinate of the anchor box."); + AddInput("PriorBoxVar", + "(Tensor, default Tensor, optional) " + "PriorBoxVar is a 2-D Tensor with shape [N, 4] which holds N " + "group of variance. PriorBoxVar will set all elements to 1 by " + "default.") + .AsDispensable(); + AddInput("TargetBox", + "(LoDTensor or Tensor) " + "This input can be a 2-D LoDTensor with shape " + "[N, classnum*4]. It holds N targets for N boxes."); + AddInput("BoxScore", + "(LoDTensor or Tensor) " + "This input can be a 2-D LoDTensor with shape " + "[N, classnum], each box is represented as [classnum] which is " + "the classification probabilities."); + AddAttr("box_clip", + "(float, default 4.135, np.log(1000. / 16.)) " + "clip box to prevent overflowing") + .SetDefault(4.135f); + AddOutput("DecodeBox", + "(LoDTensor or Tensor) " + "the output tensor of op with shape [N, classnum * 4] " + "representing the result of N target boxes decoded with " + "M Prior boxes and variances for each class."); + AddOutput("OutputAssignBox", + "(LoDTensor or Tensor) " + "the output tensor of op with shape [N, 4] " + "representing the result of N target boxes decoded with " + "M Prior boxes and variances with the best non-background class " + "by BoxScore."); + AddComment(R"DOC( + +Bounding Box Coder. + +Decode the target bounding box with the prior_box information. + +The Decoding schema is described below: + + $$ + ox = (pw \\times pxv \\times tx + px) - \\frac{tw}{2} + $$ + $$ + oy = (ph \\times pyv \\times ty + py) - \\frac{th}{2} + $$ + $$ + ow = \\exp (pwv \\times tw) \\times pw + \\frac{tw}{2} + $$ + $$ + oh = \\exp (phv \\times th) \\times ph + \\frac{th}{2} + $$ + +where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, width +and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the +prior_box's (anchor) center coordinates, width and height. `pxv`, `pyv`, `pwv`, +`phv` denote the variance of the prior_box and `ox`, `oy`, `ow`, `oh` denote the +decoded coordinates, width and height in decode_box. + +decode_box is obtained after box decode, then assigning schema is described below: + +For each prior_box, use the best non-background class's decoded values to +update the prior_box locations and get output_assign_box. So, the shape of +output_assign_box is the same as PriorBox. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(box_decoder_and_assign, ops::BoxDecoderAndAssignOp, + ops::BoxDecoderAndAssignOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + box_decoder_and_assign, + ops::BoxDecoderAndAssignKernel, + ops::BoxDecoderAndAssignKernel); diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu new file mode 100644 index 00000000..25e6545e --- /dev/null +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu @@ -0,0 +1,147 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +template +__global__ void DecodeBoxKernel(const T* prior_box_data, + const T* prior_box_var_data, + const T* target_box_data, const int roi_num, + const int class_num, const T box_clip, + T* output_box_data) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < roi_num * class_num) { + int i = idx / class_num; + int j = idx % class_num; + T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1; + T prior_box_height = + prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1; + T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2; + T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2; + + int offset = i * class_num * 4 + j * 4; + T dw = prior_box_var_data[2] * target_box_data[offset + 2]; + T dh = prior_box_var_data[3] * target_box_data[offset + 3]; + if (dw > box_clip) { + dw = box_clip; + } + if (dh > box_clip) { + dh = box_clip; + } + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + target_box_center_x = + prior_box_var_data[0] * target_box_data[offset] * prior_box_width + + prior_box_center_x; + target_box_center_y = + prior_box_var_data[1] * target_box_data[offset + 1] * prior_box_height + + prior_box_center_y; + target_box_width = expf(dw) * prior_box_width; + target_box_height = expf(dh) * prior_box_height; + + output_box_data[offset] = target_box_center_x - target_box_width / 2; + output_box_data[offset + 1] = target_box_center_y - target_box_height / 2; + output_box_data[offset + 2] = + target_box_center_x + target_box_width / 2 - 1; + output_box_data[offset + 3] = + target_box_center_y + target_box_height / 2 - 1; + } +} + +template +__global__ void AssignBoxKernel(const T* prior_box_data, + const T* box_score_data, T* output_box_data, + const int roi_num, const int class_num, + T* output_assign_box_data) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < roi_num) { + int i = idx; + T max_score = -1; + int max_j = -1; + for (int j = 0; j < class_num; ++j) { + T score = box_score_data[i * class_num + j]; + if (score > max_score && j > 0) { + max_score = score; + max_j = j; + } + } + if (max_j > 0) { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = + output_box_data[i * class_num * 4 + max_j * 4 + pno]; + } + } else { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno]; + } + } + } +} + +template +class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), + "This kernel only runs on GPU device."); + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* box_score = context.Input("BoxScore"); + auto* output_box = context.Output("DecodeBox"); + auto* output_assign_box = + context.Output("OutputAssignBox"); + + auto roi_num = target_box->dims()[0]; + auto class_num = box_score->dims()[1]; + auto* target_box_data = target_box->data(); + auto* prior_box_data = prior_box->data(); + auto* prior_box_var_data = prior_box_var->data(); + auto* box_score_data = box_score->data(); + output_box->mutable_data({roi_num, class_num * 4}, context.GetPlace()); + output_assign_box->mutable_data({roi_num, 4}, context.GetPlace()); + T* output_box_data = output_box->data(); + T* output_assign_box_data = output_assign_box->data(); + + int block = 512; + int grid = (roi_num * class_num + block - 1) / block; + auto& device_ctx = context.cuda_device_context(); + + const T box_clip = context.Attr("box_clip"); + + DecodeBoxKernel<<>>( + prior_box_data, prior_box_var_data, target_box_data, roi_num, class_num, + box_clip, output_box_data); + + context.device_context().Wait(); + int assign_grid = (roi_num + block - 1) / block; + AssignBoxKernel<<>>( + prior_box_data, box_score_data, output_box_data, roi_num, class_num, + output_assign_box_data); + context.device_context().Wait(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + box_decoder_and_assign, + ops::BoxDecoderAndAssignCUDAKernel, + ops::BoxDecoderAndAssignCUDAKernel); diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h new file mode 100644 index 00000000..e66a8351 --- /dev/null +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h @@ -0,0 +1,103 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class BoxDecoderAndAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* prior_box = context.Input("PriorBox"); + auto* prior_box_var = context.Input("PriorBoxVar"); + auto* target_box = context.Input("TargetBox"); + auto* box_score = context.Input("BoxScore"); + auto* output_box = context.Output("DecodeBox"); + auto* output_assign_box = + context.Output("OutputAssignBox"); + int roi_num = target_box->dims()[0]; + int class_num = box_score->dims()[1]; + auto* target_box_data = target_box->data(); + auto* prior_box_data = prior_box->data(); + auto* prior_box_var_data = prior_box_var->data(); + auto* box_score_data = box_score->data(); + output_box->mutable_data({roi_num, class_num * 4}, context.GetPlace()); + output_assign_box->mutable_data({roi_num, 4}, context.GetPlace()); + T* output_box_data = output_box->data(); + T* output_assign_box_data = output_assign_box->data(); + const T bbox_clip = context.Attr("box_clip"); + + for (int i = 0; i < roi_num; ++i) { + T prior_box_width = prior_box_data[i * 4 + 2] - prior_box_data[i * 4] + 1; + T prior_box_height = + prior_box_data[i * 4 + 3] - prior_box_data[i * 4 + 1] + 1; + T prior_box_center_x = prior_box_data[i * 4] + prior_box_width / 2; + T prior_box_center_y = prior_box_data[i * 4 + 1] + prior_box_height / 2; + for (int j = 0; j < class_num; ++j) { + int64_t offset = i * class_num * 4 + j * 4; + T dw = std::min(prior_box_var_data[2] * target_box_data[offset + 2], + bbox_clip); + T dh = std::min(prior_box_var_data[3] * target_box_data[offset + 3], + bbox_clip); + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + target_box_center_x = + prior_box_var_data[0] * target_box_data[offset] * prior_box_width + + prior_box_center_x; + target_box_center_y = prior_box_var_data[1] * + target_box_data[offset + 1] * + prior_box_height + + prior_box_center_y; + target_box_width = std::exp(dw) * prior_box_width; + target_box_height = std::exp(dh) * prior_box_height; + + output_box_data[offset] = target_box_center_x - target_box_width / 2; + output_box_data[offset + 1] = + target_box_center_y - target_box_height / 2; + output_box_data[offset + 2] = + target_box_center_x + target_box_width / 2 - 1; + output_box_data[offset + 3] = + target_box_center_y + target_box_height / 2 - 1; + } + + T max_score = -1; + int max_j = -1; + for (int j = 0; j < class_num; ++j) { + T score = box_score_data[i * class_num + j]; + if (score > max_score && j > 0) { + max_score = score; + max_j = j; + } + } + + if (max_j > 0) { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = + output_box_data[i * class_num * 4 + max_j * 4 + pno]; + } + } else { + for (int pno = 0; pno < 4; pno++) { + output_assign_box_data[i * 4 + pno] = prior_box_data[i * 4 + pno]; + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc new file mode 100644 index 00000000..06030728 --- /dev/null +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License.*/ + +#include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +class CollectFpnProposalsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE(context->HasInputs("MultiLevelRois"), + "Inputs(MultiLevelRois) shouldn't be null"); + PADDLE_ENFORCE(context->HasInputs("MultiLevelScores"), + "Inputs(MultiLevelScores) shouldn't be null"); + PADDLE_ENFORCE(context->HasOutput("FpnRois"), + "Outputs(MultiFpnRois) of DistributeOp should not be null"); + auto roi_dims = context->GetInputsDim("MultiLevelRois"); + auto score_dims = context->GetInputsDim("MultiLevelScores"); + auto post_nms_topN = context->Attrs().Get("post_nms_topN"); + std::vector out_dims; + for (auto &roi_dim : roi_dims) { + PADDLE_ENFORCE_EQ(roi_dim[1], 4, + "Second dimension of Input(MultiLevelRois) must be 4"); + } + for (auto &score_dim : score_dims) { + PADDLE_ENFORCE_EQ( + score_dim[1], 1, + "Second dimension of Input(MultiLevelScores) must be 1"); + } + context->SetOutputDim("FpnRois", {post_nms_topN, 4}); + if (!context->IsRuntime()) { // Runtime LoD infershape will be computed + // in Kernel. + context->ShareLoD("MultiLevelRois", "FpnRois"); + } + if (context->IsRuntime()) { + std::vector roi_inputs = + context->GetInputVarPtrs("MultiLevelRois"); + std::vector score_inputs = + context->GetInputVarPtrs("MultiLevelScores"); + for (size_t i = 0; i < roi_inputs.size(); ++i) { + framework::Variable *roi_var = + boost::get(roi_inputs[i]); + framework::Variable *score_var = + boost::get(score_inputs[i]); + auto &roi_lod = roi_var->Get().lod(); + auto &score_lod = score_var->Get().lod(); + PADDLE_ENFORCE_EQ(roi_lod, score_lod, + "Inputs(MultiLevelRois) and Inputs(MultiLevelScores) " + "should have same lod."); + } + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto data_type = + framework::GetDataTypeOfVar(ctx.MultiInputVar("MultiLevelRois")[0]); + return framework::OpKernelType(data_type, ctx.GetPlace()); + } +}; + +class CollectFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("MultiLevelRois", + "(LoDTensor) Multiple roi LoDTensors from each level in shape " + "(N, 4), N is the number of RoIs") + .AsDuplicable(); + AddInput("MultiLevelScores", + "(LoDTensor) Multiple score LoDTensors from each level in shape" + " (N, 1), N is the number of RoIs.") + .AsDuplicable(); + AddOutput("FpnRois", "(LoDTensor) All selected RoIs with highest scores"); + AddAttr("post_nms_topN", + "Select post_nms_topN RoIs from" + " all images and all fpn layers"); + AddComment(R"DOC( +This operator concats all proposals from different images + and different FPN levels. Then sort all of those proposals +by objectness confidence. Select the post_nms_topN RoIs in + total. Finally, re-sort the RoIs in the order of batch index. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(collect_fpn_proposals, ops::CollectFpnProposalsOp, + ops::CollectFpnProposalsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(collect_fpn_proposals, + ops::CollectFpnProposalsOpKernel, + ops::CollectFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu new file mode 100644 index 00000000..ba0b4ac0 --- /dev/null +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -0,0 +1,209 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "cub/cub.cuh" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/operators/strided_memcpy.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static constexpr int kNumCUDAThreads = 64; +static constexpr int kNumMaxinumNumBlocks = 4096; + +const int kBBoxSize = 4; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +static __global__ void GetLengthLoD(const int nthreads, const int* batch_ids, + int* length_lod) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (nthreads); + i += blockDim.x * gridDim.x) { + platform::CudaAtomicAdd(length_lod + batch_ids[i], 1); + } +} + +template +class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto roi_ins = ctx.MultiInput("MultiLevelRois"); + const auto score_ins = ctx.MultiInput("MultiLevelScores"); + auto fpn_rois = ctx.Output("FpnRois"); + auto& dev_ctx = ctx.template device_context(); + + const int post_nms_topN = ctx.Attr("post_nms_topN"); + + // concat inputs along axis = 0 + int roi_offset = 0; + int score_offset = 0; + int total_roi_num = 0; + for (size_t i = 0; i < roi_ins.size(); ++i) { + total_roi_num += roi_ins[i]->dims()[0]; + } + + int real_post_num = min(post_nms_topN, total_roi_num); + fpn_rois->mutable_data({real_post_num, kBBoxSize}, dev_ctx.GetPlace()); + Tensor concat_rois; + Tensor concat_scores; + T* concat_rois_data = concat_rois.mutable_data( + {total_roi_num, kBBoxSize}, dev_ctx.GetPlace()); + T* concat_scores_data = + concat_scores.mutable_data({total_roi_num, 1}, dev_ctx.GetPlace()); + Tensor roi_batch_id_list; + roi_batch_id_list.Resize({total_roi_num}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(platform::CPUPlace()); + int index = 0; + int lod_size; + auto place = boost::get(dev_ctx.GetPlace()); + + for (size_t i = 0; i < roi_ins.size(); ++i) { + auto roi_in = roi_ins[i]; + auto score_in = score_ins[i]; + auto roi_lod = roi_in->lod().back(); + lod_size = roi_lod.size() - 1; + for (size_t n = 0; n < lod_size; ++n) { + for (size_t j = roi_lod[n]; j < roi_lod[n + 1]; ++j) { + roi_batch_id_data[index++] = n; + } + } + + memory::Copy(place, concat_rois_data + roi_offset, place, + roi_in->data(), roi_in->numel() * sizeof(T), + dev_ctx.stream()); + memory::Copy(place, concat_scores_data + score_offset, place, + score_in->data(), score_in->numel() * sizeof(T), + dev_ctx.stream()); + roi_offset += roi_in->numel(); + score_offset += score_in->numel(); + } + + // copy batch id list to GPU + Tensor roi_batch_id_list_gpu; + framework::TensorCopy(roi_batch_id_list, dev_ctx.GetPlace(), + &roi_batch_id_list_gpu); + + Tensor index_in_t; + int* idx_in = + index_in_t.mutable_data({total_roi_num}, dev_ctx.GetPlace()); + platform::ForRange for_range_total( + dev_ctx, total_roi_num); + for_range_total(RangeInitFunctor{0, 1, idx_in}); + + Tensor keys_out_t; + T* keys_out = + keys_out_t.mutable_data({total_roi_num}, dev_ctx.GetPlace()); + Tensor index_out_t; + int* idx_out = + index_out_t.mutable_data({total_roi_num}, dev_ctx.GetPlace()); + + // Determine temporary device storage requirements + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairsDescending( + nullptr, temp_storage_bytes, concat_scores.data(), keys_out, idx_in, + idx_out, total_roi_num); + // Allocate temporary storage + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + + // Run sorting operation + // sort score to get corresponding index + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data(), + keys_out, idx_in, idx_out, total_roi_num); + index_out_t.Resize({real_post_num}); + Tensor sorted_rois; + sorted_rois.mutable_data({real_post_num, kBBoxSize}, dev_ctx.GetPlace()); + Tensor sorted_batch_id; + sorted_batch_id.mutable_data({real_post_num}, dev_ctx.GetPlace()); + GPUGather(dev_ctx, concat_rois, index_out_t, &sorted_rois); + GPUGather(dev_ctx, roi_batch_id_list_gpu, index_out_t, + &sorted_batch_id); + + Tensor batch_index_t; + int* batch_idx_in = + batch_index_t.mutable_data({real_post_num}, dev_ctx.GetPlace()); + platform::ForRange for_range_post( + dev_ctx, real_post_num); + for_range_post(RangeInitFunctor{0, 1, batch_idx_in}); + + Tensor out_id_t; + int* out_id_data = + out_id_t.mutable_data({real_post_num}, dev_ctx.GetPlace()); + // Determine temporary device storage requirements + temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, sorted_batch_id.data(), out_id_data, + batch_idx_in, index_out_t.data(), real_post_num); + // Allocate temporary storage + d_temp_storage = memory::Alloc(place, temp_storage_bytes); + + // Run sorting operation + // sort batch_id to get corresponding index + cub::DeviceRadixSort::SortPairs( + d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data(), + out_id_data, batch_idx_in, index_out_t.data(), real_post_num); + + GPUGather(dev_ctx, sorted_rois, index_out_t, fpn_rois); + + Tensor length_lod; + int* length_lod_data = + length_lod.mutable_data({lod_size}, dev_ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(dev_ctx, &length_lod, static_cast(0)); + + int blocks = NumBlocks(real_post_num); + int threads = kNumCUDAThreads; + + // get length-based lod by batch ids + GetLengthLoD<<>>(real_post_num, out_id_data, + length_lod_data); + std::vector length_lod_cpu(lod_size); + memory::Copy(platform::CPUPlace(), length_lod_cpu.data(), place, + length_lod_data, sizeof(int) * lod_size, dev_ctx.stream()); + dev_ctx.Wait(); + + std::vector offset(1, 0); + for (int i = 0; i < lod_size; ++i) { + offset.emplace_back(offset.back() + length_lod_cpu[i]); + } + + framework::LoD lod; + lod.emplace_back(offset); + fpn_rois->set_lod(lod); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + collect_fpn_proposals, + ops::GPUCollectFpnProposalsOpKernel, + ops::GPUCollectFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h new file mode 100644 index 00000000..268f7e21 --- /dev/null +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h @@ -0,0 +1,149 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License.*/ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +const int kBoxDim = 4; + +template +struct ScoreWithID { + T score; + int batch_id; + int index; + int level; + ScoreWithID() { + batch_id = -1; + index = -1; + level = -1; + } + ScoreWithID(T score_, int batch_id_, int index_, int level_) { + score = score_; + batch_id = batch_id_; + index = index_; + level = level_; + } +}; +template +static inline bool CompareByScore(ScoreWithID a, ScoreWithID b) { + return a.score >= b.score; +} + +template +static inline bool CompareByBatchid(ScoreWithID a, ScoreWithID b) { + return a.batch_id < b.batch_id; +} + +template +class CollectFpnProposalsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto multi_layer_rois = + context.MultiInput("MultiLevelRois"); + + auto multi_layer_scores = + context.MultiInput("MultiLevelScores"); + + auto* fpn_rois = context.Output("FpnRois"); + + int post_nms_topN = context.Attr("post_nms_topN"); + + PADDLE_ENFORCE_GE(post_nms_topN, 0UL, + "The parameter post_nms_topN must be a positive integer"); + + // assert that the length of Rois and scores are same + PADDLE_ENFORCE(multi_layer_rois.size() == multi_layer_scores.size(), + "DistributeFpnProposalsOp need 1 level of LoD"); + // Check if the lod information of two LoDTensor is same + const int num_fpn_level = multi_layer_rois.size(); + std::vector integral_of_all_rois(num_fpn_level + 1, 0); + for (int i = 0; i < num_fpn_level; ++i) { + auto cur_rois_lod = multi_layer_rois[i]->lod().back(); + integral_of_all_rois[i + 1] = + integral_of_all_rois[i] + cur_rois_lod[cur_rois_lod.size() - 1]; + } + + // concatenate all fpn rois scores into a list + // create a vector to store all scores + std::vector> scores_of_all_rois( + integral_of_all_rois[num_fpn_level], ScoreWithID()); + for (int i = 0; i < num_fpn_level; ++i) { + const T* cur_level_scores = multi_layer_scores[i]->data(); + int cur_level_num = integral_of_all_rois[i + 1] - integral_of_all_rois[i]; + auto cur_scores_lod = multi_layer_scores[i]->lod().back(); + int cur_batch_id = 0; + for (int j = 0; j < cur_level_num; ++j) { + if (j >= cur_scores_lod[cur_batch_id + 1]) { + cur_batch_id++; + } + int cur_index = j + integral_of_all_rois[i]; + scores_of_all_rois[cur_index].score = cur_level_scores[j]; + scores_of_all_rois[cur_index].index = j; + scores_of_all_rois[cur_index].level = i; + scores_of_all_rois[cur_index].batch_id = cur_batch_id; + } + } + // keep top post_nms_topN rois + // sort the rois by the score + if (post_nms_topN > integral_of_all_rois[num_fpn_level]) { + post_nms_topN = integral_of_all_rois[num_fpn_level]; + } + std::stable_sort(scores_of_all_rois.begin(), scores_of_all_rois.end(), + CompareByScore); + scores_of_all_rois.resize(post_nms_topN); + // sort by batch id + std::stable_sort(scores_of_all_rois.begin(), scores_of_all_rois.end(), + CompareByBatchid); + // create a pointer array + std::vector multi_fpn_rois_data(num_fpn_level); + for (int i = 0; i < num_fpn_level; ++i) { + multi_fpn_rois_data[i] = multi_layer_rois[i]->data(); + } + // initialize the outputs + fpn_rois->mutable_data({post_nms_topN, kBoxDim}, context.GetPlace()); + T* fpn_rois_data = fpn_rois->data(); + std::vector lod0(1, 0); + int cur_batch_id = 0; + for (int i = 0; i < post_nms_topN; ++i) { + int cur_fpn_level = scores_of_all_rois[i].level; + int cur_level_index = scores_of_all_rois[i].index; + memcpy(fpn_rois_data, + multi_fpn_rois_data[cur_fpn_level] + cur_level_index * kBoxDim, + kBoxDim * sizeof(T)); + fpn_rois_data += kBoxDim; + if (scores_of_all_rois[i].batch_id != cur_batch_id) { + cur_batch_id = scores_of_all_rois[i].batch_id; + lod0.emplace_back(i); + } + } + lod0.emplace_back(post_nms_topN); + framework::LoD lod; + lod.emplace_back(lod0); + fpn_rois->set_lod(lod); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc new file mode 100644 index 00000000..cacd47ed --- /dev/null +++ b/paddle/fluid/operators/detection/density_prior_box_op.cc @@ -0,0 +1,180 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/density_prior_box_op.h" + +namespace paddle { +namespace operators { + +class DensityPriorBoxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of DensityPriorBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Image"), + "Input(Image) of DensityPriorBoxOp should not be null."); + + auto image_dims = ctx->GetInputDim("Image"); + auto input_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW."); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + PADDLE_ENFORCE_LT(input_dims[2], image_dims[2], + "The height of input must smaller than image."); + + PADDLE_ENFORCE_LT(input_dims[3], image_dims[3], + "The width of input must smaller than image."); + auto variances = ctx->Attrs().Get>("variances"); + + auto fixed_sizes = ctx->Attrs().Get>("fixed_sizes"); + auto fixed_ratios = ctx->Attrs().Get>("fixed_ratios"); + auto densities = ctx->Attrs().Get>("densities"); + bool flatten = ctx->Attrs().Get("flatten_to_2d"); + + PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(), + "The number of fixed_sizes and densities must be equal."); + size_t num_priors = 0; + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + if (!flatten) { + std::vector dim_vec(4); + dim_vec[0] = input_dims[2]; + dim_vec[1] = input_dims[3]; + dim_vec[2] = num_priors; + dim_vec[3] = 4; + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); + ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); + } else { + int64_t dim0 = input_dims[2] * input_dims[3] * num_priors; + ctx->SetOutputDim("Boxes", {dim0, 4}); + ctx->SetOutputDim("Variances", {dim0, 4}); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input("Input")->type(), ctx.GetPlace()); + } +}; + +class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Input", + "(Tensor, default Tensor), " + "the input feature data of DensityPriorBoxOp, the layout is NCHW."); + AddInput("Image", + "(Tensor, default Tensor), " + "the input image data of DensityPriorBoxOp, the layout is NCHW."); + AddOutput("Boxes", + "(Tensor, default Tensor), the output prior boxes of " + "DensityPriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + AddOutput("Variances", + "(Tensor, default Tensor), the expanded variances of " + "DensityPriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + AddAttr>("variances", + "(vector) List of variances to be " + "encoded in density prior boxes.") + .AddCustomChecker([](const std::vector& variances) { + PADDLE_ENFORCE_EQ(variances.size(), 4, + "Must and only provide 4 variance."); + for (size_t i = 0; i < variances.size(); ++i) { + PADDLE_ENFORCE_GT(variances[i], 0.0, + "variance[%d] must be greater than 0.", i); + } + }); + AddAttr("clip", "(bool) Whether to clip out-of-boundary boxes.") + .SetDefault(true); + AddAttr("flatten_to_2d", + "(bool) Whether to flatten to 2D and " + "the second dim is 4.") + .SetDefault(false); + AddAttr( + "step_w", + "Density prior boxes step across width, 0.0 for auto calculation.") + .SetDefault(0.0) + .AddCustomChecker([](const float& step_w) { + PADDLE_ENFORCE_GE(step_w, 0.0, "step_w should be larger than 0."); + }); + AddAttr( + "step_h", + "Density prior boxes step across height, 0.0 for auto calculation.") + .SetDefault(0.0) + .AddCustomChecker([](const float& step_h) { + PADDLE_ENFORCE_GE(step_h, 0.0, "step_h should be larger than 0."); + }); + + AddAttr("offset", + "(float) " + "Density prior boxes center offset.") + .SetDefault(0.5); + AddAttr>("fixed_sizes", + "(vector) List of fixed sizes " + "of generated density prior boxes.") + .SetDefault(std::vector{}) + .AddCustomChecker([](const std::vector& fixed_sizes) { + for (size_t i = 0; i < fixed_sizes.size(); ++i) { + PADDLE_ENFORCE_GT(fixed_sizes[i], 0.0, + "fixed_sizes[%d] should be larger than 0.", i); + } + }); + + AddAttr>("fixed_ratios", + "(vector) List of fixed ratios " + "of generated density prior boxes.") + .SetDefault(std::vector{}) + .AddCustomChecker([](const std::vector& fixed_ratios) { + for (size_t i = 0; i < fixed_ratios.size(); ++i) { + PADDLE_ENFORCE_GT(fixed_ratios[i], 0.0, + "fixed_ratios[%d] should be larger than 0.", i); + } + }); + + AddAttr>("densities", + "(vector) List of densities " + "of generated density prior boxes.") + .SetDefault(std::vector{}) + .AddCustomChecker([](const std::vector& densities) { + for (size_t i = 0; i < densities.size(); ++i) { + PADDLE_ENFORCE_GT(densities[i], 0, + "densities[%d] should be larger than 0.", i); + } + }); + AddComment(R"DOC( + Density Prior box operator + Each position of the input produce N density prior boxes, N is determined by + the count of fixed_ratios, densities, the calculation of N is as follows: + for density in densities: + N += size(fixed_ratios)*density^2 + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(density_prior_box, ops::DensityPriorBoxOp, + ops::DensityPriorBoxOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(density_prior_box, ops::DensityPriorBoxOpKernel, + ops::DensityPriorBoxOpKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu new file mode 100644 index 00000000..6337a483 --- /dev/null +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -0,0 +1,172 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/density_prior_box_op.h" + +namespace paddle { +namespace operators { + +template +static __device__ inline T Clip(T in) { + return min(max(in, 0.), 1.); +} + +template +static __global__ void GenDensityPriorBox( + const int height, const int width, const int im_height, const int im_width, + const T offset, const T step_width, const T step_height, + const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin, + const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) { + int gidx = blockIdx.x * blockDim.x + threadIdx.x; + int gidy = blockIdx.y * blockDim.y + threadIdx.y; + int step_x = blockDim.x * gridDim.x; + int step_y = blockDim.y * gridDim.y; + + const T* width_ratio = ratios_shift; + const T* height_ratio = ratios_shift + num_priors; + const T* width_shift = ratios_shift + 2 * num_priors; + const T* height_shift = ratios_shift + 3 * num_priors; + + for (int j = gidy; j < height; j += step_y) { + for (int i = gidx; i < width * num_priors; i += step_x) { + int h = j; + int w = i / num_priors; + int k = i % num_priors; + + T center_x = (w + offset) * step_width; + T center_y = (h + offset) * step_height; + + T center_x_temp = center_x + width_shift[k]; + T center_y_temp = center_y + height_shift[k]; + + T box_width_ratio = width_ratio[k] / 2.; + T box_height_ratio = height_ratio[k] / 2.; + + T xmin = max((center_x_temp - box_width_ratio) / im_width, 0.); + T ymin = max((center_y_temp - box_height_ratio) / im_height, 0.); + T xmax = min((center_x_temp + box_width_ratio) / im_width, 1.); + T ymax = min((center_y_temp + box_height_ratio) / im_height, 1.); + + int out_offset = (j * width * num_priors + i) * 4; + out[out_offset] = is_clip ? Clip(xmin) : xmin; + out[out_offset + 1] = is_clip ? Clip(ymin) : ymin; + out[out_offset + 2] = is_clip ? Clip(xmax) : xmax; + out[out_offset + 3] = is_clip ? Clip(ymax) : ymax; + + var[out_offset] = var_xmin; + var[out_offset + 1] = var_ymin; + var[out_offset + 2] = var_xmax; + var[out_offset + 3] = var_ymax; + } + } +} + +template +class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto variances = ctx.Attr>("variances"); + auto is_clip = ctx.Attr("clip"); + + auto fixed_sizes = ctx.Attr>("fixed_sizes"); + auto fixed_ratios = ctx.Attr>("fixed_ratios"); + auto densities = ctx.Attr>("densities"); + + T step_w = static_cast(ctx.Attr("step_w")); + T step_h = static_cast(ctx.Attr("step_h")); + T offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + T step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + + int num_priors = 0; + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + int step_average = static_cast((step_width + step_height) * 0.5); + + framework::Tensor h_temp; + T* tdata = h_temp.mutable_data({num_priors * 4}, platform::CPUPlace()); + int idx = 0; + for (size_t s = 0; s < fixed_sizes.size(); ++s) { + auto fixed_size = fixed_sizes[s]; + int density = densities[s]; + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float ar = fixed_ratios[r]; + int shift = step_average / density; + float box_width_ratio = fixed_size * sqrt(ar); + float box_height_ratio = fixed_size / sqrt(ar); + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = shift / 2. + dj * shift - step_average / 2.; + float center_y_temp = shift / 2. + di * shift - step_average / 2.; + tdata[idx] = box_width_ratio; + tdata[num_priors + idx] = box_height_ratio; + tdata[2 * num_priors + idx] = center_x_temp; + tdata[3 * num_priors + idx] = center_y_temp; + idx++; + } + } + } + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + framework::Tensor d_temp; + framework::TensorCopy(h_temp, ctx.GetPlace(), &d_temp); + + // At least use 32 threads, at most 512 threads. + // blockx is multiple of 32. + int blockx = std::min( + static_cast(((feature_width * num_priors + 31) >> 5) << 5), + static_cast(512L)); + int gridx = (feature_width * num_priors + blockx - 1) / blockx; + dim3 threads(blockx, 1); + dim3 grids(gridx, feature_height); + + auto stream = + ctx.template device_context().stream(); + GenDensityPriorBox<<>>( + feature_height, feature_width, img_height, img_width, offset, + step_width, step_height, num_priors, d_temp.data(), is_clip, + variances[0], variances[1], variances[2], variances[3], + boxes->data(), vars->data()); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(density_prior_box, + ops::DensityPriorBoxOpCUDAKernel, + ops::DensityPriorBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h new file mode 100644 index 00000000..42137215 --- /dev/null +++ b/paddle/fluid/operators/detection/density_prior_box_op.h @@ -0,0 +1,154 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/operators/detection/prior_box_op.h" + +namespace paddle { +namespace operators { + +template +class DensityPriorBoxOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto variances = ctx.Attr>("variances"); + auto clip = ctx.Attr("clip"); + + auto fixed_sizes = ctx.Attr>("fixed_sizes"); + auto fixed_ratios = ctx.Attr>("fixed_ratios"); + auto densities = ctx.Attr>("densities"); + + T step_w = static_cast(ctx.Attr("step_w")); + T step_h = static_cast(ctx.Attr("step_h")); + T offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + T step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + int num_priors = 0; + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for reduction(+ : num_priors) +#endif + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + auto box_dim = vars->dims(); + boxes->Resize({feature_height, feature_width, num_priors, 4}); + auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); + int step_average = static_cast((step_width + step_height) * 0.5); + + std::vector sqrt_fixed_ratios; +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (size_t i = 0; i < fixed_ratios.size(); i++) { + sqrt_fixed_ratios.push_back(sqrt(fixed_ratios[i])); + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif + for (int h = 0; h < feature_height; ++h) { + for (int w = 0; w < feature_width; ++w) { + T center_x = (w + offset) * step_width; + T center_y = (h + offset) * step_height; + int idx = 0; + // Generate density prior boxes with fixed sizes. + for (size_t s = 0; s < fixed_sizes.size(); ++s) { + auto fixed_size = fixed_sizes[s]; + int density = densities[s]; + int shift = step_average / density; + // Generate density prior boxes with fixed ratios. + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float box_width_ratio = fixed_size * sqrt_fixed_ratios[r]; + float box_height_ratio = fixed_size / sqrt_fixed_ratios[r]; + float density_center_x = center_x - step_average / 2. + shift / 2.; + float density_center_y = center_y - step_average / 2. + shift / 2.; + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = density_center_x + dj * shift; + float center_y_temp = density_center_y + di * shift; + e_boxes(h, w, idx, 0) = std::max( + (center_x_temp - box_width_ratio / 2.) / img_width, 0.); + e_boxes(h, w, idx, 1) = std::max( + (center_y_temp - box_height_ratio / 2.) / img_height, 0.); + e_boxes(h, w, idx, 2) = std::min( + (center_x_temp + box_width_ratio / 2.) / img_width, 1.); + e_boxes(h, w, idx, 3) = std::min( + (center_y_temp + box_height_ratio / 2.) / img_height, 1.); + idx++; + } + } + } + } + } + } + if (clip) { + T* dt = boxes->data(); + std::transform(dt, dt + boxes->numel(), dt, [](T v) -> T { + return std::min(std::max(v, 0.), 1.); + }); + } + framework::Tensor var_t; + var_t.mutable_data( + framework::make_ddim({1, static_cast(variances.size())}), + ctx.GetPlace()); + + auto var_et = framework::EigenTensor::From(var_t); + + for (size_t i = 0; i < variances.size(); ++i) { + var_et(0, i) = variances[i]; + } + + int box_num = feature_height * feature_width * num_priors; + auto var_dim = vars->dims(); + vars->Resize({box_num, static_cast(variances.size())}); + + auto e_vars = framework::EigenMatrix::From(*vars); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < box_num; ++i) { + for (size_t j = 0; j < variances.size(); ++j) { + e_vars(i, j) = variances[j]; + } + } + + vars->Resize(var_dim); + boxes->Resize(box_dim); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc new file mode 100644 index 00000000..4cc989b6 --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" + +namespace paddle { +namespace operators { + +class DistributeFpnProposalsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("FpnRois"), + "Input(FpnRois) shouldn't be null"); + PADDLE_ENFORCE_GE( + ctx->Outputs("MultiFpnRois").size(), 1UL, + "Outputs(MultiFpnRois) of DistributeOp should not be empty"); + size_t min_level = static_cast(ctx->Attrs().Get("min_level")); + size_t max_level = static_cast(ctx->Attrs().Get("max_level")); + PADDLE_ENFORCE_GE(max_level, min_level, + "max_level must not lower than min_level"); + // Set the output shape + size_t num_out_rois = max_level - min_level + 1; + std::vector outs_dims; + outs_dims.reserve(num_out_rois); + for (size_t i = 0; i < num_out_rois; ++i) { + framework::DDim out_dim = {-1, 4}; + outs_dims.push_back(out_dim); + } + ctx->SetOutputsDim("MultiFpnRois", outs_dims); + ctx->SetOutputDim("RestoreIndex", {-1, 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("FpnRois")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class DistributeFpnProposalsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("FpnRois", "(LoDTensor) The rois at all levels in shape (-1, 4)"); + AddOutput("MultiFpnRois", "(LoDTensor) Output with distribute operator") + .AsDuplicable(); + AddOutput("RestoreIndex", + "(Tensor) An array of positive number which is " + "used to restore the order of FpnRois"); + AddAttr("min_level", + "The lowest level of FPN layer where the" + " proposals come from"); + AddAttr("max_level", + "The highest level of FPN layer where the" + " proposals come from"); + AddAttr("refer_level", + "The referring level of FPN layer with" + " specified scale"); + AddAttr("refer_scale", + "The referring scale of FPN layer with" + " specified level"); + AddComment(R"DOC( +This operator distribute all proposals into different fpn level, + with respect to scale of the proposals, the referring scale and + the referring level. Besides, to restore the order of proposals, +we return an array which indicate the original index of rois in + current proposals. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(distribute_fpn_proposals, ops::DistributeFpnProposalsOp, + ops::DistributeFpnProposalsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(distribute_fpn_proposals, + ops::DistributeFpnProposalsOpKernel, + ops::DistributeFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu new file mode 100644 index 00000000..f3486636 --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -0,0 +1,201 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "cub/cub.cuh" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static constexpr int kNumCUDAThreads = 64; +static constexpr int kNumMaxinumNumBlocks = 4096; + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +int const BBoxSize = 4; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +template +__global__ void GPUDistFpnProposalsHelper( + const int nthreads, const T* rois, const int lod_size, + const int refer_level, const int refer_scale, const int max_level, + const int min_level, int* roi_batch_id_data, int* sub_lod_list, + int* target_lvls) { + CUDA_1D_KERNEL_LOOP(i, nthreads) { + const T* offset_roi = rois + i * BBoxSize; + int roi_batch_ind = roi_batch_id_data[i]; + // get the target level of current rois + T roi_area = RoIArea(offset_roi, false); + T roi_scale = sqrt(roi_area); + int tgt_lvl = floor( + log2(roi_scale / static_cast(refer_scale) + (T)1e-6) + refer_level); + tgt_lvl = min(max_level, max(tgt_lvl, min_level)); + target_lvls[i] = tgt_lvl; + // compute number of rois in the same batch and same target level + platform::CudaAtomicAdd( + sub_lod_list + (tgt_lvl - min_level) * lod_size + roi_batch_ind, 1); + } +} + +template +class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* fpn_rois = ctx.Input("FpnRois"); + + auto multi_fpn_rois = ctx.MultiOutput("MultiFpnRois"); + auto* restore_index = ctx.Output("RestoreIndex"); + + const int min_level = ctx.Attr("min_level"); + const int max_level = ctx.Attr("max_level"); + const int refer_level = ctx.Attr("refer_level"); + const int refer_scale = ctx.Attr("refer_scale"); + int num_level = max_level - min_level + 1; + + // check that the fpn_rois is not empty + PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL, + "DistributeFpnProposalsOp need 1 level of LoD"); + + auto fpn_rois_lod = fpn_rois->lod().back(); + int lod_size = fpn_rois_lod.size() - 1; + int roi_num = fpn_rois_lod[lod_size]; + + auto& dev_ctx = ctx.template device_context(); + + // get batch id by lod in CPU + Tensor roi_batch_id_list; + roi_batch_id_list.Resize({roi_num}); + int* roi_batch_id_data = + roi_batch_id_list.mutable_data(platform::CPUPlace()); + for (int n = 0; n < lod_size; ++n) { + for (size_t i = fpn_rois_lod[n]; i < fpn_rois_lod[n + 1]; ++i) { + roi_batch_id_data[i] = n; + } + } + // copy batch id list to GPU + Tensor roi_batch_id_list_gpu; + framework::TensorCopySync(roi_batch_id_list, dev_ctx.GetPlace(), + &roi_batch_id_list_gpu); + + Tensor sub_lod_list; + sub_lod_list.Resize({num_level, lod_size}); + int* sub_lod_list_data = sub_lod_list.mutable_data(dev_ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(dev_ctx, &sub_lod_list, static_cast(0)); + + Tensor target_lvls; + target_lvls.Resize({roi_num}); + int* target_lvls_data = target_lvls.mutable_data(dev_ctx.GetPlace()); + + int dist_blocks = NumBlocks(roi_num); + int threads = kNumCUDAThreads; + // get target levels and sub_lod list + GPUDistFpnProposalsHelper<<>>( + roi_num, fpn_rois->data(), lod_size, refer_level, refer_scale, + max_level, min_level, roi_batch_id_list_gpu.data(), + sub_lod_list_data, target_lvls_data); + dev_ctx.Wait(); + auto place = boost::get(dev_ctx.GetPlace()); + + Tensor index_in_t; + int* idx_in = index_in_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + platform::ForRange for_range(dev_ctx, roi_num); + for_range(RangeInitFunctor{0, 1, idx_in}); + + Tensor keys_out_t; + int* keys_out = keys_out_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + Tensor index_out_t; + int* idx_out = index_out_t.mutable_data({roi_num}, dev_ctx.GetPlace()); + + // Determine temporary device storage requirements + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, + target_lvls_data, keys_out, + idx_in, idx_out, roi_num); + // Allocate temporary storage + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + + // Run sorting operation + // sort target level to get corresponding index + cub::DeviceRadixSort::SortPairs( + d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out, + idx_in, idx_out, roi_num); + + int* restore_idx_data = + restore_index->mutable_data({roi_num, 1}, dev_ctx.GetPlace()); + // sort current index to get restore index + cub::DeviceRadixSort::SortPairs( + d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in, + restore_idx_data, roi_num); + + int start = 0; + for (int i = 0; i < num_level; ++i) { + Tensor sub_lod = sub_lod_list.Slice(i, i + 1); + int* sub_lod_data = sub_lod.data(); + // transfer length-based lod to offset-based lod + std::vector offset(1, 0); + std::vector sub_lod_cpu(lod_size); + memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place, + sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream()); + dev_ctx.Wait(); + for (int j = 0; j < lod_size; ++j) { + offset.emplace_back(offset.back() + sub_lod_cpu[j]); + } + + int sub_rois_num = offset.back(); + + int end = start + sub_rois_num; + if (end > start) { + Tensor sub_idx = index_out_t.Slice(start, end); + start = end; + multi_fpn_rois[i]->mutable_data({sub_rois_num, kBoxDim}, + dev_ctx.GetPlace()); + GPUGather(dev_ctx, *fpn_rois, sub_idx, multi_fpn_rois[i]); + } else { + multi_fpn_rois[i]->mutable_data({sub_rois_num, kBoxDim}, + dev_ctx.GetPlace()); + } + framework::LoD lod; + lod.emplace_back(offset); + multi_fpn_rois[i]->set_lod(lod); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + distribute_fpn_proposals, + ops::GPUDistributeFpnProposalsOpKernel, + ops::GPUDistributeFpnProposalsOpKernel); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h new file mode 100644 index 00000000..a3196ea5 --- /dev/null +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +const int kBoxDim = 4; + +template +static inline T BBoxArea(const T* box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +class DistributeFpnProposalsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* fpn_rois = context.Input("FpnRois"); + + auto multi_fpn_rois = + context.MultiOutput("MultiFpnRois"); + + auto* restore_index = + context.Output("RestoreIndex"); + + const int min_level = context.Attr("min_level"); + const int max_level = context.Attr("max_level"); + const int refer_level = context.Attr("refer_level"); + const int refer_scale = context.Attr("refer_scale"); + const int num_level = max_level - min_level + 1; + + // check that the fpn_rois is not empty + PADDLE_ENFORCE_EQ(fpn_rois->lod().size(), 1UL, + "DistributeFpnProposalsOp need 1 level of LoD"); + + auto fpn_rois_lod = fpn_rois->lod().back(); + int fpn_rois_num = fpn_rois_lod[fpn_rois_lod.size() - 1]; + std::vector target_level; + // std::vector target_level(fpn_rois_num, -1); + // record the number of rois in each level + std::vector num_rois_level(num_level, 0); + std::vector num_rois_level_integral(num_level + 1, 0); + for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) { + Tensor fpn_rois_slice = + fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); + const T* rois_data = fpn_rois_slice.data(); + for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { + // get the target level of current rois + T roi_scale = std::sqrt(BBoxArea(rois_data, false)); + int tgt_lvl = std::floor(std::log2(roi_scale / refer_scale + (T)1e-6) + + refer_level); + tgt_lvl = std::min(max_level, std::max(tgt_lvl, min_level)); + target_level.push_back(tgt_lvl); + num_rois_level[tgt_lvl - min_level]++; + rois_data += kBoxDim; + } + } + // define the output rois + // pointer which point to each level fpn rois + std::vector multi_fpn_rois_data(num_level); + // lod0 which will record the offset information of each level rois + std::vector> multi_fpn_rois_lod0; + for (int i = 0; i < num_level; ++i) { + // allocate memory for each level rois + multi_fpn_rois[i]->mutable_data({num_rois_level[i], kBoxDim}, + context.GetPlace()); + multi_fpn_rois_data[i] = multi_fpn_rois[i]->data(); + std::vector lod0(1, 0); + multi_fpn_rois_lod0.push_back(lod0); + // statistic start point for each level rois + num_rois_level_integral[i + 1] = + num_rois_level_integral[i] + num_rois_level[i]; + } + restore_index->mutable_data({fpn_rois_num, 1}, context.GetPlace()); + int* restore_index_data = restore_index->data(); + std::vector restore_index_inter(fpn_rois_num, -1); + // distribute the rois into different fpn level by target level + for (int i = 0; i < fpn_rois_lod.size() - 1; ++i) { + Tensor fpn_rois_slice = + fpn_rois->Slice(fpn_rois_lod[i], fpn_rois_lod[i + 1]); + const T* rois_data = fpn_rois_slice.data(); + size_t cur_offset = fpn_rois_lod[i]; + // std::vector lod_offset[num_level]; + for (int j = 0; j < num_level; j++) { + multi_fpn_rois_lod0[j].push_back(multi_fpn_rois_lod0[j][i]); + } + for (int j = 0; j < fpn_rois_slice.dims()[0]; ++j) { + int lvl = target_level[cur_offset + j]; + memcpy(multi_fpn_rois_data[lvl - min_level], rois_data, + kBoxDim * sizeof(T)); + multi_fpn_rois_data[lvl - min_level] += kBoxDim; + int index_in_shuffle = num_rois_level_integral[lvl - min_level] + + multi_fpn_rois_lod0[lvl - min_level][i + 1]; + restore_index_inter[index_in_shuffle] = cur_offset + j; + multi_fpn_rois_lod0[lvl - min_level][i + 1]++; + rois_data += kBoxDim; + } + } + for (int i = 0; i < fpn_rois_num; ++i) { + restore_index_data[restore_index_inter[i]] = i; + } + // merge lod information into LoDTensor + for (int i = 0; i < num_level; ++i) { + framework::LoD lod; + lod.emplace_back(multi_fpn_rois_lod0[i]); + multi_fpn_rois[i]->set_lod(lod); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc new file mode 100644 index 00000000..38eafa5f --- /dev/null +++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc @@ -0,0 +1,441 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/detection/mask_util.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +const int kBoxDim = 4; + +template +void AppendMask(LoDTensor* out, int64_t offset, Tensor* to_add) { + auto* out_data = out->data(); + auto* to_add_data = to_add->data(); + memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T)); +} + +class GenerateMaskLabelsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("GtClasses"), + "Input(GtClasses) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(IsCrowd) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("GtSegms"), + "Input(GtSegms) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Rois"), "Input(Rois) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("LabelsInt32"), + "Input(LabelsInt32) shouldn't be null."); + + PADDLE_ENFORCE( + ctx->HasOutput("MaskRois"), + "Output(MaskRois) of GenerateMaskLabelsOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("RoiHasMaskInt32"), + "Output(RoiHasMaskInt32) of GenerateMaskLabelsOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("MaskInt32"), + "Output(MaskInt32) of GenerateMaskLabelsOp should not be null"); + + auto im_info_dims = ctx->GetInputDim("ImInfo"); + auto gt_segms_dims = ctx->GetInputDim("GtSegms"); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); + PADDLE_ENFORCE_EQ(gt_segms_dims.size(), 2, + "The rank of Input(GtSegms) must be 2."); + PADDLE_ENFORCE_EQ(gt_segms_dims[1], 2, + "The second dim of Input(GtSegms) must be 2."); + int num_classes = ctx->Attrs().Get("num_classes"); + int resolution = ctx->Attrs().Get("resolution"); + + ctx->SetOutputDim("MaskRois", {-1, 4}); + ctx->SetOutputDim("RoiHasMaskInt32", {-1, 1}); + ctx->SetOutputDim("MaskInt32", {-1, num_classes * resolution * resolution}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Rois")); + return framework::OpKernelType(data_type, platform::CPUPlace()); + } +}; + +/* + * Expand masks from shape (#masks, M ** 2) to (#masks, #classes * M ** 2) + * to encode class specific mask targets. + */ +template +static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx, + const Tensor& masks, + const Tensor& mask_class_labels, + const int resolution, const int num_classes, + Tensor* mask_targets) { + const uint8_t* masks_data = masks.data(); + int64_t num_mask = masks.dims()[0]; + const int* mask_class_labels_data = mask_class_labels.data(); + const int M = resolution * resolution; + const int mask_dim = M * num_classes; + + int* mask_targets_data = + mask_targets->mutable_data({num_mask, mask_dim}, ctx.GetPlace()); + math::set_constant(ctx, mask_targets, -1); + for (int64_t mask_id = 0; mask_id < num_mask; ++mask_id) { + int cls = mask_class_labels_data[mask_id]; + int start = M * cls; + if (cls > 0) { + for (int i = 0; i < M; ++i) { + mask_targets_data[mask_id * mask_dim + start + i] = + static_cast(masks_data[mask_id * M + i]); + } + } + } +} + +template +std::vector SampleMaskForOneImage( + const platform::CPUDeviceContext& ctx, const Tensor& im_info, + const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_segms, + const Tensor& rois, const Tensor& label_int32, const int num_classes, + const int resolution, const framework::LoD& segm_length) { + // Prepare the mask targets by associating one gt mask to each training roi + // that has a fg (non-bg) class label. + const int64_t gt_size = static_cast(gt_classes.dims()[0]); + const int64_t roi_size = static_cast(rois.dims()[0]); + const int* gt_classes_data = gt_classes.data(); + const int* is_crowd_data = is_crowd.data(); + const int* label_int32_data = label_int32.data(); + PADDLE_ENFORCE_EQ(roi_size, label_int32.dims()[0]); + + std::vector mask_gt_inds, fg_inds; + std::vector>> gt_polys; + + auto polys_num = segm_length[1]; + auto segm_lod_offset = framework::ConvertToOffsetBasedLoD(segm_length); + auto lod1 = segm_lod_offset[1]; + auto lod2 = segm_lod_offset[2]; + const T* polys_data = gt_segms.data(); + for (int64_t i = 0; i < gt_size; ++i) { + if ((gt_classes_data[i] > 0) && (is_crowd_data[i] == 0)) { + mask_gt_inds.emplace_back(i); + + // slice fg segmentation polys + int poly_num = polys_num[i]; + std::vector> polys; + int s_idx = lod1[i]; + for (int j = 0; j < poly_num; ++j) { + int s = lod2[s_idx + j]; + int e = lod2[s_idx + j + 1]; + PADDLE_ENFORCE_NE(s, e); + std::vector plts(polys_data + s * 2, polys_data + e * 2); + polys.push_back(plts); + } + gt_polys.push_back(polys); + } + } + for (int64_t i = 0; i < roi_size; ++i) { + if (label_int32_data[i] > 0) { + fg_inds.emplace_back(i); + } + } + int gt_num = mask_gt_inds.size(); + int fg_num = fg_inds.size(); + + Tensor boxes_from_polys; + boxes_from_polys.mutable_data({gt_num, 4}, platform::CPUPlace()); + Poly2Boxes(gt_polys, boxes_from_polys.data()); + + std::vector roi_has_mask = + std::vector(fg_inds.begin(), fg_inds.end()); + Tensor mask_class_labels; + Tensor masks; + Tensor rois_fg; + + auto im_scale = im_info.data()[2]; + if (fg_num > 0) { + // Class labels for the foreground rois + mask_class_labels.mutable_data({fg_num, 1}, ctx.GetPlace()); + Gather(label_int32_data, 1, fg_inds.data(), fg_inds.size(), + mask_class_labels.data()); + + uint8_t* masks_data = masks.mutable_data( + {fg_num, resolution * resolution}, ctx.GetPlace()); + + // Find overlap between all foreground rois and the bounding boxes + // enclosing each segmentation + T* rois_fg_data = rois_fg.mutable_data({fg_num, 4}, ctx.GetPlace()); + Gather(rois.data(), 4, fg_inds.data(), fg_inds.size(), + rois_fg.data()); + + for (int k = 0; k < rois_fg.numel(); ++k) { + rois_fg_data[k] = rois_fg_data[k] / im_scale; + } + + Tensor overlaps_bbfg_bbpolys; + overlaps_bbfg_bbpolys.mutable_data({fg_num, gt_num}, ctx.GetPlace()); + BboxOverlaps(rois_fg, boxes_from_polys, &overlaps_bbfg_bbpolys); + + // Map from each fg rois to the index of the mask with highest overlap + // (measured by bbox overlap) + T* overlaps_bbfg_bbpolys_data = overlaps_bbfg_bbpolys.data(); + std::vector fg_masks_inds; + for (int64_t i = 0; i < fg_num; ++i) { + const T* v = overlaps_bbfg_bbpolys_data + i * gt_num; + T max_overlap = std::numeric_limits::min(); + int id = 0; + for (int64_t j = 0; j < gt_num; ++j) { + if (v[j] > max_overlap) { + max_overlap = v[j]; + id = j; + } + } + fg_masks_inds.push_back(id); + } + + // add fg targets + for (int64_t i = 0; i < fg_num; ++i) { + int fg_polys_ind = fg_masks_inds[i]; + T* roi_fg = rois_fg_data + i * 4; + uint8_t* mask = masks_data + i * resolution * resolution; + Polys2MaskWrtBox(gt_polys[fg_polys_ind], roi_fg, resolution, mask); + } + } else { + // The network cannot handle empty blobs, so we must provide a mask + // We simply take the first bg roi, given it an all -1's mask (ignore + // label), and label it with class zero (bg). + int bg_num = 1; + T* rois_fg_data = rois_fg.mutable_data({bg_num, 4}, ctx.GetPlace()); + const T* rois_data = rois.data(); + std::vector bg_inds; + for (int64_t i = 0; i < roi_size; ++i) { + if (label_int32_data[i] == 0) { + bg_inds.emplace_back(i); + rois_fg_data[0] = rois_data[0] / im_scale; + rois_fg_data[1] = rois_data[1] / im_scale; + rois_fg_data[2] = rois_data[2] / im_scale; + rois_fg_data[3] = rois_data[3] / im_scale; + break; + } + } + masks.mutable_data({bg_num, resolution * resolution}, + ctx.GetPlace()); + math::set_constant(ctx, &masks, -1); + int* mask_class_labels_data = + mask_class_labels.mutable_data({bg_num, 1}, ctx.GetPlace()); + mask_class_labels_data[0] = 0; + roi_has_mask = std::vector(bg_inds.begin(), bg_inds.end()); + } + + Tensor masks_expand; + ExpandMaskTarget(ctx, masks, mask_class_labels, resolution, num_classes, + &masks_expand); + + T* rois_fg_data = rois_fg.data(); + for (int k = 0; k < rois_fg.numel(); ++k) { + rois_fg_data[k] = rois_fg_data[k] * im_scale; + } + + Tensor roi_has_mask_t; + int roi_has_mask_size = roi_has_mask.size(); + int* roi_has_mask_data = + roi_has_mask_t.mutable_data({roi_has_mask_size, 1}, ctx.GetPlace()); + std::copy(roi_has_mask.begin(), roi_has_mask.end(), roi_has_mask_data); + + std::vector res; + res.emplace_back(rois_fg); + res.emplace_back(roi_has_mask_t); + res.emplace_back(masks_expand); + return res; +} + +template +class GenerateMaskLabelsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* im_info = ctx.Input("ImInfo"); + auto* gt_classes = ctx.Input("GtClasses"); + auto* is_crowd = ctx.Input("IsCrowd"); + auto* gt_segms = ctx.Input("GtSegms"); + auto* rois = ctx.Input("Rois"); + auto* label_int32 = ctx.Input("LabelsInt32"); + + auto* mask_rois = ctx.Output("MaskRois"); + auto* roi_has_mask_int32 = ctx.Output("RoiHasMaskInt32"); + auto* mask_int32 = ctx.Output("MaskInt32"); + + int num_classes = ctx.Attr("num_classes"); + int resolution = ctx.Attr("resolution"); + + PADDLE_ENFORCE_EQ(gt_classes->lod().size(), 1UL, + "GenerateMaskLabelsOp gt_classes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "GenerateMaskLabelsOp is_crowd needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(rois->lod().size(), 1UL, + "GenerateMaskLabelsOp rois needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(label_int32->lod().size(), 1UL, + "GenerateMaskLabelsOp label_int32 needs 1 level of LoD"); + + PADDLE_ENFORCE_EQ(gt_segms->lod().size(), 3UL); + + int64_t n = static_cast(gt_classes->lod().back().size() - 1); + PADDLE_ENFORCE_EQ(gt_segms->lod()[0].size() - 1, n); + + int mask_dim = num_classes * resolution * resolution; + + mask_rois->mutable_data({rois->numel(), kBoxDim}, ctx.GetPlace()); + roi_has_mask_int32->mutable_data({rois->numel(), 1}, ctx.GetPlace()); + mask_int32->mutable_data({rois->numel(), mask_dim}, ctx.GetPlace()); + + framework::LoD lod; + std::vector lod0(1, 0); + + int64_t num_mask = 0; + auto& dev_ctx = ctx.device_context(); + + auto gt_classes_lod = gt_classes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); + auto rois_lod = rois->lod().back(); + auto label_int32_lod = label_int32->lod().back(); + auto gt_segms_lod = gt_segms->lod(); + + for (int i = 0; i < n; ++i) { + if (rois_lod[i] == rois_lod[i + 1]) { + lod0.emplace_back(num_mask); + continue; + } + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor gt_classes_slice = + gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); + Tensor label_int32_slice = + label_int32->Slice(label_int32_lod[i], label_int32_lod[i + 1]); + Tensor rois_slice = rois->Slice(rois_lod[i], rois_lod[i + 1]); + + auto sub_lod_and_offset = + framework::GetSubLoDAndAbsoluteOffset(gt_segms_lod, i, i + 1, 0); + auto lod_length = sub_lod_and_offset.first; + size_t s = sub_lod_and_offset.second.first; + size_t e = sub_lod_and_offset.second.second; + Tensor gt_segms_slice = gt_segms->Slice(s, e); + + std::vector tensor_output = SampleMaskForOneImage( + dev_ctx, im_info_slice, gt_classes_slice, is_crowd_slice, + gt_segms_slice, rois_slice, label_int32_slice, num_classes, + resolution, lod_length); + + Tensor sampled_mask_rois = tensor_output[0]; + Tensor sampled_roi_has_mask_int32 = tensor_output[1]; + Tensor sampled_mask_int32 = tensor_output[2]; + + AppendMask(mask_rois, kBoxDim * num_mask, &sampled_mask_rois); + AppendMask(roi_has_mask_int32, num_mask, + &sampled_roi_has_mask_int32); + AppendMask(mask_int32, mask_dim * num_mask, &sampled_mask_int32); + + num_mask += sampled_mask_rois.dims()[0]; + lod0.emplace_back(num_mask); + } + + lod.emplace_back(lod0); + mask_rois->set_lod(lod); + roi_has_mask_int32->set_lod(lod); + mask_int32->set_lod(lod); + mask_rois->Resize({num_mask, kBoxDim}); + roi_has_mask_int32->Resize({num_mask, 1}); + mask_int32->Resize({num_mask, mask_dim}); + } +}; + +class GenerateMaskLabelsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("ImInfo", + "(Tensor), This input is a 2D Tensor with shape [B, 3]. " + "B is the number of input images, " + "each element consists of im_height, im_width, im_scale."); + AddInput("GtClasses", + "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. " + "M is the number of groundtruth, " + "each element is a class label of groundtruth."); + AddInput( + "IsCrowd", + "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. " + "M is the number of groundtruth, " + "each element is a flag indicates whether a groundtruth is crowd."); + AddInput( + "GtSegms", + "(LoDTensor), This input is a 2D LoDTensor with shape [S, 2], it's LoD " + "level is 3. The LoD[0] represents the gt objects number of each " + "instance. LoD[1] represents the segmentation counts of each objects. " + "LoD[2] represents the polygons number of each segmentation. S the " + "total number of polygons coordinate points. Each element is (x, y) " + "coordinate points."); + AddInput( + "Rois", + "(LoDTensor), This input is a 2D LoDTensor with shape [R, 4]. " + "R is the number of rois which is the output of " + "generate_proposal_labels, " + "each element is a bounding box with (xmin, ymin, xmax, ymax) format."); + AddInput("LabelsInt32", + "(LoDTensor), This intput is a 2D LoDTensor with shape [R, 1], " + "each element repersents a class label of a roi"); + AddOutput( + "MaskRois", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. " + "P is the number of mask, " + "each element is a bounding box with [xmin, ymin, xmax, ymax] format."); + AddOutput("RoiHasMaskInt32", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], " + "each element repersents the output mask rois index with regard " + "to input rois"); + AddOutput("MaskInt32", + "(LoDTensor), This output is a 4D LoDTensor with shape [P, Q], " + "Q equal to num_classes * resolution * resolution"); + + AddAttr("num_classes", "Class number."); + AddAttr("resolution", "Resolution of mask."); + + AddComment(R"DOC( +This operator can be, for given the RoIs and corresponding labels, +to sample foreground RoIs. This mask branch also has +a :math: `K \\times M^{2}` dimensional output targets for each foreground +RoI, which encodes K binary masks of resolution M x M, one for each of the +K classes. This mask targets are used to compute loss of mask branch. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(generate_mask_labels, ops::GenerateMaskLabelsOp, + ops::GenerateMaskLabelsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(generate_mask_labels, + ops::GenerateMaskLabelsKernel); diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc new file mode 100644 index 00000000..451e0ca8 --- /dev/null +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -0,0 +1,591 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/concat_and_split.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +const int kBoxDim = 4; + +template +void AppendRois(LoDTensor* out, int64_t offset, Tensor* to_add) { + auto* out_data = out->data(); + auto* to_add_data = to_add->data(); + memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T)); +} + +class GenerateProposalLabelsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("RpnRois"), + "Input(RpnRois) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("GtClasses"), + "Input(GtClasses) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(IsCrowd) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), + "Input(GtBoxes) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null."); + + PADDLE_ENFORCE( + ctx->HasOutput("Rois"), + "Output(Rois) of GenerateProposalLabelsOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("LabelsInt32"), + "Output(LabelsInt32) of GenerateProposalLabelsOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("BboxTargets"), + "Output(BboxTargets) of GenerateProposalLabelsOp should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("BboxInsideWeights"), + "Output(BboxInsideWeights) of GenerateProposalLabelsOp " + "should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("BboxOutsideWeights"), + "Output(BboxOutsideWeights) of GenerateProposalLabelsOp " + "should not be null"); + + auto rpn_rois_dims = ctx->GetInputDim("RpnRois"); + auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + + PADDLE_ENFORCE_EQ(rpn_rois_dims.size(), 2, + "The rank of Input(RpnRois) must be 2."); + PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, + "The rank of Input(GtBoxes) must be 2."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); + + int class_nums = ctx->Attrs().Get("class_nums"); + + ctx->SetOutputDim("Rois", {-1, 4}); + ctx->SetOutputDim("LabelsInt32", {-1, 1}); + ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums}); + ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums}); + ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("RpnRois")); + return framework::OpKernelType(data_type, platform::CPUPlace()); + } +}; + +template +void Concat(const platform::CPUDeviceContext& context, + const Tensor& in_tensor_a, const Tensor& in_tensor_b, + Tensor* out_tensor) { + int axis = 0; + std::vector inputs; + inputs.emplace_back(in_tensor_a); + inputs.emplace_back(in_tensor_b); + math::ConcatFunctor concat_functor; + concat_functor(context, inputs, axis, out_tensor); +} + +template +std::vector> SampleFgBgGt( + const platform::CPUDeviceContext& context, Tensor* iou, + const Tensor& is_crowd, const int batch_size_per_im, + const float fg_fraction, const float fg_thresh, const float bg_thresh_hi, + const float bg_thresh_lo, std::minstd_rand engine, const bool use_random, + const bool is_cascade_rcnn, const Tensor& rpn_rois) { + std::vector fg_inds; + std::vector bg_inds; + std::vector mapped_gt_inds; + int64_t gt_num = is_crowd.numel(); + const int* crowd_data = is_crowd.data(); + T* proposal_to_gt_overlaps = iou->data(); + int64_t row = iou->dims()[0]; + int64_t col = iou->dims()[1]; + float epsilon = 0.00001; + const T* rpn_rois_dt = rpn_rois.data(); + // Follow the Faster RCNN's implementation + for (int64_t i = 0; i < row; ++i) { + const T* v = proposal_to_gt_overlaps + i * col; + T max_overlap = *std::max_element(v, v + col); + if ((i < gt_num) && (crowd_data[i])) { + max_overlap = -1.0; + } + if (is_cascade_rcnn && + ((rpn_rois_dt[i * 4 + 2] - rpn_rois_dt[i * 4 + 0] + 1) <= 0 || + (rpn_rois_dt[i * 4 + 3] - rpn_rois_dt[i * 4 + 1] + 1) <= 0)) { + continue; + } + if (max_overlap >= fg_thresh) { + // fg mapped gt label index + for (int64_t j = 0; j < col; ++j) { + T val = proposal_to_gt_overlaps[i * col + j]; + auto diff = std::abs(max_overlap - val); + if (diff < epsilon) { + fg_inds.emplace_back(i); + mapped_gt_inds.emplace_back(j); + break; + } + } + } else if ((max_overlap >= bg_thresh_lo) && (max_overlap < bg_thresh_hi)) { + bg_inds.emplace_back(i); + } else { + continue; + } + } + + std::vector> res; + if (is_cascade_rcnn) { + res.emplace_back(fg_inds); + res.emplace_back(bg_inds); + res.emplace_back(mapped_gt_inds); + } else { + // Reservoir Sampling + // sampling fg + std::uniform_real_distribution uniform(0, 1); + int fg_rois_per_im = std::floor(batch_size_per_im * fg_fraction); + int fg_rois_this_image = fg_inds.size(); + int fg_rois_per_this_image = std::min(fg_rois_per_im, fg_rois_this_image); + if (use_random) { + const int64_t fg_size = static_cast(fg_inds.size()); + if (fg_size > fg_rois_per_this_image) { + for (int64_t i = fg_rois_per_this_image; i < fg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) { + std::iter_swap(fg_inds.begin() + rng_ind, fg_inds.begin() + i); + std::iter_swap(mapped_gt_inds.begin() + rng_ind, + mapped_gt_inds.begin() + i); + } + } + } + } + std::vector new_fg_inds(fg_inds.begin(), + fg_inds.begin() + fg_rois_per_this_image); + std::vector new_gt_inds( + mapped_gt_inds.begin(), + mapped_gt_inds.begin() + fg_rois_per_this_image); + // sampling bg + int bg_rois_per_image = batch_size_per_im - fg_rois_per_this_image; + int bg_rois_this_image = bg_inds.size(); + int bg_rois_per_this_image = + std::min(bg_rois_per_image, bg_rois_this_image); + if (use_random) { + const int64_t bg_size = static_cast(bg_inds.size()); + if (bg_size > bg_rois_per_this_image) { + for (int64_t i = bg_rois_per_this_image; i < bg_size; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < fg_rois_per_this_image) + std::iter_swap(bg_inds.begin() + rng_ind, bg_inds.begin() + i); + } + } + } + std::vector new_bg_inds(bg_inds.begin(), + bg_inds.begin() + bg_rois_per_this_image); + // + res.emplace_back(new_fg_inds); + res.emplace_back(new_bg_inds); + res.emplace_back(new_gt_inds); + } + + return res; +} + +template +void GatherBoxesLabels(const platform::CPUDeviceContext& context, + const Tensor& boxes, const Tensor& gt_boxes, + const Tensor& gt_classes, + const std::vector& fg_inds, + const std::vector& bg_inds, + const std::vector& gt_inds, Tensor* sampled_boxes, + Tensor* sampled_labels, Tensor* sampled_gts) { + int fg_num = fg_inds.size(); + int bg_num = bg_inds.size(); + Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t; + int* fg_inds_data = fg_inds_t.mutable_data({fg_num}, context.GetPlace()); + int* bg_inds_data = bg_inds_t.mutable_data({bg_num}, context.GetPlace()); + int* gt_box_inds_data = + gt_box_inds_t.mutable_data({fg_num}, context.GetPlace()); + int* gt_label_inds_data = + gt_label_inds_t.mutable_data({fg_num}, context.GetPlace()); + std::copy(fg_inds.begin(), fg_inds.end(), fg_inds_data); + std::copy(bg_inds.begin(), bg_inds.end(), bg_inds_data); + std::copy(gt_inds.begin(), gt_inds.end(), gt_box_inds_data); + std::copy(gt_inds.begin(), gt_inds.end(), gt_label_inds_data); + + Tensor fg_boxes, bg_boxes, fg_labels, bg_labels; + fg_boxes.mutable_data({fg_num, kBoxDim}, context.GetPlace()); + CPUGather(context, boxes, fg_inds_t, &fg_boxes); + bg_boxes.mutable_data({bg_num, kBoxDim}, context.GetPlace()); + CPUGather(context, boxes, bg_inds_t, &bg_boxes); + Concat(context, fg_boxes, bg_boxes, sampled_boxes); + CPUGather(context, gt_boxes, gt_box_inds_t, sampled_gts); + fg_labels.mutable_data({fg_num}, context.GetPlace()); + CPUGather(context, gt_classes, gt_label_inds_t, &fg_labels); + bg_labels.mutable_data({bg_num}, context.GetPlace()); + math::set_constant(context, &bg_labels, 0); + Concat(context, fg_labels, bg_labels, sampled_labels); +} + +template +std::vector SampleRoisForOneImage( + const platform::CPUDeviceContext& context, const Tensor& rpn_rois_in, + const Tensor& gt_classes, const Tensor& is_crowd, const Tensor& gt_boxes, + const Tensor& im_info, const int batch_size_per_im, const float fg_fraction, + const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo, + const std::vector& bbox_reg_weights, const int class_nums, + std::minstd_rand engine, bool use_random, bool is_cascade_rcnn, + bool is_cls_agnostic) { + // 1.1 map to original image + auto im_scale = im_info.data()[2]; + Tensor rpn_rois_slice; + Tensor rpn_rois; + + if (is_cascade_rcnn) { + // slice rpn_rois from gt_box_num refer to detectron + rpn_rois_slice = + rpn_rois_in.Slice(gt_boxes.dims()[0], rpn_rois_in.dims()[0]); + rpn_rois.mutable_data(rpn_rois_slice.dims(), context.GetPlace()); + const T* rpn_rois_in_dt = rpn_rois_slice.data(); + T* rpn_rois_dt = rpn_rois.data(); + for (int i = 0; i < rpn_rois.numel(); ++i) { + rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale; + } + } else { + rpn_rois.mutable_data(rpn_rois_in.dims(), context.GetPlace()); + const T* rpn_rois_in_dt = rpn_rois_in.data(); + T* rpn_rois_dt = rpn_rois.data(); + for (int i = 0; i < rpn_rois.numel(); ++i) { + rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale; + } + } + + // 1.2 compute overlaps + int proposals_num = gt_boxes.dims()[0] + rpn_rois.dims()[0]; + Tensor boxes; + boxes.mutable_data({proposals_num, kBoxDim}, context.GetPlace()); + Concat(context, gt_boxes, rpn_rois, &boxes); + Tensor proposal_to_gt_overlaps; + proposal_to_gt_overlaps.mutable_data({proposals_num, gt_boxes.dims()[0]}, + context.GetPlace()); + BboxOverlaps(boxes, gt_boxes, &proposal_to_gt_overlaps); + + // Generate proposal index + std::vector> fg_bg_gt = + SampleFgBgGt(context, &proposal_to_gt_overlaps, is_crowd, + batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi, + bg_thresh_lo, engine, use_random, is_cascade_rcnn, boxes); + std::vector fg_inds = fg_bg_gt[0]; + std::vector bg_inds = fg_bg_gt[1]; + std::vector mapped_gt_inds = fg_bg_gt[2]; // mapped_gt_labels + + // Gather boxes and labels + Tensor sampled_boxes, sampled_labels, sampled_gts; + int fg_num = fg_inds.size(); + int bg_num = bg_inds.size(); + int boxes_num = fg_num + bg_num; + framework::DDim bbox_dim({boxes_num, kBoxDim}); + sampled_boxes.mutable_data(bbox_dim, context.GetPlace()); + sampled_labels.mutable_data({boxes_num}, context.GetPlace()); + sampled_gts.mutable_data({fg_num, kBoxDim}, context.GetPlace()); + GatherBoxesLabels(context, boxes, gt_boxes, gt_classes, fg_inds, bg_inds, + mapped_gt_inds, &sampled_boxes, &sampled_labels, + &sampled_gts); + + // Compute targets + Tensor bbox_targets_single; + bbox_targets_single.mutable_data(bbox_dim, context.GetPlace()); + BoxToDelta(fg_num, sampled_boxes, sampled_gts, bbox_reg_weights.data(), + false, &bbox_targets_single); + + // Scale rois + Tensor sampled_rois; + sampled_rois.mutable_data(sampled_boxes.dims(), context.GetPlace()); + auto sampled_rois_et = framework::EigenTensor::From(sampled_rois); + auto sampled_boxes_et = framework::EigenTensor::From(sampled_boxes); + sampled_rois_et = sampled_boxes_et * im_scale; + + // Expand box targets + Tensor bbox_targets, bbox_inside_weights, bbox_outside_weights; + framework::DDim bbox_expand_dim({boxes_num, kBoxDim * class_nums}); + bbox_targets.mutable_data(bbox_expand_dim, context.GetPlace()); + bbox_inside_weights.mutable_data(bbox_expand_dim, context.GetPlace()); + bbox_outside_weights.mutable_data(bbox_expand_dim, context.GetPlace()); + math::set_constant(context, &bbox_targets, 0.0); + math::set_constant(context, &bbox_inside_weights, 0.0); + math::set_constant(context, &bbox_outside_weights, 0.0); + + auto* bbox_targets_single_data = bbox_targets_single.data(); + auto* sampled_labels_data = sampled_labels.data(); + auto* bbox_targets_data = bbox_targets.data(); + auto* bbox_inside_weights_data = bbox_inside_weights.data(); + auto* bbox_outside_weights_data = bbox_outside_weights.data(); + int width = kBoxDim * class_nums; + for (int64_t i = 0; i < boxes_num; ++i) { + int label = sampled_labels_data[i]; + if (label > 0) { + if (is_cls_agnostic) { + label = 1; + } + int dst_idx = i * width + kBoxDim * label; + int src_idx = kBoxDim * i; + bbox_targets_data[dst_idx] = bbox_targets_single_data[src_idx]; + bbox_targets_data[dst_idx + 1] = bbox_targets_single_data[src_idx + 1]; + bbox_targets_data[dst_idx + 2] = bbox_targets_single_data[src_idx + 2]; + bbox_targets_data[dst_idx + 3] = bbox_targets_single_data[src_idx + 3]; + bbox_inside_weights_data[dst_idx] = 1; + bbox_inside_weights_data[dst_idx + 1] = 1; + bbox_inside_weights_data[dst_idx + 2] = 1; + bbox_inside_weights_data[dst_idx + 3] = 1; + bbox_outside_weights_data[dst_idx] = 1; + bbox_outside_weights_data[dst_idx + 1] = 1; + bbox_outside_weights_data[dst_idx + 2] = 1; + bbox_outside_weights_data[dst_idx + 3] = 1; + } + } + std::vector res; + res.emplace_back(sampled_rois); + res.emplace_back(sampled_labels); + res.emplace_back(bbox_targets); + res.emplace_back(bbox_inside_weights); + res.emplace_back(bbox_outside_weights); + return res; +} + +template +class GenerateProposalLabelsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* rpn_rois = context.Input("RpnRois"); + auto* gt_classes = context.Input("GtClasses"); + auto* is_crowd = context.Input("IsCrowd"); + auto* gt_boxes = context.Input("GtBoxes"); + auto* im_info = context.Input("ImInfo"); + + auto* rois = context.Output("Rois"); + auto* labels_int32 = context.Output("LabelsInt32"); + auto* bbox_targets = context.Output("BboxTargets"); + auto* bbox_inside_weights = context.Output("BboxInsideWeights"); + auto* bbox_outside_weights = + context.Output("BboxOutsideWeights"); + + int batch_size_per_im = context.Attr("batch_size_per_im"); + float fg_fraction = context.Attr("fg_fraction"); + float fg_thresh = context.Attr("fg_thresh"); + float bg_thresh_hi = context.Attr("bg_thresh_hi"); + float bg_thresh_lo = context.Attr("bg_thresh_lo"); + std::vector bbox_reg_weights = + context.Attr>("bbox_reg_weights"); + int class_nums = context.Attr("class_nums"); + bool use_random = context.Attr("use_random"); + bool is_cascade_rcnn = context.Attr("is_cascade_rcnn"); + bool is_cls_agnostic = context.Attr("is_cls_agnostic"); + PADDLE_ENFORCE_EQ(rpn_rois->lod().size(), 1UL, + "GenerateProposalLabelsOp rpn_rois needs 1 level of LoD"); + PADDLE_ENFORCE_EQ( + gt_classes->lod().size(), 1UL, + "GenerateProposalLabelsOp gt_classes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "GenerateProposalLabelsOp is_crowd needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, + "GenerateProposalLabelsOp gt_boxes needs 1 level of LoD"); + int64_t n = static_cast(rpn_rois->lod().back().size() - 1); + + rois->mutable_data({n * batch_size_per_im, kBoxDim}, context.GetPlace()); + labels_int32->mutable_data({n * batch_size_per_im, 1}, + context.GetPlace()); + bbox_targets->mutable_data({n * batch_size_per_im, kBoxDim * class_nums}, + context.GetPlace()); + bbox_inside_weights->mutable_data( + {n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace()); + bbox_outside_weights->mutable_data( + {n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace()); + + std::random_device rnd; + std::minstd_rand engine; + int seed = rnd(); + engine.seed(seed); + + framework::LoD lod; + std::vector lod0(1, 0); + + int64_t num_rois = 0; + auto& dev_ctx = context.device_context(); + + auto rpn_rois_lod = rpn_rois->lod().back(); + auto gt_classes_lod = gt_classes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); + auto gt_boxes_lod = gt_boxes->lod().back(); + for (int i = 0; i < n; ++i) { + if (rpn_rois_lod[i] == rpn_rois_lod[i + 1]) { + lod0.emplace_back(num_rois); + continue; + } + Tensor rpn_rois_slice = + rpn_rois->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]); + Tensor gt_classes_slice = + gt_classes->Slice(gt_classes_lod[i], gt_classes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); + Tensor gt_boxes_slice = + gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); + Tensor im_info_slice = im_info->Slice(i, i + 1); + std::vector tensor_output = SampleRoisForOneImage( + dev_ctx, rpn_rois_slice, gt_classes_slice, is_crowd_slice, + gt_boxes_slice, im_info_slice, batch_size_per_im, fg_fraction, + fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums, + engine, use_random, is_cascade_rcnn, is_cls_agnostic); + Tensor sampled_rois = tensor_output[0]; + Tensor sampled_labels_int32 = tensor_output[1]; + Tensor sampled_bbox_targets = tensor_output[2]; + Tensor sampled_bbox_inside_weights = tensor_output[3]; + Tensor sampled_bbox_outside_weights = tensor_output[4]; + + AppendRois(rois, kBoxDim * num_rois, &sampled_rois); + AppendRois(labels_int32, num_rois, &sampled_labels_int32); + AppendRois(bbox_targets, kBoxDim * num_rois * class_nums, + &sampled_bbox_targets); + AppendRois(bbox_inside_weights, kBoxDim * num_rois * class_nums, + &sampled_bbox_inside_weights); + AppendRois(bbox_outside_weights, kBoxDim * num_rois * class_nums, + &sampled_bbox_outside_weights); + + num_rois += sampled_rois.dims()[0]; + lod0.emplace_back(num_rois); + } + + lod.emplace_back(lod0); + rois->set_lod(lod); + labels_int32->set_lod(lod); + bbox_targets->set_lod(lod); + bbox_inside_weights->set_lod(lod); + bbox_outside_weights->set_lod(lod); + rois->Resize({num_rois, kBoxDim}); + labels_int32->Resize({num_rois, 1}); + bbox_targets->Resize({num_rois, kBoxDim * class_nums}); + bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums}); + bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums}); + } +}; + +class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "RpnRois", + "(LoDTensor), This input is a 2D LoDTensor with shape [N, 4]. " + "N is the number of the GenerateProposalOp's output, " + "each element is a bounding box with [xmin, ymin, xmax, ymax] format."); + AddInput("GtClasses", + "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. " + "M is the number of groundtruth, " + "each element is a class label of groundtruth."); + AddInput( + "IsCrowd", + "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. " + "M is the number of groundtruth, " + "each element is a flag indicates whether a groundtruth is crowd."); + AddInput( + "GtBoxes", + "(LoDTensor), This input is a 2D LoDTensor with shape [M, 4]. " + "M is the number of groundtruth, " + "each element is a bounding box with [xmin, ymin, xmax, ymax] format."); + AddInput("ImInfo", + "(Tensor), This input is a 2D Tensor with shape [B, 3]. " + "B is the number of input images, " + "each element consists of im_height, im_width, im_scale."); + + AddOutput( + "Rois", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. " + "P usuall equal to batch_size_per_im * batch_size, " + "each element is a bounding box with [xmin, ymin, xmax, ymax] format."); + AddOutput("LabelsInt32", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 1], " + "each element repersents a class label of a roi"); + AddOutput("BboxTargets", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * " + "class_nums], " + "each element repersents a box label of a roi"); + AddOutput( + "BboxInsideWeights", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * " + "class_nums], " + "each element indicates whether a box should contribute to loss."); + AddOutput( + "BboxOutsideWeights", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * " + "class_nums], " + "each element indicates whether a box should contribute to loss."); + + AddAttr("batch_size_per_im", "Batch size of rois per images."); + AddAttr("fg_fraction", + "Foreground fraction in total batch_size_per_im."); + AddAttr( + "fg_thresh", + "Overlap threshold which is used to chose foreground sample."); + AddAttr("bg_thresh_hi", + "Overlap threshold upper bound which is used to chose " + "background sample."); + AddAttr("bg_thresh_lo", + "Overlap threshold lower bound which is used to chose " + "background sample."); + AddAttr>("bbox_reg_weights", "Box regression weights."); + AddAttr("class_nums", "Class number."); + AddAttr( + "use_random", + "Use random sampling to choose foreground and background boxes.") + .SetDefault(true); + AddAttr("is_cascade_rcnn", + "cascade rcnn sampling policy changed from stage 2.") + .SetDefault(false); + AddAttr( + "is_cls_agnostic", + "the box regress will only include fg and bg locations if set true ") + .SetDefault(false); + + AddComment(R"DOC( +This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth, +to sample foreground boxes and background boxes, and compute loss target. + +RpnRois is the output boxes of RPN and was processed by generate_proposal_op, these boxes +were combined with groundtruth boxes and sampled according to batch_size_per_im and fg_fraction, +If an instance with a groundtruth overlap greater than fg_thresh, then it was considered as a foreground sample. +If an instance with a groundtruth overlap greater than bg_thresh_lo and lower than bg_thresh_hi, +then it was considered as a background sample. +After all foreground and background boxes are chosen (so called Rois), +then we apply random sampling to make sure +the number of foreground boxes is no more than batch_size_per_im * fg_fraction. + +For each box in Rois, we assign the classification (class label) and regression targets (box label) to it. +Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(generate_proposal_labels, ops::GenerateProposalLabelsOp, + ops::GenerateProposalLabelsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(generate_proposal_labels, + ops::GenerateProposalLabelsKernel, + ops::GenerateProposalLabelsKernel); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc new file mode 100644 index 00000000..06e48f12 --- /dev/null +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -0,0 +1,500 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); + +static void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) { + auto *out_data = dst->data(); + auto *to_add_data = src.data(); + size_t size_of_t = framework::SizeOfType(src.type()); + offset *= size_of_t; + std::memcpy( + reinterpret_cast(reinterpret_cast(out_data) + offset), + to_add_data, src.numel() * size_of_t); +} + +class GenerateProposalsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Scores"), "Input(Scores) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("BboxDeltas"), + "Input(BboxDeltas) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), "Input(ImInfo) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Anchors"), + "Input(Anchors) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput("Variances"), + "Input(Variances) shouldn't be null."); + + ctx->SetOutputDim("RpnRois", {-1, 4}); + ctx->SetOutputDim("RpnRoiProbs", {-1, 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Anchors")->type(), + ctx.device_context()); + } +}; + +template +static inline void BoxCoder(const platform::DeviceContext &ctx, + Tensor *all_anchors, Tensor *bbox_deltas, + Tensor *variances, Tensor *proposals) { + T *proposals_data = proposals->mutable_data(ctx.GetPlace()); + + int64_t row = all_anchors->dims()[0]; + int64_t len = all_anchors->dims()[1]; + + auto *bbox_deltas_data = bbox_deltas->data(); + auto *anchor_data = all_anchors->data(); + const T *variances_data = nullptr; + if (variances) { + variances_data = variances->data(); + } + + for (int64_t i = 0; i < row; ++i) { + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; + T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; + + T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; + T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; + + T bbox_center_x = 0, bbox_center_y = 0; + T bbox_width = 0, bbox_height = 0; + + if (variances) { + bbox_center_x = + variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data[i * len + 1] * + bbox_deltas_data[i * len + 1] * anchor_height + + anchor_center_y; + bbox_width = std::exp(std::min(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } else { + bbox_center_x = + bbox_deltas_data[i * len] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; + bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } + + proposals_data[i * len] = bbox_center_x - bbox_width / 2; + proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; + } + // return proposals; +} + +template +static inline void ClipTiledBoxes(const platform::DeviceContext &ctx, + const Tensor &im_info, Tensor *boxes) { + T *boxes_data = boxes->mutable_data(ctx.GetPlace()); + const T *im_info_data = im_info.data(); + T zero(0); + for (int64_t i = 0; i < boxes->numel(); ++i) { + if (i % 4 == 0) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); + } else if (i % 4 == 1) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); + } else if (i % 4 == 2) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); + } else { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); + } + } +} + +template +static inline void FilterBoxes(const platform::DeviceContext &ctx, + Tensor *boxes, float min_size, + const Tensor &im_info, Tensor *keep) { + const T *im_info_data = im_info.data(); + T *boxes_data = boxes->mutable_data(ctx.GetPlace()); + T im_scale = im_info_data[2]; + keep->Resize({boxes->dims()[0]}); + min_size = std::max(min_size, 1.0f); + int *keep_data = keep->mutable_data(ctx.GetPlace()); + + int keep_len = 0; + for (int i = 0; i < boxes->dims()[0]; ++i) { + T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; + T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; + T ws_origin_scale = + (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1; + T hs_origin_scale = + (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1; + T x_ctr = boxes_data[4 * i] + ws / 2; + T y_ctr = boxes_data[4 * i + 1] + hs / 2; + if (ws_origin_scale >= min_size && hs_origin_scale >= min_size && + x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { + keep_data[keep_len++] = i; + } + } + keep->Resize({keep_len}); +} + +template +static inline std::vector> GetSortedScoreIndex( + const std::vector &scores) { + std::vector> sorted_indices; + sorted_indices.reserve(scores.size()); + for (size_t i = 0; i < scores.size(); ++i) { + sorted_indices.emplace_back(scores[i], i); + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices.begin(), sorted_indices.end(), + [](const std::pair &a, const std::pair &b) { + return a.first < b.first; + }); + return sorted_indices; +} + +template +static inline T BBoxArea(const T *box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1); + const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1); + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +static inline Tensor VectorToTensor(const std::vector &selected_indices, + int selected_num) { + Tensor keep_nms; + keep_nms.Resize({selected_num}); + auto *keep_data = keep_nms.mutable_data(platform::CPUPlace()); + for (int i = 0; i < selected_num; ++i) { + keep_data[i] = selected_indices[i]; + } + return keep_nms; +} + +template +static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, + Tensor *scores, T nms_threshold, float eta) { + PADDLE_ENFORCE_NOT_NULL(bbox); + int64_t num_boxes = bbox->dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox->dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores->data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices = + GetSortedScoreIndex(scores_data); + + std::vector selected_indices; + int selected_num = 0; + T adaptive_threshold = nms_threshold; + const T *bbox_data = bbox->data(); + while (sorted_indices.size() != 0) { + int idx = sorted_indices.back().second; + bool flag = true; + for (int kept_idx : selected_indices) { + if (flag) { + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, false); + flag = (overlap <= adaptive_threshold); + } else { + break; + } + } + if (flag) { + selected_indices.push_back(idx); + ++selected_num; + } + sorted_indices.erase(sorted_indices.end() - 1); + if (flag && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + return VectorToTensor(selected_indices, selected_num); +} + +template +class GenerateProposalsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *scores = context.Input("Scores"); + auto *bbox_deltas = context.Input("BboxDeltas"); + auto *im_info = context.Input("ImInfo"); + auto anchors = detail::Ref(context.Input("Anchors"), + "Cannot find input Anchors(%s) in scope", + context.Inputs("Anchors")[0]); + auto variances = detail::Ref(context.Input("Variances"), + "Cannot find input Variances(%s) in scope", + context.Inputs("Variances")[0]); + + auto *rpn_rois = context.Output("RpnRois"); + auto *rpn_roi_probs = context.Output("RpnRoiProbs"); + + int pre_nms_top_n = context.Attr("pre_nms_topN"); + int post_nms_top_n = context.Attr("post_nms_topN"); + float nms_thresh = context.Attr("nms_thresh"); + float min_size = context.Attr("min_size"); + float eta = context.Attr("eta"); + + auto &dev_ctx = + context.template device_context(); + + auto &scores_dim = scores->dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto &bbox_dim = bbox_deltas->dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, + context.GetPlace()); + rpn_roi_probs->mutable_data({scores->numel(), 1}, context.GetPlace()); + + Tensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, + dev_ctx.GetPlace()); + scores_swap.mutable_data({num, h_score, w_score, c_score}, + dev_ctx.GetPlace()); + + math::Transpose trans; + std::vector axis = {0, 2, 3, 1}; + trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); + trans(dev_ctx, *scores, &scores_swap, axis); + + framework::LoD lod; + lod.resize(1); + auto &lod0 = lod[0]; + lod0.push_back(0); + anchors.Resize({anchors.numel() / 4, 4}); + variances.Resize({variances.numel() / 4, 4}); + + int64_t num_proposals = 0; + for (int64_t i = 0; i < num; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + Tensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + scores_slice.Resize({h_score * w_score * c_score, 1}); + + std::pair tensor_pair = + ProposalForOneImage(dev_ctx, im_info_slice, anchors, variances, + bbox_deltas_slice, scores_slice, pre_nms_top_n, + post_nms_top_n, nms_thresh, min_size, eta); + Tensor &proposals = tensor_pair.first; + Tensor &scores = tensor_pair.second; + + AppendProposals(rpn_rois, 4 * num_proposals, proposals); + AppendProposals(rpn_roi_probs, num_proposals, scores); + num_proposals += proposals.dims()[0]; + lod0.push_back(num_proposals); + } + rpn_rois->set_lod(lod); + rpn_roi_probs->set_lod(lod); + rpn_rois->Resize({num_proposals, 4}); + rpn_roi_probs->Resize({num_proposals, 1}); + } + + std::pair ProposalForOneImage( + const platform::CPUDeviceContext &ctx, const Tensor &im_info_slice, + const Tensor &anchors, const Tensor &variances, + const Tensor &bbox_deltas_slice, // [M, 4] + const Tensor &scores_slice, // [N, 1] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) const { + auto *scores_data = scores_slice.data(); + + // Sort index + Tensor index_t; + index_t.Resize({scores_slice.numel()}); + int *index = index_t.mutable_data(ctx.GetPlace()); + for (int i = 0; i < scores_slice.numel(); ++i) { + index[i] = i; + } + auto compare = [scores_data](const int64_t &i, const int64_t &j) { + return scores_data[i] > scores_data[j]; + }; + + if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { + std::sort(index, index + scores_slice.numel(), compare); + } else { + std::nth_element(index, index + pre_nms_top_n, + index + scores_slice.numel(), compare); + index_t.Resize({pre_nms_top_n}); + } + + Tensor scores_sel, bbox_sel, anchor_sel, var_sel; + scores_sel.mutable_data({index_t.numel(), 1}, ctx.GetPlace()); + bbox_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + anchor_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + var_sel.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + + CPUGather(ctx, scores_slice, index_t, &scores_sel); + CPUGather(ctx, bbox_deltas_slice, index_t, &bbox_sel); + CPUGather(ctx, anchors, index_t, &anchor_sel); + CPUGather(ctx, variances, index_t, &var_sel); + + Tensor proposals; + proposals.mutable_data({index_t.numel(), 4}, ctx.GetPlace()); + BoxCoder(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals); + + ClipTiledBoxes(ctx, im_info_slice, &proposals); + + Tensor keep; + FilterBoxes(ctx, &proposals, min_size, im_info_slice, &keep); + + Tensor scores_filter; + bbox_sel.mutable_data({keep.numel(), 4}, ctx.GetPlace()); + scores_filter.mutable_data({keep.numel(), 1}, ctx.GetPlace()); + CPUGather(ctx, proposals, keep, &bbox_sel); + CPUGather(ctx, scores_sel, keep, &scores_filter); + if (nms_thresh <= 0) { + return std::make_pair(bbox_sel, scores_filter); + } + + Tensor keep_nms = NMS(ctx, &bbox_sel, &scores_filter, nms_thresh, eta); + + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize({post_nms_top_n}); + } + + proposals.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); + scores_sel.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); + CPUGather(ctx, bbox_sel, keep_nms, &proposals); + CPUGather(ctx, scores_filter, keep_nms, &scores_sel); + + return std::make_pair(proposals, scores_sel); + } +}; + +class GenerateProposalsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Scores", + "(Tensor) The scores from conv is in shape (N, A, H, W), " + "N is batch size, A is number of anchors, " + "H and W are height and width of the feature map"); + AddInput("BboxDeltas", + "(Tensor) Bounding box deltas from conv is in " + "shape (N, 4*A, H, W)."); + AddInput("ImInfo", + "(Tensor) Information for image reshape is in shape (N, 3), " + "in format (height, width, scale)"); + AddInput("Anchors", + "(Tensor) Bounding box anchors from anchor_generator_op " + "is in shape (A, H, W, 4)."); + AddInput("Variances", + "(Tensor) Bounding box variances with same shape as `Anchors`."); + + AddOutput("RpnRois", + "(LoDTensor), Output proposals with shape (rois_num, 4)."); + AddOutput("RpnRoiProbs", + "(LoDTensor) Scores of proposals with shape (rois_num, 1)."); + AddAttr("pre_nms_topN", + "Number of top scoring RPN proposals to keep before " + "applying NMS."); + AddAttr("post_nms_topN", + "Number of top scoring RPN proposals to keep after " + "applying NMS"); + AddAttr("nms_thresh", "NMS threshold used on RPN proposals."); + AddAttr("min_size", + "Proposal height and width both need to be greater " + "than this min_size."); + AddAttr("eta", "The parameter for adaptive NMS."); + AddComment(R"DOC( +This operator Generate bounding box proposals for Faster RCNN. +The propoasls are generated for a list of images based on image +score 'Scores', bounding box regression result 'BboxDeltas' as +well as predefined bounding box shapes 'anchors'. Greedy +non-maximum suppression is applied to generate the final bounding +boxes. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(generate_proposals, ops::GenerateProposalsOp, + ops::GenerateProposalsOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(generate_proposals, ops::GenerateProposalsKernel, + ops::GenerateProposalsKernel); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu new file mode 100644 index 00000000..43deb5f9 --- /dev/null +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -0,0 +1,466 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +namespace { + +#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0)) +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +int const kThreadsPerBlock = sizeof(uint64_t) * 8; + +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); + +struct RangeInitFunctor { + int start_; + int delta_; + int *out_; + __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; } +}; + +template +static void SortDescending(const platform::CUDADeviceContext &ctx, + const Tensor &value, Tensor *value_out, + Tensor *index_out) { + int num = static_cast(value.numel()); + Tensor index_in_t; + int *idx_in = index_in_t.mutable_data({num}, ctx.GetPlace()); + platform::ForRange for_range(ctx, num); + for_range(RangeInitFunctor{0, 1, idx_in}); + + int *idx_out = index_out->mutable_data({num}, ctx.GetPlace()); + + const T *keys_in = value.data(); + T *keys_out = value_out->mutable_data({num}, ctx.GetPlace()); + + // Determine temporary device storage requirements + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairsDescending( + nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); + // Allocate temporary storage + auto place = boost::get(ctx.GetPlace()); + auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); + + // Run sorting operation + cub::DeviceRadixSort::SortPairsDescending( + d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in, + idx_out, num); +} + +template +struct BoxDecodeAndClipFunctor { + const T *anchor; + const T *deltas; + const T *var; + const int *index; + const T *im_info; + + T *proposals; + + BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var, + const int *index, const T *im_info, T *proposals) + : anchor(anchor), + deltas(deltas), + var(var), + index(index), + im_info(im_info), + proposals(proposals) {} + + T bbox_clip_default{static_cast(kBBoxClipDefault)}; + + __device__ void operator()(size_t i) { + int k = index[i] * 4; + T axmin = anchor[k]; + T aymin = anchor[k + 1]; + T axmax = anchor[k + 2]; + T aymax = anchor[k + 3]; + + T w = axmax - axmin + 1.0; + T h = aymax - aymin + 1.0; + T cx = axmin + 0.5 * w; + T cy = aymin + 0.5 * h; + + T dxmin = deltas[k]; + T dymin = deltas[k + 1]; + T dxmax = deltas[k + 2]; + T dymax = deltas[k + 3]; + + T d_cx, d_cy, d_w, d_h; + if (var) { + d_cx = cx + dxmin * w * var[k]; + d_cy = cy + dymin * h * var[k + 1]; + d_w = exp(Min(dxmax * var[k + 2], bbox_clip_default)) * w; + d_h = exp(Min(dymax * var[k + 3], bbox_clip_default)) * h; + } else { + d_cx = cx + dxmin * w; + d_cy = cy + dymin * h; + d_w = exp(Min(dxmax, bbox_clip_default)) * w; + d_h = exp(Min(dymax, bbox_clip_default)) * h; + } + + T oxmin = d_cx - d_w * 0.5; + T oymin = d_cy - d_h * 0.5; + T oxmax = d_cx + d_w * 0.5 - 1.; + T oymax = d_cy + d_h * 0.5 - 1.; + + proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.); + proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.); + proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.); + proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.); + } + + __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; } + + __device__ __forceinline__ T Max(T a, T b) const { return a > b ? a : b; } +}; + +template +static __global__ void FilterBBoxes(const T *bboxes, const T *im_info, + const T min_size, const int num, + int *keep_num, int *keep) { + T im_h = im_info[0]; + T im_w = im_info[1]; + T im_scale = im_info[2]; + + int cnt = 0; + __shared__ int keep_index[BlockSize]; + + CUDA_1D_KERNEL_LOOP(i, num) { + keep_index[threadIdx.x] = -1; + __syncthreads(); + + int k = i * 4; + T xmin = bboxes[k]; + T ymin = bboxes[k + 1]; + T xmax = bboxes[k + 2]; + T ymax = bboxes[k + 3]; + + T w = xmax - xmin + 1.0; + T h = ymax - ymin + 1.0; + T cx = xmin + w / 2.; + T cy = ymin + h / 2.; + + T w_s = (xmax - xmin) / im_scale + 1.; + T h_s = (ymax - ymin) / im_scale + 1.; + + if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) { + keep_index[threadIdx.x] = i; + } + __syncthreads(); + if (threadIdx.x == 0) { + int size = (num - i) < BlockSize ? num - i : BlockSize; + for (int j = 0; j < size; ++j) { + if (keep_index[j] > -1) { + keep[cnt++] = keep_index[j]; + } + } + } + __syncthreads(); + } + if (threadIdx.x == 0) { + keep_num[0] = cnt; + } +} + +static __device__ inline float IoU(const float *a, const float *b) { + float left = max(a[0], b[0]), right = min(a[2], b[2]); + float top = max(a[1], b[1]), bottom = min(a[3], b[3]); + float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); + float inter_s = width * height; + float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); + float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); + return inter_s / (s_a + s_b - inter_s); +} + +static __global__ void NMSKernel(const int n_boxes, + const float nms_overlap_thresh, + const float *dev_boxes, uint64_t *dev_mask) { + const int row_start = blockIdx.y; + const int col_start = blockIdx.x; + + const int row_size = + min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock); + const int col_size = + min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock); + + __shared__ float block_boxes[kThreadsPerBlock * 4]; + if (threadIdx.x < col_size) { + block_boxes[threadIdx.x * 4 + 0] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0]; + block_boxes[threadIdx.x * 4 + 1] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1]; + block_boxes[threadIdx.x * 4 + 2] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2]; + block_boxes[threadIdx.x * 4 + 3] = + dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3]; + } + __syncthreads(); + + if (threadIdx.x < row_size) { + const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x; + const float *cur_box = dev_boxes + cur_box_idx * 4; + int i = 0; + uint64_t t = 0; + int start = 0; + if (row_start == col_start) { + start = threadIdx.x + 1; + } + for (i = start; i < col_size; i++) { + if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) { + t |= 1ULL << i; + } + } + const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock); + dev_mask[cur_box_idx * col_blocks + col_start] = t; + } +} + +template +static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, + const Tensor &sorted_indices, const T nms_threshold, + Tensor *keep_out) { + int boxes_num = proposals.dims()[0]; + PADDLE_ENFORCE_EQ(boxes_num, sorted_indices.dims()[0]); + + const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock); + dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock), + DIVUP(boxes_num, kThreadsPerBlock)); + dim3 threads(kThreadsPerBlock); + + const T *boxes = proposals.data(); + auto place = boost::get(ctx.GetPlace()); + framework::Vector mask(boxes_num * col_blocks); + NMSKernel<<>>( + boxes_num, nms_threshold, boxes, + mask.CUDAMutableData(boost::get(ctx.GetPlace()))); + + std::vector remv(col_blocks); + memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + + std::vector keep_vec; + int num_to_keep = 0; + for (int i = 0; i < boxes_num; i++) { + int nblock = i / kThreadsPerBlock; + int inblock = i % kThreadsPerBlock; + + if (!(remv[nblock] & (1ULL << inblock))) { + ++num_to_keep; + keep_vec.push_back(i); + uint64_t *p = &mask[0] + i * col_blocks; + for (int j = nblock; j < col_blocks; j++) { + remv[j] |= p[j]; + } + } + } + int *keep = keep_out->mutable_data({num_to_keep}, ctx.GetPlace()); + memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(), + sizeof(int) * num_to_keep, ctx.stream()); + ctx.Wait(); +} + +template +static std::pair ProposalForOneImage( + const platform::CUDADeviceContext &ctx, const Tensor &im_info, + const Tensor &anchors, const Tensor &variances, + const Tensor &bbox_deltas, // [M, 4] + const Tensor &scores, // [N, 1] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) { + // 1. pre nms + Tensor scores_sort, index_sort; + SortDescending(ctx, scores, &scores_sort, &index_sort); + int num = scores.numel(); + int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel() + : pre_nms_top_n; + scores_sort.Resize({pre_nms_num, 1}); + index_sort.Resize({pre_nms_num, 1}); + + // 2. box decode and clipping + Tensor proposals; + proposals.mutable_data({pre_nms_num, 4}, ctx.GetPlace()); + + { + platform::ForRange for_range(ctx, pre_nms_num); + for_range(BoxDecodeAndClipFunctor{ + anchors.data(), bbox_deltas.data(), variances.data(), + index_sort.data(), im_info.data(), proposals.data()}); + } + + // 3. filter + Tensor keep_index, keep_num_t; + keep_index.mutable_data({pre_nms_num}, ctx.GetPlace()); + keep_num_t.mutable_data({1}, ctx.GetPlace()); + min_size = std::max(min_size, 1.0f); + auto stream = ctx.stream(); + FilterBBoxes<<<1, 512, 0, stream>>>( + proposals.data(), im_info.data(), min_size, pre_nms_num, + keep_num_t.data(), keep_index.data()); + int keep_num; + const auto gpu_place = boost::get(ctx.GetPlace()); + memory::Copy(platform::CPUPlace(), &keep_num, gpu_place, + keep_num_t.data(), sizeof(int), ctx.stream()); + ctx.Wait(); + keep_index.Resize({keep_num}); + + Tensor scores_filter, proposals_filter; + proposals_filter.mutable_data({keep_num, 4}, ctx.GetPlace()); + scores_filter.mutable_data({keep_num, 1}, ctx.GetPlace()); + GPUGather(ctx, proposals, keep_index, &proposals_filter); + GPUGather(ctx, scores_sort, keep_index, &scores_filter); + + if (nms_thresh <= 0) { + return std::make_pair(proposals_filter, scores_filter); + } + + // 4. nms + Tensor keep_nms; + NMS(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms); + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize({post_nms_top_n}); + } + + Tensor scores_nms, proposals_nms; + proposals_nms.mutable_data({keep_nms.numel(), 4}, ctx.GetPlace()); + scores_nms.mutable_data({keep_nms.numel(), 1}, ctx.GetPlace()); + GPUGather(ctx, proposals_filter, keep_nms, &proposals_nms); + GPUGather(ctx, scores_filter, keep_nms, &scores_nms); + + return std::make_pair(proposals_nms, scores_nms); +} +} // namespace + +template +class CUDAGenerateProposalsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *scores = context.Input("Scores"); + auto *bbox_deltas = context.Input("BboxDeltas"); + auto *im_info = context.Input("ImInfo"); + auto anchors = detail::Ref(context.Input("Anchors"), + "Cannot find input Anchors(%s) in scope", + context.Inputs("Anchors")[0]); + auto variances = detail::Ref(context.Input("Variances"), + "Cannot find input Variances(%s) in scope", + context.Inputs("Variances")[0]); + + auto *rpn_rois = context.Output("RpnRois"); + auto *rpn_roi_probs = context.Output("RpnRoiProbs"); + + int pre_nms_top_n = context.Attr("pre_nms_topN"); + int post_nms_top_n = context.Attr("post_nms_topN"); + float nms_thresh = context.Attr("nms_thresh"); + float min_size = context.Attr("min_size"); + float eta = context.Attr("eta"); + PADDLE_ENFORCE_GE(eta, 1., "Not support adaptive NMS."); + + auto &dev_ctx = context.template device_context(); + + auto scores_dim = scores->dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto bbox_dim = bbox_deltas->dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + Tensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}, + dev_ctx.GetPlace()); + scores_swap.mutable_data({num, h_score, w_score, c_score}, + dev_ctx.GetPlace()); + + math::Transpose trans; + std::vector axis = {0, 2, 3, 1}; + trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis); + trans(dev_ctx, *scores, &scores_swap, axis); + + anchors.Resize({anchors.numel() / 4, 4}); + variances.Resize({variances.numel() / 4, 4}); + + rpn_rois->mutable_data({bbox_deltas->numel() / 4, 4}, + context.GetPlace()); + rpn_roi_probs->mutable_data({scores->numel(), 1}, context.GetPlace()); + + T *rpn_rois_data = rpn_rois->data(); + T *rpn_roi_probs_data = rpn_roi_probs->data(); + + auto place = boost::get(dev_ctx.GetPlace()); + + int64_t num_proposals = 0; + std::vector offset(1, 0); + for (int64_t i = 0; i < num; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + Tensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + scores_slice.Resize({h_score * w_score * c_score, 1}); + + std::pair box_score_pair = + ProposalForOneImage(dev_ctx, im_info_slice, anchors, variances, + bbox_deltas_slice, scores_slice, pre_nms_top_n, + post_nms_top_n, nms_thresh, min_size, eta); + + Tensor &proposals = box_score_pair.first; + Tensor &scores = box_score_pair.second; + + memory::Copy(place, rpn_rois_data + num_proposals * 4, place, + proposals.data(), sizeof(T) * proposals.numel(), + dev_ctx.stream()); + memory::Copy(place, rpn_roi_probs_data + num_proposals, place, + scores.data(), sizeof(T) * scores.numel(), + dev_ctx.stream()); + dev_ctx.Wait(); + num_proposals += proposals.dims()[0]; + offset.emplace_back(num_proposals); + } + framework::LoD lod; + lod.emplace_back(offset); + rpn_rois->set_lod(lod); + rpn_roi_probs->set_lod(lod); + rpn_rois->Resize({num_proposals, 4}); + rpn_roi_probs->Resize({num_proposals, 1}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(generate_proposals, + ops::CUDAGenerateProposalsKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc new file mode 100644 index 00000000..f46aaf7d --- /dev/null +++ b/paddle/fluid/operators/detection/gpc.cc @@ -0,0 +1,2206 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/** + * @file src/gpc.cpp + * @author huhan02(com@baidu.com) + * @date 2015/12/18 14:17:30 + * @brief + * + * @modified by sunyipeng + * @email sunyipeng@baidu.com + * @date 2018/6/12 + **/ + +#include "paddle/fluid/operators/detection/gpc.h" +#include "paddle/fluid/platform/enforce.h" + +namespace gpc { + +typedef struct lmt_shape { /* Local minima table */ + double y; /* Y coordinate at local minimum */ + edge_node *first_bound; /* Pointer to bound list */ + struct lmt_shape *next; /* Pointer to next local minimum */ +} lmt_node; + +typedef struct sbt_t_shape { /* Scanbeam tree */ + double y; /* Scanbeam node y value */ + struct sbt_t_shape *less; /* Pointer to nodes with lower y */ + struct sbt_t_shape *more; /* Pointer to nodes with higher y */ +} sb_tree; + +typedef struct it_shape { /* Intersection table */ + edge_node *ie[2]; /* Intersecting edge (bundle) pair */ + gpc_vertex point; /* Point of intersection */ + struct it_shape *next; /* The next intersection table node */ +} it_node; + +typedef struct st_shape { /* Sorted edge table */ + edge_node *edge; /* Pointer to AET edge */ + double xb; /* Scanbeam bottom x coordinate */ + double xt; /* Scanbeam top x coordinate */ + double dx; /* Change in x for a unit y increase */ + struct st_shape *prev; /* Previous edge in sorted list */ +} st_node; + +typedef struct bbox_shape { /* Contour axis-aligned bounding box */ + double xmin; /* Minimum x coordinate */ + double ymin; /* Minimum y coordinate */ + double xmax; /* Maximum x coordinate */ + double ymax; /* Maximum y coordinate */ +} bbox; + +/* +=========================================================================== + Global Data +=========================================================================== +*/ + +/* Horizontal edge state transitions within scanbeam boundary */ +const h_state next_h_state[3][6] = { + /* ABOVE BELOW CROSS */ + /* L R L R L R */ + /* NH */ + {BH, TH, TH, BH, NH, NH}, + /* BH */ + {NH, NH, NH, NH, TH, TH}, + /* TH */ + {NH, NH, NH, NH, BH, BH}}; + +/* +=========================================================================== + Private Functions +=========================================================================== +*/ + +static void reset_it(it_node **it) { + it_node *itn; + + while (*it) { + itn = (*it)->next; + gpc_free(*it); + *it = itn; + } +} + +static void reset_lmt(lmt_node **lmt) { + lmt_node *lmtn; + + while (*lmt) { + lmtn = (*lmt)->next; + gpc_free(*lmt); + *lmt = lmtn; + } +} + +static void insert_bound(edge_node **b, edge_node *e) { + edge_node *existing_bound = NULL; + + if (!*b) { + /* Link node e to the tail of the list */ + *b = e; + } else { + /* Do primary sort on the x field */ + if (e[0].bot.x < (*b)[0].bot.x) { + /* Insert a new node mid-list */ + existing_bound = *b; + *b = e; + (*b)->next_bound = existing_bound; + } else { + if (e[0].bot.x == (*b)[0].bot.x) { + /* Do secondary sort on the dx field */ + if (e[0].dx < (*b)[0].dx) { + /* Insert a new node mid-list */ + existing_bound = *b; + *b = e; + (*b)->next_bound = existing_bound; + } else { + /* Head further down the list */ + insert_bound(&((*b)->next_bound), e); + } + } else { + /* Head further down the list */ + insert_bound(&((*b)->next_bound), e); + } + } + } +} + +static edge_node **bound_list(lmt_node **lmt, double y) { + lmt_node *existing_node; + + if (!*lmt) { + /* Add node onto the tail end of the LMT */ + gpc_malloc(*lmt, sizeof(lmt_node), + const_cast("LMT insertion")); + (*lmt)->y = y; + (*lmt)->first_bound = NULL; + (*lmt)->next = NULL; + return &((*lmt)->first_bound); + } else if (y < (*lmt)->y) { + /* Insert a new LMT node before the current node */ + existing_node = *lmt; + gpc_malloc(*lmt, sizeof(lmt_node), + const_cast("LMT insertion")); + (*lmt)->y = y; + (*lmt)->first_bound = NULL; + (*lmt)->next = existing_node; + return &((*lmt)->first_bound); + } else { + if (y > (*lmt)->y) { + /* Head further up the LMT */ + return bound_list(&((*lmt)->next), y); + } else { + /* Use this existing LMT node */ + return &((*lmt)->first_bound); + } + } +} + +static void add_to_sbtree(int *entries, sb_tree **sbtree, double y) { + if (!*sbtree) { + /* Add a new tree node here */ + gpc_malloc(*sbtree, sizeof(sb_tree), + const_cast("scanbeam tree insertion")); + (*sbtree)->y = y; + (*sbtree)->less = NULL; + (*sbtree)->more = NULL; + (*entries)++; + } else { + if ((*sbtree)->y > y) { + /* Head into the 'less' sub-tree */ + add_to_sbtree(entries, &((*sbtree)->less), y); + } else { + if ((*sbtree)->y < y) { + /* Head into the 'more' sub-tree */ + add_to_sbtree(entries, &((*sbtree)->more), y); + } + } + } +} + +static void build_sbt(int *entries, double *sbt, sb_tree *sbtree) { + if (sbtree->less) { + build_sbt(entries, sbt, sbtree->less); + } + sbt[*entries] = sbtree->y; + (*entries)++; + if (sbtree->more) { + build_sbt(entries, sbt, sbtree->more); + } +} + +static void free_sbtree(sb_tree **sbtree) { + if (*sbtree) { + free_sbtree(&((*sbtree)->less)); + free_sbtree(&((*sbtree)->more)); + gpc_free(*sbtree); + } +} + +static int count_optimal_vertices(gpc_vertex_list c) { + int result = 0; + int i = 0; + + /* Ignore non-contributing contours */ + if (c.num_vertices > 0) { + for (i = 0; i < c.num_vertices; i++) { + /* Ignore superfluous vertices embedded in horizontal edges */ + if (gpc_optimal(c.vertex, i, c.num_vertices)) { + result++; + } + } + } + return result; +} + +static edge_node *build_lmt(lmt_node **lmt, sb_tree **sbtree, int *sbt_entries, + gpc_polygon *p, int type, gpc_op op) { + int c = 0; + int i = 0; + int min = 0; + int max = 0; + int num_edges = 0; + int v = 0; + int num_vertices = 0; + int total_vertices = 0; + int e_index = 0; + edge_node *e = NULL; + edge_node *edge_table = NULL; + + for (c = 0; c < p->num_contours; c++) { + total_vertices += count_optimal_vertices(p->contour[c]); + } + + /* Create the entire input polygon edge table in one go */ + gpc_malloc(edge_table, total_vertices * sizeof(edge_node), + const_cast("edge table creation")); + + for (c = 0; c < p->num_contours; c++) { + if (p->contour[c].num_vertices < 0) { + /* Ignore the non-contributing contour and repair the vertex count */ + p->contour[c].num_vertices = -p->contour[c].num_vertices; + } else { + /* Perform contour optimisation */ + num_vertices = 0; + for (i = 0; i < p->contour[c].num_vertices; i++) { + if (gpc_optimal(p->contour[c].vertex, i, p->contour[c].num_vertices)) { + edge_table[num_vertices].vertex.x = p->contour[c].vertex[i].x; + edge_table[num_vertices].vertex.y = p->contour[c].vertex[i].y; + + /* Record vertex in the scanbeam table */ + add_to_sbtree(sbt_entries, sbtree, edge_table[num_vertices].vertex.y); + + num_vertices++; + } + } + + /* Do the contour forward pass */ + for (min = 0; min < num_vertices; min++) { + /* If a forward local minimum... */ + if (gpc_fwd_min(edge_table, min, num_vertices)) { + /* Search for the next local maximum... */ + num_edges = 1; + max = gpc_next_index(min, num_vertices); + while (gpc_not_fmax(edge_table, max, num_vertices)) { + num_edges++; + max = gpc_next_index(max, num_vertices); + } + + /* Build the next edge list */ + e = &edge_table[e_index]; + e_index += num_edges; + v = min; + e[0].bstate[BELOW] = UNBUNDLED; + e[0].bundle[BELOW][CLIP] = 0; + e[0].bundle[BELOW][SUBJ] = 0; + for (i = 0; i < num_edges; i++) { + e[i].xb = edge_table[v].vertex.x; + e[i].bot.x = edge_table[v].vertex.x; + e[i].bot.y = edge_table[v].vertex.y; + + v = gpc_next_index(v, num_vertices); + + e[i].top.x = edge_table[v].vertex.x; + e[i].top.y = edge_table[v].vertex.y; + e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) / + (e[i].top.y - e[i].bot.y); + e[i].type = type; + e[i].outp[ABOVE] = NULL; + e[i].outp[BELOW] = NULL; + e[i].next = NULL; + e[i].prev = NULL; + e[i].succ = + ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL; + e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL; + e[i].next_bound = NULL; + e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT; + e[i].bside[SUBJ] = LEFT; + } + insert_bound(bound_list(lmt, edge_table[min].vertex.y), e); + } + } + + /* Do the contour reverse pass */ + for (min = 0; min < num_vertices; min++) { + /* If a reverse local minimum... */ + if (gpc_rev_min(edge_table, min, num_vertices)) { + /* Search for the previous local maximum... */ + num_edges = 1; + max = gpc_prev_index(min, num_vertices); + while (gpc_not_rmax(edge_table, max, num_vertices)) { + num_edges++; + max = gpc_prev_index(max, num_vertices); + } + + /* Build the previous edge list */ + e = &edge_table[e_index]; + e_index += num_edges; + v = min; + e[0].bstate[BELOW] = UNBUNDLED; + e[0].bundle[BELOW][CLIP] = 0; + e[0].bundle[BELOW][SUBJ] = 0; + for (i = 0; i < num_edges; i++) { + e[i].xb = edge_table[v].vertex.x; + e[i].bot.x = edge_table[v].vertex.x; + e[i].bot.y = edge_table[v].vertex.y; + + v = gpc_prev_index(v, num_vertices); + + e[i].top.x = edge_table[v].vertex.x; + e[i].top.y = edge_table[v].vertex.y; + e[i].dx = (edge_table[v].vertex.x - e[i].bot.x) / + (e[i].top.y - e[i].bot.y); + e[i].type = type; + e[i].outp[ABOVE] = NULL; + e[i].outp[BELOW] = NULL; + e[i].next = NULL; + e[i].prev = NULL; + e[i].succ = + ((num_edges > 1) && (i < (num_edges - 1))) ? &(e[i + 1]) : NULL; + e[i].pred = ((num_edges > 1) && (i > 0)) ? &(e[i - 1]) : NULL; + e[i].next_bound = NULL; + e[i].bside[CLIP] = (op == GPC_DIFF) ? RIGHT : LEFT; + e[i].bside[SUBJ] = LEFT; + } + insert_bound(bound_list(lmt, edge_table[min].vertex.y), e); + } + } + } + } + return edge_table; +} // NOLINT + +static void add_edge_to_aet(edge_node **aet, edge_node *edge, edge_node *prev) { + if (!*aet) { + /* Append edge onto the tail end of the AET */ + *aet = edge; + edge->prev = prev; + edge->next = NULL; + } else { + /* Do primary sort on the xb field */ + if (edge->xb < (*aet)->xb) { + /* Insert edge here (before the AET edge) */ + edge->prev = prev; + edge->next = *aet; + (*aet)->prev = edge; + *aet = edge; + } else { + if (edge->xb == (*aet)->xb) { + /* Do secondary sort on the dx field */ + if (edge->dx < (*aet)->dx) { + /* Insert edge here (before the AET edge) */ + edge->prev = prev; + edge->next = *aet; + (*aet)->prev = edge; + *aet = edge; + } else { + /* Head further into the AET */ + add_edge_to_aet(&((*aet)->next), edge, *aet); + } + } else { + /* Head further into the AET */ + add_edge_to_aet(&((*aet)->next), edge, *aet); + } + } + } +} + +static void add_intersection(it_node **it, edge_node *edge0, edge_node *edge1, + double x, double y) { + it_node *existing_node; + + if (!*it) { + /* Append a new node to the tail of the list */ + gpc_malloc(*it, sizeof(it_node), + const_cast("IT insertion")); + (*it)->ie[0] = edge0; + (*it)->ie[1] = edge1; + (*it)->point.x = x; + (*it)->point.y = y; + (*it)->next = NULL; + } else { + if ((*it)->point.y > y) { + /* Insert a new node mid-list */ + existing_node = *it; + gpc_malloc(*it, sizeof(it_node), + const_cast("IT insertion")); + (*it)->ie[0] = edge0; + (*it)->ie[1] = edge1; + (*it)->point.x = x; + (*it)->point.y = y; + (*it)->next = existing_node; + } else { + /* Head further down the list */ + add_intersection(&((*it)->next), edge0, edge1, x, y); + } + } +} + +static void add_st_edge(st_node **st, it_node **it, edge_node *edge, + double dy) { + st_node *existing_node; + double den = 0.0; + double r = 0.0; + double x = 0.0; + double y = 0.0; + + if (!*st) { + /* Append edge onto the tail end of the ST */ + gpc_malloc(*st, sizeof(st_node), + const_cast("ST insertion")); + (*st)->edge = edge; + (*st)->xb = edge->xb; + (*st)->xt = edge->xt; + (*st)->dx = edge->dx; + (*st)->prev = NULL; + } else { + den = ((*st)->xt - (*st)->xb) - (edge->xt - edge->xb); + + /* If new edge and ST edge don't cross */ + if ((edge->xt >= (*st)->xt) || (edge->dx == (*st)->dx) || + (fabs(den) <= DBL_EPSILON)) { + /* No intersection - insert edge here (before the ST edge) */ + existing_node = *st; + gpc_malloc(*st, sizeof(st_node), + const_cast("ST insertion")); + (*st)->edge = edge; + (*st)->xb = edge->xb; + (*st)->xt = edge->xt; + (*st)->dx = edge->dx; + (*st)->prev = existing_node; + } else { + /* Compute intersection between new edge and ST edge */ + r = (edge->xb - (*st)->xb) / den; + x = (*st)->xb + r * ((*st)->xt - (*st)->xb); + y = r * dy; + + /* Insert the edge pointers and the intersection point in the IT */ + add_intersection(it, (*st)->edge, edge, x, y); + + /* Head further into the ST */ + add_st_edge(&((*st)->prev), it, edge, dy); + } + } +} + +static void build_intersection_table(it_node **it, edge_node *aet, double dy) { + st_node *st; + st_node *stp; + edge_node *edge = NULL; + + /* Build intersection table for the current scanbeam */ + reset_it(it); + st = NULL; + + /* Process each AET edge */ + for (edge = aet; edge; edge = edge->next) { + if ((edge->bstate[ABOVE] == BUNDLE_HEAD) || edge->bundle[ABOVE][CLIP] || + edge->bundle[ABOVE][SUBJ]) { + add_st_edge(&st, it, edge, dy); + } + } + + /* Free the sorted edge table */ + while (st) { + stp = st->prev; + gpc_free(st); + st = stp; + } +} + +static int count_contours(polygon_node *polygon) { + int nc = 0; + int nv = 0; + vertex_node *v = NULL; + vertex_node *nextv = NULL; + + for (nc = 0; polygon; polygon = polygon->next) { + if (polygon->active) { + /* Count the vertices in the current contour */ + nv = 0; + for (v = polygon->proxy->v[LEFT]; v; v = v->next) { + nv++; + } + + /* Record valid vertex counts in the active field */ + if (nv > 2) { + polygon->active = nv; + nc++; + } else { + /* Invalid contour: just free the heap */ + for (v = polygon->proxy->v[LEFT]; v; v = nextv) { + nextv = v->next; + gpc_free(v); + } + polygon->active = 0; + } + } + } + return nc; +} + +static void add_left(polygon_node *p, double x, double y) { + vertex_node *nv = NULL; + + /* Create a new vertex node and set its fields */ + gpc_malloc(nv, sizeof(vertex_node), + const_cast("vertex node creation")); + nv->x = x; + nv->y = y; + + /* Add vertex nv to the left end of the polygon's vertex list */ + nv->next = p->proxy->v[LEFT]; + + /* Update proxy->[LEFT] to point to nv */ + p->proxy->v[LEFT] = nv; +} + +static void merge_left(polygon_node *p, polygon_node *q, polygon_node *list) { + polygon_node *target = NULL; + + /* Label contour as a hole */ + q->proxy->hole = 1; + + if (p->proxy != q->proxy) { + /* Assign p's vertex list to the left end of q's list */ + p->proxy->v[RIGHT]->next = q->proxy->v[LEFT]; + q->proxy->v[LEFT] = p->proxy->v[LEFT]; + + /* Redirect any p->proxy references to q->proxy */ + + for (target = p->proxy; list; list = list->next) { + if (list->proxy == target) { + list->active = 0; + list->proxy = q->proxy; + } + } + } +} + +static void add_right(polygon_node *p, double x, double y) { + vertex_node *nv = NULL; + + /* Create a new vertex node and set its fields */ + gpc_malloc(nv, sizeof(vertex_node), + const_cast("vertex node creation")); + nv->x = x; + nv->y = y; + nv->next = NULL; + + /* Add vertex nv to the right end of the polygon's vertex list */ + p->proxy->v[RIGHT]->next = nv; + + /* Update proxy->v[RIGHT] to point to nv */ + p->proxy->v[RIGHT] = nv; +} + +static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) { + polygon_node *target = NULL; + + /* Label contour as external */ + q->proxy->hole = 0; + + if (p->proxy != q->proxy) { + /* Assign p's vertex list to the right end of q's list */ + q->proxy->v[RIGHT]->next = p->proxy->v[LEFT]; + q->proxy->v[RIGHT] = p->proxy->v[RIGHT]; + + /* Redirect any p->proxy references to q->proxy */ + for (target = p->proxy; list; list = list->next) { + if (list->proxy == target) { + list->active = 0; + list->proxy = q->proxy; + } + } + } +} + +static void add_local_min(polygon_node **p, edge_node *edge, double x, + double y) { + polygon_node *existing_min = NULL; + vertex_node *nv = NULL; + + existing_min = *p; + + gpc_malloc(*p, sizeof(polygon_node), + const_cast("polygon node creation")); + + /* Create a new vertex node and set its fields */ + gpc_malloc(nv, sizeof(vertex_node), + const_cast("vertex node creation")); + nv->x = x; + nv->y = y; + nv->next = NULL; + + /* Initialise proxy to point to p itself */ + (*p)->proxy = (*p); + (*p)->active = 1; + (*p)->next = existing_min; + + /* Make v[LEFT] and v[RIGHT] point to new vertex nv */ + (*p)->v[LEFT] = nv; + (*p)->v[RIGHT] = nv; + + /* Assign polygon p to the edge */ + edge->outp[ABOVE] = *p; +} + +static int count_tristrips(polygon_node *tn) { + int total = 0; + + for (total = 0; tn; tn = tn->next) { + if (tn->active > 2) { + total++; + } + } + return total; +} + +void add_vertex(vertex_node **t, double x, double y) { + if (!(*t)) { + gpc_malloc(*t, sizeof(vertex_node), + const_cast("tristrip vertex creation")); + (*t)->x = x; + (*t)->y = y; + (*t)->next = NULL; + } else { + /* Head further down the list */ + add_vertex(&((*t)->next), x, y); + } +} + +void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) { + add_vertex(&(e->outp[p]->v[s]), x, y); + e->outp[p]->active++; +} + +static void new_tristrip(polygon_node **tn, edge_node *edge, double x, + double y) { + if (!(*tn)) { + gpc_malloc(*tn, sizeof(polygon_node), + const_cast("tristrip node creation")); + (*tn)->next = NULL; + (*tn)->v[LEFT] = NULL; + (*tn)->v[RIGHT] = NULL; + (*tn)->active = 1; + add_vertex(&((*tn)->v[LEFT]), x, y); + edge->outp[ABOVE] = *tn; + } else { + /* Head further down the list */ + new_tristrip(&((*tn)->next), edge, x, y); + } +} + +static bbox *create_contour_bboxes(gpc_polygon *p) { + bbox *box; + int c = 0; + int v = 0; + + gpc_malloc(box, p->num_contours * sizeof(bbox), + const_cast("Bounding box creation")); + PADDLE_ENFORCE_NOT_NULL(box); + + /* Construct contour bounding boxes */ + for (c = 0; c < p->num_contours; c++) { + /* Initialise bounding box extent */ + box[c].xmin = DBL_MAX; + box[c].ymin = DBL_MAX; + box[c].xmax = -DBL_MAX; + box[c].ymax = -DBL_MAX; + + for (v = 0; v < p->contour[c].num_vertices; v++) { + /* Adjust bounding box */ + if (p->contour[c].vertex[v].x < box[c].xmin) { + box[c].xmin = p->contour[c].vertex[v].x; + } + if (p->contour[c].vertex[v].y < box[c].ymin) { + box[c].ymin = p->contour[c].vertex[v].y; + } + if (p->contour[c].vertex[v].x > box[c].xmax) { + box[c].xmax = p->contour[c].vertex[v].x; + } + if (p->contour[c].vertex[v].y > box[c].ymax) { + box[c].ymax = p->contour[c].vertex[v].y; + } + } + } + return box; +} + +static void minimax_test(gpc_polygon *subj, gpc_polygon *clip, gpc_op op) { + bbox *s_bbox; + bbox *c_bbox; + int s = 0; + int c = 0; + int *o_table = NULL; + int overlap = 0; + + s_bbox = create_contour_bboxes(subj); + c_bbox = create_contour_bboxes(clip); + + gpc_malloc(o_table, + subj->num_contours * clip->num_contours * sizeof(int), + const_cast("overlap table creation")); + + /* Check all subject contour bounding boxes against clip boxes */ + for (s = 0; s < subj->num_contours; s++) { + for (c = 0; c < clip->num_contours; c++) { + o_table[c * subj->num_contours + s] = + (!((s_bbox[s].xmax < c_bbox[c].xmin) || + (s_bbox[s].xmin > c_bbox[c].xmax))) && + (!((s_bbox[s].ymax < c_bbox[c].ymin) || + (s_bbox[s].ymin > c_bbox[c].ymax))); + } + } + + /* For each clip contour, search for any subject contour overlaps */ + for (c = 0; c < clip->num_contours; c++) { + overlap = 0; + for (s = 0; (!overlap) && (s < subj->num_contours); s++) { + overlap = o_table[c * subj->num_contours + s]; + } + + if (!overlap) { + /* Flag non contributing status by negating vertex count */ + clip->contour[c].num_vertices = -clip->contour[c].num_vertices; + } + } + + if (op == GPC_INT) { + /* For each subject contour, search for any clip contour overlaps */ + for (s = 0; s < subj->num_contours; s++) { + overlap = 0; + for (c = 0; (!overlap) && (c < clip->num_contours); c++) { + overlap = o_table[c * subj->num_contours + s]; + } + + if (!overlap) { + /* Flag non contributing status by negating vertex count */ + subj->contour[s].num_vertices = -subj->contour[s].num_vertices; + } + } + } + + gpc_free(s_bbox); + gpc_free(c_bbox); + gpc_free(o_table); +} + +/* +=========================================================================== + Public Functions +=========================================================================== +*/ + +void gpc_free_polygon(gpc_polygon *p) { + int c = 0; + + for (c = 0; c < p->num_contours; c++) { + gpc_free(p->contour[c].vertex); + } + gpc_free(p->hole); + gpc_free(p->contour); + p->num_contours = 0; +} + +/* +void gpc_read_polygon(FILE *fp, int read_hole_flags, gpc_polygon *p) { + int c = 0; + int v = 0; + + fscanf(fp, "%d", &(p->num_contours)); + gpc_malloc(p->hole, p->num_contours * sizeof(int), + (char *)"hole flag array creation"); + gpc_malloc(p->contour, + p->num_contours * sizeof(gpc_vertex_list), + (char *)"contour creation"); + for (c = 0; c < p->num_contours; c++) { + fscanf(fp, "%d", &(p->contour[c].num_vertices)); + + if (read_hole_flags) { + fscanf(fp, "%d", &(p->hole[c])); + } else { + p->hole[c] = 0; // Assume all contours to be external + } + + gpc_malloc(p->contour[c].vertex, + p->contour[c].num_vertices * sizeof(gpc_vertex), + (char *)"vertex creation"); + for (v = 0; v < p->contour[c].num_vertices; v++) { + fscanf(fp, "%lf %lf", &(p->contour[c].vertex[v].x), + &(p->contour[c].vertex[v].y)); + } + } +} + +void gpc_write_polygon(FILE *fp, int write_hole_flags, gpc_polygon *p) { + int c = 0; + int v = 0; + + fprintf(fp, "%d\n", p->num_contours); + for (c = 0; c < p->num_contours; c++) { + fprintf(fp, "%d\n", p->contour[c].num_vertices); + + if (write_hole_flags) { + fprintf(fp, "%d\n", p->hole[c]); + } + + for (v = 0; v < p->contour[c].num_vertices; v++) { + fprintf(fp, "% .*lf % .*lf\n", DBL_DIG, p->contour[c].vertex[v].x, + DBL_DIG, p->contour[c].vertex[v].y); + } + } +} +*/ + +void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) { + int *extended_hole = NULL; + int c = 0; + int v = 0; + gpc_vertex_list *extended_contour = NULL; + + /* Create an extended hole array */ + gpc_malloc(extended_hole, (p->num_contours + 1) * sizeof(int), + const_cast("contour hole addition")); + PADDLE_ENFORCE_NOT_NULL(extended_hole); + + /* Create an extended contour array */ + gpc_malloc(extended_contour, + (p->num_contours + 1) * sizeof(gpc_vertex_list), + const_cast("contour addition")); + + /* Copy the old contour and hole data into the extended arrays */ + for (c = 0; c < p->num_contours; c++) { + extended_hole[c] = p->hole[c]; + extended_contour[c] = p->contour[c]; + } + + /* Copy the new contour and hole onto the end of the extended arrays */ + c = p->num_contours; + extended_hole[c] = hole; + extended_contour[c].num_vertices = new_contour->num_vertices; + gpc_malloc(extended_contour[c].vertex, + new_contour->num_vertices * sizeof(gpc_vertex), + const_cast("contour addition")); + for (v = 0; v < new_contour->num_vertices; v++) { + extended_contour[c].vertex[v] = new_contour->vertex[v]; + } + + /* Dispose of the old contour */ + gpc_free(p->contour); + gpc_free(p->hole); + + /* Update the polygon information */ + p->num_contours++; + p->hole = extended_hole; + p->contour = extended_contour; +} + +// gpc_polygon_clip +void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, + gpc_polygon *result) { + sb_tree *sbtree = NULL; + it_node *it = NULL; + it_node *intersect = NULL; + edge_node *edge = NULL; + edge_node *prev_edge = NULL; + edge_node *next_edge = NULL; + edge_node *succ_edge = NULL; + edge_node *e0 = NULL; + edge_node *e1 = NULL; + edge_node *aet = NULL; + edge_node *c_heap = NULL; + edge_node *s_heap = NULL; + lmt_node *lmt = NULL; + lmt_node *local_min = NULL; + polygon_node *out_poly = NULL; + polygon_node *p = NULL; + polygon_node *q = NULL; + polygon_node *poly = NULL; + polygon_node *npoly = NULL; + polygon_node *cf = NULL; + vertex_node *vtx = NULL; + vertex_node *nv = NULL; + h_state horiz[2]; + int in[2]; + int exists[2]; + int parity[2] = {LEFT, LEFT}; + int c = 0; + int v = 0; + int contributing = 0; + int search = 0; + int scanbeam = 0; + int sbt_entries = 0; + int vclass = 0; + int bl = 0; + int br = 0; + int tl = 0; + int tr = 0; + double *sbt = NULL; + double xb = 0.0; + double px = 0.0; + double yb = 0.0; + double yt = 0.0; + double dy = 0.0; + double ix = 0.0; + double iy = 0.0; + + /* Test for trivial NULL result cases */ + if (((subj->num_contours == 0) && (clip->num_contours == 0)) || + ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) || + ((clip->num_contours == 0) && (op == GPC_INT))) { + result->num_contours = 0; + result->hole = NULL; + result->contour = NULL; + return; + } + /* Identify potentialy contributing contours */ + if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) && + (clip->num_contours > 0)) { + minimax_test(subj, clip, op); + } + /* Build LMT */ + if (subj->num_contours > 0) { + s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op); + } + if (clip->num_contours > 0) { + c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op); + } + /* Return a NULL result if no contours contribute */ + if (lmt == NULL) { + result->num_contours = 0; + result->hole = NULL; + result->contour = NULL; + reset_lmt(&lmt); + gpc_free(s_heap); + gpc_free(c_heap); + return; + } + + /* Build scanbeam table from scanbeam tree */ + gpc_malloc(sbt, sbt_entries * sizeof(double), + const_cast("sbt creation")); + PADDLE_ENFORCE_NOT_NULL(sbt); + build_sbt(&scanbeam, sbt, sbtree); + scanbeam = 0; + free_sbtree(&sbtree); + /* Allow pointer re-use without causing memory leak */ + if (subj == result) { + gpc_free_polygon(subj); + } + if (clip == result) { + gpc_free_polygon(clip); + } + /* Invert clip polygon for difference operation */ + if (op == GPC_DIFF) { + parity[CLIP] = RIGHT; + } + local_min = lmt; + + // Process each scanbeam + while (scanbeam < sbt_entries) { + /* Set yb and yt to the bottom and top of the scanbeam */ + yb = sbt[scanbeam++]; + if (scanbeam < sbt_entries) { + yt = sbt[scanbeam]; + dy = yt - yb; + } + /* === SCANBEAM BOUNDARY PROCESSING ================================ */ + /* If LMT node corresponding to yb exists */ + if (local_min) { + if (local_min->y == yb) { + /* Add edges starting at this local minimum to the AET */ + for (edge = local_min->first_bound; edge; edge = edge->next_bound) { + add_edge_to_aet(&aet, edge, NULL); + } + local_min = local_min->next; + } + } + /* Set dummy previous x value */ + px = -DBL_MAX; + /* Create bundles within AET */ + e0 = aet; + e1 = aet; + /* Set up bundle fields of first edge */ + aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); + aet->bundle[ABOVE][!aet->type] = 0; + aet->bstate[ABOVE] = UNBUNDLED; + + for (next_edge = aet->next; next_edge; next_edge = next_edge->next) { + /* Set up bundle fields of next edge */ + next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb); + next_edge->bundle[ABOVE][!next_edge->type] = 0; + next_edge->bstate[ABOVE] = UNBUNDLED; + /* Bundle edges above the scanbeam boundary if they coincide */ + if (next_edge->bundle[ABOVE][next_edge->type]) { + if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) && + (e0->top.y != yb)) { + next_edge->bundle[ABOVE][next_edge->type] ^= + e0->bundle[ABOVE][next_edge->type]; + next_edge->bundle[ABOVE][!next_edge->type] = + e0->bundle[ABOVE][!next_edge->type]; + next_edge->bstate[ABOVE] = BUNDLE_HEAD; + e0->bundle[ABOVE][CLIP] = 0; + e0->bundle[ABOVE][SUBJ] = 0; + e0->bstate[ABOVE] = BUNDLE_TAIL; + } + e0 = next_edge; + } + } + horiz[CLIP] = NH; + horiz[SUBJ] = NH; + + // Process each edge at this scanbeam boundary + for (edge = aet; edge; edge = edge->next) { + exists[CLIP] = + edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1); + exists[SUBJ] = + edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1); + if (exists[CLIP] || exists[SUBJ]) { + /* Set bundle side */ + edge->bside[CLIP] = parity[CLIP]; + edge->bside[SUBJ] = parity[SUBJ]; + /* Determine contributing status and quadrant occupancies */ + switch (op) { + case GPC_DIFF: + case GPC_INT: + contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) && (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) && + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_XOR: + contributing = exists[CLIP] || exists[SUBJ]; + br = (parity[CLIP]) ^ (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^ + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_UNION: + contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) || (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) || + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + } + // Update parity + parity[CLIP] ^= edge->bundle[ABOVE][CLIP]; + parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ]; + /* Update horizontal state */ + if (exists[CLIP]) { + horiz[CLIP] = next_h_state[horiz[CLIP]] + [((exists[CLIP] - 1) << 1) + parity[CLIP]]; + } + if (exists[SUBJ]) { + horiz[SUBJ] = next_h_state[horiz[SUBJ]] + [((exists[SUBJ] - 1) << 1) + parity[SUBJ]]; + } + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + if (contributing) { + xb = edge->xb; + switch (vclass) { + case EMN: + case IMN: + add_local_min(&out_poly, edge, xb, yb); + px = xb; + cf = edge->outp[ABOVE]; + break; + case ERI: + if (xb != px) { + add_right(cf, xb, yb); + px = xb; + } + edge->outp[ABOVE] = cf; + cf = NULL; + break; + case ELI: + add_left(edge->outp[BELOW], xb, yb); + px = xb; + cf = edge->outp[BELOW]; + break; + case EMX: + if (xb != px) { + add_left(cf, xb, yb); + px = xb; + } + merge_right(cf, edge->outp[BELOW], out_poly); + cf = NULL; + break; + case ILI: + if (xb != px) { + add_left(cf, xb, yb); + px = xb; + } + edge->outp[ABOVE] = cf; + cf = NULL; + break; + case IRI: + add_right(edge->outp[BELOW], xb, yb); + px = xb; + cf = edge->outp[BELOW]; + edge->outp[BELOW] = NULL; + break; + case IMX: + if (xb != px) { + add_right(cf, xb, yb); + px = xb; + } + merge_left(cf, edge->outp[BELOW], out_poly); + cf = NULL; + edge->outp[BELOW] = NULL; + break; + case IMM: + if (xb != px) { + add_right(cf, xb, yb); + px = xb; + } + merge_left(cf, edge->outp[BELOW], out_poly); + edge->outp[BELOW] = NULL; + add_local_min(&out_poly, edge, xb, yb); + cf = edge->outp[ABOVE]; + break; + case EMM: + if (xb != px) { + add_left(cf, xb, yb); + px = xb; + } + merge_right(cf, edge->outp[BELOW], out_poly); + edge->outp[BELOW] = NULL; + add_local_min(&out_poly, edge, xb, yb); + cf = edge->outp[ABOVE]; + break; + case LED: + if (edge->bot.y == yb) { + add_left(edge->outp[BELOW], xb, yb); + } + edge->outp[ABOVE] = edge->outp[BELOW]; + px = xb; + break; + case RED: + if (edge->bot.y == yb) { + add_right(edge->outp[BELOW], xb, yb); + } + edge->outp[ABOVE] = edge->outp[BELOW]; + px = xb; + break; + default: + break; + } /* End of switch */ + } /* End of contributing conditional */ + } /* End of edge exists conditional */ + } // End of AET loop + + /* Delete terminating edges from the AET, otherwise compute xt */ + for (edge = aet; edge; edge = edge->next) { + if (edge->top.y == yb) { + prev_edge = edge->prev; + next_edge = edge->next; + if (prev_edge) { + prev_edge->next = next_edge; + } else { + aet = next_edge; + } + if (next_edge) { + next_edge->prev = prev_edge; + } + /* Copy bundle head state to the adjacent tail edge if required */ + if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) { + if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->outp[BELOW] = edge->outp[BELOW]; + prev_edge->bstate[BELOW] = UNBUNDLED; + if (prev_edge->prev) { + if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->bstate[BELOW] = BUNDLE_HEAD; + } + } + } + } + } else { + if (edge->top.y == yt) { + edge->xt = edge->top.x; + } else { + edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y); + } + } + } + + if (scanbeam < sbt_entries) { + /* === SCANBEAM INTERIOR PROCESSING ============================== */ + build_intersection_table(&it, aet, dy); + /* Process each node in the intersection table */ + for (intersect = it; intersect; intersect = intersect->next) { + e0 = intersect->ie[0]; + e1 = intersect->ie[1]; + /* Only generate output for contributing intersections */ + if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) && + (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) { + p = e0->outp[ABOVE]; + q = e1->outp[ABOVE]; + ix = intersect->point.x; + iy = intersect->point.y + yb; + + in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) || + (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) || + (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] && + e0->bside[CLIP] && e1->bside[CLIP]); + in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) || + (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) || + (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] && + e0->bside[SUBJ] && e1->bside[SUBJ]); + + // Determine quadrant occupancies + switch (op) { + case GPC_DIFF: + case GPC_INT: + tr = (in[CLIP]) && (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_XOR: + tr = (in[CLIP]) ^ (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_UNION: + tr = (in[CLIP]) || (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + } + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + switch (vclass) { + case EMN: + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + break; + case ERI: + if (p) { + add_right(p, ix, iy); + e1->outp[ABOVE] = p; + e0->outp[ABOVE] = NULL; + } + break; + case ELI: + if (q) { + add_left(q, ix, iy); + e0->outp[ABOVE] = q; + e1->outp[ABOVE] = NULL; + } + break; + case EMX: + if (p && q) { + add_left(p, ix, iy); + merge_right(p, q, out_poly); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + } + break; + case IMN: + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + break; + case ILI: + if (p) { + add_left(p, ix, iy); + e1->outp[ABOVE] = p; + e0->outp[ABOVE] = NULL; + } + break; + case IRI: + if (q) { + add_right(q, ix, iy); + e0->outp[ABOVE] = q; + e1->outp[ABOVE] = NULL; + } + break; + case IMX: + if (p && q) { + add_right(p, ix, iy); + merge_left(p, q, out_poly); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + } + break; + case IMM: + if (p && q) { + add_right(p, ix, iy); + merge_left(p, q, out_poly); + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + } + break; + case EMM: + if (p && q) { + add_left(p, ix, iy); + merge_right(p, q, out_poly); + add_local_min(&out_poly, e0, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + } + break; + default: + break; + } // End of switch + } /* End of contributing intersection conditional */ + + /* Swap bundle sides in response to edge crossing */ + if (e0->bundle[ABOVE][CLIP]) { + e1->bside[CLIP] = !e1->bside[CLIP]; + } + if (e1->bundle[ABOVE][CLIP]) { + e0->bside[CLIP] = !e0->bside[CLIP]; + } + if (e0->bundle[ABOVE][SUBJ]) { + e1->bside[SUBJ] = !e1->bside[SUBJ]; + } + if (e1->bundle[ABOVE][SUBJ]) { + e0->bside[SUBJ] = !e0->bside[SUBJ]; + } + + /* Swap e0 and e1 bundles in the AET */ + prev_edge = e0->prev; + next_edge = e1->next; + if (next_edge) { + next_edge->prev = e0; + } + if (e0->bstate[ABOVE] == BUNDLE_HEAD) { + search = 1; + while (search) { + prev_edge = prev_edge->prev; + if (prev_edge) { + if (prev_edge->bstate[ABOVE] != BUNDLE_TAIL) { + search = 0; + } + } else { + search = 0; + } + } + } + if (!prev_edge) { + aet->prev = e1; + e1->next = aet; + aet = e0->next; + } else { + prev_edge->next->prev = e1; + e1->next = prev_edge->next; + prev_edge->next = e0->next; + } + e0->next->prev = prev_edge; + e1->next->prev = e1; + e0->next = next_edge; + } /* End of IT loop*/ + + // Prepare for next scanbeam + for (edge = aet; edge; edge = next_edge) { + next_edge = edge->next; + succ_edge = edge->succ; + if ((edge->top.y == yt) && succ_edge) { + /* Replace AET edge by its successor */ + succ_edge->outp[BELOW] = edge->outp[ABOVE]; + succ_edge->bstate[BELOW] = edge->bstate[ABOVE]; + succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + prev_edge = edge->prev; + if (prev_edge) { + prev_edge->next = succ_edge; + } else { + aet = succ_edge; + } + if (next_edge) { + next_edge->prev = succ_edge; + } + succ_edge->prev = prev_edge; + succ_edge->next = next_edge; + } else { + /* Update this edge */ + edge->outp[BELOW] = edge->outp[ABOVE]; + edge->bstate[BELOW] = edge->bstate[ABOVE]; + edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + edge->xb = edge->xt; + } + edge->outp[ABOVE] = NULL; + } + } + } /* === END OF SCANBEAM PROCESSING ================================== */ + // Generate result polygon from out_poly + result->contour = NULL; + result->hole = NULL; + result->num_contours = count_contours(out_poly); + if (result->num_contours > 0) { + gpc_malloc(result->hole, result->num_contours * sizeof(int), + const_cast("hole flag table creation")); + gpc_malloc(result->contour, + result->num_contours * sizeof(gpc_vertex_list), + const_cast("contour creation")); + + c = 0; + for (poly = out_poly; poly; poly = npoly) { + npoly = poly->next; + if (poly->active) { + result->hole[c] = poly->proxy->hole; + result->contour[c].num_vertices = poly->active; + gpc_malloc( + result->contour[c].vertex, + result->contour[c].num_vertices * sizeof(gpc_vertex), + const_cast("vertex creation")); + + v = result->contour[c].num_vertices - 1; + for (vtx = poly->proxy->v[LEFT]; vtx; vtx = nv) { + nv = vtx->next; + result->contour[c].vertex[v].x = vtx->x; + result->contour[c].vertex[v].y = vtx->y; + gpc_free(vtx); + v--; + } + c++; + } + gpc_free(poly); + } + } else { + for (poly = out_poly; poly; poly = npoly) { + npoly = poly->next; + gpc_free(poly); + } + } + + // Tidy up + reset_it(&it); + reset_lmt(&lmt); + gpc_free(c_heap); + gpc_free(s_heap); + gpc_free(sbt); +} // NOLINT + +void gpc_free_tristrip(gpc_tristrip *t) { + int s = 0; + for (s = 0; s < t->num_strips; s++) { + gpc_free(t->strip[s].vertex); + } + gpc_free(t->strip); + t->num_strips = 0; +} + +void gpc_polygon_to_tristrip(gpc_polygon *s, gpc_tristrip *t) { + gpc_polygon c; + c.num_contours = 0; + c.hole = NULL; + c.contour = NULL; + gpc_tristrip_clip(GPC_DIFF, s, &c, t); +} + +// gpc_tristrip_clip +void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, + gpc_tristrip *result) { + sb_tree *sbtree = NULL; + it_node *it = NULL; + it_node *intersect = NULL; + edge_node *edge = NULL; + edge_node *prev_edge = NULL; + edge_node *next_edge = NULL; + edge_node *succ_edge = NULL; + edge_node *e0 = NULL; + edge_node *e1 = NULL; + edge_node *aet = NULL; + edge_node *c_heap = NULL; + edge_node *s_heap = NULL; + edge_node *cf = NULL; + lmt_node *lmt = NULL; + lmt_node *local_min = NULL; + polygon_node *tlist = NULL; + polygon_node *tn = NULL; + polygon_node *tnn = NULL; + polygon_node *p = NULL; + polygon_node *q = NULL; + vertex_node *lt = NULL; + vertex_node *ltn = NULL; + vertex_node *rt = NULL; + vertex_node *rtn = NULL; + h_state horiz[2]; + vertex_type cft = NUL; + int in[2]; + int exists[2]; + int parity[2] = {LEFT, LEFT}; + int s = 0; + int v = 0; + int contributing = 0; + int search = 0; + int scanbeam = 0; + int sbt_entries = 0; + int vclass = 0; + int bl = 0; + int br = 0; + int tl = 0; + int tr = 0; + double *sbt = NULL; + double xb = 0.0; + double px = 0.0; + double nx = 0.0; + double yb = 0.0; + double yt = 0.0; + double dy = 0.0; + double ix = 0.0; + double iy = 0.0; + + /* Test for trivial NULL result cases */ + if (((subj->num_contours == 0) && (clip->num_contours == 0)) || + ((subj->num_contours == 0) && ((op == GPC_INT) || (op == GPC_DIFF))) || + ((clip->num_contours == 0) && (op == GPC_INT))) { + result->num_strips = 0; + result->strip = NULL; + return; + } + + /* Identify potentialy contributing contours */ + if (((op == GPC_INT) || (op == GPC_DIFF)) && (subj->num_contours > 0) && + (clip->num_contours > 0)) { + minimax_test(subj, clip, op); + } + /* Build LMT */ + if (subj->num_contours > 0) { + s_heap = build_lmt(&lmt, &sbtree, &sbt_entries, subj, SUBJ, op); + } + if (clip->num_contours > 0) { + c_heap = build_lmt(&lmt, &sbtree, &sbt_entries, clip, CLIP, op); + } + /* Return a NULL result if no contours contribute */ + if (lmt == NULL) { + result->num_strips = 0; + result->strip = NULL; + reset_lmt(&lmt); + gpc_free(s_heap); + gpc_free(c_heap); + return; + } + + /* Build scanbeam table from scanbeam tree */ + gpc_malloc(sbt, sbt_entries * sizeof(double), + const_cast("sbt creation")); + PADDLE_ENFORCE_NOT_NULL(sbt); + build_sbt(&scanbeam, sbt, sbtree); + scanbeam = 0; + free_sbtree(&sbtree); + + /* Invert clip polygon for difference operation */ + if (op == GPC_DIFF) { + parity[CLIP] = RIGHT; + } + local_min = lmt; + + // Process each scanbeam + while (scanbeam < sbt_entries) { + /* Set yb and yt to the bottom and top of the scanbeam */ + yb = sbt[scanbeam++]; + if (scanbeam < sbt_entries) { + yt = sbt[scanbeam]; + dy = yt - yb; + } + + /* === SCANBEAM BOUNDARY PROCESSING ================================ */ + /* If LMT node corresponding to yb exists */ + if (local_min) { + if (local_min->y == yb) { + /* Add edges starting at this local minimum to the AET */ + for (edge = local_min->first_bound; edge; edge = edge->next_bound) { + add_edge_to_aet(&aet, edge, NULL); + } + local_min = local_min->next; + } + } + /* Set dummy previous x value */ + /* Create bundles within AET */ + px = -DBL_MAX; + e0 = aet; + e1 = aet; + + /* Set up bundle fields of first edge */ + aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); + aet->bundle[ABOVE][!aet->type] = 0; + aet->bstate[ABOVE] = UNBUNDLED; + + for (next_edge = aet->next; next_edge; next_edge = next_edge->next) { + /* Set up bundle fields of next edge */ + next_edge->bundle[ABOVE][next_edge->type] = (next_edge->top.y != yb); + next_edge->bundle[ABOVE][!next_edge->type] = 0; + next_edge->bstate[ABOVE] = UNBUNDLED; + + /* Bundle edges above the scanbeam boundary if they coincide */ + if (next_edge->bundle[ABOVE][next_edge->type]) { + if (gpc_eq(e0->xb, next_edge->xb) && gpc_eq(e0->dx, next_edge->dx) && + (e0->top.y != yb)) { + next_edge->bundle[ABOVE][next_edge->type] ^= + e0->bundle[ABOVE][next_edge->type]; + next_edge->bundle[ABOVE][!next_edge->type] = + e0->bundle[ABOVE][!next_edge->type]; + next_edge->bstate[ABOVE] = BUNDLE_HEAD; + e0->bundle[ABOVE][CLIP] = 0; + e0->bundle[ABOVE][SUBJ] = 0; + e0->bstate[ABOVE] = BUNDLE_TAIL; + } + e0 = next_edge; + } + } + horiz[CLIP] = NH; + horiz[SUBJ] = NH; + + /* Process each edge at this scanbeam boundary */ + for (edge = aet; edge; edge = edge->next) { + exists[CLIP] = + edge->bundle[ABOVE][CLIP] + (edge->bundle[BELOW][CLIP] << 1); + exists[SUBJ] = + edge->bundle[ABOVE][SUBJ] + (edge->bundle[BELOW][SUBJ] << 1); + + if (exists[CLIP] || exists[SUBJ]) { + /* Set bundle side */ + edge->bside[CLIP] = parity[CLIP]; + edge->bside[SUBJ] = parity[SUBJ]; + + /* Determine contributing status and quadrant occupancies */ + switch (op) { + case GPC_DIFF: + case GPC_INT: + contributing = (exists[CLIP] && (parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) && (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) && + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) && + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_XOR: + contributing = exists[CLIP] || exists[SUBJ]; + br = (parity[CLIP]) ^ (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) ^ + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) ^ + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + case GPC_UNION: + contributing = (exists[CLIP] && (!parity[SUBJ] || horiz[SUBJ])) || + (exists[SUBJ] && (!parity[CLIP] || horiz[CLIP])) || + (exists[CLIP] && exists[SUBJ] && + (parity[CLIP] == parity[SUBJ])); + br = (parity[CLIP]) || (parity[SUBJ]); + bl = (parity[CLIP] ^ edge->bundle[ABOVE][CLIP]) || + (parity[SUBJ] ^ edge->bundle[ABOVE][SUBJ]); + tr = (parity[CLIP] ^ (horiz[CLIP] != NH)) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH)); + tl = (parity[CLIP] ^ (horiz[CLIP] != NH) ^ + edge->bundle[BELOW][CLIP]) || + (parity[SUBJ] ^ (horiz[SUBJ] != NH) ^ + edge->bundle[BELOW][SUBJ]); + break; + } + + // Update parity + parity[CLIP] ^= edge->bundle[ABOVE][CLIP]; + parity[SUBJ] ^= edge->bundle[ABOVE][SUBJ]; + + /* Update horizontal state */ + if (exists[CLIP]) { + horiz[CLIP] = next_h_state[horiz[CLIP]] + [((exists[CLIP] - 1) << 1) + parity[CLIP]]; + } + if (exists[SUBJ]) { + horiz[SUBJ] = next_h_state[horiz[SUBJ]] + [((exists[SUBJ] - 1) << 1) + parity[SUBJ]]; + } + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + + if (contributing) { + xb = edge->xb; + switch (vclass) { + case EMN: + new_tristrip(&tlist, edge, xb, yb); + cf = edge; + break; + case ERI: + edge->outp[ABOVE] = cf->outp[ABOVE]; + if (xb != cf->xb) { + gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); + } + cf = NULL; + break; + case ELI: + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + edge->outp[ABOVE] = NULL; + cf = edge; + break; + case EMX: + if (xb != cf->xb) { + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + } + edge->outp[ABOVE] = NULL; + cf = NULL; + break; + case IMN: + if (cft == LED) { + if (cf->bot.y != yb) { + gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); + } + new_tristrip(&tlist, cf, cf->xb, yb); + } + edge->outp[ABOVE] = cf->outp[ABOVE]; + gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); + break; + case ILI: + new_tristrip(&tlist, edge, xb, yb); + cf = edge; + cft = ILI; + break; + case IRI: + if (cft == LED) { + if (cf->bot.y != yb) { + gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); + } + new_tristrip(&tlist, cf, cf->xb, yb); + } + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + edge->outp[ABOVE] = NULL; + break; + case IMX: + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + edge->outp[ABOVE] = NULL; + cft = IMX; + break; + case IMM: + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + edge->outp[ABOVE] = cf->outp[ABOVE]; + if (xb != cf->xb) { + gpc_vertex_create(cf, ABOVE, RIGHT, xb, yb); + } + cf = edge; + break; + case EMM: + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + edge->outp[ABOVE] = NULL; + new_tristrip(&tlist, edge, xb, yb); + cf = edge; + break; + case LED: + if (edge->bot.y == yb) { + gpc_vertex_create(edge, BELOW, LEFT, xb, yb); + } + edge->outp[ABOVE] = edge->outp[BELOW]; + cf = edge; + cft = LED; + break; + case RED: + edge->outp[ABOVE] = cf->outp[ABOVE]; + if (cft == LED) { + if (cf->bot.y == yb) { + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + } else { + if (edge->bot.y == yb) { + gpc_vertex_create(cf, BELOW, LEFT, cf->xb, yb); + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + } + } + } else { + gpc_vertex_create(edge, BELOW, RIGHT, xb, yb); + gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); + } + cf = NULL; + break; + default: + break; + } /* End of switch */ + } /* End of contributing conditional */ + } /* End of edge exists conditional */ + } // End of AET loop + + /* Delete terminating edges from the AET, otherwise compute xt */ + for (edge = aet; edge; edge = edge->next) { + if (edge->top.y == yb) { + prev_edge = edge->prev; + next_edge = edge->next; + if (prev_edge) { + prev_edge->next = next_edge; + } else { + aet = next_edge; + } + if (next_edge) { + next_edge->prev = prev_edge; + } + + /* Copy bundle head state to the adjacent tail edge if required */ + if ((edge->bstate[BELOW] == BUNDLE_HEAD) && prev_edge) { + if (prev_edge->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->outp[BELOW] = edge->outp[BELOW]; + prev_edge->bstate[BELOW] = UNBUNDLED; + if (prev_edge->prev) { + if (prev_edge->prev->bstate[BELOW] == BUNDLE_TAIL) { + prev_edge->bstate[BELOW] = BUNDLE_HEAD; + } + } + } + } + } else { + if (edge->top.y == yt) { + edge->xt = edge->top.x; + } else { + edge->xt = edge->bot.x + edge->dx * (yt - edge->bot.y); + } + } + } + + if (scanbeam < sbt_entries) { + /* === SCANBEAM INTERIOR PROCESSING ============================== */ + build_intersection_table(&it, aet, dy); + /* Process each node in the intersection table */ + for (intersect = it; intersect; intersect = intersect->next) { + e0 = intersect->ie[0]; + e1 = intersect->ie[1]; + + /* Only generate output for contributing intersections */ + if ((e0->bundle[ABOVE][CLIP] || e0->bundle[ABOVE][SUBJ]) && + (e1->bundle[ABOVE][CLIP] || e1->bundle[ABOVE][SUBJ])) { + p = e0->outp[ABOVE]; + q = e1->outp[ABOVE]; + ix = intersect->point.x; + iy = intersect->point.y + yb; + + in[CLIP] = (e0->bundle[ABOVE][CLIP] && !e0->bside[CLIP]) || + (e1->bundle[ABOVE][CLIP] && e1->bside[CLIP]) || + (!e0->bundle[ABOVE][CLIP] && !e1->bundle[ABOVE][CLIP] && + e0->bside[CLIP] && e1->bside[CLIP]); + in[SUBJ] = (e0->bundle[ABOVE][SUBJ] && !e0->bside[SUBJ]) || + (e1->bundle[ABOVE][SUBJ] && e1->bside[SUBJ]) || + (!e0->bundle[ABOVE][SUBJ] && !e1->bundle[ABOVE][SUBJ] && + e0->bside[SUBJ] && e1->bside[SUBJ]); + + switch (op) { // Determine quadrant occupancies + case GPC_DIFF: + case GPC_INT: + tr = (in[CLIP]) && (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) && + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_XOR: + tr = (in[CLIP]) ^ (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) ^ + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + case GPC_UNION: + tr = (in[CLIP]) || (in[SUBJ]); + tl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ]); + br = (in[CLIP] ^ e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e0->bundle[ABOVE][SUBJ]); + bl = (in[CLIP] ^ e1->bundle[ABOVE][CLIP] ^ + e0->bundle[ABOVE][CLIP]) || + (in[SUBJ] ^ e1->bundle[ABOVE][SUBJ] ^ + e0->bundle[ABOVE][SUBJ]); + break; + } + + vclass = tr + (tl << 1) + (br << 2) + (bl << 3); + switch (vclass) { + case EMN: + new_tristrip(&tlist, e1, ix, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + break; + case ERI: + if (p) { + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + e0->outp[ABOVE] = NULL; + } + break; + case ELI: + if (q) { + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + e1->outp[ABOVE] = NULL; + } + break; + case EMX: + if (p && q) { + gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + } + break; + case IMN: + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + new_tristrip(&tlist, prev_edge, px, iy); + e1->outp[ABOVE] = prev_edge->outp[ABOVE]; + gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); + new_tristrip(&tlist, e0, ix, iy); + next_edge->outp[ABOVE] = e0->outp[ABOVE]; + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + break; + case ILI: + if (p) { + gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + e1->outp[ABOVE] = e0->outp[ABOVE]; + e0->outp[ABOVE] = NULL; + } + break; + case IRI: + if (q) { + gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + e1->outp[ABOVE] = NULL; + } + break; + case IMX: + if (p && q) { + gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); + gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); + e0->outp[ABOVE] = NULL; + e1->outp[ABOVE] = NULL; + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + new_tristrip(&tlist, prev_edge, px, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + next_edge->outp[ABOVE] = prev_edge->outp[ABOVE]; + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + } + break; + case IMM: + if (p && q) { + gpc_vertex_create(e0, ABOVE, RIGHT, ix, iy); + gpc_vertex_create(e1, ABOVE, LEFT, ix, iy); + gpc_p_edge(prev_edge, e0, ABOVE); + gpc_vertex_create(prev_edge, ABOVE, LEFT, px, iy); + new_tristrip(&tlist, prev_edge, px, iy); + gpc_n_edge(next_edge, e1, ABOVE); + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + e1->outp[ABOVE] = prev_edge->outp[ABOVE]; + gpc_vertex_create(e1, ABOVE, RIGHT, ix, iy); + new_tristrip(&tlist, e0, ix, iy); + next_edge->outp[ABOVE] = e0->outp[ABOVE]; + gpc_vertex_create(next_edge, ABOVE, RIGHT, nx, iy); + } + break; + case EMM: + if (p && q) { + gpc_vertex_create(e0, ABOVE, LEFT, ix, iy); + new_tristrip(&tlist, e1, ix, iy); + e0->outp[ABOVE] = e1->outp[ABOVE]; + } + break; + default: + break; + } /* End of switch */ + } /* End of contributing intersection conditional */ + + // Swap bundle sides in response to edge crossing + if (e0->bundle[ABOVE][CLIP]) { + e1->bside[CLIP] = !e1->bside[CLIP]; + } + if (e1->bundle[ABOVE][CLIP]) { + e0->bside[CLIP] = !e0->bside[CLIP]; + } + if (e0->bundle[ABOVE][SUBJ]) { + e1->bside[SUBJ] = !e1->bside[SUBJ]; + } + if (e1->bundle[ABOVE][SUBJ]) { + e0->bside[SUBJ] = !e0->bside[SUBJ]; + } + + /* Swap e0 and e1 bundles in the AET */ + prev_edge = e0->prev; + next_edge = e1->next; + if (e1->next) { + e1->next->prev = e0; + } + + if (e0->bstate[ABOVE] == BUNDLE_HEAD) { + search = 1; + while (search) { + prev_edge = prev_edge->prev; + if (prev_edge) { + if (prev_edge->bundle[ABOVE][CLIP] || + prev_edge->bundle[ABOVE][SUBJ] || + (prev_edge->bstate[ABOVE] == BUNDLE_HEAD)) { + search = 0; + } + } else { + search = 0; + } + } + } + if (!prev_edge) { + e1->next = aet; + aet = e0->next; + } else { + e1->next = prev_edge->next; + prev_edge->next = e0->next; + } + e0->next->prev = prev_edge; + e1->next->prev = e1; + e0->next = next_edge; + } /* End of IT loop*/ + + /* Prepare for next scanbeam */ + for (edge = aet; edge; edge = next_edge) { + next_edge = edge->next; + succ_edge = edge->succ; + + if ((edge->top.y == yt) && succ_edge) { + /* Replace AET edge by its successor */ + succ_edge->outp[BELOW] = edge->outp[ABOVE]; + succ_edge->bstate[BELOW] = edge->bstate[ABOVE]; + succ_edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + succ_edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + prev_edge = edge->prev; + if (prev_edge) { + prev_edge->next = succ_edge; + } else { + aet = succ_edge; + } + if (next_edge) { + next_edge->prev = succ_edge; + } + succ_edge->prev = prev_edge; + succ_edge->next = next_edge; + } else { + /* Update this edge */ + edge->outp[BELOW] = edge->outp[ABOVE]; + edge->bstate[BELOW] = edge->bstate[ABOVE]; + edge->bundle[BELOW][CLIP] = edge->bundle[ABOVE][CLIP]; + edge->bundle[BELOW][SUBJ] = edge->bundle[ABOVE][SUBJ]; + edge->xb = edge->xt; + } + edge->outp[ABOVE] = NULL; + } + } + } /* === END OF SCANBEAM PROCESSING ================================== */ + + // Generate result tristrip from tlist + result->strip = NULL; + result->num_strips = count_tristrips(tlist); + if (result->num_strips > 0) { + gpc_malloc(result->strip, + result->num_strips * sizeof(gpc_vertex_list), + const_cast("tristrip list creation")); + + s = 0; + for (tn = tlist; tn; tn = tnn) { + tnn = tn->next; + if (tn->active > 2) { + /* Valid tristrip: copy the vertices and free the heap */ + result->strip[s].num_vertices = tn->active; + gpc_malloc(result->strip[s].vertex, + tn->active * sizeof(gpc_vertex), + const_cast("tristrip creation")); + v = 0; + if (0) { + lt = tn->v[RIGHT]; + rt = tn->v[LEFT]; + } else { + lt = tn->v[LEFT]; + rt = tn->v[RIGHT]; + } + while (lt || rt) { + if (lt) { + ltn = lt->next; + result->strip[s].vertex[v].x = lt->x; + result->strip[s].vertex[v].y = lt->y; + v++; + gpc_free(lt); + lt = ltn; + } + if (rt) { + rtn = rt->next; + result->strip[s].vertex[v].x = rt->x; + result->strip[s].vertex[v].y = rt->y; + v++; + gpc_free(rt); + rt = rtn; + } + } + s++; + } else { + /* Invalid tristrip: just free the heap */ + for (lt = tn->v[LEFT]; lt; lt = ltn) { + ltn = lt->next; + gpc_free(lt); + } + for (rt = tn->v[RIGHT]; rt; rt = rtn) { + rtn = rt->next; + gpc_free(rt); + } + } + gpc_free(tn); + } + } + // Tidy up + reset_it(&it); + reset_lmt(&lmt); + gpc_free(c_heap); + gpc_free(s_heap); + gpc_free(sbt); +} // NOLINT + +} // namespace gpc + +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/paddle/fluid/operators/detection/gpc.h b/paddle/fluid/operators/detection/gpc.h new file mode 100644 index 00000000..ee86262e --- /dev/null +++ b/paddle/fluid/operators/detection/gpc.h @@ -0,0 +1,246 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/*************************************************************************** + * + * Copyright (c) 2015 Baidu.com, Inc. All Rights Reserved + * + **************************************************************************/ + +/** + * @file include/gpc.h + * @author huhan02(com@baidu.com) + * @date 2015/12/18 13:52:10 + * @brief + * + * @modified by sunyipeng + * @email sunyipeng@baidu.com + * @date 2018/6/12 + **/ + +#ifndef PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_ // GPC_H_ +#define PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_ // GPC_H_ + +#include +#include +#include +#include + +namespace gpc { + +typedef enum { // Set operation type + GPC_DIFF, // Difference + GPC_INT, // Intersection + GPC_XOR, // Exclusive or + GPC_UNION // Union +} gpc_op; + +typedef struct { // Polygon vertex structure + double x; // Vertex x component + double y; // vertex y component +} gpc_vertex; + +typedef struct { // Vertex list structure + int num_vertices; // Number of vertices in list + gpc_vertex *vertex; // Vertex array pointer +} gpc_vertex_list; + +typedef struct { // Polygon set structure + int num_contours; // Number of contours in polygon + int *hole; // Hole external contour flags + gpc_vertex_list *contour; // Contour array pointer +} gpc_polygon; + +typedef struct { // Tristrip set structure + int num_strips; // Number of tristrips + gpc_vertex_list *strip; // Tristrip array pointer +} gpc_tristrip; + +typedef enum { LEFT, RIGHT } gpc_left_right; + +typedef enum { ABOVE, BELOW } gpc_above_below; + +typedef enum { CLIP, SUBJ } gpc_clip_subj; + +typedef enum { /* Edge intersection classes */ + NUL, /* Empty non-intersection */ + EMX, /* External maximum */ + ELI, /* External left intermediate */ + TED, /* Top edge */ + ERI, /* External right intermediate */ + RED, /* Right edge */ + IMM, /* Internal maximum and minimum */ + IMN, /* Internal minimum */ + EMN, /* External minimum */ + EMM, /* External maximum and minimum */ + LED, /* Left edge */ + ILI, /* Internal left intermediate */ + BED, /* Bottom edge */ + IRI, /* Internal right intermediate */ + IMX, /* Internal maximum */ + FUL /* Full non-intersection */ +} vertex_type; + +typedef enum { /* Horizontal edge states */ + NH, /* No horizontal edge */ + BH, /* Bottom horizontal edge */ + TH /* Top horizontal edge */ +} h_state; + +typedef enum { /* Edge bundle state */ + UNBUNDLED, /* Isolated edge not within a bundle */ + BUNDLE_HEAD, /* Bundle head node */ + BUNDLE_TAIL /* Passive bundle tail node */ +} bundle_state; + +typedef struct v_shape { /* Internal vertex list datatype */ + double x; /* X coordinate component */ + double y; /* Y coordinate component */ + struct v_shape *next; /* Pointer to next vertex in list */ +} vertex_node; + +typedef struct p_shape { /* Internal contour / tristrip type */ + int active; /* Active flag / vertex count */ + int hole; /* Hole / external contour flag */ + vertex_node *v[2]; /* Left and right vertex list ptrs */ + struct p_shape *next; /* Pointer to next polygon contour */ + struct p_shape *proxy; /* Pointer to actual structure used */ +} polygon_node; + +typedef struct edge_shape { + gpc_vertex vertex; /* Piggy-backed contour vertex data */ + gpc_vertex bot; /* Edge lower (x, y) coordinate */ + gpc_vertex top; /* Edge upper (x, y) coordinate */ + double xb; /* Scanbeam bottom x coordinate */ + double xt; /* Scanbeam top x coordinate */ + double dx; /* Change in x for a unit y increase */ + int type; /* Clip / subject edge flag */ + int bundle[2][2]; /* Bundle edge flags */ + int bside[2]; /* Bundle left / right indicators */ + bundle_state bstate[2]; /* Edge bundle state */ + polygon_node *outp[2]; /* Output polygon / tristrip pointer */ + struct edge_shape *prev; /* Previous edge in the AET */ + struct edge_shape *next; /* Next edge in the AET */ + struct edge_shape *pred; /* Edge connected at the lower end */ + struct edge_shape *succ; /* Edge connected at the upper end */ + struct edge_shape *next_bound; /* Pointer to next bound in LMT */ +} edge_node; + +inline bool gpc_eq(float a, float b) { return (fabs(a - b) <= 1e-6); } + +inline bool gpc_prev_index(float a, float b) { return (fabs(a - b) <= 1e-6); } + +inline int gpc_prev_index(int i, int n) { return ((i - 1 + n) % n); } + +inline int gpc_next_index(int i, int n) { return ((i + 1) % n); } + +inline int gpc_optimal(gpc_vertex *v, int i, int n) { + return (v[(i + 1) % n].y != v[i].y || v[(i - 1 + n) % n].y != v[i].y); +} + +inline int gpc_fwd_min(edge_node *v, int i, int n) { + return (v[(i + 1) % n].vertex.y > v[i].vertex.y && + v[(i - 1 + n) % n].vertex.y >= v[i].vertex.y); +} + +inline int gpc_not_fmax(edge_node *v, int i, int n) { + return (v[(i + 1) % n].vertex.y > v[i].vertex.y); +} + +inline int gpc_rev_min(edge_node *v, int i, int n) { + return (v[(i + 1) % n].vertex.y >= v[i].vertex.y && + v[(i - 1 + n) % n].vertex.y > v[i].vertex.y); +} + +inline int gpc_not_rmax(edge_node *v, int i, int n) { + return (v[(i - 1 + n) % n].vertex.y > v[i].vertex.y); +} + +// inline void gpc_p_edge(edge_node *d, edge_node *e, int p, double i, double j) +// { +inline void gpc_p_edge(edge_node *d, edge_node *e, int p) { + d = e; + do { + d = d->prev; + } while (!d->outp[p]); + // i = d->bot.x + d->dx * (j - d->bot.y); +} + +// inline void gpc_n_edge(edge_node *d, edge_node *e, int p, double i, double j) +// { +inline void gpc_n_edge(edge_node *d, edge_node *e, int p) { + d = e; + do { + d = d->next; + } while (!d->outp[p]); + // i = d->bot.x + d->dx * (j - d->bot.y); +} + +template +void gpc_malloc(T *&p, int b, char *s) { + if (b > 0) { + p = (T *)malloc(b); + + if (!p) { + fprintf(stderr, "gpc malloc failure: %s\n", s); + exit(0); + } + } else { + p = NULL; + } +} +template +void gpc_free(T *&p) { + if (p) { + free(p); + p = NULL; + } +} + +/* +=========================================================================== + Public Function Prototypes +=========================================================================== +*/ + +void add_vertex(vertex_node **t, double x, double y); + +void gpc_vertex_create(edge_node *e, int p, int s, double x, double y); + +/* +void gpc_read_polygon(FILE *infile_ptr, int read_hole_flags, + gpc_polygon *polygon); + +void gpc_write_polygon(FILE *outfile_ptr, int write_hole_flags, + gpc_polygon *polygon); +*/ +void gpc_add_contour(gpc_polygon *polygon, gpc_vertex_list *contour, int hole); + +void gpc_polygon_clip(gpc_op set_operation, gpc_polygon *subject_polygon, + gpc_polygon *clip_polygon, gpc_polygon *result_polygon); + +void gpc_tristrip_clip(gpc_op set_operation, gpc_polygon *subject_polygon, + gpc_polygon *clip_polygon, + gpc_tristrip *result_tristrip); + +void gpc_polygon_to_tristrip(gpc_polygon *polygon, gpc_tristrip *tristrip); + +void gpc_free_polygon(gpc_polygon *polygon); + +void gpc_free_tristrip(gpc_tristrip *tristrip); + +} // namespace gpc + +#endif // PADDLE_FLUID_OPERATORS_DETECTION_GPC_H_ +/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc new file mode 100644 index 00000000..9c89b7ca --- /dev/null +++ b/paddle/fluid/operators/detection/iou_similarity_op.cc @@ -0,0 +1,97 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/iou_similarity_op.h" + +namespace paddle { +namespace operators { + +class IOUSimilarityOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of IOUSimilarityOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), + "Input(Y) of IOUSimilarityOp should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto y_dims = ctx->GetInputDim("Y"); + + PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2."); + PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]"); + PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2."); + PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]"); + + ctx->ShareLoD("X", /*->*/ "Out"); + ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]})); + } +}; + +class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor, default LoDTensor) " + "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, " + "each box is represented as [xmin, ymin, xmax, ymax], " + "the shape of X is [N, 4]. [xmin, ymin] is the left top " + "coordinate of the box if the input is image feature map, they " + "are close to the origin of the coordinate system. " + "[xmax, ymax] is the right bottom coordinate of the box. " + "This tensor can contain LoD information to represent a batch " + "of inputs. One instance of this batch can contain different " + "numbers of entities."); + AddInput("Y", + "(Tensor, default Tensor) " + "Box list Y holds M boxes, each box is represented as " + "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. " + "[xmin, ymin] is the left top coordinate of the box if the " + "input is image feature map, and [xmax, ymax] is the right " + "bottom coordinate of the box."); + + AddOutput("Out", + "(LoDTensor, the lod is same as input X) The output of " + "iou_similarity op, a tensor with shape [N, M] " + "representing pairwise iou scores."); + + AddComment(R"DOC( +**IOU Similarity Operator** + +Computes intersection-over-union (IOU) between two box lists. +Box list 'X' should be a LoDTensor and 'Y' is a common Tensor, +boxes in 'Y' are shared by all instance of the batched inputs of X. +Given two boxes A and B, the calculation of IOU is as follows: + +$$ +IOU(A, B) = +\\frac{area(A\\cap B)}{area(A)+area(B)-area(A\\cap B)} +$$ + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(iou_similarity, ops::IOUSimilarityOp, + ops::IOUSimilarityOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + iou_similarity, + ops::IOUSimilarityKernel, + ops::IOUSimilarityKernel); diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cu b/paddle/fluid/operators/detection/iou_similarity_op.cu new file mode 100644 index 00000000..8342b413 --- /dev/null +++ b/paddle/fluid/operators/detection/iou_similarity_op.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/iou_similarity_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + iou_similarity, + ops::IOUSimilarityKernel, + ops::IOUSimilarityKernel); diff --git a/paddle/fluid/operators/detection/iou_similarity_op.h b/paddle/fluid/operators/detection/iou_similarity_op.h new file mode 100644 index 00000000..9f193ebc --- /dev/null +++ b/paddle/fluid/operators/detection/iou_similarity_op.h @@ -0,0 +1,92 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/for_range.h" + +template +inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2, + T ymin2, T xmax2, T ymax2) { + constexpr T zero = static_cast(0); + T area1 = (ymax1 - ymin1) * (xmax1 - xmin1); + T area2 = (ymax2 - ymin2) * (xmax2 - xmin2); + T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1; + T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1; + T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2; + T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2; + T inter_height = inter_ymax - inter_ymin; + T inter_width = inter_xmax - inter_xmin; + inter_height = inter_height > zero ? inter_height : zero; + inter_width = inter_width > zero ? inter_width : zero; + T inter_area = inter_width * inter_height; + T union_area = area1 + area2 - inter_area; + T sim_score = inter_area / union_area; + return sim_score; +} + +template +struct IOUSimilarityFunctor { + IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols) + : x_(x), y_(y), z_(z), cols_(static_cast(cols)) {} + + inline HOSTDEVICE void operator()(size_t tid) const { + size_t row_id = tid / cols_; + size_t col_id = tid % cols_; + + T x_min1 = x_[row_id * 4]; + T y_min1 = x_[row_id * 4 + 1]; + T x_max1 = x_[row_id * 4 + 2]; + T y_max1 = x_[row_id * 4 + 3]; + + T x_min2 = y_[col_id * 4]; + T y_min2 = y_[col_id * 4 + 1]; + T x_max2 = y_[col_id * 4 + 2]; + T y_max2 = y_[col_id * 4 + 3]; + + T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2, + x_max2, y_max2); + + z_[row_id * cols_ + col_id] = sim; + } + const T* x_; + const T* y_; + T* z_; + const size_t cols_; +}; + +namespace paddle { +namespace operators { + +template +class IOUSimilarityKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::LoDTensor* in_x = ctx.Input("X"); + const framework::Tensor* in_y = ctx.Input("Y"); + framework::LoDTensor* out = ctx.Output("Out"); + + int x_n = in_x->dims()[0]; + int y_n = in_y->dims()[0]; + IOUSimilarityFunctor functor(in_x->data(), in_y->data(), + out->mutable_data(ctx.GetPlace()), y_n); + + platform::ForRange for_range( + static_cast(ctx.device_context()), x_n * y_n); + for_range(functor); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/mask_util.cc b/paddle/fluid/operators/detection/mask_util.cc new file mode 100644 index 00000000..bd6fee71 --- /dev/null +++ b/paddle/fluid/operators/detection/mask_util.cc @@ -0,0 +1,229 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/mask_util.h" +#include +#include +#include +#include +#include +#include "paddle/fluid/memory/memory.h" + +namespace paddle { +namespace operators { + +uint32_t UMax(uint32_t a, uint32_t b) { return (a > b) ? a : b; } + +static inline int Compare(const void* a, const void* b) { + uint32_t c = *(reinterpret_cast(a)); + uint32_t d = *(reinterpret_cast(b)); + return c > d ? 1 : c < d ? -1 : 0; +} + +void Decode(const uint32_t* cnts, int m, uint8_t* mask) { + uint8_t v = 0; + for (int j = 0; j < m; j++) { + for (uint32_t k = 0; k < cnts[j]; k++) { + *(mask++) = v; + } + v = !v; + } +} + +typedef uint32_t uint; +void Poly2Mask(const float* xy, int k, int h, int w, uint8_t* mask) { + int j, m = 0; + double scale = 5; + int *x, *y, *u, *v; + uint *a, *b; + platform::CPUPlace cpu; + auto xptr = memory::Alloc(cpu, sizeof(int) * (k + 1) * 2); + x = reinterpret_cast(xptr->ptr()); + y = x + (k + 1); + + for (j = 0; j < k; j++) x[j] = static_cast(scale * xy[j * 2 + 0] + .5); + x[k] = x[0]; + for (j = 0; j < k; j++) y[j] = static_cast(scale * xy[j * 2 + 1] + .5); + y[k] = y[0]; + for (j = 0; j < k; j++) { + m += UMax(abs(x[j] - x[j + 1]), abs(y[j] - y[j + 1])) + 1; + } + auto vptr = memory::Alloc(cpu, sizeof(int) * m * 2); + u = reinterpret_cast(vptr->ptr()); + v = u + m; + m = 0; + for (j = 0; j < k; j++) { + int xs = x[j], xe = x[j + 1], ys = y[j], ye = y[j + 1], dx, dy, t, d; + int flip; + double s; + dx = abs(xe - xs); + dy = abs(ys - ye); + flip = (dx >= dy && xs > xe) || (dx < dy && ys > ye); + if (flip) { + t = xs; + xs = xe; + xe = t; + t = ys; + ys = ye; + ye = t; + } + if (dx >= dy) { + s = dx == 0 ? 0 : static_cast(ye - ys) / dx; + for (d = 0; d <= dx; d++) { + t = flip ? dx - d : d; + u[m] = t + xs; + v[m] = static_cast(ys + s * t + .5); + m++; + } + } else { + s = dy == 0 ? 0 : static_cast(xe - xs) / dy; + for (d = 0; d <= dy; d++) { + t = flip ? dy - d : d; + v[m] = t + ys; + u[m] = static_cast(xs + s * t + .5); + m++; + } + } + } + /* get points along y-boundary and downsample */ + k = m; + m = 0; + double xd, yd; + auto xyptr = memory::Alloc(cpu, sizeof(int) * k * 2); + x = reinterpret_cast(xyptr->ptr()); + y = x + k; + for (j = 1; j < k; j++) { + if (u[j] != u[j - 1]) { + xd = static_cast(u[j] < u[j - 1] ? u[j] : u[j] - 1); + xd = (xd + .5) / scale - .5; + if (floor(xd) != xd || xd < 0 || xd > w - 1) continue; + yd = static_cast(v[j] < v[j - 1] ? v[j] : v[j - 1]); + yd = (yd + .5) / scale - .5; + if (yd < 0) + yd = 0; + else if (yd > h) + yd = h; + yd = ceil(yd); + x[m] = static_cast(xd); + y[m] = static_cast(yd); + m++; + } + } + /* compute rle encoding given y-boundary points */ + k = m; + auto aptr = memory::Alloc(cpu, sizeof(uint) * (k + 1)); + a = reinterpret_cast(aptr->ptr()); + for (j = 0; j < k; j++) a[j] = static_cast(x[j] * h + y[j]); + a[k++] = static_cast(h * w); + + qsort(a, k, sizeof(uint), Compare); + uint p = 0; + for (j = 0; j < k; j++) { + uint t = a[j]; + a[j] -= p; + p = t; + } + auto bptr = memory::Alloc(cpu, sizeof(uint32_t) * k); + b = reinterpret_cast(bptr->ptr()); + j = m = 0; + b[m++] = a[j++]; + while (j < k) { + if (a[j] > 0) { + b[m++] = a[j++]; + } else { + j++; + if (j < k) b[m - 1] += a[j++]; + } + } + + // convert to mask + auto mskptr = memory::Alloc(cpu, sizeof(uint8_t) * h * w); + uint8_t* msk = reinterpret_cast(mskptr->ptr()); + Decode(b, m, msk); + + for (int ii = 0; ii < h; ++ii) { + for (int jj = 0; jj < w; ++jj) { + mask[ii * w + jj] = msk[jj * h + ii]; + } + } +} + +void Poly2Boxes(const std::vector>>& polys, + float* boxes) { + // lists + for (size_t i = 0; i < polys.size(); ++i) { + float x0 = std::numeric_limits::max(); + float x1 = std::numeric_limits::min(); + float y0 = std::numeric_limits::max(); + float y1 = std::numeric_limits::min(); + // each list may have more than one polys + for (size_t j = 0; j < polys[i].size(); ++j) { + for (size_t k = 0; k < polys[i][j].size() / 2; ++k) { + x0 = std::min(x0, polys[i][j][2 * k]); + x1 = std::max(x1, polys[i][j][2 * k]); + y0 = std::min(y0, polys[i][j][2 * k + 1]); + y1 = std::max(y1, polys[i][j][2 * k + 1]); + } + } + boxes[i * 4] = x0; + boxes[i * 4 + 1] = y0; + boxes[i * 4 + 2] = x1; + boxes[i * 4 + 3] = y1; + } +} + +void Polys2MaskWrtBox(const std::vector>& polygons, + const float* box, int M, uint8_t* mask) { + float w = box[2] - box[0]; + float h = box[3] - box[1]; + w = std::max(w, static_cast(1.)); + h = std::max(h, static_cast(1.)); + + uint8_t* msk = nullptr; + if (polygons.size() == 1UL) { + msk = mask; + } else { + msk = reinterpret_cast( + malloc(M * M * polygons.size() * sizeof(uint8_t))); + } + for (size_t i = 0; i < polygons.size(); ++i) { + int k = polygons[i].size() / 2; + std::vector p; + for (int j = 0; j < k; ++j) { + float pw = (polygons[i][2 * j] - box[0]) * M / w; + float ph = (polygons[i][2 * j + 1] - box[1]) * M / h; + p.push_back(pw); + p.push_back(ph); + } + uint8_t* msk_i = msk + i * M * M; + Poly2Mask(p.data(), k, M, M, msk_i); + } + + if (polygons.size() > 1UL) { + for (size_t i = 0; i < polygons.size(); ++i) { + uint8_t* msk_i = msk + i * M * M; + for (int j = 0; j < M * M; ++j) { + if (i == 0) { + mask[j] = msk_i[j]; + } else { + mask[j] = (mask[j] + msk_i[j]) > 0 ? 1 : 0; + } + } + } + free(msk); + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/mask_util.h b/paddle/fluid/operators/detection/mask_util.h new file mode 100644 index 00000000..4e0ea54f --- /dev/null +++ b/paddle/fluid/operators/detection/mask_util.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +namespace paddle { +namespace operators { + +void Poly2Mask(const float* ploy, int k, int h, int w, uint8_t* mask); + +void Poly2Boxes(const std::vector>>& polys, + float* boxes); + +void Polys2MaskWrtBox(const std::vector>& polygons, + const float* box, int M, uint8_t* mask); +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/mask_util_test.cc b/paddle/fluid/operators/detection/mask_util_test.cc new file mode 100644 index 00000000..de904e94 --- /dev/null +++ b/paddle/fluid/operators/detection/mask_util_test.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/mask_util.h" +#include +#include "paddle/fluid/memory/memory.h" + +namespace paddle { +namespace operators { + +template +void Compare(const T* a, const T* b, const int n) { + for (int i = 0; i < n; i++) { + EXPECT_EQ(a[i], b[i]); + } +} + +TEST(MaskUtil, Poly2MaskTest) { + float polys[] = {1.97f, 1.88f, 5.81f, 1.88f, 1.69f, + 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}; + int h = 8, w = 8; + int k = 5; // length(polys) / 2 + // clang-format off + uint8_t expect_mask[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 0, 0, 0, 0, + 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 1, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0 + }; + // clang-format on + + // the groud-truth mask is computed by coco API: + // + // import pycocotools.mask as mask_util + // import numpy as np + // segm = [1.97, 1.88, 5.81, 1.88, 1.69, 6.53, 5.94, 6.38, 1.97, 1.88] + // rles = mask_util.frPyObjects([segm], im_h, im_w) + // mask = mask_util.decode(rles) + // print mask + platform::CPUPlace cpu; + auto allocation = memory::Alloc(cpu, sizeof(expect_mask)); + uint8_t* mask = reinterpret_cast(allocation->ptr()); + Poly2Mask(polys, k, h, w, mask); + Compare(expect_mask, mask, h * w); +} + +TEST(MaskUtil, Poly2BoxesTest) { + // clang-format off + std::vector>> polys = { + {{1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}}, + {{2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}} + }; + float expect_boxes[] = { + 1.69f, 1.88f, 5.94f, 6.53f, + 1.69f, 0.88f, 6.94f, 6.63f + }; + // clang-format on + + platform::CPUPlace cpu; + auto allocation = memory::Alloc(cpu, sizeof(expect_boxes)); + float* boxes = reinterpret_cast(allocation->ptr()); + Poly2Boxes(polys, boxes); + Compare(expect_boxes, boxes, 8); +} + +TEST(MaskUtil, Polys2MaskWrtBoxTest) { + // clang-format off + std::vector>> polys = {{ + {1.97f, 1.88f, 5.81f, 1.88f, 1.69f, 6.53f, 5.94f, 6.38f, 1.97f, 1.88f}, + {2.97f, 1.88f, 3.81f, 1.68f, 1.69f, 6.63f, 6.94f, 6.58f, 2.97f, 0.88f}}}; + float expect_boxes[] = { + 1.69f, 0.88f, 6.94f, 6.63f + }; + uint8_t expect_mask[] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 0, 0, + 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 1, 1, 1, 0, 0, 0, + 0, 0, 1, 1, 1, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1 + }; + // clang-format on + + platform::CPUPlace cpu; + auto allocation = memory::Alloc(cpu, sizeof(expect_boxes)); + float* boxes = reinterpret_cast(allocation->ptr()); + Poly2Boxes(polys, boxes); + Compare(expect_boxes, boxes, 4); + + auto allocat_mask = memory::Alloc(cpu, sizeof(expect_mask)); + uint8_t* mask = reinterpret_cast(allocat_mask->ptr()); + int M = 8; + Polys2MaskWrtBox(polys[0], expect_boxes, M, mask); + Compare(expect_mask, mask, M * M); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc new file mode 100644 index 00000000..c68fe243 --- /dev/null +++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc @@ -0,0 +1,341 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +enum MiningType { kNone = 0, kMaxNegative, kHardExample }; + +template +bool SortScoreDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +inline bool IsEligibleMining(const MiningType mining_type, const int match_idx, + const float match_dist, + const float neg_dist_threshold) { + if (mining_type == MiningType::kMaxNegative) { + return match_idx == -1 && match_dist < neg_dist_threshold; + } else if (mining_type == MiningType::kHardExample) { + return true; + } else { + return false; + } +} + +inline MiningType GetMiningType(std::string str) { + if (str == "max_negative") { + return MiningType::kMaxNegative; + } else if (str == "hard_example") { + return MiningType::kHardExample; + } else { + return MiningType::kNone; + } +} + +template +class MineHardExamplesKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_cls_loss = ctx.Input("ClsLoss"); + auto* in_loc_loss = ctx.Input("LocLoss"); + auto* in_matched_indices = ctx.Input("MatchIndices"); + auto* in_match_dist = ctx.Input("MatchDist"); + float neg_pos_ratio = ctx.Attr("neg_pos_ratio"); + T neg_dist_threshold = + static_cast(ctx.Attr("neg_dist_threshold")); + int sample_size = ctx.Attr("sample_size"); + MiningType mining_type = + GetMiningType(ctx.Attr("mining_type")); + + auto out_neg_indices = ctx.Output("NegIndices"); + auto out_match_indices = + ctx.Output("UpdatedMatchIndices"); + + framework::TensorCopy(*in_matched_indices, ctx.GetPlace(), + out_match_indices); + + int batch_size = in_matched_indices->dims()[0]; + int prior_num = in_matched_indices->dims()[1]; + + auto match_indices = framework::EigenMatrix::From(*in_matched_indices); + + auto match_indices_et = + framework::EigenMatrix::From(*out_match_indices); + + auto match_dist = framework::EigenMatrix::From(*in_match_dist); + + const T* cls_loss = in_cls_loss->data(); + const T* loc_loss = nullptr; + if (in_loc_loss) { + loc_loss = in_loc_loss->data(); + } + + std::vector> all_neg_indices; + std::vector batch_starts = {0}; + for (int n = 0; n < batch_size; ++n) { + std::vector> loss_idx; + int neg_sel = 0; + for (int m = 0; m < prior_num; ++m) { + if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m), + neg_dist_threshold)) { + T loss = cls_loss[n * prior_num + m]; + if (mining_type == MiningType::kHardExample && loc_loss != nullptr) { + loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m]; + } + loss_idx.push_back(std::make_pair(loss, m)); + ++neg_sel; + } + } + + if (mining_type == MiningType::kMaxNegative) { + int num_pos = 0; + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) != -1) ++num_pos; + } + neg_sel = std::min(static_cast(num_pos * neg_pos_ratio), neg_sel); + } else if (mining_type == MiningType::kHardExample) { + neg_sel = std::min(sample_size, neg_sel); + } + + std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend); + std::set sel_indices; + std::vector neg_indices; + std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel, + std::inserter(sel_indices, sel_indices.begin()), + [](std::pair& l) -> int { + return static_cast(l.second); + }); + + if (mining_type == MiningType::kHardExample) { + for (int m = 0; m < prior_num; ++m) { + if (match_indices(n, m) > -1) { + if (sel_indices.find(m) == sel_indices.end()) { + match_indices_et(n, m) = -1; + } + } else { + if (sel_indices.find(m) != sel_indices.end()) { + neg_indices.push_back(m); + } + } + } + } else { + neg_indices.resize(sel_indices.size()); + std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin()); + } + + all_neg_indices.push_back(neg_indices); + batch_starts.push_back(batch_starts.back() + neg_indices.size()); + } + + framework::LoD out_neg_indices_lod; + out_neg_indices_lod.emplace_back(batch_starts); + int neg_offset = 0; + auto neg_data = out_neg_indices->mutable_data( + framework::make_ddim({static_cast(batch_starts.back()), 1}), + ctx.GetPlace()); + + for (auto neg_indices : all_neg_indices) { + std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset); + neg_offset += neg_indices.size(); + } + out_neg_indices->set_lod(out_neg_indices_lod); + return; + } +}; + +class MineHardExamplesOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("ClsLoss"), + "Input(ClsLoss) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchIndices"), + "Input(MatchIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("MatchDist"), + "Input(MatchDist) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("NegIndices"), + "Output(NegIndices) of MineHardExamplesOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"), + "Output(UpdatedMatchIndices) of MineHardExamplesOp should " + "not be null."); + + auto cls_loss_dims = ctx->GetInputDim("ClsLoss"); + auto idx_dims = ctx->GetInputDim("MatchIndices"); + auto dis_dims = ctx->GetInputDim("MatchDist"); + + PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL, + "The shape of ClsLoss is [N, Np]."); + PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL, + "The shape of MatchIndices is [N, Np]."); + PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL, + "The shape of MatchDist is [N, Np]."); + + if (ctx->HasInput("LocLoss")) { + auto loc_loss_dims = ctx->GetInputDim("LocLoss"); + PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL, + "The shape of LocLoss is [N, Np]."); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ( + cls_loss_dims[0], loc_loss_dims[0], + "Batch size of ClsLoss and LocLoss must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], loc_loss_dims[1], + "Prior box number of ClsLoss and LocLoss must be the same."); + } + } + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ( + cls_loss_dims[0], idx_dims[0], + "Batch size of ClsLoss and MatchIndices must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchIndices must be the same."); + + PADDLE_ENFORCE_EQ( + cls_loss_dims[0], dis_dims[0], + "Batch size of ClsLoss and MatchDist must be the same."); + PADDLE_ENFORCE_EQ( + cls_loss_dims[1], idx_dims[1], + "Prior box number of ClsLoss and MatchDist must be the same."); + } + + auto mining_type = + GetMiningType(ctx->Attrs().Get("mining_type")); + + PADDLE_ENFORCE_NE(mining_type, MiningType::kNone, + "mining_type must be hard_example or max_negative"); + + if (mining_type == MiningType::kMaxNegative) { + auto neg_pos_ratio = ctx->Attrs().Get("neg_pos_ratio"); + auto neg_dist_threshold = ctx->Attrs().Get("neg_dist_threshold"); + PADDLE_ENFORCE_GT( + neg_pos_ratio, 0.0f, + "neg_pos_ratio must greater than zero in max_negative mode"); + PADDLE_ENFORCE_LT( + neg_dist_threshold, 1.0f, + "neg_dist_threshold must less than one in max_negative mode"); + PADDLE_ENFORCE_GT( + neg_dist_threshold, 0.0f, + "neg_dist_threshold must greater than zero in max_negative mode"); + } else if (mining_type == MiningType::kHardExample) { + auto sample_size = ctx->Attrs().Get("sample_size"); + PADDLE_ENFORCE_GT( + sample_size, 0, + "sample_size must greater than zero in hard_example mode"); + } + + ctx->SetOutputDim("UpdatedMatchIndices", idx_dims); + // The first dimension of NegIndices will be set correcttly in Compute. + ctx->SetOutputDim("NegIndices", {-1, 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input("ClsLoss")->type(), platform::CPUPlace()); + } +}; + +class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "ClsLoss", + "(Tensor, default Tensor), The classification loss with shape " + "[N, Np], N is the batch size and Np is the number of prior box."); + AddInput("LocLoss", + "(Tensor, optional, default Tensor), The localization loss " + "with shape [N, Np], N is the batch size and Np is the number of " + "prior box.") + .AsDispensable(); + AddInput("MatchIndices", + "(Tensor, Tensor), Matched indices with shape [N, Np], N is " + "the batch size and Np is the number of prior box. " + "MatchIndices[i][j] equal -1 means the j-th prior box in i-th " + "instance does not match any entity, otherwise means it is " + "matched to row."); + AddInput("MatchDist", + "(Tensor, default Tensor) Matched indices with shape [N, " + "Np], N is the batch size and Np is the number of prior box."); + AddAttr("neg_pos_ratio", + "(float) The ratio of the negative box to the positive " + "box. Use only when mining_type is max_negative.") + .SetDefault(1.0); + AddAttr("neg_dist_threshold", + "(float) The negative overlap upper bound for the unmatched " + "predictions. Use only when mining_type is max_negative.") + .SetDefault(0.5); + AddAttr("sample_size", + "(float) The max sample size of negative box. Use only when " + "mining_type is hard_example.") + .SetDefault(0); + AddAttr("mining_type", + "(float) The mining algorithm name, the value is " + "hard_example or max_negative.") + .SetDefault("max_negative") + .InEnum({"hard_example", "max_negative"}); + + AddOutput( + "NegIndices", + "(LoDTensor) The output of negative example indices. a LoDTensor " + "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, " + "and each element is the prior box index. " + "For example, the batch size is 2, the lod is [[0, 1, 2]], " + "the sample 0's box 1(MatchIndices[0][1]) is selected, " + "and sample 1's box 0 is selected. The output NegIndices is " + "[[1], [0]]."); + + AddOutput("UpdatedMatchIndices", + "(Tensor) The output of updated MatchIndices, a tensor with " + "shape [N, Np]. Only update when mining_type is " + "hard_example. The input MatchIndices elements will be update to " + "-1 when it is not in the candidate high loss list of negative " + "examples."); + + AddComment(R"DOC( +Mine hard examples Operator. +This operator implements hard example mining to select a subset of negative box indices. +For each image, selects the box with highest losses. subject to the condition that the +box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. +The selected number is min(sample_size, max_negative_box_number) when mining_type is +hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) +when mining_type is max_negative, where the max_negative_box_number is the count of +MatchIndices elements with value -1. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(mine_hard_examples, ops::MineHardExamplesOp, + ops::MineHardExamplesOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + mine_hard_examples, + ops::MineHardExamplesKernel, + ops::MineHardExamplesKernel); diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc new file mode 100644 index 00000000..8abc8b89 --- /dev/null +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -0,0 +1,537 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/poly_util.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class MultiClassNMSOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("BBoxes"), + "Input(BBoxes) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Scores"), + "Input(Scores) of MultiClassNMS should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MultiClassNMS should not be null."); + + auto box_dims = ctx->GetInputDim("BBoxes"); + auto score_dims = ctx->GetInputDim("Scores"); + auto score_size = score_dims.size(); + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE(score_size == 2 || score_size == 3, + "The rank of Input(Scores) must be 2 or 3"); + PADDLE_ENFORCE_EQ(box_dims.size(), 3, + "The rank of Input(BBoxes) must be 3"); + if (score_size == 3) { + PADDLE_ENFORCE(box_dims[2] == 4 || box_dims[2] == 8 || + box_dims[2] == 16 || box_dims[2] == 24 || + box_dims[2] == 32, + "The last dimension of Input(BBoxes) must be 4 or 8, " + "represents the layout of coordinate " + "[xmin, ymin, xmax, ymax] or " + "4 points: [x1, y1, x2, y2, x3, y3, x4, y4] or " + "8 points: [xi, yi] i= 1,2,...,8 or " + "12 points: [xi, yi] i= 1,2,...,12 or " + "16 points: [xi, yi] i= 1,2,...,16"); + PADDLE_ENFORCE_EQ( + box_dims[1], score_dims[2], + "The 2nd dimension of Input(BBoxes) must be equal to " + "last dimension of Input(Scores), which represents the " + "predicted bboxes."); + } else { + PADDLE_ENFORCE(box_dims[2] == 4, + "The last dimension of Input(BBoxes) must be 4"); + PADDLE_ENFORCE_EQ(box_dims[1], score_dims[1], + "The 2nd dimension of Input(BBoxes)" + "must be equal to the 2nd dimension" + " of Input(Scores)"); + } + } + // Here the box_dims[0] is not the real dimension of output. + // It will be rewritten in the computing kernel. + if (score_size == 3) { + ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2}); + } else { + ctx->SetOutputDim("Out", {-1, box_dims[2] + 2}); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input("Scores")->type(), + platform::CPUPlace()); + } +}; + +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +static inline void GetMaxScoreIndex( + const std::vector& scores, const T threshold, int top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); + } +} + +template +static inline T BBoxArea(const T* box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline T JaccardOverlap(const T* box1, const T* box2, + const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + T norm = normalized ? static_cast(0.) : static_cast(1.); + T inter_w = inter_xmax - inter_xmin + norm; + T inter_h = inter_ymax - inter_ymin + norm; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +T PolyIoU(const T* box1, const T* box2, const size_t box_size, + const bool normalized) { + T bbox1_area = PolyArea(box1, box_size, normalized); + T bbox2_area = PolyArea(box2, box_size, normalized); + T inter_area = PolyOverlapArea(box1, box2, box_size, normalized); + if (bbox1_area == 0 || bbox2_area == 0 || inter_area == 0) { + // If coordinate values are invalid + // if area size <= 0, return 0. + return T(0.); + } else { + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +void SliceOneClass(const platform::DeviceContext& ctx, + const framework::Tensor& items, const int class_id, + framework::Tensor* one_class_item) { + T* item_data = one_class_item->mutable_data(ctx.GetPlace()); + const T* items_data = items.data(); + const int64_t num_item = items.dims()[0]; + const int class_num = items.dims()[1]; + if (items.dims().size() == 3) { + int item_size = items.dims()[2]; + for (int i = 0; i < num_item; ++i) { + std::memcpy(item_data + i * item_size, + items_data + i * class_num * item_size + class_id * item_size, + sizeof(T) * item_size); + } + } else { + for (int i = 0; i < num_item; ++i) { + item_data[i] = items_data[i * class_num + class_id]; + } + } +} + +template +class MultiClassNMSKernel : public framework::OpKernel { + public: + void NMSFast(const Tensor& bbox, const Tensor& scores, + const T score_threshold, const T nms_threshold, const T eta, + const int64_t top_k, std::vector* selected_indices, + const bool normalized) const { + // The total boxes for each instance. + int64_t num_boxes = bbox.dims()[0]; + // 4: [xmin ymin xmax ymax] + // 8: [x1 y1 x2 y2 x3 y3 x4 y4] + // 16, 24, or 32: [x1 y1 x2 y2 ... xn yn], n = 8, 12 or 16 + int64_t box_size = bbox.dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores.data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices; + GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices); + + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + const T* bbox_data = bbox.data(); + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = T(0.); + // 4: [xmin ymin xmax ymax] + if (box_size == 4) { + overlap = + JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, normalized); + } + // 8: [x1 y1 x2 y2 x3 y3 x4 y4] or 16, 24, 32 + if (box_size == 8 || box_size == 16 || box_size == 24 || + box_size == 32) { + overlap = PolyIoU(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, box_size, + normalized); + } + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + } + + void MultiClassNMS(const framework::ExecutionContext& ctx, + const Tensor& scores, const Tensor& bboxes, + const int scores_size, + std::map>* indices, + int* num_nmsed_out) const { + int64_t background_label = ctx.Attr("background_label"); + int64_t nms_top_k = ctx.Attr("nms_top_k"); + int64_t keep_top_k = ctx.Attr("keep_top_k"); + bool normalized = ctx.Attr("normalized"); + T nms_threshold = static_cast(ctx.Attr("nms_threshold")); + T nms_eta = static_cast(ctx.Attr("nms_eta")); + T score_threshold = static_cast(ctx.Attr("score_threshold")); + auto& dev_ctx = ctx.template device_context(); + + int num_det = 0; + + int64_t class_num = scores_size == 3 ? scores.dims()[0] : scores.dims()[1]; + Tensor bbox_slice, score_slice; + for (int64_t c = 0; c < class_num; ++c) { + if (c == background_label) continue; + if (scores_size == 3) { + score_slice = scores.Slice(c, c + 1); + bbox_slice = bboxes; + } else { + score_slice.Resize({scores.dims()[0], 1}); + bbox_slice.Resize({scores.dims()[0], 4}); + SliceOneClass(dev_ctx, scores, c, &score_slice); + SliceOneClass(dev_ctx, bboxes, c, &bbox_slice); + } + NMSFast(bbox_slice, score_slice, score_threshold, nms_threshold, nms_eta, + nms_top_k, &((*indices)[c]), normalized); + if (scores_size == 2) { + std::stable_sort((*indices)[c].begin(), (*indices)[c].end()); + } + num_det += (*indices)[c].size(); + } + + *num_nmsed_out = num_det; + const T* scores_data = scores.data(); + if (keep_top_k > -1 && num_det > keep_top_k) { + const T* sdata; + std::vector>> score_index_pairs; + for (const auto& it : *indices) { + int label = it.first; + if (scores_size == 3) { + sdata = scores_data + label * scores.dims()[1]; + } else { + score_slice.Resize({scores.dims()[0], 1}); + SliceOneClass(dev_ctx, scores, label, &score_slice); + sdata = score_slice.data(); + } + const std::vector& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + score_index_pairs.push_back( + std::make_pair(sdata[idx], std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScorePairDescend>); + score_index_pairs.resize(keep_top_k); + + // Store the new indices. + std::map> new_indices; + for (size_t j = 0; j < score_index_pairs.size(); ++j) { + int label = score_index_pairs[j].second.first; + int idx = score_index_pairs[j].second.second; + new_indices[label].push_back(idx); + } + if (scores_size == 2) { + for (const auto& it : new_indices) { + int label = it.first; + std::stable_sort(new_indices[label].begin(), + new_indices[label].end()); + } + } + new_indices.swap(*indices); + *num_nmsed_out = keep_top_k; + } + } + + void MultiClassOutput(const platform::DeviceContext& ctx, + const Tensor& scores, const Tensor& bboxes, + const std::map>& selected_indices, + const int scores_size, Tensor* outs) const { + int64_t class_num = scores.dims()[1]; + int64_t predict_dim = scores.dims()[1]; + int64_t box_size = bboxes.dims()[1]; + if (scores_size == 2) { + box_size = bboxes.dims()[2]; + } + int64_t out_dim = box_size + 2; + auto* scores_data = scores.data(); + auto* bboxes_data = bboxes.data(); + auto* odata = outs->data(); + const T* sdata; + Tensor bbox; + bbox.Resize({scores.dims()[0], box_size}); + int count = 0; + for (const auto& it : selected_indices) { + int label = it.first; + const std::vector& indices = it.second; + if (scores_size == 2) { + SliceOneClass(ctx, bboxes, label, &bbox); + } else { + sdata = scores_data + label * predict_dim; + } + for (size_t j = 0; j < indices.size(); ++j) { + int idx = indices[j]; + odata[count * out_dim] = label; // label + const T* bdata; + if (scores_size == 3) { + bdata = bboxes_data + idx * box_size; + odata[count * out_dim + 1] = sdata[idx]; // score + } else { + bdata = bbox.data() + idx * box_size; + odata[count * out_dim + 1] = *(scores_data + idx * class_num + label); + } + // xmin, ymin, xmax, ymax or multi-points coordinates + std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T)); + count++; + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* boxes = ctx.Input("BBoxes"); + auto* scores = ctx.Input("Scores"); + auto* outs = ctx.Output("Out"); + + auto score_dims = scores->dims(); + auto score_size = score_dims.size(); + auto& dev_ctx = ctx.template device_context(); + + std::vector>> all_indices; + std::vector batch_starts = {0}; + int64_t batch_size = score_dims[0]; + int64_t box_dim = boxes->dims()[2]; + int64_t out_dim = box_dim + 2; + int num_nmsed_out = 0; + Tensor boxes_slice, scores_slice; + int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1; + for (int i = 0; i < n; ++i) { + if (score_size == 3) { + scores_slice = scores->Slice(i, i + 1); + scores_slice.Resize({score_dims[1], score_dims[2]}); + boxes_slice = boxes->Slice(i, i + 1); + boxes_slice.Resize({score_dims[2], box_dim}); + } else { + auto boxes_lod = boxes->lod().back(); + scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]); + boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]); + } + std::map> indices; + MultiClassNMS(ctx, scores_slice, boxes_slice, score_size, &indices, + &num_nmsed_out); + all_indices.push_back(indices); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + } + + int num_kept = batch_starts.back(); + if (num_kept == 0) { + T* od = outs->mutable_data({1, 1}, ctx.GetPlace()); + od[0] = -1; + batch_starts = {0, 1}; + } else { + outs->mutable_data({num_kept, out_dim}, ctx.GetPlace()); + for (int i = 0; i < n; ++i) { + if (score_size == 3) { + scores_slice = scores->Slice(i, i + 1); + boxes_slice = boxes->Slice(i, i + 1); + scores_slice.Resize({score_dims[1], score_dims[2]}); + boxes_slice.Resize({score_dims[2], box_dim}); + } else { + auto boxes_lod = boxes->lod().back(); + scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]); + boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]); + } + int64_t s = batch_starts[i]; + int64_t e = batch_starts[i + 1]; + if (e > s) { + Tensor out = outs->Slice(s, e); + MultiClassOutput(dev_ctx, scores_slice, boxes_slice, all_indices[i], + score_dims.size(), &out); + } + } + } + + framework::LoD lod; + lod.emplace_back(batch_starts); + + outs->set_lod(lod); + } +}; + +class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("BBoxes", + "Two types of bboxes are supported:" + "1. (Tensor) A 3-D Tensor with shape " + "[N, M, 4 or 8 16 24 32] represents the " + "predicted locations of M bounding bboxes, N is the batch size. " + "Each bounding box has four coordinate values and the layout is " + "[xmin, ymin, xmax, ymax], when box size equals to 4." + "2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]" + "M is the number of bounding boxes, C is the class number"); + AddInput("Scores", + "Two types of scores are supported:" + "1. (Tensor) A 3-D Tensor with shape [N, C, M] represents the " + "predicted confidence predictions. N is the batch size, C is the " + "class number, M is number of bounding boxes. For each category " + "there are total M scores which corresponding M bounding boxes. " + " Please note, M is equal to the 2nd dimension of BBoxes. " + "2. (LoDTensor) A 2-D LoDTensor with shape [M, C]. " + "M is the number of bbox, C is the class number. In this case, " + "Input BBoxes should be the second case with shape [M, C, 4]."); + AddAttr( + "background_label", + "(int, default: 0) " + "The index of background label, the background label will be ignored. " + "If set to -1, then all categories will be considered.") + .SetDefault(0); + AddAttr("score_threshold", + "(float) " + "Threshold to filter out bounding boxes with low " + "confidence score. If not provided, consider all boxes."); + AddAttr("nms_top_k", + "(int64_t) " + "Maximum number of detections to be kept according to the " + "confidences aftern the filtering detections based on " + "score_threshold"); + AddAttr("nms_threshold", + "(float, default: 0.3) " + "The threshold to be used in NMS.") + .SetDefault(0.3); + AddAttr("nms_eta", + "(float) " + "The parameter for adaptive NMS.") + .SetDefault(1.0); + AddAttr("keep_top_k", + "(int64_t) " + "Number of total bboxes to be kept per image after NMS " + "step. -1 means keeping all bboxes after NMS step."); + AddAttr("normalized", + "(bool, default true) " + "Whether detections are normalized.") + .SetDefault(true); + AddOutput("Out", + "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the " + "detections. Each row has 6 values: " + "[label, confidence, xmin, ymin, xmax, ymax] or " + "(LoDTensor) A 2-D LoDTensor with shape [No, 10] represents the " + "detections. Each row has 10 values: " + "[label, confidence, x1, y1, x2, y2, x3, y3, x4, y4]. No is the " + "total number of detections in this mini-batch." + "For each instance, " + "the offsets in first dimension are called LoD, the number of " + "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is " + "no detected bbox."); + AddComment(R"DOC( +This operator is to do multi-class non maximum suppression (NMS) on a batched +of boxes and scores. +In the NMS step, this operator greedily selects a subset of detection bounding +boxes that have high scores larger than score_threshold, if providing this +threshold, then selects the largest nms_top_k confidences scores if nms_top_k +is larger than -1. Then this operator pruns away boxes that have high IOU +(intersection over union) overlap with already selected boxes by adaptive +threshold NMS based on parameters of nms_threshold and nms_eta. +Aftern NMS step, at most keep_top_k number of total bboxes are to be kept +per image if keep_top_k is larger than -1. +This operator support multi-class and batched inputs. It applying NMS +independently for each class. The outputs is a 2-D LoDTenosr, for each +image, the offsets in first dimension of LoDTensor are called LoD, the number +of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0, +means there is no detected bbox for this image. If there is no detected boxes +for all images, all the elements in LoD are set to {1}, and the Out only +contains one value which is -1. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp, + ops::MultiClassNMSOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel, + ops::MultiClassNMSKernel); diff --git a/paddle/fluid/operators/detection/poly_util.cc b/paddle/fluid/operators/detection/poly_util.cc new file mode 100644 index 00000000..1af2c95c --- /dev/null +++ b/paddle/fluid/operators/detection/poly_util.cc @@ -0,0 +1,132 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef POLY_UTIL_CC_ +#define POLY_UTIL_CC_ + +#include "paddle/fluid/operators/detection/poly_util.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using gpc::gpc_polygon_clip; +using gpc::gpc_free_polygon; + +template +void Array2PointVec(const T*& box, const size_t box_size, + std::vector>& vec) { + size_t pts_num = box_size / 2; + vec.resize(pts_num); + for (size_t i = 0; i < pts_num; i++) { + vec.at(i).x = box[2 * i]; + vec.at(i).y = box[2 * i + 1]; + } +} + +template +void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly) { + size_t pts_num = box_size / 2; + poly.num_contours = 1; + poly.hole = (int*)malloc(sizeof(int)); + poly.hole[0] = 0; + poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); + poly.contour->num_vertices = pts_num; + poly.contour->vertex = + (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num); + for (size_t i = 0; i < pts_num; ++i) { + poly.contour->vertex[i].x = box[2 * i]; + poly.contour->vertex[i].y = box[2 * i + 1]; + } +} + +template +void PointVec2Poly(const std::vector>& vec, gpc::gpc_polygon& poly) { + int pts_num = vec.size(); + poly.num_contours = 1; + poly.hole = (int*)malloc(sizeof(int)); + poly.hole[0] = 0; + poly.contour = (gpc::gpc_vertex_list*)malloc(sizeof(gpc::gpc_vertex_list)); + poly.contour->num_vertices = pts_num; + poly.contour->vertex = + (gpc::gpc_vertex*)malloc(sizeof(gpc::gpc_vertex) * pts_num); + for (size_t i = 0; i < pts_num; ++i) { + poly.contour->vertex[i].x = vec[i].x; + poly.contour->vertex[i].y = vec[i].y; + } +} + +template +void Poly2PointVec(const gpc::gpc_vertex_list& contour, + std::vector>& vec) { + int pts_num = contour.num_vertices; + vec.resize(pts_num); + for (int i = 0; i < pts_num; i++) { + vec.at(i).x = contour.vertex[i].x; + vec.at(i).y = contour.vertex[i].y; + } +} + +template +T GetContourArea(std::vector>& vec) { + size_t pts_num = vec.size(); + if (pts_num < 3) return T(0.); + T area = T(0.); + for (size_t i = 0; i < pts_num; ++i) { + area += vec[i].x * vec[(i + 1) % pts_num].y - + vec[i].y * vec[(i + 1) % pts_num].x; + } + return std::fabs(area / 2.0); +} + +template +T PolyArea(const T* box, const size_t box_size, const bool normalized) { + // If coordinate values are is invalid + // if area size <= 0, return 0. + std::vector> vec; + Array2PointVec(box, box_size, vec); + return GetContourArea(vec); +} + +template +T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, + const bool normalized) { + gpc::gpc_polygon poly1; + gpc::gpc_polygon poly2; + Array2Poly(box1, box_size, poly1); + Array2Poly(box2, box_size, poly2); + gpc::gpc_polygon respoly; + gpc::gpc_op op = gpc::GPC_INT; + gpc::gpc_polygon_clip(op, &poly2, &poly1, &respoly); + + T inter_area = T(0.); + int contour_num = respoly.num_contours; + for (int i = 0; i < contour_num; ++i) { + std::vector> resvec; + Poly2PointVec(respoly.contour[i], resvec); + // inter_area += std::fabs(cv::contourArea(resvec)) + 0.5f * + // (cv::arcLength(resvec, true)); + inter_area += GetContourArea(resvec); + } + + gpc::gpc_free_polygon(&poly1); + gpc::gpc_free_polygon(&poly2); + gpc::gpc_free_polygon(&respoly); + return inter_area; +} + +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/detection/poly_util.h b/paddle/fluid/operators/detection/poly_util.h new file mode 100644 index 00000000..f07baf72 --- /dev/null +++ b/paddle/fluid/operators/detection/poly_util.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef POLY_UTIL_H_ +#define POLY_UTIL_H_ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/gpc.h" + +namespace paddle { +namespace operators { + +template +class Point_ { + public: + // default constructor + Point_() {} + Point_(T _x, T _y) {} + Point_(const Point_& pt) {} + + Point_& operator=(const Point_& pt); + // conversion to another data type + // template operator Point_<_T>() const; + // conversion to the old-style C structures + // operator Vec() const; + + // checks whether the point is inside the specified rectangle + // bool inside(const Rect_& r) const; + T x; //!< x coordinate of the point + T y; //!< y coordinate of the point +}; + +template +void Array2PointVec(const T*& box, const size_t box_size, + std::vector>& vec); + +template +void Array2Poly(const T*& box, const size_t box_size, gpc::gpc_polygon& poly); + +template +void PointVec2Poly(const std::vector>& vec, gpc::gpc_polygon& poly); + +template +void Poly2PointVec(const gpc::gpc_vertex_list& contour, + std::vector>& vec); + +template +T GetContourArea(std::vector>& vec); + +template +T PolyArea(const T* box, const size_t box_size, const bool normalized); + +template +T PolyOverlapArea(const T* box1, const T* box2, const size_t box_size, + const bool normalized); +} // namespace operators +} // namespace paddle + +#include "paddle/fluid/operators/detection/poly_util.cc" + +#endif // POLY_UTIL_H_ diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cc b/paddle/fluid/operators/detection/polygon_box_transform_op.cc new file mode 100644 index 00000000..4b3bc2ed --- /dev/null +++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class PolygonBoxTransformCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* in = ctx.Input("Input"); + auto in_dims = in->dims(); + const T* in_data = in->data(); + auto* out = ctx.Output("Output"); + T* out_data = out->mutable_data(ctx.GetPlace()); + + int batch_size = in_dims[0]; + int geo_channel = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int id = 0; + for (int id_n = 0; id_n < batch_size * geo_channel; ++id_n) { + for (int id_h = 0; id_h < height; ++id_h) { + for (int id_w = 0; id_w < width; ++id_w) { + id = id_n * height * width + width * id_h + id_w; + if (id_n % 2 == 0) { + out_data[id] = id_w * 4 - in_data[id]; + } else { + out_data[id] = id_h * 4 - in_data[id]; + } + } + } + } + } +}; + +class PolygonBoxTransformOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("Input"), + "Input (Input) of polygon_box transform op should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Output"), + "Output (Output) of polygon_box transform op should not be null."); + + auto in_dim = ctx->GetInputDim("Input"); + + PADDLE_ENFORCE_EQ(in_dim.size(), 4, "input's rank must be 4."); + PADDLE_ENFORCE_EQ(in_dim[1] % 2, 0, + "input's second dimension must be even."); + + ctx->SetOutputDim("Output", in_dim); + } +}; + +class PolygonBoxTransformOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Input", + "The input with shape [batch_size, geometry_channels, height, width]"); + AddOutput("Output", "The output with the same shape as input"); + + AddComment(R"DOC( +PolygonBoxTransform Operator. + +PolygonBoxTransform Operator is used to transform the coordinate shift to the real coordinate. + +The input is the final geometry output in detection network. +We use 2*n numbers to denote the coordinate shift from n corner vertices of +the polygon_box to the pixel location. As each distance offset contains two numbers (xi, yi), +the geometry output contains 2*n channels. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(polygon_box_transform, ops::PolygonBoxTransformOp, + ops::PolygonBoxTransformOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + polygon_box_transform, + ops::PolygonBoxTransformCPUKernel, + ops::PolygonBoxTransformCPUKernel); diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu new file mode 100644 index 00000000..e1eaf084 --- /dev/null +++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu @@ -0,0 +1,76 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using platform::PADDLE_CUDA_NUM_THREADS; +#define CUDA_BLOCK_SIZE 16 + +template +__global__ void PolygonBoxTransformKernel(const int n, const int h, const int w, + const T* input, T* output) { + int id_n = threadIdx.x + blockDim.x * blockIdx.x; + int id_h = threadIdx.y + blockDim.y * blockIdx.y; + int id_w = threadIdx.z + blockDim.z * blockIdx.z; + if (id_n < n && id_h < h && id_w < w) { + int id = id_n * h * w + w * id_h + id_w; + if (id_n % 2 == 0) { + output[id] = id_w * 4 - input[id]; + } else { + output[id] = id_h * 4 - input[id]; + } + } +} + +template +class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* in = ctx.Input("Input"); + auto in_dims = in->dims(); + const T* in_data = in->data(); + auto* out = ctx.Output("Output"); + T* out_data = out->mutable_data(ctx.GetPlace()); + + int batch_size = in_dims[0]; + int geo_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + dim3 threadsPerBlock( + PADDLE_CUDA_NUM_THREADS / (CUDA_BLOCK_SIZE * CUDA_BLOCK_SIZE), + CUDA_BLOCK_SIZE, CUDA_BLOCK_SIZE); + dim3 numBlocks((batch_size * geo_channels) / threadsPerBlock.x, + (height + threadsPerBlock.y - 1) / threadsPerBlock.y, + (width + threadsPerBlock.z - 1) / threadsPerBlock.z); + auto stream = ctx.cuda_device_context().stream(); + PolygonBoxTransformKernel<<>>( + batch_size * geo_channels, height, width, in_data, out_data); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL( + polygon_box_transform, + paddle::operators::PolygonBoxTransformOpCUDAKernel, + paddle::operators::PolygonBoxTransformOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc new file mode 100644 index 00000000..9709cbc0 --- /dev/null +++ b/paddle/fluid/operators/detection/prior_box_op.cc @@ -0,0 +1,245 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/prior_box_op.h" + +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +class PriorBoxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Input"), + "Input(Input) of PriorBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Image"), + "Input(Image) of PriorBoxOp should not be null."); + + auto image_dims = ctx->GetInputDim("Image"); + auto input_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW."); + PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); + + PADDLE_ENFORCE_LT(input_dims[2], image_dims[2], + "The height of input must smaller than image."); + + PADDLE_ENFORCE_LT(input_dims[3], image_dims[3], + "The width of input must smaller than image."); + + auto min_sizes = ctx->Attrs().Get>("min_sizes"); + auto max_sizes = ctx->Attrs().Get>("max_sizes"); + auto variances = ctx->Attrs().Get>("variances"); + auto aspect_ratios = ctx->Attrs().Get>("aspect_ratios"); + bool flip = ctx->Attrs().Get("flip"); + + std::vector aspect_ratios_vec; + ExpandAspectRatios(aspect_ratios, flip, &aspect_ratios_vec); + + size_t num_priors = aspect_ratios_vec.size() * min_sizes.size(); + if (max_sizes.size() > 0) { + PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(), + "The number of min_size and max_size must be equal."); + num_priors += max_sizes.size(); + for (size_t i = 0; i < max_sizes.size(); ++i) { + PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i], + "max_size[%d] must be greater than min_size[%d].", i, + i); + } + } + + std::vector dim_vec(4); + dim_vec[0] = input_dims[2]; + dim_vec[1] = input_dims[3]; + dim_vec[2] = num_priors; + dim_vec[3] = 4; + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); + ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_input_type = ctx.Input("Input")->type(); + + framework::LibraryType library_{framework::LibraryType::kPlain}; + framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + auto input_image_type = ctx.Input("Image")->type(); + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; + if (input_image_type == framework::DataTypeTrait::DataType()) { + customized_type_value = kPriorBoxFLOAT; + } else if (input_image_type == + framework::DataTypeTrait::DataType()) { + customized_type_value = kPriorBoxDOUBLE; + } + return framework::OpKernelType(input_input_type, ctx.GetPlace(), layout_, + library_, customized_type_value); + } +#endif + return framework::OpKernelType(input_input_type, ctx.GetPlace(), layout_, + library_); + } +}; + +class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(Tensor, default Tensor), " + "the input feature data of PriorBoxOp, The layout is NCHW."); + AddInput("Image", + "(Tensor, default Tensor), " + "the input image data of PriorBoxOp, The layout is NCHW."); + AddOutput("Boxes", + "(Tensor, default Tensor), the output prior boxes of " + "PriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + AddOutput("Variances", + "(Tensor, default Tensor), the expanded variances of " + "PriorBoxOp. The layout is [H, W, num_priors, 4]. " + "H is the height of input, W is the width of input, num_priors " + "is the box count of each position."); + + AddAttr>("min_sizes", + "(vector) List of min sizes " + "of generated prior boxes.") + .AddCustomChecker([](const std::vector& min_sizes) { + PADDLE_ENFORCE_GT(min_sizes.size(), 0, + "Size of min_sizes must be at least 1."); + for (size_t i = 0; i < min_sizes.size(); ++i) { + PADDLE_ENFORCE_GT(min_sizes[i], 0.0, + "min_sizes[%d] must be positive.", i); + } + }); + AddAttr>( + "max_sizes", + "(vector) List of max sizes of generated prior boxes.") + .SetDefault(std::vector{}); + AddAttr>( + "aspect_ratios", + "(vector) List of aspect ratios of generated prior boxes."); + + AddAttr>( + "variances", + "(vector) List of variances to be encoded in prior boxes.") + .AddCustomChecker([](const std::vector& variances) { + PADDLE_ENFORCE_EQ(variances.size(), 4, + "Must and only provide 4 variance."); + for (size_t i = 0; i < variances.size(); ++i) { + PADDLE_ENFORCE_GT(variances[i], 0.0, + "variance[%d] must be greater than 0.", i); + } + }); + AddAttr("flip", "(bool) Whether to flip aspect ratios.") + .SetDefault(true); + AddAttr("clip", "(bool) Whether to clip out-of-boundary boxes.") + .SetDefault(true); + + AddAttr("step_w", + "Prior boxes step across width, 0.0 for auto calculation.") + .SetDefault(0.0) + .AddCustomChecker([](const float& step_w) { + PADDLE_ENFORCE_GE(step_w, 0.0, "step_w should be larger than 0."); + }); + AddAttr("step_h", + "Prior boxes step across height, 0.0 for auto calculation.") + .SetDefault(0.0) + .AddCustomChecker([](const float& step_h) { + PADDLE_ENFORCE_GE(step_h, 0.0, "step_h should be larger than 0."); + }); + + AddAttr("offset", + "(float) " + "Prior boxes center offset.") + .SetDefault(0.5); + AddAttr( + "min_max_aspect_ratios_order", + "(bool) If set True, the output prior box is in order of" + "[min, max, aspect_ratios], which is consistent with Caffe." + "Please note, this order affects the weights order of convolution layer" + "followed by and does not affect the final detection results.") + .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("use_quantizer", + "(bool, default false) " + "Set to true for operators that should be quantized and use " + "int8 kernel. " + "Only used on CPU.") + .SetDefault(false); + AddComment(R"DOC( +Prior box operator +Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm. +Each position of the input produce N prior boxes, N is determined by + the count of min_sizes, max_sizes and aspect_ratios, The size of the + box is in range(min_size, max_size) interval, which is generated in + sequence according to the aspect_ratios. + +Please get more information from the following papers: +https://arxiv.org/abs/1512.02325. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(prior_box, ops::PriorBoxOpKernel, + ops::PriorBoxOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN, + ::paddle::platform::CPUPlace, FF, + ops::kPriorBoxFLOAT, + ops::PriorBoxOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN, + ::paddle::platform::CPUPlace, DD, + ops::kPriorBoxDOUBLE, + ops::PriorBoxOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN, + ::paddle::platform::CPUPlace, U8F, + ops::kPriorBoxFLOAT, + ops::PriorBoxOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN, + ::paddle::platform::CPUPlace, S8F, + ops::kPriorBoxFLOAT, + ops::PriorBoxOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN, + ::paddle::platform::CPUPlace, U8D, + ops::kPriorBoxDOUBLE, + ops::PriorBoxOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(prior_box, MKLDNN, + ::paddle::platform::CPUPlace, S8D, + ops::kPriorBoxDOUBLE, + ops::PriorBoxOpKernel); diff --git a/paddle/fluid/operators/detection/prior_box_op.cu b/paddle/fluid/operators/detection/prior_box_op.cu new file mode 100644 index 00000000..1ea8cfc1 --- /dev/null +++ b/paddle/fluid/operators/detection/prior_box_op.cu @@ -0,0 +1,183 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/prior_box_op.h" + +namespace paddle { +namespace operators { + +template +__device__ inline T clip(T in) { + return min(max(in, 0.), 1.); +} + +template +__global__ void GenPriorBox(T* out, const T* aspect_ratios, const int height, + const int width, const int im_height, + const int im_width, const int as_num, + const T offset, const T step_width, + const T step_height, const T* min_sizes, + const T* max_sizes, const int min_num, bool is_clip, + bool min_max_aspect_ratios_order) { + int num_priors = max_sizes ? as_num * min_num + min_num : as_num * min_num; + int box_num = height * width * num_priors; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < box_num; + i += blockDim.x * gridDim.x) { + int h = i / (num_priors * width); + int w = (i / num_priors) % width; + int p = i % num_priors; + int m = max_sizes ? p / (as_num + 1) : p / as_num; + T cx = (w + offset) * step_width; + T cy = (h + offset) * step_height; + T bw, bh; + T min_size = min_sizes[m]; + if (max_sizes) { + int s = p % (as_num + 1); + if (!min_max_aspect_ratios_order) { + if (s < as_num) { + T ar = aspect_ratios[s]; + bw = min_size * sqrt(ar) / 2.; + bh = min_size / sqrt(ar) / 2.; + } else { + T max_size = max_sizes[m]; + bw = sqrt(min_size * max_size) / 2.; + bh = bw; + } + } else { + if (s == 0) { + bw = bh = min_size / 2.; + } else if (s == 1) { + T max_size = max_sizes[m]; + bw = sqrt(min_size * max_size) / 2.; + bh = bw; + } else { + T ar = aspect_ratios[s - 1]; + bw = min_size * sqrt(ar) / 2.; + bh = min_size / sqrt(ar) / 2.; + } + } + } else { + int s = p % as_num; + T ar = aspect_ratios[s]; + bw = min_size * sqrt(ar) / 2.; + bh = min_size / sqrt(ar) / 2.; + } + T xmin = (cx - bw) / im_width; + T ymin = (cy - bh) / im_height; + T xmax = (cx + bw) / im_width; + T ymax = (cy + bh) / im_height; + out[i * 4] = is_clip ? clip(xmin) : xmin; + out[i * 4 + 1] = is_clip ? clip(ymin) : ymin; + out[i * 4 + 2] = is_clip ? clip(xmax) : xmax; + out[i * 4 + 3] = is_clip ? clip(ymax) : ymax; + } +} + +template +__global__ void SetVariance(T* out, const T* var, const int vnum, + const int num) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + out[i] = var[i % vnum]; + } +} + +template +class PriorBoxOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto min_sizes = ctx.Attr>("min_sizes"); + auto max_sizes = ctx.Attr>("max_sizes"); + auto input_aspect_ratio = ctx.Attr>("aspect_ratios"); + auto variances = ctx.Attr>("variances"); + auto flip = ctx.Attr("flip"); + auto clip = ctx.Attr("clip"); + auto min_max_aspect_ratios_order = + ctx.Attr("min_max_aspect_ratios_order"); + + std::vector aspect_ratios; + ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); + + T step_w = static_cast(ctx.Attr("step_w")); + T step_h = static_cast(ctx.Attr("step_h")); + T offset = static_cast(ctx.Attr("offset")); + + auto im_width = image->dims()[3]; + auto im_height = image->dims()[2]; + + auto width = input->dims()[3]; + auto height = input->dims()[2]; + + T step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(im_width) / width; + step_height = static_cast(im_height) / height; + } else { + step_width = step_w; + step_height = step_h; + } + + int num_priors = aspect_ratios.size() * min_sizes.size(); + if (max_sizes.size() > 0) { + num_priors += max_sizes.size(); + } + int min_num = static_cast(min_sizes.size()); + int box_num = width * height * num_priors; + + int block = 512; + int grid = (box_num + block - 1) / block; + + auto stream = + ctx.template device_context().stream(); + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + framework::Tensor r; + framework::TensorFromVector(aspect_ratios, ctx.device_context(), &r); + + framework::Tensor min; + framework::TensorFromVector(min_sizes, ctx.device_context(), &min); + + T* max_data = nullptr; + framework::Tensor max; + if (max_sizes.size() > 0) { + framework::TensorFromVector(max_sizes, ctx.device_context(), &max); + max_data = max.data(); + } + + GenPriorBox<<>>( + boxes->data(), r.data(), height, width, im_height, im_width, + aspect_ratios.size(), offset, step_width, step_height, min.data(), + max_data, min_num, clip, min_max_aspect_ratios_order); + + framework::Tensor v; + framework::TensorFromVector(variances, ctx.device_context(), &v); + grid = (box_num * 4 + block - 1) / block; + SetVariance<<>>(vars->data(), v.data(), + variances.size(), box_num * 4); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(prior_box, ops::PriorBoxOpCUDAKernel, + ops::PriorBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h new file mode 100644 index 00000000..71c67b44 --- /dev/null +++ b/paddle/fluid/operators/detection/prior_box_op.h @@ -0,0 +1,205 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/transform.h" + +namespace paddle { +namespace operators { + +constexpr int kPriorBoxFLOAT = 1; +constexpr int kPriorBoxDOUBLE = 2; + +inline void ExpandAspectRatios(const std::vector& input_aspect_ratior, + bool flip, + std::vector* output_aspect_ratior) { + constexpr float epsilon = 1e-6; + output_aspect_ratior->clear(); + output_aspect_ratior->push_back(1.0f); + for (size_t i = 0; i < input_aspect_ratior.size(); ++i) { + float ar = input_aspect_ratior[i]; + bool already_exist = false; + for (size_t j = 0; j < output_aspect_ratior->size(); ++j) { + if (fabs(ar - output_aspect_ratior->at(j)) < epsilon) { + already_exist = true; + break; + } + } + if (!already_exist) { + output_aspect_ratior->push_back(ar); + if (flip) { + output_aspect_ratior->push_back(1.0f / ar); + } + } + } +} + +template +class PriorBoxOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto min_sizes = ctx.Attr>("min_sizes"); + auto max_sizes = ctx.Attr>("max_sizes"); + auto input_aspect_ratio = ctx.Attr>("aspect_ratios"); + auto variances = ctx.Attr>("variances"); + auto flip = ctx.Attr("flip"); + auto clip = ctx.Attr("clip"); + auto min_max_aspect_ratios_order = + ctx.Attr("min_max_aspect_ratios_order"); + + std::vector aspect_ratios; + ExpandAspectRatios(input_aspect_ratio, flip, &aspect_ratios); + + K step_w = static_cast(ctx.Attr("step_w")); + K step_h = static_cast(ctx.Attr("step_h")); + K offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + K step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + + int num_priors = aspect_ratios.size() * min_sizes.size(); + if (max_sizes.size() > 0) { + num_priors += max_sizes.size(); + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + K* b_t = boxes->data(); + for (int h = 0; h < feature_height; ++h) { + for (int w = 0; w < feature_width; ++w) { + K center_x = (w + offset) * step_width; + K center_y = (h + offset) * step_height; + K box_width, box_height; + for (size_t s = 0; s < min_sizes.size(); ++s) { + auto min_size = min_sizes[s]; + if (min_max_aspect_ratios_order) { + box_width = box_height = min_size / 2.; + b_t[0] = (center_x - box_width) / img_width; + b_t[1] = (center_y - box_height) / img_height; + b_t[2] = (center_x + box_width) / img_width; + b_t[3] = (center_y + box_height) / img_height; + b_t += 4; + if (max_sizes.size() > 0) { + auto max_size = max_sizes[s]; + // square prior with size sqrt(minSize * maxSize) + box_width = box_height = sqrt(min_size * max_size) / 2.; + b_t[0] = (center_x - box_width) / img_width; + b_t[1] = (center_y - box_height) / img_height; + b_t[2] = (center_x + box_width) / img_width; + b_t[3] = (center_y + box_height) / img_height; + b_t += 4; + } + // priors with different aspect ratios + for (size_t r = 0; r < aspect_ratios.size(); ++r) { + float ar = aspect_ratios[r]; + if (fabs(ar - 1.) < 1e-6) { + continue; + } + box_width = min_size * sqrt(ar) / 2.; + box_height = min_size / sqrt(ar) / 2.; + b_t[0] = (center_x - box_width) / img_width; + b_t[1] = (center_y - box_height) / img_height; + b_t[2] = (center_x + box_width) / img_width; + b_t[3] = (center_y + box_height) / img_height; + b_t += 4; + } + } else { + // priors with different aspect ratios + for (size_t r = 0; r < aspect_ratios.size(); ++r) { + float ar = aspect_ratios[r]; + box_width = min_size * sqrt(ar) / 2.; + box_height = min_size / sqrt(ar) / 2.; + b_t[0] = (center_x - box_width) / img_width; + b_t[1] = (center_y - box_height) / img_height; + b_t[2] = (center_x + box_width) / img_width; + b_t[3] = (center_y + box_height) / img_height; + b_t += 4; + } + if (max_sizes.size() > 0) { + auto max_size = max_sizes[s]; + // square prior with size sqrt(minSize * maxSize) + box_width = box_height = sqrt(min_size * max_size) / 2.; + b_t[0] = (center_x - box_width) / img_width; + b_t[1] = (center_y - box_height) / img_height; + b_t[2] = (center_x + box_width) / img_width; + b_t[3] = (center_y + box_height) / img_height; + b_t += 4; + } + } + } + } + } + + if (clip) { + K* dt = boxes->data(); + std::transform(dt, dt + boxes->numel(), dt, [](K v) -> K { + return std::min(std::max(v, 0.), 1.); + }); + } + + framework::Tensor var_t; + var_t.mutable_data( + framework::make_ddim({1, static_cast(variances.size())}), + ctx.GetPlace()); + auto var_et = framework::EigenTensor::From(var_t); + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (size_t i = 0; i < variances.size(); ++i) { + var_et(0, i) = variances[i]; + } + + int box_num = feature_height * feature_width * num_priors; + auto var_dim = vars->dims(); + vars->Resize({box_num, static_cast(variances.size())}); + + auto e_vars = framework::EigenMatrix::From(*vars); + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(2) +#endif + for (int i = 0; i < box_num; ++i) { + for (int j = 0; j < variances.size(); ++j) { + e_vars(i, j) = variances[j]; + } + } + vars->Resize(var_dim); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/retinanet_detection_output_op.cc b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc new file mode 100644 index 00000000..4a6dfec1 --- /dev/null +++ b/paddle/fluid/operators/detection/retinanet_detection_output_op.cc @@ -0,0 +1,566 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class RetinanetDetectionOutputOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_GE( + ctx->Inputs("BBoxes").size(), 1UL, + "Input(BBoxes) of RetinanetDetectionOutput should not be null."); + PADDLE_ENFORCE_GE( + ctx->Inputs("Scores").size(), 1UL, + "Input(Scores) of RetinanetDetectionOutput should not be null."); + PADDLE_ENFORCE_GE( + ctx->Inputs("Anchors").size(), 1UL, + "Input(Anchors) of RetinanetDetectionOutput should not be null."); + PADDLE_ENFORCE_EQ( + ctx->Inputs("BBoxes").size(), ctx->Inputs("Scores").size(), + "Input tensors(BBoxes and Scores) should have the same size."); + PADDLE_ENFORCE_EQ( + ctx->Inputs("BBoxes").size(), ctx->Inputs("Anchors").size(), + "Input tensors(BBoxes and Anchors) should have the same size."); + PADDLE_ENFORCE( + ctx->HasInput("ImInfo"), + "Input(ImInfo) of RetinanetDetectionOutput should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of RetinanetDetectionOutput should not be null."); + + auto bboxes_dims = ctx->GetInputsDim("BBoxes"); + auto scores_dims = ctx->GetInputsDim("Scores"); + auto anchors_dims = ctx->GetInputsDim("Anchors"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + + const size_t b_n = bboxes_dims.size(); + PADDLE_ENFORCE_GT(b_n, 0, "Input bbox tensors count should > 0."); + const size_t s_n = scores_dims.size(); + PADDLE_ENFORCE_GT(s_n, 0, "Input score tensors count should > 0."); + const size_t a_n = anchors_dims.size(); + PADDLE_ENFORCE_GT(a_n, 0, "Input anchor tensors count should > 0."); + + auto bbox_dims = bboxes_dims[0]; + auto score_dims = scores_dims[0]; + auto anchor_dims = anchors_dims[0]; + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ(score_dims.size(), 3, + "The rank of Input(Scores) must be 3"); + PADDLE_ENFORCE_EQ(bbox_dims.size(), 3, + "The rank of Input(BBoxes) must be 3"); + PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, + "The rank of Input(Anchors) must be 2"); + PADDLE_ENFORCE(bbox_dims[2] == 4, + "The last dimension of Input(BBoxes) must be 4, " + "represents the layout of coordinate " + "[xmin, ymin, xmax, ymax]"); + PADDLE_ENFORCE_EQ(bbox_dims[1], score_dims[1], + "The 2nd dimension of Input(BBoxes) must be equal to " + "2nd dimension of Input(Scores), which represents the " + "number of the predicted boxes."); + + PADDLE_ENFORCE_EQ(anchor_dims[0], bbox_dims[1], + "The 1st dimension of Input(Anchors) must be equal to " + "2nd dimension of Input(BBoxes), which represents the " + "number of the predicted boxes."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); + } + // Here the box_dims[0] is not the real dimension of output. + // It will be rewritten in the computing kernel. + ctx->SetOutputDim("Out", {bbox_dims[1], bbox_dims[2] + 2}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = + framework::GetDataTypeOfVar(ctx.MultiInputVar("Scores")[0]); + + return framework::OpKernelType(input_data_type, + platform::CPUPlace()); // ctx.GetPlace()); + } +}; + +template +bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +bool SortScoreTwoPairDescend(const std::pair>& pair1, + const std::pair>& pair2) { + return pair1.first > pair2.first; +} + +template +static inline void GetMaxScoreIndex( + const std::vector& scores, const T threshold, int top_k, + std::vector>* sorted_indices) { + for (size_t i = 0; i < scores.size(); ++i) { + if (scores[i] > threshold) { + sorted_indices->push_back(std::make_pair(scores[i], i)); + } + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices->begin(), sorted_indices->end(), + SortScorePairDescend); + // Keep top_k scores if needed. + if (top_k > -1 && top_k < static_cast(sorted_indices->size())) { + sorted_indices->resize(top_k); + } +} + +template +static inline T BBoxArea(const std::vector& box, const bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline T JaccardOverlap(const std::vector& box1, + const std::vector& box2, + const bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + T norm = normalized ? static_cast(0.) : static_cast(1.); + T inter_w = inter_xmax - inter_xmin + norm; + T inter_h = inter_ymax - inter_ymin + norm; + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +class RetinanetDetectionOutputKernel : public framework::OpKernel { + public: + void NMSFast(const std::vector>& cls_dets, + const T nms_threshold, const T eta, + std::vector* selected_indices) const { + int64_t num_boxes = cls_dets.size(); + std::vector> sorted_indices; + for (int64_t i = 0; i < num_boxes; ++i) { + sorted_indices.push_back(std::make_pair(cls_dets[i][4], i)); + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices.begin(), sorted_indices.end(), + SortScorePairDescend); + selected_indices->clear(); + T adaptive_threshold = nms_threshold; + + while (sorted_indices.size() != 0) { + const int idx = sorted_indices.front().second; + bool keep = true; + for (size_t k = 0; k < selected_indices->size(); ++k) { + if (keep) { + const int kept_idx = (*selected_indices)[k]; + T overlap = T(0.); + + overlap = JaccardOverlap(cls_dets[idx], cls_dets[kept_idx], false); + keep = overlap <= adaptive_threshold; + } else { + break; + } + } + if (keep) { + selected_indices->push_back(idx); + } + sorted_indices.erase(sorted_indices.begin()); + if (keep && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + } + + void DeltaScoreToPrediction( + const std::vector& bboxes_data, const std::vector& anchors_data, + T im_height, T im_width, T im_scale, int class_num, + const std::vector>& sorted_indices, + std::map>>* preds) const { + im_height = static_cast(round(im_height / im_scale)); + im_width = static_cast(round(im_width / im_scale)); + T zero(0); + int i = 0; + for (const auto& it : sorted_indices) { + T score = it.first; + int idx = it.second; + int a = idx / class_num; + int c = idx % class_num; + + int box_offset = a * 4; + T anchor_box_width = + anchors_data[box_offset + 2] - anchors_data[box_offset] + 1; + T anchor_box_height = + anchors_data[box_offset + 3] - anchors_data[box_offset + 1] + 1; + T anchor_box_center_x = anchors_data[box_offset] + anchor_box_width / 2; + T anchor_box_center_y = + anchors_data[box_offset + 1] + anchor_box_height / 2; + T target_box_center_x = 0, target_box_center_y = 0; + T target_box_width = 0, target_box_height = 0; + target_box_center_x = + bboxes_data[box_offset] * anchor_box_width + anchor_box_center_x; + target_box_center_y = + bboxes_data[box_offset + 1] * anchor_box_height + anchor_box_center_y; + target_box_width = + std::exp(bboxes_data[box_offset + 2]) * anchor_box_width; + target_box_height = + std::exp(bboxes_data[box_offset + 3]) * anchor_box_height; + T pred_box_xmin = target_box_center_x - target_box_width / 2; + T pred_box_ymin = target_box_center_y - target_box_height / 2; + T pred_box_xmax = target_box_center_x + target_box_width / 2 - 1; + T pred_box_ymax = target_box_center_y + target_box_height / 2 - 1; + pred_box_xmin = pred_box_xmin / im_scale; + pred_box_ymin = pred_box_ymin / im_scale; + pred_box_xmax = pred_box_xmax / im_scale; + pred_box_ymax = pred_box_ymax / im_scale; + + pred_box_xmin = std::max(std::min(pred_box_xmin, im_width - 1), zero); + pred_box_ymin = std::max(std::min(pred_box_ymin, im_height - 1), zero); + pred_box_xmax = std::max(std::min(pred_box_xmax, im_width - 1), zero); + pred_box_ymax = std::max(std::min(pred_box_ymax, im_height - 1), zero); + + std::vector one_pred; + one_pred.push_back(pred_box_xmin); + one_pred.push_back(pred_box_ymin); + one_pred.push_back(pred_box_xmax); + one_pred.push_back(pred_box_ymax); + one_pred.push_back(score); + (*preds)[c].push_back(one_pred); + i++; + } + } + + void MultiClassNMS(const std::map>>& preds, + int class_num, const int keep_top_k, const T nms_threshold, + const T nms_eta, std::vector>* nmsed_out, + int* num_nmsed_out) const { + std::map> indices; + int num_det = 0; + for (int c = 0; c < class_num; ++c) { + if (static_cast(preds.count(c))) { + const std::vector> cls_dets = preds.at(c); + NMSFast(cls_dets, nms_threshold, nms_eta, &(indices[c])); + num_det += indices[c].size(); + } + } + + std::vector>> score_index_pairs; + for (const auto& it : indices) { + int label = it.first; + const std::vector& label_indices = it.second; + for (size_t j = 0; j < label_indices.size(); ++j) { + int idx = label_indices[j]; + score_index_pairs.push_back(std::make_pair(preds.at(label)[idx][4], + std::make_pair(label, idx))); + } + } + // Keep top k results per image. + std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(), + SortScoreTwoPairDescend); + if (num_det > keep_top_k) { + score_index_pairs.resize(keep_top_k); + } + + // Store the new indices. + std::map> new_indices; + for (const auto& it : score_index_pairs) { + int label = it.second.first; + int idx = it.second.second; + std::vector one_pred; + one_pred.push_back(label); + one_pred.push_back(preds.at(label)[idx][4]); + one_pred.push_back(preds.at(label)[idx][0]); + one_pred.push_back(preds.at(label)[idx][1]); + one_pred.push_back(preds.at(label)[idx][2]); + one_pred.push_back(preds.at(label)[idx][3]); + nmsed_out->push_back(one_pred); + } + + *num_nmsed_out = (num_det > keep_top_k ? keep_top_k : num_det); + } + + void RetinanetDetectionOutput(const framework::ExecutionContext& ctx, + const std::vector& scores, + const std::vector& bboxes, + const std::vector& anchors, + const Tensor& im_info, + std::vector>* nmsed_out, + int* num_nmsed_out) const { + int64_t nms_top_k = ctx.Attr("nms_top_k"); + int64_t keep_top_k = ctx.Attr("keep_top_k"); + T nms_threshold = static_cast(ctx.Attr("nms_threshold")); + T nms_eta = static_cast(ctx.Attr("nms_eta")); + T score_threshold = static_cast(ctx.Attr("score_threshold")); + + int64_t class_num = scores[0].dims()[1]; + std::map>> preds; + for (size_t l = 0; l < scores.size(); ++l) { + // Fetch per level score + Tensor scores_per_level = scores[l]; + // Fetch per level bbox + Tensor bboxes_per_level = bboxes[l]; + // Fetch per level anchor + Tensor anchors_per_level = anchors[l]; + + int64_t scores_num = scores_per_level.numel(); + int64_t bboxes_num = bboxes_per_level.numel(); + std::vector scores_data(scores_num); + std::vector bboxes_data(bboxes_num); + std::vector anchors_data(bboxes_num); + std::copy_n(scores_per_level.data(), scores_num, scores_data.begin()); + std::copy_n(bboxes_per_level.data(), bboxes_num, bboxes_data.begin()); + std::copy_n(anchors_per_level.data(), bboxes_num, + anchors_data.begin()); + std::vector> sorted_indices; + + // For the highest level, we take the threshold 0.0 + T threshold = (l < (scores.size() - 1) ? score_threshold : 0.0); + GetMaxScoreIndex(scores_data, threshold, nms_top_k, &sorted_indices); + auto* im_info_data = im_info.data(); + auto im_height = im_info_data[0]; + auto im_width = im_info_data[1]; + auto im_scale = im_info_data[2]; + DeltaScoreToPrediction(bboxes_data, anchors_data, im_height, im_width, + im_scale, class_num, sorted_indices, &preds); + } + + MultiClassNMS(preds, class_num, keep_top_k, nms_threshold, nms_eta, + nmsed_out, num_nmsed_out); + } + + void MultiClassOutput(const platform::DeviceContext& ctx, + const std::vector>& nmsed_out, + Tensor* outs) const { + auto* odata = outs->data(); + int count = 0; + int64_t out_dim = 6; + for (size_t i = 0; i < nmsed_out.size(); ++i) { + odata[count * out_dim] = nmsed_out[i][0] + 1; // label + odata[count * out_dim + 1] = nmsed_out[i][1]; // score + odata[count * out_dim + 2] = nmsed_out[i][2]; // xmin + odata[count * out_dim + 3] = nmsed_out[i][3]; // xmin + odata[count * out_dim + 4] = nmsed_out[i][4]; // xmin + odata[count * out_dim + 5] = nmsed_out[i][5]; // xmin + count++; + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto boxes = ctx.MultiInput("BBoxes"); + auto scores = ctx.MultiInput("Scores"); + auto anchors = ctx.MultiInput("Anchors"); + auto* im_info = ctx.Input("ImInfo"); + auto* outs = ctx.Output("Out"); + + std::vector boxes_list(boxes.size()); + std::vector scores_list(scores.size()); + std::vector anchors_list(anchors.size()); + for (size_t j = 0; j < boxes_list.size(); ++j) { + boxes_list[j] = *boxes[j]; + scores_list[j] = *scores[j]; + anchors_list[j] = *anchors[j]; + } + auto score_dims = scores_list[0].dims(); + int64_t batch_size = score_dims[0]; + auto box_dims = boxes_list[0].dims(); + int64_t box_dim = box_dims[2]; + int64_t out_dim = box_dim + 2; + + auto& dev_ctx = ctx.template device_context(); + + std::vector>> all_nmsed_out; + std::vector batch_starts = {0}; + for (int i = 0; i < batch_size; ++i) { + int num_nmsed_out = 0; + std::vector box_per_batch_list(boxes_list.size()); + std::vector score_per_batch_list(scores_list.size()); + for (size_t j = 0; j < boxes_list.size(); ++j) { + auto score_dims = scores_list[j].dims(); + score_per_batch_list[j] = scores_list[j].Slice(i, i + 1); + score_per_batch_list[j].Resize({score_dims[1], score_dims[2]}); + box_per_batch_list[j] = boxes_list[j].Slice(i, i + 1); + box_per_batch_list[j].Resize({score_dims[1], box_dim}); + } + Tensor im_info_slice = im_info->Slice(i, i + 1); + + std::vector> nmsed_out; + RetinanetDetectionOutput(ctx, score_per_batch_list, box_per_batch_list, + anchors_list, im_info_slice, &nmsed_out, + &num_nmsed_out); + all_nmsed_out.push_back(nmsed_out); + batch_starts.push_back(batch_starts.back() + num_nmsed_out); + } + + int num_kept = batch_starts.back(); + if (num_kept == 0) { + outs->Resize({0, out_dim}); + } else { + outs->mutable_data({num_kept, out_dim}, ctx.GetPlace()); + for (int i = 0; i < batch_size; ++i) { + int64_t s = batch_starts[i]; + int64_t e = batch_starts[i + 1]; + if (e > s) { + Tensor out = outs->Slice(s, e); + MultiClassOutput(dev_ctx, all_nmsed_out[i], &out); + } + } + } + + framework::LoD lod; + lod.emplace_back(batch_starts); + + outs->set_lod(lod); + } +}; + +class RetinanetDetectionOutputOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("BBoxes", + "(List) A list of tensors from multiple FPN levels. Each " + "element is a 3-D Tensor with shape [N, Mi, 4] represents the " + "predicted locations of Mi bounding boxes, N is the batch size. " + "Mi is the number of bounding boxes from i-th FPN level. Each " + "bounding box has four coordinate values and the layout is " + "[xmin, ymin, xmax, ymax].") + .AsDuplicable(); + AddInput("Scores", + "(List) A list of tensors from multiple FPN levels. Each " + "element is a 3-D Tensor with shape [N, Mi, C] represents the " + "predicted confidence from its FPN level. N is the batch size, " + "C is the class number (excluding background), Mi is the number " + "of bounding boxes from i-th FPN level. For each bounding box, " + "there are total C scores.") + .AsDuplicable(); + AddInput("Anchors", + "(List) A list of tensors from multiple FPN levels. Each" + "element is a 2-D Tensor with shape [Mi, 4] represents the " + "locations of Mi anchor boxes from i-th FPN level. Each " + "bounding box has four coordinate values and the layout is " + "[xmin, ymin, xmax, ymax].") + .AsDuplicable(); + AddInput("ImInfo", + "(LoDTensor) A 2-D LoDTensor with shape [N, 3] represents the " + "image information. N is the batch size, each image information " + "includes height, width and scale."); + AddAttr("score_threshold", + "(float) " + "Threshold to filter out bounding boxes with a confidence " + "score."); + AddAttr("nms_top_k", + "(int64_t) " + "Maximum number of detections per FPN layer to be kept " + "according to the confidence before NMS."); + AddAttr("nms_threshold", + "(float) " + "The threshold to be used in NMS."); + AddAttr("nms_eta", + "(float) " + "The parameter for adaptive NMS."); + AddAttr( + "keep_top_k", + "(int64_t) " + "Number of total bounding boxes to be kept per image after NMS " + "step."); + AddOutput("Out", + "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the " + "detections. Each row has 6 values: " + "[label, confidence, xmin, ymin, xmax, ymax]" + "No is the total number of detections in this mini-batch." + "For each instance, " + "the offsets in first dimension are called LoD, the number of " + "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is " + "no detected bbox."); + AddComment(R"DOC( +This operator is to decode boxes and scores from each FPN layer and do +multi-class non maximum suppression (NMS) on merged predictions. + +Top-scoring predictions per FPN layer are decoded with the anchor +information. This operator greedily selects a subset of detection bounding +boxes from each FPN layer that have high scores larger than score_threshold, +if providing this threshold, then selects the largest nms_top_k confidences +scores per FPN layer, if nms_top_k is larger than -1. +The decoding schema is described below: + +ox = (pw * pxv * tx * + px) - tw / 2 + +oy = (ph * pyv * ty * + py) - th / 2 + +ow = exp(pwv * tw) * pw + tw / 2 + +oh = exp(phv * th) * ph + th / 2 + +where `tx`, `ty`, `tw`, `th` denote the predicted box's center coordinates, width +and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote the +anchor's center coordinates, width and height. `pxv`, `pyv`, `pwv`, +`phv` denote the variance of the anchor box and `ox`, `oy`, `ow`, `oh` denote the +decoded coordinates, width and height. + +Then the top decoded prediction from all levels are merged followed by NMS. +In the NMS step, this operator prunes away boxes that have high IOU +(intersection over union) overlap with already selected boxes by adaptive +threshold NMS based on parameters of nms_threshold and nms_eta. +After NMS step, at most keep_top_k number of total bounding boxes are to be kept +per image if keep_top_k is larger than -1. +This operator support multi-class and batched inputs. It applying NMS +independently for each class. The outputs is a 2-D LoDTenosr, for each +image, the offsets in first dimension of LoDTensor are called LoD, the number +of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0, +means there is no detected bounding box for this image. If there is no detected boxes +for all images, all the elements in LoD are set to 0, and the output tensor is +empty (None). +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(retinanet_detection_output, ops::RetinanetDetectionOutputOp, + ops::RetinanetDetectionOutputOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(retinanet_detection_output, + ops::RetinanetDetectionOutputKernel, + ops::RetinanetDetectionOutputKernel); diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc new file mode 100644 index 00000000..6628dde5 --- /dev/null +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -0,0 +1,653 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +bool GT_E(T a, T b) { + return (a > b) || fabs(a - b) < 1e-4; +} + +template +bool LT_E(T a, T b) { + return (a < b) || fabs(a - b) < 1e-4; +} + +template +bool GT(T a, T b) { + return (a - b) > 1e-4; +} + +/* +*check if (x, y) is in the boundary of roi +*/ +template +bool in_quad(T x, T y, T roi_x[], T roi_y[]) { + for (int i = 0; i < 4; i++) { + T xs = roi_x[i]; + T ys = roi_y[i]; + T xe = roi_x[(i + 1) % 4]; + T ye = roi_y[(i + 1) % 4]; + if (fabs(ys - ye) < 1e-4) { + if (fabs(y - ys) < 1e-4 && fabs(y - ye) < 1e-4 && + GT_E(x, std::min(xs, xe)) && LT_E(x, std::max(xs, xe))) { + return true; + } + } else { + T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs; + if (fabs(intersec_x - x) < 1e-4 && GT_E(y, std::min(ys, ye)) && + LT_E(y, std::max(ys, ye))) { + return true; + } + } + } + + int n_cross = 0; + for (int i = 0; i < 4; i++) { + T xs = roi_x[i]; + T ys = roi_y[i]; + T xe = roi_x[(i + 1) % 4]; + T ye = roi_y[(i + 1) % 4]; + if (fabs(ys - ye) < 1e-4) { + continue; + } + if (LT_E(y, std::min(ys, ye)) || GT(y, std::max(ys, ye))) { + continue; + } + T intersec_x = (y - ys) * (xe - xs) / (ye - ys) + xs; + if (fabs(intersec_x - x) < 1e-4) { + return true; + } + if (GT(intersec_x, x)) { + n_cross++; + } + } + return (n_cross % 2 == 1); +} + +/** + * Get the matrix of perspective transform. + * + * dx1 = x1 - x2 + * dx2 = x3 - x2 + * dx3 = x0 - x1 + x2 - x3 + * dy1 = y1 - y2 + * dy2 = y3 - y2 + * dy3 = y0 - y1 + y2 - y3 + * + * a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1) + * a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1) + * a13 = x0 + * a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1) + * a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1) + * a23 = y0 + * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1) + * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1) + * a33 = 1 + */ +template +void get_transform_matrix(const int transformed_width, + const int transformed_height, T roi_x[], T roi_y[], + T matrix[]) { + T x0 = roi_x[0]; + T x1 = roi_x[1]; + T x2 = roi_x[2]; + T x3 = roi_x[3]; + T y0 = roi_y[0]; + T y1 = roi_y[1]; + T y2 = roi_y[2]; + T y3 = roi_y[3]; + + // Estimate the height and width of RoI + T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1)); + T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)); + T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3)); + T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0)); + T estimated_height = (len2 + len4) / 2.0; + T estimated_width = (len1 + len3) / 2.0; + + // Get the normalized height and normalized width + int normalized_height = transformed_height; + int normalized_width = + std::round(estimated_width * (normalized_height - 1) / estimated_height) + + 1; + normalized_width = std::min(normalized_width, transformed_width); + + T dx1 = x1 - x2; + T dx2 = x3 - x2; + T dx3 = x0 - x1 + x2 - x3; + T dy1 = y1 - y2; + T dy2 = y3 - y2; + T dy3 = y0 - y1 + y2 - y3; + + matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / + (normalized_width - 1); + matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / + (normalized_height - 1); + matrix[8] = 1; + + matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) / + (normalized_width - 1); + matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) / + (normalized_height - 1); + matrix[5] = y0; + + matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) / + (normalized_width - 1); + matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) / + (normalized_height - 1); + matrix[2] = x0; +} + +/** + * Get the source coordinates in the input feature map. + * + * (u, v, w)^matrix = matrix * (out_w, out_h, 1)^matrix + * + * in_w = u / w + * in_h = v / w + * + */ +template +void get_source_coords(T matrix[], int out_w, int out_h, T* in_w, T* in_h) { + T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2]; + T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5]; + T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8]; + + in_w[0] = u / w; + in_h[0] = v / w; +} + +/** + * Perform bilinear interpolation in the input feature map. + */ +template +void bilinear_interpolate(const T* in_data, const int channels, const int width, + const int height, int in_n, int in_c, T in_w, T in_h, + T* val) { + // Deal with cases that source coords are out of feature map boundary + if (GT(-0.5, in_w) || GT(in_w, width - 0.5) || GT(-0.5, in_h) || + GT(in_h, height - 0.5)) { + // empty + val[0] = 0.0; + return; + } + + if (GT(0, in_w)) { + in_w = 0; + } + if (GT(0, in_h)) { + in_h = 0; + } + + int in_w_floor = floor(in_w); + int in_h_floor = floor(in_h); + int in_w_ceil; + int in_h_ceil; + + if (GT_E(in_w_floor, width - 1)) { + in_w_ceil = in_w_floor = width - 1; + in_w = static_cast(in_w_floor); + } else { + in_w_ceil = in_w_floor + 1; + } + + if (GT_E(in_h_floor, height - 1)) { + in_h_ceil = in_h_floor = height - 1; + in_h = static_cast(in_h_floor); + } else { + in_h_ceil = in_h_floor + 1; + } + T w_floor = in_w - in_w_floor; + T h_floor = in_h - in_h_floor; + T w_ceil = 1 - w_floor; + T h_ceil = 1 - h_floor; + const T* data = in_data + (in_n * channels + in_c) * height * width; + // Do bilinear interpolation + T v1 = data[in_h_floor * width + in_w_floor]; + T v2 = data[in_h_ceil * width + in_w_floor]; + T v3 = data[in_h_ceil * width + in_w_ceil]; + T v4 = data[in_h_floor * width + in_w_ceil]; + T w1 = w_ceil * h_ceil; + T w2 = w_ceil * h_floor; + T w3 = w_floor * h_floor; + T w4 = w_floor * h_ceil; + val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; +} + +template +class CPUROIPerspectiveTransformOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + auto* mask = ctx.Output("Mask"); + auto* out_transform_matrix = + ctx.Output("TransformMatrix"); + auto transformed_height = ctx.Attr("transformed_height"); + auto transformed_width = ctx.Attr("transformed_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + auto in_dims = in->dims(); + int channels = in_dims[1]; + int in_height = in_dims[2]; + int in_width = in_dims[3]; + int rois_num = rois->dims()[0]; + + const T* input_data = in->data(); + int* mask_data = mask->mutable_data(ctx.GetPlace()); + + framework::Tensor roi2image; + roi2image.Resize({rois_num}); + int* roi2image_data = roi2image.mutable_data(ctx.GetPlace()); + auto lod = rois->lod().back(); + for (size_t i = 0; i < lod.size() - 1; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { + roi2image_data[j] = i; + } + } + + T* output_data = out->mutable_data(ctx.GetPlace()); + const T* rois_data = rois->data(); + + T* transform_matrix = + out_transform_matrix->mutable_data({rois_num, 9}, ctx.GetPlace()); + + for (int n = 0; n < rois_num; ++n) { + const T* n_rois = rois_data + n * 8; + T roi_x[4]; + T roi_y[4]; + for (int k = 0; k < 4; ++k) { + roi_x[k] = n_rois[2 * k] * spatial_scale; + roi_y[k] = n_rois[2 * k + 1] * spatial_scale; + } + int image_id = roi2image_data[n]; + // Get transform matrix + T matrix[9]; + get_transform_matrix(transformed_width, transformed_height, roi_x, + roi_y, matrix); + for (int i = 0; i < 9; i++) { + transform_matrix[n * 9 + i] = matrix[i]; + } + for (int c = 0; c < channels; ++c) { + for (int out_h = 0; out_h < transformed_height; ++out_h) { + for (int out_w = 0; out_w < transformed_width; ++out_w) { + int out_index = + n * channels * transformed_height * transformed_width + + c * transformed_height * transformed_width + + out_h * transformed_width + out_w; + T in_w, in_h; + get_source_coords(matrix, out_w, out_h, &in_w, &in_h); + if (in_quad(in_w, in_h, roi_x, roi_y)) { + if (GT(-0.5, in_w) || + GT(in_w, static_cast(in_width - 0.5)) || + GT(-0.5, in_h) || + GT(in_h, static_cast(in_height - 0.5))) { + output_data[out_index] = 0.0; + mask_data[(n * transformed_height + out_h) * transformed_width + + out_w] = 0; + } else { + bilinear_interpolate(input_data, channels, in_width, in_height, + image_id, c, in_w, in_h, + output_data + out_index); + mask_data[(n * transformed_height + out_h) * transformed_width + + out_w] = 1; + } + } else { + output_data[out_index] = 0.0; + mask_data[(n * transformed_height + out_h) * transformed_width + + out_w] = 0; + } + } + } + } + } + } +}; + +template +T get_feature_gradient(T xs, T ys, int w, int h, const int width, + const int height) { + if (GT(-0.5, xs) || GT(xs, width - 0.5) || GT(-0.5, ys) || + GT(ys, height - 0.5)) { + return 0; + } + + if (GT(0, xs)) { + xs = 0; + } + if (GT(0, ys)) { + ys = 0; + } + + int xs_floor = floor(xs); + int ys_floor = floor(ys); + int xs_ceil; + int ys_ceil; + + if (GT_E(xs_floor, width - 1)) { + xs_ceil = xs_floor = width - 1; + xs = static_cast(xs_floor); + } else { + xs_ceil = xs_floor + 1; + } + + if (GT_E(ys_floor, height - 1)) { + ys_ceil = ys_floor = height - 1; + ys = static_cast(ys_floor); + } else { + ys_ceil = ys_floor + 1; + } + + T weight = 0; + if (w == xs_floor) { + if (h == ys_floor) { + weight = (w + 1 - xs) * (h + 1 - ys); + } else if (h == ys_ceil) { + weight = (w + 1 - xs) * (ys + 1 - h); + } + } else if (w == xs_ceil) { + if (h == ys_floor) { + weight = (xs + 1 - w) * (h + 1 - ys); + } else if (h == ys_ceil) { + weight = (xs + 1 - w) * (ys + 1 - h); + } + } + return weight; +} + +template +class CPUROIPerspectiveTransformGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* in_grad = ctx.Output(framework::GradVarName("X")); + + auto transformed_height = ctx.Attr("transformed_height"); + auto transformed_width = ctx.Attr("transformed_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int channels = in_dims[1]; + int in_height = in_dims[2]; + int in_width = in_dims[3]; + int rois_num = rois->dims()[0]; + + T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); + const T* out_grad_data = out_grad->data(); + const T* rois_data = rois->data(); + + framework::Tensor roi2image; + roi2image.Resize({rois_num}); + int* roi2image_data = roi2image.mutable_data(ctx.GetPlace()); + auto lod = rois->lod().back(); + for (size_t i = 0; i < lod.size() - 1; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { + roi2image_data[j] = i; + } + } + + for (int n = 0; n < batch_size; ++n) { + for (int c = 0; c < channels; ++c) { + for (int in_h = 0; in_h < in_height; ++in_h) { + for (int in_w = 0; in_w < in_width; ++in_w) { + T gradient = 0.0; + for (size_t roi_idx = lod[n]; roi_idx < lod[n + 1]; ++roi_idx) { + const T* rois = rois_data + roi_idx * 8; + T roi_x[4]; + T roi_y[4]; + for (int k = 0; k < 4; ++k) { + roi_x[k] = rois[2 * k] * spatial_scale; + roi_y[k] = rois[2 * k + 1] * spatial_scale; + } + + // Get transform matrix + T matrix[9]; + get_transform_matrix(transformed_width, transformed_height, + roi_x, roi_y, matrix); + const T* out_grad_ptr = out_grad_data + + (roi_idx * channels + c) * + transformed_height * + transformed_width; + for (int out_h = 0; out_h < transformed_height; ++out_h) { + for (int out_w = 0; out_w < transformed_width; ++out_w) { + T src_w; + T src_h; + get_source_coords(matrix, out_w, out_h, &src_w, &src_h); + if (in_quad(src_w, src_h, roi_x, roi_y)) { + if (GT(-0.5, src_w) || + GT(src_w, static_cast(in_width - 0.5)) || + GT(-0.5, src_h) || + GT(src_h, static_cast(in_height - 0.5))) { + continue; + } + T weight = get_feature_gradient(src_w, src_h, in_w, in_h, + in_width, in_height); + gradient += + out_grad_ptr[out_h * transformed_width + out_w] * + weight; + } + } + } + } + int out_idx = (n * channels + c) * in_height * in_width + + in_h * in_width + in_w; + in_grad_data[out_idx] = gradient; + } + } + } + } + } +}; + +class ROIPerspectiveTransformOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ROIPerspectiveTransformOp should not be null."); + PADDLE_ENFORCE( + ctx->HasInput("ROIs"), + "Input(ROIs) of ROIPerspectiveTransformOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Output(Out) of ROIPerspectiveTransformOp should not be null."); + auto input_dims = ctx->GetInputDim("X"); + auto rois_dims = ctx->GetInputDim("ROIs"); + PADDLE_ENFORCE(input_dims.size() == 4, + "The format of input tensor is NCHW."); + PADDLE_ENFORCE(rois_dims.size() == 2, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 8)" + "given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...]"); + PADDLE_ENFORCE(rois_dims[1] == 8, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 8)" + "given as [[x0, y0, x1, y1, x2, y2, x3, y3], ...]."); + int transformed_height = ctx->Attrs().Get("transformed_height"); + int transformed_width = ctx->Attrs().Get("transformed_width"); + float spatial_scale = ctx->Attrs().Get("spatial_scale"); + + PADDLE_ENFORCE_GT(transformed_height, 0, + "The transformed output height must greater than 0"); + PADDLE_ENFORCE_GT(transformed_width, 0, + "The transformed output width must greater than 0"); + PADDLE_ENFORCE_GT(spatial_scale, 0.0f, + "The spatial scale must greater than 0"); + std::vector out_dims_v({rois_dims[0], // num_rois + input_dims[1], // channels + static_cast(transformed_height), + static_cast(transformed_width)}); + auto out_dims = framework::make_ddim(out_dims_v); + + std::vector mask_dims_v({rois_dims[0], // num_rois + 1, // channels + static_cast(transformed_height), + static_cast(transformed_width)}); + auto mask_dims = framework::make_ddim(mask_dims_v); + + std::vector matrix_dims_v({rois_dims[0], 9}); + auto matrix_dims = framework::make_ddim(matrix_dims_v); + + ctx->SetOutputDim("Out", out_dims); + ctx->SetOutputDim("Mask", mask_dims); + ctx->SetOutputDim("TransformMatrix", matrix_dims); + ctx->SetOutputDim("Out2InIdx", out_dims); + ctx->SetOutputDim("Out2InWeights", out_dims); + ctx->ShareLoD("ROIs", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")), + "The gradient of X should not be null."); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class ROIPerspectiveTransformOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor), " + "the input of ROIPerspectiveTransformOp. " + "The format of input tensor is NCHW. Where N is batch size, " + "C is the number of input channels, " + "H is the height of the feature, and " + "W is the width of the feature."); + AddInput("ROIs", + "(LoDTensor), " + "ROIs (Regions of Interest) to be transformed. " + "should be a 2-D LoDTensor of shape (num_rois, 8)" + "given as [[x1, y1, x2, y2, x3, y3, x4, y4], ...]." + "(x1, y1) is the top left coordinates, and " + "(x2, y2) is the top right coordinates, and" + "(x3, y3) is the bottom right coordinates, and" + "(x4, y4) is the bottom left coordinates."); + AddOutput( + "Out", + "(Tensor), " + "The output of ROIPerspectiveTransformOp is a 4-D tensor with shape " + "(num_rois, channels, transformed_h, transformed_w)."); + AddOutput("Mask", + "(Tensor), " + "The output mask of ROIPerspectiveTransformOp is a 4-D tensor " + "with shape " + "(num_rois, 1, transformed_h, transformed_w)."); + AddOutput("TransformMatrix", + "(Tensor), " + "The output transform matrix of ROIPerspectiveTransformOp is a " + "1-D tensor with shape " + "(num_rois, 9)."); + AddOutput("Out2InIdx", + "(Tensor), " + "An intermediate tensor used to map indexes of input feature map " + "and indexes of output feature map." + "The shape of the tensor is [out_size, 4] and out_size is the " + "number of elements in output feature map.") + .AsIntermediate(); + AddOutput("Out2InWeights", + "(Tensor), " + "An intermediate tensor used to record the weights of bilinear " + "interpolatein for each element in output. The shape of the " + "tensor is [out_size, 4] and out_size is the number of elements " + "in output feature map.") + .AsIntermediate(); + AddAttr("spatial_scale", + "(float, default 1.0), " + "Spatial scale factor to scale ROI coords.") + .SetDefault(1.0); + AddAttr("transformed_height", + "(int, default 1), " + "The height of transformed output.") + .SetDefault(1); + AddAttr("transformed_width", + "(int, default 1), " + "The width of transformed output.") + .SetDefault(1); + AddComment(R"DOC( +**ROIPerspectiveTransform Operator** + + )DOC"); + } +}; + +class ROIPerspectiveTransformGradDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("roi_perspective_transform_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput("Out2InIdx", Output("Out2InIdx")); + op->SetInput("Out2InWeights", Output("Out2InWeights")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp, + ops::ROIPerspectiveTransformOpMaker, + ops::ROIPerspectiveTransformGradDescMaker); +REGISTER_OPERATOR(roi_perspective_transform_grad, + ops::ROIPerspectiveTransformGradOp); +REGISTER_OP_CPU_KERNEL(roi_perspective_transform, + ops::CPUROIPerspectiveTransformOpKernel); +REGISTER_OP_CPU_KERNEL(roi_perspective_transform_grad, + ops::CPUROIPerspectiveTransformGradOpKernel); diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu new file mode 100644 index 00000000..19df68fa --- /dev/null +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -0,0 +1,511 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using paddle::platform::float16; + +namespace paddle { +namespace operators { + +// CUDA: index helpers +#define idx4_4(index, d1, d2, d3, d4) (index % d4) +#define idx4_3(index, d1, d2, d3, d4) ((index / d4) % d3) +#define idx4_2(index, d1, d2, d3, d4) ((index / d4 / d3) % d2) +#define idx4_1(index, d1, d2, d3, d4) ((index / d4 / d3 / d2) % d1) + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__device__ bool GT_E(T a, T b) { + return (a > b) || Eigen::numext::abs(a - b) < 1e-4; +} + +template +__device__ bool LT_E(T a, T b) { + return (a < b) || Eigen::numext::abs(a - b) < 1e-4; +} + +template +__device__ bool GT(T a, T b) { + return (a - b) > 1e-4; +} + +template +__device__ T max(T a, T b) { + return a > b ? a : b; +} + +template +__device__ T min(T a, T b) { + return a < b ? a : b; +} + +/* +* check if (x, y) is in the boundary of roi +*/ +template +__device__ bool in_quad(T x, T y, T roi_x[], T roi_y[]) { + for (int i = 0; i < 4; i++) { + T start_w = roi_x[i]; + T start_h = roi_y[i]; + T end_w = roi_x[(i + 1) % 4]; + T end_h = roi_y[(i + 1) % 4]; + if (fabs(start_h - end_h) < 1e-4) { + if (fabs(y - start_h) < 1e-4 && fabs(y - end_h) < 1e-4 && + GT_E(x, min(start_w, end_w)) && + LT_E(x, max(start_w, end_w))) { + return true; + } + } else { + T intersec_x = + (y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w; + if (fabs(intersec_x - x) < 1e-4 && GT_E(y, min(start_h, end_h)) && + LT_E(y, max(start_h, end_h))) { + return true; + } + } + } + + int n_cross = 0; + for (int i = 0; i < 4; i++) { + T start_w = roi_x[i]; + T start_h = roi_y[i]; + T end_w = roi_x[(i + 1) % 4]; + T end_h = roi_y[(i + 1) % 4]; + if (fabs(start_h - end_h) < 1e-4) { + continue; + } + if (LT_E(y, min(start_h, end_h)) || + GT(y, max(start_h, end_h))) { + continue; + } + T intersec_x = + (y - start_h) * (end_w - start_w) / (end_h - start_h) + start_w; + if (fabs(intersec_x - x) < 1e-4) { + return true; + } + if (GT(intersec_x, x)) { + n_cross++; + } + } + return (n_cross % 2 == 1); +} + +/** + * Perform bilinear interpolation in the input feature map. + */ +template +__device__ void bilinear_interpolate(const T* in_data, const int channels, + const int width, const int height, + int in_n, int in_c, T in_w, T in_h, T* val, + int out_idx, int* out2in_idx, + T* out2in_w) { + // Deal with cases that source coords are out of feature map boundary + if (GT(-0.5, in_w) || GT(in_w, width - 0.5) || GT(-0.5, in_h) || + GT(in_h, height - 0.5)) { + val[0] = 0.0; + return; + } + + if (GT(0, in_w)) { + in_w = 0; + } + if (GT(0, in_h)) { + in_h = 0; + } + + int in_w_floor = floor(in_w); + int in_h_floor = floor(in_h); + int in_w_ceil; + int in_h_ceil; + + if (GT_E(in_w_floor, width - 1)) { + in_w_ceil = in_w_floor = width - 1; + in_w = static_cast(in_w_floor); + } else { + in_w_ceil = in_w_floor + 1; + } + + if (GT_E(in_h_floor, height - 1)) { + in_h_ceil = in_h_floor = height - 1; + in_h = static_cast(in_h_floor); + } else { + in_h_ceil = in_h_floor + 1; + } + + T w_floor = in_w - in_w_floor; + T h_floor = in_h - in_h_floor; + T w_ceil = 1 - w_floor; + T h_ceil = 1 - h_floor; + const T* data = in_data + (in_n * channels + in_c) * height * width; + // Do bilinear interpolation + T v1 = data[in_h_floor * width + in_w_floor]; + T v2 = data[in_h_ceil * width + in_w_floor]; + T v3 = data[in_h_ceil * width + in_w_ceil]; + T v4 = data[in_h_floor * width + in_w_ceil]; + T w1 = w_ceil * h_ceil; + T w2 = w_ceil * h_floor; + T w3 = w_floor * h_floor; + T w4 = w_floor * h_ceil; + val[0] = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; + + int base_idx = (in_n * channels + in_c) * height * width; + out2in_idx[out_idx * 4] = base_idx + in_h_floor * width + in_w_floor; + out2in_idx[out_idx * 4 + 1] = base_idx + in_h_ceil * width + in_w_floor; + out2in_idx[out_idx * 4 + 2] = base_idx + in_h_ceil * width + in_w_ceil; + out2in_idx[out_idx * 4 + 3] = base_idx + in_h_floor * width + in_w_ceil; + out2in_w[out_idx * 4] = w1; + out2in_w[out_idx * 4 + 1] = w2; + out2in_w[out_idx * 4 + 2] = w3; + out2in_w[out_idx * 4 + 3] = w4; +} + +/** + * Get the source coordinates in the input feature map. + * + * (u, v, w)^matrix = T * (out_w, out_h, 1)^matrix + * + * in_w = u / w + * in_h = v / w + * + */ +template +__device__ void get_source_coords(T matrix[], int out_w, int out_h, T* in_w, + T* in_h) { + T u = matrix[0] * out_w + matrix[1] * out_h + matrix[2]; + T v = matrix[3] * out_w + matrix[4] * out_h + matrix[5]; + T w = matrix[6] * out_w + matrix[7] * out_h + matrix[8]; + + in_w[0] = u / w; + in_h[0] = v / w; +} + +/** + * Get the matrix of perspective transform. + * + * dx1 = x1 - x2 + * dx2 = x3 - x2 + * dx3 = x0 - x1 + x2 - x3 + * dy1 = y1 - y2 + * dy2 = y3 - y2 + * dy3 = y0 - y1 + y2 - y3 + * + * a11 = (x1 - x0 + a31 * (w - 1) * x1) / (w - 1) + * a12 = (x3 - x0 + a32 * (h - 1) * x3) / (h - 1) + * a13 = x0 + * a21 = (y1 - y0 + a31 * (w - 1) * y1) / (w - 1) + * a22 = (y3 - y0 + a32 * (h - 1) * y3) / (h - 1) + * a23 = y0 + * a31 = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / (w - 1) + * a32 = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / (h - 1) + * a33 = 1 + * + */ +template +__device__ void get_transform_matrix(const int transformed_width, + const int transformed_height, T roi_x[], + T roi_y[], T matrix[]) { + T x0 = roi_x[0]; + T x1 = roi_x[1]; + T x2 = roi_x[2]; + T x3 = roi_x[3]; + T y0 = roi_y[0]; + T y1 = roi_y[1]; + T y2 = roi_y[2]; + T y3 = roi_y[3]; + + // Estimate the height and width of RoI + T len1 = sqrt((x0 - x1) * (x0 - x1) + (y0 - y1) * (y0 - y1)); + T len2 = sqrt((x1 - x2) * (x1 - x2) + (y1 - y2) * (y1 - y2)); + T len3 = sqrt((x2 - x3) * (x2 - x3) + (y2 - y3) * (y2 - y3)); + T len4 = sqrt((x3 - x0) * (x3 - x0) + (y3 - y0) * (y3 - y0)); + T estimated_height = (len2 + len4) / 2.0; + T estimated_width = (len1 + len3) / 2.0; + + // Get the normalized height and normalized width + int normalized_height = transformed_height; + int normalized_width = + round(estimated_width * (normalized_height - 1) / estimated_height) + 1; + normalized_width = min(normalized_width, transformed_width); + + T dx1 = x1 - x2; + T dx2 = x3 - x2; + T dx3 = x0 - x1 + x2 - x3; + T dy1 = y1 - y2; + T dy2 = y3 - y2; + T dy3 = y0 - y1 + y2 - y3; + + matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / + (normalized_width - 1); + matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / + (normalized_height - 1); + matrix[8] = 1; + + matrix[3] = (y1 - y0 + matrix[6] * (normalized_width - 1) * y1) / + (normalized_width - 1); + matrix[4] = (y3 - y0 + matrix[7] * (normalized_height - 1) * y3) / + (normalized_height - 1); + matrix[5] = y0; + + matrix[0] = (x1 - x0 + matrix[6] * (normalized_width - 1) * x1) / + (normalized_width - 1); + matrix[1] = (x3 - x0 + matrix[7] * (normalized_height - 1) * x3) / + (normalized_height - 1); + matrix[2] = x0; +} + +template +__global__ void RoiTransformKernel(const float* input_data, + const float* rois_data, + const int* roi2image_data, int num_rois, + int in_height, int in_width, int channels, + int transformed_height, + int transformed_width, float spatial_scale, + T* output_data, int* out2in_idx, T* out2in_w, + int* mask, T* transform_matrix) { + int output_size = + num_rois * transformed_height * transformed_width * channels; + + CUDA_1D_KERNEL_LOOP(index, output_size) { + // (n, c, out_h, out_w) is an element in the transformed output + int out_w = idx4_4(index, num_rois, channels, transformed_height, + transformed_width); + int out_h = idx4_3(index, num_rois, channels, transformed_height, + transformed_width); + int c = idx4_2(index, num_rois, channels, transformed_height, + transformed_width); + int n = idx4_1(index, num_rois, channels, transformed_height, + transformed_width); + + auto bottom_rois = rois_data + n * 8; + int roi_batch_ind = bottom_rois[0]; + T roi_x[4]; + T roi_y[4]; + for (int k = 0; k < 4; ++k) { + roi_x[k] = bottom_rois[2 * k] * spatial_scale; + roi_y[k] = bottom_rois[2 * k + 1] * spatial_scale; + } + + // Get transform matrix + T matrix[9]; + get_transform_matrix(transformed_width, transformed_height, roi_x, roi_y, + matrix); + for (int i = 0; i < 9; i++) { + transform_matrix[n * 9 + i] = matrix[i]; + } + // Get source coords + T in_w; + T in_h; + get_source_coords(matrix, out_w, out_h, &in_w, &in_h); + + if (in_quad(in_w, in_h, roi_x, roi_y)) { + if (GT(-0.5, in_w) || GT(in_w, static_cast(in_width - 0.5)) || + GT(-0.5, in_h) || GT(in_h, static_cast(in_height - 0.5))) { + // Skip if source coords is not in input image + output_data[index] = 0.0; + mask[(n * transformed_height + out_h) * transformed_width + out_w] = 0; + } else { + // Perform bilinear interpolation + int in_n = roi2image_data[n]; + bilinear_interpolate(input_data, channels, in_width, in_height, in_n, + c, in_w, in_h, output_data + index, index, + out2in_idx, out2in_w); + mask[(n * transformed_height + out_h) * transformed_width + out_w] = 1; + } + + } else { + // Skip if source coords is not in quad + output_data[index] = 0.0; + mask[(n * transformed_height + out_h) * transformed_width + out_w] = 0; + } + } +} + +template +class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + auto* out2in_idx = ctx.Output("Out2InIdx"); + auto* out2in_w = ctx.Output("Out2InWeights"); + auto* mask = ctx.Output("Mask"); + auto* out_transform_matrix = + ctx.Output("TransformMatrix"); + + int* mask_data = mask->mutable_data(ctx.GetPlace()); + int* out2in_idx_data = + out2in_idx->mutable_data({out->numel(), 4}, ctx.GetPlace()); + T* out2in_w_data = + out2in_w->mutable_data({out->numel(), 4}, ctx.GetPlace()); + + math::SetConstant init; + init(ctx.cuda_device_context(), out2in_idx, static_cast(-1)); + + auto transformed_height = ctx.Attr("transformed_height"); + auto transformed_width = ctx.Attr("transformed_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int channels = in_dims[1]; + int in_height = in_dims[2]; + int in_width = in_dims[3]; + int rois_num = rois->dims()[0]; + + const T* input_data = in->data(); + T* output_data = out->mutable_data(ctx.GetPlace()); + const T* rois_data = rois->data(); + + framework::Tensor roi2image; + framework::Tensor roi2image_dev; + roi2image.Resize({rois_num}); + int* roi2image_data = roi2image.mutable_data(platform::CPUPlace()); + auto lod = rois->lod().back(); + for (size_t i = 0; i < lod.size() - 1; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { + roi2image_data[j] = i; + } + } + TensorCopySync(roi2image, ctx.GetPlace(), &roi2image_dev); + + int out_size = rois_num * transformed_height * transformed_width * channels; + auto stream = ctx.cuda_device_context().stream(); + int block = 512; + int grid = (out_size + block - 1) / block; + + // Get transform matrix + T* matrix = + out_transform_matrix->mutable_data({rois_num, 9}, ctx.GetPlace()); + + RoiTransformKernel<<>>( + input_data, rois_data, roi2image_dev.data(), rois_num, in_height, + in_width, channels, transformed_height, transformed_width, + spatial_scale, output_data, out2in_idx_data, out2in_w_data, mask_data, + matrix); + } +}; + +template +__device__ T get_feature_gradient(T xs, T ys, int w, int h, const int width, + const int height) { + if (GT(-0.5, xs) || GT(xs, width - 0.5) || GT(-0.5, ys) || + GT(ys, height - 0.5)) { + return 0; + } + + if (GT(0, xs)) { + xs = 0; + } + if (GT(0, ys)) { + ys = 0; + } + + int xs_floor = floor(xs); + int ys_floor = floor(ys); + int xs_ceil; + int ys_ceil; + + if (GT_E(xs_floor, width - 1)) { + xs_ceil = xs_floor = width - 1; + xs = static_cast(xs_floor); + } else { + xs_ceil = xs_floor + 1; + } + + if (GT_E(ys_floor, height - 1)) { + ys_ceil = ys_floor = height - 1; + ys = static_cast(ys_floor); + } else { + ys_ceil = ys_floor + 1; + } + + T weight = 0; + if (w == xs_floor) { + if (h == ys_floor) { + weight = (w + 1 - xs) * (h + 1 - ys); + } else if (h == ys_ceil) { + weight = (w + 1 - xs) * (ys + 1 - h); + } + } else if (w == xs_ceil) { + if (h == ys_floor) { + weight = (xs + 1 - w) * (h + 1 - ys); + } else if (h == ys_ceil) { + weight = (xs + 1 - w) * (ys + 1 - h); + } + } + return weight; +} + +template +__global__ void RoiTransformGradKernel(int out_size, const int* out2in_idx_data, + const T* out2in_w_data, + const T* out_grad_data, + T* in_grad_data) { + CUDA_1D_KERNEL_LOOP(index, out_size * 4) { + int in_idx = out2in_idx_data[index]; + if (in_idx >= 0) { + int out_idx = index / 4; + atomicAdd(in_grad_data + in_idx, + out_grad_data[out_idx] * out2in_w_data[index]); + } + } +} + +template +class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out2in_idx = ctx.Input("Out2InIdx"); + auto* out2in_w = ctx.Input("Out2InWeights"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* in_grad = ctx.Output(framework::GradVarName("X")); + + T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); + + math::SetConstant set_zero; + set_zero(ctx.cuda_device_context(), in_grad, static_cast(0)); + + const T* out_grad_data = out_grad->data(); + const int* out2in_idx_data = out2in_idx->data(); + const T* out2in_w_data = out2in_w->data(); + + int out_size = out_grad->numel(); + auto stream = ctx.cuda_device_context().stream(); + int block = 512; + int grid = (out_size * 4 + block - 1) / block; + + RoiTransformGradKernel<<>>( + out_size, out2in_idx_data, out2in_w_data, out_grad_data, in_grad_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(roi_perspective_transform, + ops::CUDAROIPerspectiveTransformOpKernel); +REGISTER_OP_CUDA_KERNEL(roi_perspective_transform_grad, + ops::CUDAROIPerspectiveTransformGradOpKernel); diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc new file mode 100644 index 00000000..33895434 --- /dev/null +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -0,0 +1,1035 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detection/bbox_util.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +template +using EigenMatrix = framework::EigenMatrix; + +class RpnTargetAssignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Anchor"), + "Input(Anchor) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("GtBoxes"), + "Input(GtBoxes) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("IsCrowd"), + "Input(Anchor) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("ImInfo"), + "Input(ImInfo) of RpnTargetAssignOp should not be null"); + + PADDLE_ENFORCE( + ctx->HasOutput("LocationIndex"), + "Output(LocationIndex) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("ScoreIndex"), + "Output(ScoreIndex) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("TargetLabel"), + "Output(TargetLabel) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("TargetBBox"), + "Output(TargetBBox) of RpnTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("BBoxInsideWeight"), + "Output(BBoxInsideWeight) of RpnTargetAssignOp should not be null"); + + auto anchor_dims = ctx->GetInputDim("Anchor"); + auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, + "The rank of Input(Anchor) must be 2."); + PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, + "The rank of Input(GtBoxes) must be 2."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); + + ctx->SetOutputDim("LocationIndex", {-1}); + ctx->SetOutputDim("ScoreIndex", {-1}); + ctx->SetOutputDim("TargetLabel", {-1, 1}); + ctx->SetOutputDim("TargetBBox", {-1, 4}); + ctx->SetOutputDim("BBoxInsideWeight", {-1, 4}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input("Anchor")->type(), + platform::CPUPlace()); + } +}; + +template +void AppendRpns(LoDTensor* out, int64_t offset, Tensor* to_add) { + auto* out_data = out->data(); + auto* to_add_data = to_add->data(); + memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T)); +} + +template +std::vector FilterStraddleAnchor( + const platform::CPUDeviceContext& context, const Tensor* anchor, + const float rpn_straddle_thresh, T im_height, T im_width) { + std::vector inds_inside; + int anchor_num = anchor->dims()[0]; + auto* anchor_data = anchor->data(); + if (rpn_straddle_thresh >= 0) { + int index; + for (int i = 0; i < anchor_num; ++i) { + index = i * 4; + if ((anchor_data[index + 0] >= -rpn_straddle_thresh) && + (anchor_data[index + 1] >= -rpn_straddle_thresh) && + (anchor_data[index + 2] < im_width + rpn_straddle_thresh) && + (anchor_data[index + 3] < im_height + rpn_straddle_thresh)) { + inds_inside.emplace_back(i); + } + } + } else { + for (int i = 0; i < anchor_num; ++i) { + inds_inside.emplace_back(i); + } + } + int inside_num = inds_inside.size(); + Tensor inds_inside_t; + int* inds_inside_data = + inds_inside_t.mutable_data({inside_num}, context.GetPlace()); + std::copy(inds_inside.begin(), inds_inside.end(), inds_inside_data); + Tensor inside_anchor_t; + T* inside_anchor_data = + inside_anchor_t.mutable_data({inside_num, 4}, context.GetPlace()); + Gather(anchor->data(), 4, inds_inside_data, inside_num, + inside_anchor_data); + std::vector res; + res.emplace_back(inds_inside_t); + res.emplace_back(inside_anchor_t); + return res; +} + +template +Tensor FilterCrowdGt(const platform::CPUDeviceContext& context, + Tensor* gt_boxes, Tensor* is_crowd) { + int gt_num = gt_boxes->dims()[0]; + std::vector not_crowd_inds; + auto* is_crowd_data = is_crowd->data(); + for (int i = 0; i < gt_num; ++i) { + if (is_crowd_data[i] == 0) { + not_crowd_inds.emplace_back(i); + } + } + int ncrowd_num = not_crowd_inds.size(); + Tensor ncrowd_gt_boxes; + T* ncrowd_gt_boxes_data = + ncrowd_gt_boxes.mutable_data({ncrowd_num, 4}, context.GetPlace()); + Gather(gt_boxes->data(), 4, not_crowd_inds.data(), ncrowd_num, + ncrowd_gt_boxes_data); + return ncrowd_gt_boxes; +} + +void ReservoirSampling(const int num, std::vector* inds, + std::minstd_rand engine, bool use_random) { + std::uniform_real_distribution uniform(0, 1); + size_t len = inds->size(); + if (len > static_cast(num)) { + if (use_random) { + for (size_t i = num; i < len; ++i) { + int rng_ind = std::floor(uniform(engine) * i); + if (rng_ind < num) + std::iter_swap(inds->begin() + rng_ind, inds->begin() + i); + } + } + inds->resize(num); + } +} + +template +void ScoreAssign(const T* anchor_by_gt_overlap_data, + const Tensor& anchor_to_gt_max, const Tensor& gt_to_anchor_max, + const int rpn_batch_size_per_im, const float rpn_fg_fraction, + const float rpn_positive_overlap, + const float rpn_negative_overlap, std::vector* fg_inds, + std::vector* bg_inds, std::vector* tgt_lbl, + std::vector* fg_fake, std::vector* bbox_inside_weight, + std::minstd_rand engine, bool use_random) { + float epsilon = 0.00001; + int anchor_num = anchor_to_gt_max.dims()[0]; + int gt_num = gt_to_anchor_max.dims()[0]; + std::vector target_label(anchor_num, -1); + std::vector fg_inds_fake; + std::vector bg_inds_fake; + const T* anchor_to_gt_max_data = anchor_to_gt_max.data(); + const T* gt_to_anchor_max_data = gt_to_anchor_max.data(); + // TODO(buxingyuan): Match with Detectron now + // but it seems here is a bug in two directions assignment + // in which the later one may overwrites the former one. + for (int64_t i = 0; i < anchor_num; ++i) { + bool is_anchors_with_max_overlap = false; + for (int64_t j = 0; j < gt_num; ++j) { + T value = anchor_by_gt_overlap_data[i * gt_num + j]; + T diff = std::abs(value - gt_to_anchor_max_data[j]); + if (diff < epsilon) { + is_anchors_with_max_overlap = true; + break; + } + } + bool is_anchor_great_than_thresh = + (anchor_to_gt_max_data[i] >= rpn_positive_overlap); + if (is_anchors_with_max_overlap || is_anchor_great_than_thresh) { + fg_inds_fake.push_back(i); + } + } + + // Reservoir Sampling + int fg_num = 0; + if (rpn_fg_fraction > 0 && rpn_batch_size_per_im > 0) { + fg_num = static_cast(rpn_fg_fraction * rpn_batch_size_per_im); + ReservoirSampling(fg_num, &fg_inds_fake, engine, use_random); + } else { + fg_num = static_cast(fg_inds_fake.size()); + } + int fg_fake_num = static_cast(fg_inds_fake.size()); + for (int64_t i = 0; i < fg_fake_num; ++i) { + target_label[fg_inds_fake[i]] = 1; + } + + for (int64_t i = 0; i < anchor_num; ++i) { + if (anchor_to_gt_max_data[i] < rpn_negative_overlap) { + bg_inds_fake.push_back(i); + } + } + int bg_num = 0; + if (rpn_fg_fraction > 0 && rpn_batch_size_per_im > 0) { + bg_num = rpn_batch_size_per_im - fg_fake_num; + ReservoirSampling(bg_num, &bg_inds_fake, engine, use_random); + bg_num = static_cast(bg_inds_fake.size()); + } else { + bg_num = static_cast(bg_inds_fake.size()); + } + + int fake_num = 0; + for (int64_t i = 0; i < bg_num; ++i) { + // fg fake found + if (target_label[bg_inds_fake[i]] == 1) { + fake_num++; + fg_fake->emplace_back(fg_inds_fake[0]); + for (int j = 0; j < 4; ++j) { + bbox_inside_weight->emplace_back(T(0.)); + } + } + target_label[bg_inds_fake[i]] = 0; + } + + for (int64_t i = 0; i < (fg_fake_num - fake_num) * 4; ++i) { + bbox_inside_weight->emplace_back(T(1.)); + } + + for (int64_t i = 0; i < anchor_num; ++i) { + if (target_label[i] == 1) { + fg_inds->emplace_back(i); + fg_fake->emplace_back(i); + } + if (target_label[i] == 0) bg_inds->emplace_back(i); + } + fg_num = fg_inds->size(); + bg_num = bg_inds->size(); + + tgt_lbl->resize(fg_num + bg_num, 0); + std::vector fg_lbl(fg_num, 1); + std::vector bg_lbl(bg_num, 0); + std::copy(fg_lbl.begin(), fg_lbl.end(), tgt_lbl->data()); + std::copy(bg_lbl.begin(), bg_lbl.end(), tgt_lbl->data() + fg_num); +} + +template +std::vector SampleRpnFgBgGt(const platform::CPUDeviceContext& ctx, + const Tensor& anchor_by_gt_overlap, + const int rpn_batch_size_per_im, + const float rpn_positive_overlap, + const float rpn_negative_overlap, + const float rpn_fg_fraction, + std::minstd_rand engine, bool use_random) { + auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data(); + int anchor_num = anchor_by_gt_overlap.dims()[0]; + int gt_num = anchor_by_gt_overlap.dims()[1]; + + std::vector fg_inds; + std::vector bg_inds; + std::vector gt_inds; + std::vector tgt_lbl; + std::vector fg_fake; + std::vector bbox_inside_weight; + // Calculate the max IoU between anchors and gt boxes + // Map from anchor to gt box that has highest overlap + auto place = ctx.GetPlace(); + Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max; + anchor_to_gt_max.mutable_data({anchor_num}, place); + int* argmax = anchor_to_gt_argmax.mutable_data({anchor_num}, place); + gt_to_anchor_max.mutable_data({gt_num}, place); + + auto anchor_by_gt_overlap_et = + framework::EigenMatrix::From(anchor_by_gt_overlap); + auto anchor_to_gt_max_et = + framework::EigenVector::Flatten(anchor_to_gt_max); + auto gt_to_anchor_max_et = + framework::EigenVector::Flatten(gt_to_anchor_max); + auto anchor_to_gt_argmax_et = + framework::EigenVector::Flatten(anchor_to_gt_argmax); + anchor_to_gt_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(1)); + anchor_to_gt_argmax_et = + anchor_by_gt_overlap_et.argmax(1).template cast(); + gt_to_anchor_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(0)); + + // Follow the Faster RCNN's implementation + ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max, + rpn_batch_size_per_im, rpn_fg_fraction, rpn_positive_overlap, + rpn_negative_overlap, &fg_inds, &bg_inds, &tgt_lbl, &fg_fake, + &bbox_inside_weight, engine, use_random); + + int fg_num = fg_inds.size(); + int bg_num = bg_inds.size(); + int fg_fake_num = fg_fake.size(); + gt_inds.reserve(fg_fake_num); + for (int i = 0; i < fg_fake_num; ++i) { + gt_inds.emplace_back(argmax[fg_fake[i]]); + } + Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t; + int* loc_index_data = loc_index_t.mutable_data({fg_fake_num}, place); + int* score_index_data = + score_index_t.mutable_data({fg_num + bg_num}, place); + int* tgt_lbl_data = tgt_lbl_t.mutable_data({fg_num + bg_num}, place); + int* gt_inds_data = gt_inds_t.mutable_data({fg_fake_num}, place); + T* bbox_inside_weight_data = + bbox_inside_weight_t.mutable_data({fg_fake_num, 4}, place); + std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data); + std::copy(fg_inds.begin(), fg_inds.end(), score_index_data); + std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num); + std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data); + std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data); + std::copy(bbox_inside_weight.begin(), bbox_inside_weight.end(), + bbox_inside_weight_data); + std::vector loc_score_tgtlbl_gt; + loc_score_tgtlbl_gt.emplace_back(loc_index_t); + loc_score_tgtlbl_gt.emplace_back(score_index_t); + loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t); + loc_score_tgtlbl_gt.emplace_back(gt_inds_t); + loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t); + + return loc_score_tgtlbl_gt; +} + +template +class RpnTargetAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* anchor = context.Input("Anchor"); // (H*W*A) * 4 + auto* gt_boxes = context.Input("GtBoxes"); + auto* is_crowd = context.Input("IsCrowd"); + auto* im_info = context.Input("ImInfo"); + + auto* loc_index = context.Output("LocationIndex"); + auto* score_index = context.Output("ScoreIndex"); + auto* tgt_bbox = context.Output("TargetBBox"); + auto* tgt_lbl = context.Output("TargetLabel"); + auto* bbox_inside_weight = context.Output("BBoxInsideWeight"); + + PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, + "RpnTargetAssignOp gt_boxes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "RpnTargetAssignOp is_crowd needs 1 level of LoD"); + int64_t anchor_num = static_cast(anchor->dims()[0]); + int64_t batch_num = static_cast(gt_boxes->lod().back().size() - 1); + + int rpn_batch_size_per_im = context.Attr("rpn_batch_size_per_im"); + float rpn_straddle_thresh = context.Attr("rpn_straddle_thresh"); + float rpn_positive_overlap = context.Attr("rpn_positive_overlap"); + float rpn_negative_overlap = context.Attr("rpn_negative_overlap"); + float rpn_fg_fraction = context.Attr("rpn_fg_fraction"); + bool use_random = context.Attr("use_random"); + + int64_t max_num = batch_num * rpn_batch_size_per_im; + auto place = context.GetPlace(); + + loc_index->mutable_data({max_num}, place); + score_index->mutable_data({max_num}, place); + tgt_bbox->mutable_data({max_num, 4}, place); + tgt_lbl->mutable_data({max_num, 1}, place); + bbox_inside_weight->mutable_data({max_num, 4}, place); + auto& dev_ctx = context.device_context(); + + std::random_device rnd; + std::minstd_rand engine; + int seed = rnd(); + engine.seed(seed); + + framework::LoD lod_loc, loc_score; + std::vector lod0_loc(1, 0); + std::vector lod0_score(1, 0); + + int total_loc_num = 0; + int total_score_num = 0; + auto gt_boxes_lod = gt_boxes->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); + for (int i = 0; i < batch_num; ++i) { + Tensor gt_boxes_slice = + gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); + Tensor im_info_slice = im_info->Slice(i, i + 1); + auto* im_info_data = im_info_slice.data(); + auto im_height = im_info_data[0]; + auto im_width = im_info_data[1]; + auto im_scale = im_info_data[2]; + + // Filter straddle anchor + std::vector filter_output = FilterStraddleAnchor( + dev_ctx, anchor, rpn_straddle_thresh, im_height, im_width); + Tensor inds_inside = filter_output[0]; + Tensor inside_anchor = filter_output[1]; + + // Filter crowd gt + Tensor ncrowd_gt_boxes = + FilterCrowdGt(dev_ctx, >_boxes_slice, &is_crowd_slice); + auto ncrowd_gt_boxes_et = + framework::EigenTensor::From(ncrowd_gt_boxes); + ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale; + + Tensor anchor_by_gt_overlap; + anchor_by_gt_overlap.mutable_data( + {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place); + BboxOverlaps(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap); + + auto loc_score_tgtlbl_gt = SampleRpnFgBgGt( + dev_ctx, anchor_by_gt_overlap, rpn_batch_size_per_im, + rpn_positive_overlap, rpn_negative_overlap, rpn_fg_fraction, engine, + use_random); + + Tensor sampled_loc_index = loc_score_tgtlbl_gt[0]; + Tensor sampled_score_index = loc_score_tgtlbl_gt[1]; + Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2]; + Tensor sampled_gt_index = loc_score_tgtlbl_gt[3]; + Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4]; + + int loc_num = sampled_loc_index.dims()[0]; + int score_num = sampled_score_index.dims()[0]; + // unmap to all anchor + Tensor sampled_loc_index_unmap, sampled_score_index_unmap; + sampled_loc_index_unmap.mutable_data({loc_num}, place); + sampled_score_index_unmap.mutable_data({score_num}, place); + Gather(inds_inside.data(), 1, sampled_loc_index.data(), + loc_num, sampled_loc_index_unmap.data()); + Gather(inds_inside.data(), 1, sampled_score_index.data(), + score_num, sampled_score_index_unmap.data()); + + // get target bbox deltas + Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox; + auto* sampled_anchor_data = + sampled_anchor.mutable_data({loc_num, 4}, place); + auto* sampled_gt_data = sampled_gt.mutable_data({loc_num, 4}, place); + Gather(anchor->data(), 4, sampled_loc_index_unmap.data(), + loc_num, sampled_anchor_data); + Gather(ncrowd_gt_boxes.data(), 4, sampled_gt_index.data(), + loc_num, sampled_gt_data); + sampled_tgt_bbox.mutable_data({loc_num, 4}, place); + BoxToDelta(loc_num, sampled_anchor, sampled_gt, nullptr, false, + &sampled_tgt_bbox); + + // Add anchor offset + int anchor_offset = i * anchor_num; + auto sampled_loc_index_unmap_et = + framework::EigenTensor::From(sampled_loc_index_unmap); + sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset; + auto sampled_score_index_unmap_et = + framework::EigenTensor::From(sampled_score_index_unmap); + sampled_score_index_unmap_et = + sampled_score_index_unmap_et + anchor_offset; + AppendRpns(loc_index, total_loc_num, &sampled_loc_index_unmap); + AppendRpns(score_index, total_score_num, &sampled_score_index_unmap); + AppendRpns(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox); + AppendRpns(tgt_lbl, total_score_num, &sampled_tgtlbl); + AppendRpns(bbox_inside_weight, total_loc_num * 4, + &sampled_bbox_inside_weight); + total_loc_num += loc_num; + + total_score_num += score_num; + lod0_loc.emplace_back(total_loc_num); + lod0_score.emplace_back(total_score_num); + } + + PADDLE_ENFORCE_LE(total_loc_num, max_num); + PADDLE_ENFORCE_LE(total_score_num, max_num); + + lod_loc.emplace_back(lod0_loc); + loc_score.emplace_back(lod0_score); + loc_index->set_lod(lod_loc); + score_index->set_lod(loc_score); + tgt_bbox->set_lod(lod_loc); + tgt_lbl->set_lod(loc_score); + bbox_inside_weight->set_lod(lod_loc); + loc_index->Resize({total_loc_num}); + score_index->Resize({total_score_num}); + tgt_bbox->Resize({total_loc_num, 4}); + tgt_lbl->Resize({total_score_num, 1}); + bbox_inside_weight->Resize({total_loc_num, 4}); + } +}; + +class RpnTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Anchor", + "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4]."); + AddInput("GtBoxes", + "(LoDTensor) input ground-truth bbox with shape [K, 4]."); + AddInput("IsCrowd", + "(LoDTensor) input which indicates ground-truth is crowd."); + AddInput("ImInfo", + "(LoDTensor) input image information with shape [N, 3]. " + "N is the batch size, each image information includes height, " + "width and scale."); + AddAttr("rpn_batch_size_per_im", + "Total number of RPN examples per image.") + .SetDefault(256); + AddAttr( + "rpn_straddle_thresh", + "Remove RPN anchors that go outside the image by straddle_thresh " + "pixels, " + "Set to -1 or a large value, e.g. 100000, to disable pruning anchors."); + AddAttr( + "rpn_positive_overlap", + "Minimum overlap required between an anchor and ground-truth " + "box for the (anchor, gt box) pair to be a positive example.") + .SetDefault(0.7); + AddAttr( + "rpn_negative_overlap", + "Maximum overlap allowed between an anchor and ground-truth " + "box for the (anchor, gt box) pair to be a negative examples.") + .SetDefault(0.3); + AddAttr( + "rpn_fg_fraction", + "Target fraction of RoI minibatch that " + "is labeled foreground (i.e. class > 0), 0-th class is background.") + .SetDefault(0.25); + AddAttr("use_random", + "A flag indicating whether to use a ReservoirSampling. " + "NOTE: DO NOT set this flag to false in training. " + "Setting this flag to false is only useful in unittest.") + .SetDefault(true); + AddOutput( + "LocationIndex", + "(Tensor), The indexes of foreground anchors in all RPN anchors, the " + "shape of the LocationIndex is [F], F depends on the value of input " + "tensor and attributes."); + AddOutput( + "ScoreIndex", + "(Tensor), The indexes of foreground and background anchors in all " + "RPN anchors(The rest anchors are ignored). The shape of the " + "ScoreIndex is [F + B], F and B are sampled foreground and background " + " number."); + AddOutput("TargetBBox", + "(Tensor), The target bbox deltas with shape " + "[F, 4], F is the sampled foreground number."); + AddOutput( + "TargetLabel", + "(Tensor), The target labels of each anchor with shape " + "[F + B, 1], F and B are sampled foreground and background number."); + AddOutput("BBoxInsideWeight", + "(Tensor), The bbox inside weight with shape " + "[F, 4], F is the sampled foreground number."); + AddComment(R"DOC( +This operator can be, for a given set of ground truth bboxes and the +anchors, to assign classification and regression targets to each prediction. +The ScoreIndex and LocationIndex will be generated according to the anchor-groundtruth IOU. +The rest anchors would not contibute to the RPN training loss + +ScoreIndex is composed of foreground anchor indexes(positive labels) and +background anchor indexes(negative labels). LocationIndex is exactly same +as the foreground anchor indexes since we can not assign regression target to +the background anchors. + +The classification targets(TargetLabel) is a binary class label (of being +an object or not). Following the paper of Faster-RCNN, the positive labels +are two kinds of anchors: (i) the anchor/anchors with the highest IoU +overlap with a ground-truth box, or (ii) an anchor that has an IoU overlap +higher than rpn_positive_overlap(0.7) with any ground-truth box. Note that +a single ground-truth box may assign positive labels to multiple anchors. +A non-positive anchor is when its IoU ratio is lower than rpn_negative_overlap +(0.3) for all ground-truth boxes. Anchors that are neither positive nor +negative do not contribute to the training objective. + +)DOC"); + } +}; + +class RetinanetTargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Anchor", + "(Tensor) input anchor is a 2-D Tensor with shape [H*W*A, 4]."); + AddInput("GtBoxes", + "(LoDTensor) input ground-truth bbox with shape [K, 4]."); + AddInput("GtLabels", + "(LoDTensor) input ground-truth label with shape [K, 1]."); + AddInput("IsCrowd", + "(LoDTensor) input which indicates ground-truth is crowd."); + AddInput("ImInfo", + "(LoDTensor) input image information with shape [N, 3]. " + "N is the batch size, each image information includes height, " + "width and scale."); + AddAttr( + "positive_overlap", + "Minimum overlap required between an anchor and ground-truth " + "box for the (anchor, gt box) pair to be a positive example.") + .SetDefault(0.5); + AddAttr( + "negative_overlap", + "Maximum overlap allowed between an anchor and ground-truth " + "box for the (anchor, gt box) pair to be a negative examples.") + .SetDefault(0.4); + AddOutput( + "LocationIndex", + "(Tensor), The indexes of foreground anchors in all anchors, the " + "shape of the LocationIndex is [F], F depends on the value of input " + "tensor and attributes."); + AddOutput( + "ScoreIndex", + "(Tensor), The indexes of foreground and background anchors in all " + "RPN anchors(The rest anchors are ignored). The shape of the " + "ScoreIndex is [F + B], F and B are foreground and background " + " number."); + AddOutput("TargetBBox", + "(Tensor), The target bbox deltas with shape " + "[F, 4], F is the foreground number."); + AddOutput("TargetLabel", + "(Tensor), The target labels of each anchor with shape " + "[F + B, 1], F and B are foreground and background number."); + AddOutput("BBoxInsideWeight", + "(Tensor), The bbox inside weight with shape " + "[F, 4], F is the foreground number."); + AddOutput("ForegroundNumber", + "(Tensor), The foreground number. " + "[1, 1]."); + AddComment(R"DOC( + This layer can be, for given the Intersection-over-Union (IoU) overlap + between anchors and ground truth boxes, to assign classification and + regression targets to each anchor, these target labels are used for + train retinanet. + + Every anchor is assigned with a length C one-hot vector of + classification targets, and a 4-vector of box regression targets, + where C is the class number. The assignment rules are as followed: + + 1. Anchors are assigned to ground-truth boxes when: (i) it has the highest + IoU overlap with a ground-truth box, or (ii) it has an IoU overlap higher + than positive_overlap(0.5) with any ground-truth box. + + 2. Anchors are assigned to background when its IoU ratio is lower than + negative_overlap (0.4) for all ground-truth boxes. + + When an anchor is assigned with a ground-truth box which is the i-th category, + the i-th entry in its C vector of targets is set to 1 and all other entries + are set to 0. When an anchor is assigned with background, all entries are set + to 0. Anchors that are not assigned do not contribute to the training + objective. The regression targets are the encoded ground-truth boxes + associated with the assigned anchors. + +)DOC"); + } +}; + +class RetinanetTargetAssignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE( + ctx->HasInput("Anchor"), + "Input(Anchor) of RetinanetTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasInput("GtBoxes"), + "Input(GtBoxes) of RetinanetTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasInput("GtLabels"), + "Input(GtLabels) of RetinanetTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasInput("IsCrowd"), + "Input(Anchor) of RetinanetTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasInput("ImInfo"), + "Input(ImInfo) of RetinanetTargetAssignOp should not be null"); + + PADDLE_ENFORCE( + ctx->HasOutput("LocationIndex"), + "Output(LocationIndex) of RetinanetTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("ScoreIndex"), + "Output(ScoreIndex) of RetinanetTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("TargetLabel"), + "Output(TargetLabel) of RetinanetTargetAssignOp should not be null"); + PADDLE_ENFORCE( + ctx->HasOutput("TargetBBox"), + "Output(TargetBBox) of RetinanetTargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("BBoxInsideWeight"), + "Output(BBoxInsideWeight) of RetinanetTargetAssignOp should " + "not be null"); + PADDLE_ENFORCE(ctx->HasOutput("ForegroundNumber"), + "Output(ForegroundNumber) of RetinanetTargetAssignOp should " + "not be null"); + + auto anchor_dims = ctx->GetInputDim("Anchor"); + auto gt_boxes_dims = ctx->GetInputDim("GtBoxes"); + auto gt_labels_dims = ctx->GetInputDim("GtLabels"); + auto im_info_dims = ctx->GetInputDim("ImInfo"); + + PADDLE_ENFORCE_EQ(anchor_dims.size(), 2, + "The rank of Input(Anchor) must be 2."); + PADDLE_ENFORCE_EQ(gt_boxes_dims.size(), 2, + "The rank of Input(GtBoxes) must be 2."); + PADDLE_ENFORCE_EQ(gt_labels_dims.size(), 2, + "The rank of Input(GtLabels) must be 2."); + PADDLE_ENFORCE_EQ(im_info_dims.size(), 2, + "The rank of Input(ImInfo) must be 2."); + + ctx->SetOutputDim("LocationIndex", {gt_labels_dims[0]}); + ctx->SetOutputDim("ScoreIndex", {gt_labels_dims[0]}); + ctx->SetOutputDim("TargetBBox", {gt_labels_dims[0], 4}); + ctx->SetOutputDim("TargetLabel", {gt_labels_dims[0], 1}); + ctx->SetOutputDim("BBoxInsideWeight", {gt_labels_dims[0], 4}); + ctx->SetOutputDim("ForegroundNumber", {gt_labels_dims[0], 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input("Anchor")->type(), + platform::CPUPlace()); + } +}; + +template +std::vector FilterCrowdGtBoxLabel( + const platform::CPUDeviceContext& context, Tensor* gt_boxes, + Tensor* gt_labels, Tensor* is_crowd) { + int gt_num = gt_boxes->dims()[0]; + std::vector not_crowd_inds; + auto* is_crowd_data = is_crowd->data(); + for (int i = 0; i < gt_num; ++i) { + if (is_crowd_data[i] == 0) { + not_crowd_inds.emplace_back(i); + } + } + int ncrowd_num = not_crowd_inds.size(); + Tensor ncrowd_gt_boxes, ncrowd_gt_labels; + T* ncrowd_gt_boxes_data = + ncrowd_gt_boxes.mutable_data({ncrowd_num, 4}, context.GetPlace()); + int* ncrowd_gt_labels_data = + ncrowd_gt_labels.mutable_data({ncrowd_num, 1}, context.GetPlace()); + Gather(gt_boxes->data(), 4, not_crowd_inds.data(), ncrowd_num, + ncrowd_gt_boxes_data); + Gather(gt_labels->data(), 1, not_crowd_inds.data(), ncrowd_num, + ncrowd_gt_labels_data); + std::vector res; + res.emplace_back(ncrowd_gt_boxes); + res.emplace_back(ncrowd_gt_labels); + return res; +} + +template +std::vector GetAllFgBgGt(const platform::CPUDeviceContext& ctx, + const Tensor& anchor_by_gt_overlap, + const Tensor& ncrowd_gt_labels, + const float positive_overlap, + const float negative_overlap, + std::minstd_rand engine) { + auto* anchor_by_gt_overlap_data = anchor_by_gt_overlap.data(); + int anchor_num = anchor_by_gt_overlap.dims()[0]; + int gt_num = anchor_by_gt_overlap.dims()[1]; + + std::vector fg_inds; + std::vector bg_inds; + std::vector gt_inds; + std::vector tgt_lbl; + std::vector fg_fake; + std::vector bbox_inside_weight; + // Calculate the max IoU between anchors and gt boxes + // Map from anchor to gt box that has highest overlap + auto place = ctx.GetPlace(); + Tensor anchor_to_gt_max, anchor_to_gt_argmax, gt_to_anchor_max; + anchor_to_gt_max.mutable_data({anchor_num}, place); + int* argmax = anchor_to_gt_argmax.mutable_data({anchor_num}, place); + gt_to_anchor_max.mutable_data({gt_num}, place); + + auto anchor_by_gt_overlap_et = + framework::EigenMatrix::From(anchor_by_gt_overlap); + auto anchor_to_gt_max_et = + framework::EigenVector::Flatten(anchor_to_gt_max); + auto gt_to_anchor_max_et = + framework::EigenVector::Flatten(gt_to_anchor_max); + auto anchor_to_gt_argmax_et = + framework::EigenVector::Flatten(anchor_to_gt_argmax); + anchor_to_gt_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(1)); + anchor_to_gt_argmax_et = + anchor_by_gt_overlap_et.argmax(1).template cast(); + gt_to_anchor_max_et = + anchor_by_gt_overlap_et.maximum(Eigen::DSizes(0)); + + ScoreAssign(anchor_by_gt_overlap_data, anchor_to_gt_max, gt_to_anchor_max, -1, + -1, positive_overlap, negative_overlap, &fg_inds, &bg_inds, + &tgt_lbl, &fg_fake, &bbox_inside_weight, engine, false); + const int* gt_labels_data = ncrowd_gt_labels.data(); + int64_t fg_num = fg_inds.size(); + for (int64_t i = 0; i < fg_num; ++i) { + int gt_idx = argmax[fg_inds[i]]; + tgt_lbl[i] = gt_labels_data[gt_idx]; + } + + int bg_num = bg_inds.size(); + int fg_fake_num = fg_fake.size(); + gt_inds.reserve(fg_fake_num); + for (int i = 0; i < fg_fake_num; ++i) { + gt_inds.emplace_back(argmax[fg_fake[i]]); + } + + Tensor loc_index_t, score_index_t, tgt_lbl_t, gt_inds_t, bbox_inside_weight_t; + Tensor fg_num_t; + int* loc_index_data = loc_index_t.mutable_data({fg_fake_num}, place); + int* score_index_data = + score_index_t.mutable_data({fg_num + bg_num}, place); + int* tgt_lbl_data = tgt_lbl_t.mutable_data({fg_num + bg_num}, place); + int* gt_inds_data = gt_inds_t.mutable_data({fg_fake_num}, place); + int* fg_num_data = fg_num_t.mutable_data({1}, place); + T* bbox_inside_weight_data = + bbox_inside_weight_t.mutable_data({fg_fake_num, 4}, place); + std::copy(fg_fake.begin(), fg_fake.end(), loc_index_data); + std::copy(fg_inds.begin(), fg_inds.end(), score_index_data); + std::copy(bg_inds.begin(), bg_inds.end(), score_index_data + fg_num); + std::copy(tgt_lbl.begin(), tgt_lbl.end(), tgt_lbl_data); + std::copy(gt_inds.begin(), gt_inds.end(), gt_inds_data); + std::copy(bbox_inside_weight.begin(), bbox_inside_weight.end(), + bbox_inside_weight_data); + fg_num_data[0] = fg_fake.size() + 1; + std::vector loc_score_tgtlbl_gt; + loc_score_tgtlbl_gt.emplace_back(loc_index_t); + loc_score_tgtlbl_gt.emplace_back(score_index_t); + loc_score_tgtlbl_gt.emplace_back(tgt_lbl_t); + loc_score_tgtlbl_gt.emplace_back(gt_inds_t); + loc_score_tgtlbl_gt.emplace_back(bbox_inside_weight_t); + loc_score_tgtlbl_gt.emplace_back(fg_num_t); + + return loc_score_tgtlbl_gt; +} + +template +class RetinanetTargetAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* anchor = context.Input("Anchor"); // (H*W*A) * 4 + auto* gt_boxes = context.Input("GtBoxes"); + auto* gt_labels = context.Input("GtLabels"); + auto* is_crowd = context.Input("IsCrowd"); + auto* im_info = context.Input("ImInfo"); + + auto* loc_index = context.Output("LocationIndex"); + auto* score_index = context.Output("ScoreIndex"); + auto* tgt_bbox = context.Output("TargetBBox"); + auto* tgt_lbl = context.Output("TargetLabel"); + auto* bbox_inside_weight = context.Output("BBoxInsideWeight"); + auto* fg_num = context.Output("ForegroundNumber"); + + PADDLE_ENFORCE_EQ(gt_boxes->lod().size(), 1UL, + "RetinanetTargetAssignOp gt_boxes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(gt_labels->lod().size(), 1UL, + "RetinanetTargetAssignOp gt_boxes needs 1 level of LoD"); + PADDLE_ENFORCE_EQ(is_crowd->lod().size(), 1UL, + "RetinanetTargetAssignOp is_crowd needs 1 level of LoD"); + + int64_t anchor_num = static_cast(anchor->dims()[0]); + int64_t batch_num = static_cast(gt_boxes->lod().back().size() - 1); + + float positive_overlap = context.Attr("positive_overlap"); + float negative_overlap = context.Attr("negative_overlap"); + + int64_t max_num = batch_num * anchor_num; + auto place = context.GetPlace(); + + loc_index->mutable_data({max_num}, place); + score_index->mutable_data({max_num}, place); + tgt_bbox->mutable_data({max_num, 4}, place); + tgt_lbl->mutable_data({max_num, 1}, place); + bbox_inside_weight->mutable_data({max_num, 4}, place); + fg_num->mutable_data({batch_num, 1}, place); + auto& dev_ctx = context.device_context(); + + std::random_device rnd; + std::minstd_rand engine; + int seed = rnd(); + engine.seed(seed); + + framework::LoD lod_loc, loc_score, lod_fg; + std::vector lod0_loc(1, 0); + std::vector lod0_score(1, 0); + std::vector lod0_fg(1, 0); + + int total_loc_num = 0; + int total_score_num = 0; + int total_fg_num = 0; + auto gt_boxes_lod = gt_boxes->lod().back(); + auto gt_labels_lod = gt_labels->lod().back(); + auto is_crowd_lod = is_crowd->lod().back(); + for (int i = 0; i < batch_num; ++i) { + Tensor gt_boxes_slice = + gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]); + Tensor gt_labels_slice = + gt_labels->Slice(gt_labels_lod[i], gt_labels_lod[i + 1]); + Tensor is_crowd_slice = + is_crowd->Slice(is_crowd_lod[i], is_crowd_lod[i + 1]); + Tensor im_info_slice = im_info->Slice(i, i + 1); + auto* im_info_data = im_info_slice.data(); + auto im_height = im_info_data[0]; + auto im_width = im_info_data[1]; + auto im_scale = im_info_data[2]; + + // Filter straddle anchor + std::vector filter_output = + FilterStraddleAnchor(dev_ctx, anchor, -1, im_height, im_width); + Tensor inds_inside = filter_output[0]; + Tensor inside_anchor = filter_output[1]; + + // Filter crowd gt + std::vector ncrowd_output = FilterCrowdGtBoxLabel( + dev_ctx, >_boxes_slice, >_labels_slice, &is_crowd_slice); + Tensor ncrowd_gt_boxes = ncrowd_output[0]; + Tensor ncrowd_gt_labels = ncrowd_output[1]; + + auto ncrowd_gt_boxes_et = + framework::EigenTensor::From(ncrowd_gt_boxes); + ncrowd_gt_boxes_et = ncrowd_gt_boxes_et * im_scale; + + Tensor anchor_by_gt_overlap; + anchor_by_gt_overlap.mutable_data( + {inside_anchor.dims()[0], ncrowd_gt_boxes.dims()[0]}, place); + BboxOverlaps(inside_anchor, ncrowd_gt_boxes, &anchor_by_gt_overlap); + + auto loc_score_tgtlbl_gt = + GetAllFgBgGt(dev_ctx, anchor_by_gt_overlap, ncrowd_gt_labels, + positive_overlap, negative_overlap, engine); + + Tensor sampled_loc_index = loc_score_tgtlbl_gt[0]; + Tensor sampled_score_index = loc_score_tgtlbl_gt[1]; + Tensor sampled_tgtlbl = loc_score_tgtlbl_gt[2]; + Tensor sampled_gt_index = loc_score_tgtlbl_gt[3]; + Tensor sampled_bbox_inside_weight = loc_score_tgtlbl_gt[4]; + Tensor sampled_fg_num = loc_score_tgtlbl_gt[5]; + + int loc_num = sampled_loc_index.dims()[0]; + int score_num = sampled_score_index.dims()[0]; + // unmap to all anchor + Tensor sampled_loc_index_unmap, sampled_score_index_unmap; + sampled_loc_index_unmap.mutable_data({loc_num}, place); + sampled_score_index_unmap.mutable_data({score_num}, place); + Gather(inds_inside.data(), 1, sampled_loc_index.data(), + loc_num, sampled_loc_index_unmap.data()); + Gather(inds_inside.data(), 1, sampled_score_index.data(), + score_num, sampled_score_index_unmap.data()); + + // get target bbox deltas + Tensor sampled_anchor, sampled_gt, sampled_tgt_bbox; + auto* sampled_anchor_data = + sampled_anchor.mutable_data({loc_num, 4}, place); + auto* sampled_gt_data = sampled_gt.mutable_data({loc_num, 4}, place); + Gather(anchor->data(), 4, sampled_loc_index_unmap.data(), + loc_num, sampled_anchor_data); + Gather(ncrowd_gt_boxes.data(), 4, sampled_gt_index.data(), + loc_num, sampled_gt_data); + sampled_tgt_bbox.mutable_data({loc_num, 4}, place); + BoxToDelta(loc_num, sampled_anchor, sampled_gt, nullptr, false, + &sampled_tgt_bbox); + + // Add anchor offset + int anchor_offset = i * anchor_num; + auto sampled_loc_index_unmap_et = + framework::EigenTensor::From(sampled_loc_index_unmap); + sampled_loc_index_unmap_et = sampled_loc_index_unmap_et + anchor_offset; + auto sampled_score_index_unmap_et = + framework::EigenTensor::From(sampled_score_index_unmap); + sampled_score_index_unmap_et = + sampled_score_index_unmap_et + anchor_offset; + AppendRpns(loc_index, total_loc_num, &sampled_loc_index_unmap); + AppendRpns(score_index, total_score_num, &sampled_score_index_unmap); + AppendRpns(tgt_bbox, total_loc_num * 4, &sampled_tgt_bbox); + AppendRpns(tgt_lbl, total_score_num, &sampled_tgtlbl); + AppendRpns(bbox_inside_weight, total_loc_num * 4, + &sampled_bbox_inside_weight); + AppendRpns(fg_num, total_fg_num, &sampled_fg_num); + + total_loc_num += loc_num; + total_score_num += score_num; + total_fg_num += 1; + lod0_loc.emplace_back(total_loc_num); + lod0_score.emplace_back(total_score_num); + lod0_fg.emplace_back(total_fg_num); + } + + PADDLE_ENFORCE_LE(total_loc_num, max_num); + PADDLE_ENFORCE_LE(total_score_num, max_num); + PADDLE_ENFORCE_LE(total_fg_num, batch_num); + + lod_loc.emplace_back(lod0_loc); + loc_score.emplace_back(lod0_score); + lod_fg.emplace_back(lod0_fg); + loc_index->set_lod(lod_loc); + score_index->set_lod(loc_score); + tgt_bbox->set_lod(lod_loc); + tgt_lbl->set_lod(loc_score); + bbox_inside_weight->set_lod(lod_loc); + fg_num->set_lod(lod_fg); + loc_index->Resize({total_loc_num}); + score_index->Resize({total_score_num}); + tgt_bbox->Resize({total_loc_num, 4}); + tgt_lbl->Resize({total_score_num, 1}); + bbox_inside_weight->Resize({total_loc_num, 4}); + fg_num->Resize({total_fg_num, 1}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(rpn_target_assign, ops::RpnTargetAssignOp, + ops::RpnTargetAssignOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(rpn_target_assign, ops::RpnTargetAssignKernel, + ops::RpnTargetAssignKernel); +REGISTER_OPERATOR(retinanet_target_assign, ops::RetinanetTargetAssignOp, + ops::RetinanetTargetAssignOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(retinanet_target_assign, + ops::RetinanetTargetAssignKernel, + ops::RetinanetTargetAssignKernel); diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc new file mode 100644 index 00000000..50ff3cb1 --- /dev/null +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cc @@ -0,0 +1,208 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" +#include +#include +#include + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class SigmoidFocalLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("FgNum"), "Input(FgNum) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto labels_dims = ctx->GetInputDim("Label"); + auto fg_dims = ctx->GetInputDim("FgNum"); + + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(rank, labels_dims.size(), + "Input(X) and Input(Label) shall have the same rank."); + PADDLE_ENFORCE_EQ(fg_dims.size(), 1, "The rank of Input(FgNum) must be 1."); + bool check = true; + if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 || + framework::product(labels_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(labels_dims, 0, rank - 1), + "Input(X) and Input(Label) shall have the same shape " + "except the last dimension."); + } + + PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL, + "The last dimension of input(Label) should be 1."); + + ctx->ShareDim("X", /*->*/ "Out"); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class SigmoidFocalLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("FgNum"), "Input(FgNum) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto labels_dims = ctx->GetInputDim("Label"); + auto fg_dims = ctx->GetInputDim("FgNum"); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(rank, labels_dims.size(), + "Input(X) and Input(Label) shall have the same rank."); + PADDLE_ENFORCE_EQ(fg_dims.size(), 1, "The rank of Input(FgNum) must be 1."); + bool check = true; + if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 || + framework::product(labels_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(labels_dims, 0, rank - 1), + "Input(X) and Input(Label) shall have the same shape."); + + PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL, + "The last dimension of input(Label) should be 1."); + + PADDLE_ENFORCE_EQ( + framework::slice_ddim(x_dims, 0, rank), + framework::slice_ddim(dout_dims, 0, rank), + "Input(X) and Input(Out@Grad) shall have the same shape."); + } + + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class SigmoidFocalLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a 2-D tensor with shape [N, D], " + "where N is the batch size and D is the number of classes " + "(excluding background). This input is a tensor of logits " + "computed by the previous operator."); + AddInput("Label", + "(Tensor, default Tensor), a 2-D tensor with shape [N, 1]. " + "This input is a tensor of probabilistic labels."); + AddInput("FgNum", + "(Tensor, default Tensor), a 1-D tensor with shape [1]. " + "This input is the number of foreground."); + AddOutput( + "Out", + "(Tensor, default Tensor), a 2-D tensor with shape [N, D]. " + "This output is the focal loss."); + AddAttr( + "gamma", + "Hyper-parameter of sigmoid focal loss op, which is to balance the " + "easy and hard examples. " + "A float scalar with default value 2.0.") + .SetDefault(2.0); + AddAttr( + "alpha", + "Hyper-parameter of sigmoid focal loss op, which is to balance the " + "positive and negative examples. " + "A float scalar with default value 0.5.") + .SetDefault(0.25); + AddComment(R"DOC( +Sigmoid Focal Loss Operator. + +Focal loss is used to address the foreground-background class imbalance existed +on the training phase of one-stage detectors. This operator computes the sigmoid +value for each element in the input tensor, after which focal loss is measured. + +The focal loss is given as follows: + +$$Loss_j = (-Label_j * alpha * \pow(1 - \sigma(X_j), gamma) * \log(\sigma(X_j)) - +(1 - Labels_j) * (1 - alpha) * \pow(\sigma(X_j), gamma) * \log(1 - \sigma(X_j))) +/ FgNum, j = 1,...,K$$ + +We know that $$\sigma(X_j) = \\frac{1}{1 + \exp(-X_j)}$$. + +)DOC"); + } +}; + +class SigmoidFocalLossGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sigmoid_focal_loss_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Label", Input("Label")); + op->SetInput("FgNum", Input("FgNum")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sigmoid_focal_loss, ops::SigmoidFocalLossOp, + ops::SigmoidFocalLossOpMaker, + ops::SigmoidFocalLossGradOpDescMaker); +REGISTER_OPERATOR(sigmoid_focal_loss_grad, ops::SigmoidFocalLossGradOp); +REGISTER_OP_CPU_KERNEL( + sigmoid_focal_loss, + ops::SigmoidFocalLossKernel, + ops::SigmoidFocalLossKernel); +REGISTER_OP_CPU_KERNEL( + sigmoid_focal_loss_grad, + ops::SigmoidFocalLossGradKernel, + ops::SigmoidFocalLossGradKernel); diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu new file mode 100644 index 00000000..4031554a --- /dev/null +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu @@ -0,0 +1,181 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "cub/cub.cuh" +#include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" +#include "paddle/fluid/operators/math.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void GPUSigmoidFocalLossForward(const T *x_data, + const int *label_data, + const int *fg_num_data, + const T gamma, const T alpha, + const int num_classes, + const int limit, T *out_data) { + CUDA_1D_KERNEL_LOOP(i, limit) { + T x = x_data[i]; + int a = i / num_classes; // current sample + int d = i % num_classes; // current class + int g = label_data[a]; // target + + // check whether the input data is positive or negative + // the target classes are in range 1-81 + // and the d is in range 0-80 + T c_pos = static_cast(g == (d + 1)); + T c_neg = static_cast((g != -1) & (g != (d + 1))); + + T fg_num = static_cast((fg_num_data[0] > 1) ? fg_num_data[0] : 1); + T s_neg = (1.0 - alpha) / fg_num; + T s_pos = alpha / fg_num; + + // p = 1. / 1. + expf(-x) + T p = 1. / (1. + real_exp(-x)); + + // (1 - p)**gamma * log(p) + T term_pos = std::pow(static_cast(1. - p), gamma) * + real_log(p > FLT_MIN ? p : FLT_MIN); + // p**gamma * log(1 - p) + T term_neg = + std::pow(p, gamma) * + (-1. * x * (x >= 0) - real_log(1. + real_exp(x - 2. * x * (x >= 0)))); + + out_data[i] = 0.0; + out_data[i] += -c_pos * term_pos * s_pos; + out_data[i] += -c_neg * term_neg * s_neg; + } +} + +template +__global__ void GPUSigmoidFocalLossBackward( + const T *x_data, const int *label_data, const int *fg_num_data, + const T gamma, const T alpha, const int num_classes, const T *dout_data, + const int limit, T *dx_data) { + CUDA_1D_KERNEL_LOOP(i, limit) { + T x = x_data[i]; + T dout = dout_data[i]; + + int a = i / num_classes; // current sample + int d = i % num_classes; // current class + + T fg_num = static_cast((fg_num_data[0] > 1) ? fg_num_data[0] : 1); + T s_neg = (1.0 - alpha) / fg_num; + T s_pos = alpha / fg_num; + + int g = label_data[a]; + T c_pos = static_cast(g == (d + 1)); + T c_neg = static_cast((g != -1) & (g != (d + 1))); + + T p = 1. / (1. + real_exp(-x)); + + // (1-p)**g * (1 - p - g*p*log(p)) + T term_pos = std::pow(static_cast(1. - p), gamma) * + (1. - p - (p * gamma * real_log(p > FLT_MIN ? p : FLT_MIN))); + // (p**g) * (g*(1-p)*log(1-p) - p) + T term_neg = + std::pow(p, gamma) * + ((-1. * x * (x >= 0) - real_log(1. + real_exp(x - 2. * x * (x >= 0)))) * + (1. - p) * gamma - + p); + + dx_data[i] = 0.0; + dx_data[i] += -c_pos * s_pos * term_pos; + dx_data[i] += -c_neg * s_neg * term_neg; + dx_data[i] = dx_data[i] * dout; + } +} + +template +class GPUSigmoidFocalLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + const Tensor *FgNum = context.Input("FgNum"); + Tensor *Out = context.Output("Out"); + T gamma = static_cast(context.Attr("gamma")); + T alpha = static_cast(context.Attr("alpha")); + auto x_dims = X->dims(); + int num_classes = static_cast(x_dims[1]); + auto out_data = Out->mutable_data(context.GetPlace()); + + auto &dev_ctx = context.cuda_device_context(); + + int limit = Out->numel(); + int blocks = NumBlocks(limit); + int threads = kNumCUDAThreads; + GPUSigmoidFocalLossForward<<>>( + X->data(), Labels->data(), FgNum->data(), gamma, alpha, + num_classes, limit, out_data); + } +}; + +template +class GPUSigmoidFocalLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + const Tensor *FgNum = context.Input("FgNum"); + const Tensor *dOut = context.Input(framework::GradVarName("Out")); + Tensor *dX = context.Output(framework::GradVarName("X")); + auto dx_data = dX->mutable_data(context.GetPlace()); + T gamma = static_cast(context.Attr("gamma")); + T alpha = static_cast(context.Attr("alpha")); + auto x_dims = X->dims(); + int num_classes = static_cast(x_dims[1]); + + auto &dev_ctx = context.cuda_device_context(); + + int limit = dX->numel(); + int blocks = NumBlocks(limit); + int threads = kNumCUDAThreads; + GPUSigmoidFocalLossBackward<<>>( + X->data(), Labels->data(), FgNum->data(), gamma, alpha, + num_classes, dOut->data(), limit, dx_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + sigmoid_focal_loss, + ops::GPUSigmoidFocalLossKernel, + ops::GPUSigmoidFocalLossKernel); +REGISTER_OP_CUDA_KERNEL( + sigmoid_focal_loss_grad, + ops::GPUSigmoidFocalLossGradKernel, + ops::GPUSigmoidFocalLossGradKernel); diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h new file mode 100644 index 00000000..51829595 --- /dev/null +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.h @@ -0,0 +1,128 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SigmoidFocalLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + const Tensor *FgNum = context.Input("FgNum"); + Tensor *Out = context.Output("Out"); + T gamma = static_cast(context.Attr("gamma")); + T alpha = static_cast(context.Attr("alpha")); + auto out_data = Out->mutable_data(context.GetPlace()); + int limit = Out->numel(); + auto x_data = X->data(); + auto label_data = Labels->data(); + auto fg_num_data = FgNum->data(); + auto x_dims = X->dims(); + int num_classes = static_cast(x_dims[1]); + + for (int idx = 0; idx < limit; ++idx) { + T x = x_data[idx]; + int a = idx / num_classes; // current sample + int d = idx % num_classes; // current class + int g = label_data[a]; // target + + // Check whether the input data is positive or negative + // The target classes are in range 1-81 + // and the d is in range 0-80 + T c_pos = static_cast(g == (d + 1)); + T c_neg = static_cast((g != -1) & (g != (d + 1))); + T fg_num = static_cast((fg_num_data[0] > 1) ? fg_num_data[0] : 1); + T s_neg = (1.0 - alpha) / fg_num; + T s_pos = alpha / fg_num; + + // p = 1. / 1. + expf(-x) + T p = 1. / (1. + std::exp(-x)); + + // (1 - p)**gamma * log(p) where + T term_pos = std::pow(static_cast(1. - p), gamma) * + std::log(p > FLT_MIN ? p : FLT_MIN); + // p**gamma * log(1 - p) + T term_neg = + std::pow(p, gamma) * + (-1. * x * (x >= 0) - std::log(1. + std::exp(x - 2. * x * (x >= 0)))); + + out_data[idx] = 0.0; + out_data[idx] += -c_pos * term_pos * s_pos; + out_data[idx] += -c_neg * term_neg * s_neg; + } + } +}; + +template +class SigmoidFocalLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + const Tensor *FgNum = context.Input("FgNum"); + const Tensor *dOut = context.Input(framework::GradVarName("Out")); + Tensor *dX = context.Output(framework::GradVarName("X")); + auto dx_data = dX->mutable_data(context.GetPlace()); + T gamma = static_cast(context.Attr("gamma")); + T alpha = static_cast(context.Attr("alpha")); + auto x_dims = X->dims(); + int num_classes = static_cast(x_dims[1]); + + int limit = dX->numel(); + auto x_data = X->data(); + auto label_data = Labels->data(); + auto fg_num_data = FgNum->data(); + auto dout_data = dOut->data(); + for (int idx = 0; idx < limit; ++idx) { + T x = x_data[idx]; + int a = idx / num_classes; // current sample + int d = idx % num_classes; // current class + + T fg_num = static_cast((fg_num_data[0] > 1) ? fg_num_data[0] : 1); + T s_neg = static_cast((1.0 - alpha) / fg_num); + T s_pos = alpha / fg_num; + int g = label_data[a]; + + T c_pos = static_cast(g == (d + 1)); + T c_neg = static_cast((g != -1) & (g != (d + 1))); + T p = 1. / (1. + std::exp(-x)); + + // (1-p)**g * (1 - p - g*p*log(p)) + T term_pos = std::pow(static_cast(1. - p), gamma) * + (1. - p - (p * gamma * std::log(p > FLT_MIN ? p : FLT_MIN))); + // (p**g) * (g*(1-p)*log(1-p) - p) + T term_neg = std::pow(p, gamma) * + ((-1. * x * (x >= 0) - + std::log(1. + std::exp(x - 2. * x * (x >= 0)))) * + (1. - p) * gamma - + p); + dx_data[idx] = 0.0; + dx_data[idx] += -c_pos * s_pos * term_pos; + dx_data[idx] += -c_neg * s_neg * term_neg; + dx_data[idx] = dx_data[idx] * dout_data[idx]; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc new file mode 100644 index 00000000..c057c82c --- /dev/null +++ b/paddle/fluid/operators/detection/target_assign_op.cc @@ -0,0 +1,159 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/target_assign_op.h" + +namespace paddle { +namespace operators { + +class TargetAssignOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of TargetAssignOp should not be null"); + PADDLE_ENFORCE(ctx->HasInput("MatchIndices"), + "Input(MatchIndices) of TargetAssignOp should not be null"); + + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of TargetAssignOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("OutWeight"), + "Output(OutWeight) of TargetAssignOp should not be null."); + + auto in_dims = ctx->GetInputDim("X"); + auto mi_dims = ctx->GetInputDim("MatchIndices"); + + PADDLE_ENFORCE_EQ(in_dims.size(), 3, "The rank of Input(X) must be 3."); + PADDLE_ENFORCE_EQ(mi_dims.size(), 2, + "The rank of Input(MatchIndices) must be 2."); + + if (ctx->HasInput("NegIndices")) { + auto neg_dims = ctx->GetInputDim("NegIndices"); + PADDLE_ENFORCE_EQ(neg_dims.size(), 2, + "The rank of Input(NegIndices) must be 2."); + PADDLE_ENFORCE_EQ(neg_dims[1], 1, + "The last dimenstion of Out(NegIndices) must be 1."); + } + + auto n = mi_dims[0]; + auto m = mi_dims[1]; + auto k = in_dims[in_dims.size() - 1]; + ctx->SetOutputDim("Out", {n, m, k}); + ctx->SetOutputDim("OutWeight", {n, m, 1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class TargetAssignOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor), This input is a 3D LoDTensor with shape [M, P, K]. " + "Some elements in X will be assigned to Out based on the " + "MatchIndices and NegIndices."); + AddInput("MatchIndices", + "(Tensor, default Tensor), The input matched indices " + "with shape [N, P], If MatchIndices[i][j] is -1, the j-th entity " + "of column is not matched to any entity of row in i-th instance."); + AddInput("NegIndices", + "(LoDTensor, default LoDTensor), The input negative example " + "indices are an optional input with shape [Neg, 1], where Neg is " + "the total number of negative example indices.") + .AsDispensable(); + AddAttr("mismatch_value", + "(int, default 0), Fill this value to the " + "mismatched location.") + .SetDefault(0); + AddOutput("Out", + "(Tensor), The output is a 3D Tensor with shape [N, P, K], " + "N and P is the same as they are in NegIndices, K is the " + "same as it in input of X. If MatchIndices[i][j] " + "is -1, the Out[i][j][0 : K] is the mismatch_value."); + AddOutput("OutWeight", + "(Tensor), The weight for output with the shape of [N, P, 1]"); + AddComment(R"DOC( +This operator can be, for given the target bounding boxes or labels, +to assign classification and regression targets to each prediction as well as +weights to prediction. The weights is used to specify which prediction would +not contribute to training loss. + +For each instance, the output `Out` and`OutWeight` are assigned based on +`MatchIndices` and `NegIndices`. +Assumed that the row offset for each instance in `X` is called lod, +this operator assigns classification/regression targets by performing the +following steps: + +1. Assigning all outpts based on `MatchIndices`: + +If id = MatchIndices[i][j] > 0, + + Out[i][j][0 : K] = X[lod[i] + id][j % P][0 : K] + OutWeight[i][j] = 1. + +Otherwise, + + Out[j][j][0 : K] = {mismatch_value, mismatch_value, ...} + OutWeight[i][j] = 0. + +2. Assigning OutWeight based on `NegIndices` if `NegIndices` is provided: + +Assumed that the row offset for each instance in `NegIndices` is called neg_lod, +for i-th instance and each `id` of NegIndices in this instance: + + Out[i][id][0 : K] = {mismatch_value, mismatch_value, ...} + OutWeight[i][id] = 1.0 + + )DOC"); + } +}; + +template +struct NegTargetAssignFunctor { + void operator()(const platform::CPUDeviceContext& ctx, const int* neg_indices, + const size_t* lod, const int N, const int M, const int K, + const int mismatch_value, T* out, WT* out_wt) { + for (int i = 0; i < N; ++i) { + for (size_t j = lod[i]; j < lod[i + 1]; ++j) { + int id = neg_indices[j]; + int off = (i * M + id) * K; + for (int k = 0; k < K; ++k) { + out[off + k] = mismatch_value; + out_wt[off + k] = static_cast(1.0); + } + } + } + } +}; + +template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(target_assign, ops::TargetAssignOp, ops::TargetAssignOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + target_assign, + ops::TargetAssignKernel, + ops::TargetAssignKernel); diff --git a/paddle/fluid/operators/detection/target_assign_op.cu b/paddle/fluid/operators/detection/target_assign_op.cu new file mode 100644 index 00000000..ddf68899 --- /dev/null +++ b/paddle/fluid/operators/detection/target_assign_op.cu @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/target_assign_op.h" + +namespace paddle { +namespace operators { + +template +__global__ void NegTargetAssignKernel(const int* neg_indices, const size_t* lod, + const int N, const int M, const int K, + const int mismatch_value, T* out, + WT* out_wt) { + int bidx = blockIdx.x; + int st = lod[bidx]; + int ed = lod[bidx + 1]; + + int row_start = bidx * M; + for (int i = st + threadIdx.x; i < ed; i += blockDim.x) { + int id = row_start + neg_indices[i]; + for (int k = 0; k < K; ++k) { + out[id * K + k] = T(mismatch_value); + out_wt[id * K + k] = WT(1.); + } + } +} + +template +struct NegTargetAssignFunctor { + void operator()(const platform::CUDADeviceContext& ctx, + const int* neg_indices, const size_t* lod, const int N, + const int M, const int K, const int mismatch_value, T* out, + WT* out_wt) { + const int block_size = 256; + const int grid_size = N; + NegTargetAssignKernel<<>>( + neg_indices, lod, N, M, K, mismatch_value, out, out_wt); + } +}; + +template struct NegTargetAssignFunctor; +template struct NegTargetAssignFunctor; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + target_assign, + ops::TargetAssignKernel, + ops::TargetAssignKernel); diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h new file mode 100644 index 00000000..7f989dfc --- /dev/null +++ b/paddle/fluid/operators/detection/target_assign_op.h @@ -0,0 +1,141 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { +template +struct TargetAssignFunctor { + const T* in_; + const int* match_indices_; + const size_t* lod_; + const int mismatch_value_; + const int64_t N_; + const int64_t M_; + const int64_t P_; + const int64_t K_; + + T* out_; + WT* out_wt_; + + TargetAssignFunctor(const T* input, const int* match_indices, + const size_t* lod, const int mismatch_value, + const int64_t N, const int64_t M, const int64_t P, + const int64_t K, T* out, WT* out_wt) + : in_(input), + match_indices_(match_indices), + lod_(lod), + mismatch_value_(mismatch_value), + N_(N), + M_(M), + P_(P), + K_(K), + out_(out), + out_wt_(out_wt) {} + + HOSTDEVICE void operator()(size_t i) const { + int h = i / M_; + int w = i - h * M_; + + size_t off = lod_[h]; + int id = match_indices_[i]; + + T* out = out_ + i * K_; + WT* out_wt = out_wt_ + i; + + if (id > -1) { + int w_off = w % P_; + const T* in = in_ + ((off + id) * P_ + w_off) * K_; + for (int64_t k = 0; k < K_; ++k) { + out[k] = in[k]; + } + out_wt[0] = static_cast(1.); + } else { + for (int64_t k = 0; k < K_; ++k) { + out[k] = static_cast(mismatch_value_); + } + out_wt[0] = static_cast(0.); + } + } +}; + +template +struct NegTargetAssignFunctor { + void operator()(const platform::DeviceContext& ctx, const int* neg_indices, + const size_t* lod, const int N, const int M, const int K, + const int mismatch_value, T* out, WT* out_wt) const; +}; + +template +class TargetAssignKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* match_indices = ctx.Input("MatchIndices"); + + auto* out = ctx.Output("Out"); + auto* out_wt = ctx.Output("OutWeight"); + + PADDLE_ENFORCE_EQ(x->lod().size(), 1UL); + int mismatch_value = ctx.Attr("mismatch_value"); + + const T* x_data = x->data(); + const int* match_idx_data = match_indices->data(); + + T* out_data = out->mutable_data(ctx.GetPlace()); + WT* out_wt_data = out_wt->mutable_data(ctx.GetPlace()); + + int64_t n = match_indices->dims()[0]; + int64_t m = match_indices->dims()[1]; + int64_t p = x->dims()[1]; + int64_t k = x->dims()[2]; + + auto x_lod = x->lod().back(); +#if defined(PADDLE_WITH_CUDA) + size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace()); +#else + size_t* x_lod_data = x_lod.data(); +#endif + + TargetAssignFunctor functor(x_data, match_idx_data, x_lod_data, + mismatch_value, n, m, p, k, out_data, + out_wt_data); + + auto& device_ctx = ctx.template device_context(); + platform::ForRange for_range(device_ctx, n * m); + for_range(functor); + + auto* neg_indices = ctx.Input("NegIndices"); + if (neg_indices) { + PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL); + const int* neg_idx_data = neg_indices->data(); + auto neg_lod = neg_indices->lod().back(); +#if defined(PADDLE_WITH_CUDA) + size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace()); +#else + size_t* neg_lod_data = neg_lod.data(); +#endif + NegTargetAssignFunctor neg_trg_functor; + neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k, + mismatch_value, out_data, out_wt_data); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc new file mode 100644 index 00000000..e0d7e25d --- /dev/null +++ b/paddle/fluid/operators/detection/yolo_box_op.cc @@ -0,0 +1,167 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/detection/yolo_box_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class YoloBoxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of YoloBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ImgSize"), + "Input(ImgSize) of YoloBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Boxes"), + "Output(Boxes) of YoloBoxOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Scores"), + "Output(Scores) of YoloBoxOp should not be null."); + + auto dim_x = ctx->GetInputDim("X"); + auto dim_imgsize = ctx->GetInputDim("ImgSize"); + auto anchors = ctx->Attrs().Get>("anchors"); + int anchor_num = anchors.size() / 2; + auto class_num = ctx->Attrs().Get("class_num"); + + PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); + PADDLE_ENFORCE_EQ( + dim_x[1], anchor_num * (5 + class_num), + "Input(X) dim[1] should be equal to (anchor_mask_number * (5 " + "+ class_num))."); + PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2, + "Input(ImgSize) should be a 2-D tensor."); + PADDLE_ENFORCE_EQ( + dim_imgsize[0], dim_x[0], + "Input(ImgSize) dim[0] and Input(X) dim[0] should be same."); + PADDLE_ENFORCE_EQ(dim_imgsize[1], 2, "Input(ImgSize) dim[1] should be 2."); + PADDLE_ENFORCE_GT(anchors.size(), 0, + "Attr(anchors) length should be greater than 0."); + PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, + "Attr(anchors) length should be even integer."); + PADDLE_ENFORCE_GT(class_num, 0, + "Attr(class_num) should be an integer greater than 0."); + + int box_num = dim_x[2] * dim_x[3] * anchor_num; + std::vector dim_boxes({dim_x[0], box_num, 4}); + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_boxes)); + + std::vector dim_scores({dim_x[0], box_num, class_num}); + ctx->SetOutputDim("Scores", framework::make_ddim(dim_scores)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } +}; + +class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input tensor of YoloBox operator is a 4-D tensor with " + "shape of [N, C, H, W]. The second dimension(C) stores " + "box locations, confidence score and classification one-hot " + "keys of each anchor box. Generally, X should be the output " + "of YOLOv3 network."); + AddInput("ImgSize", + "The image size tensor of YoloBox operator, " + "This is a 2-D tensor with shape of [N, 2]. This tensor holds " + "height and width of each input image used for resizing output " + "box in input image scale."); + AddOutput("Boxes", + "The output tensor of detection boxes of YoloBox operator, " + "This is a 3-D tensor with shape of [N, M, 4], N is the " + "batch num, M is output box number, and the 3rd dimension " + "stores [xmin, ymin, xmax, ymax] coordinates of boxes."); + AddOutput("Scores", + "The output tensor of detection boxes scores of YoloBox " + "operator, This is a 3-D tensor with shape of " + "[N, M, :attr:`class_num`], N is the batch num, M is " + "output box number."); + + AddAttr("class_num", "The number of classes to predict."); + AddAttr>("anchors", + "The anchor width and height, " + "it will be parsed pair by pair.") + .SetDefault(std::vector{}); + AddAttr("downsample_ratio", + "The downsample ratio from network input to YoloBox operator " + "input, so 32, 16, 8 should be set for the first, second, " + "and thrid YoloBox operators.") + .SetDefault(32); + AddAttr("conf_thresh", + "The confidence scores threshold of detection boxes. " + "Boxes with confidence scores under threshold should " + "be ignored.") + .SetDefault(0.01); + AddComment(R"DOC( + This operator generates YOLO detection boxes from output of YOLOv3 network. + + The output of previous network is in shape [N, C, H, W], while H and W + should be the same, H and W specify the grid size, each grid point predict + given number boxes, this given number, which following will be represented as S, + is specified by the number of anchors. In the second dimension(the channel + dimension), C should be equal to S * (5 + class_num), class_num is the object + category number of source dataset(such as 80 in coco dataset), so the + second(channel) dimension, apart from 4 box location coordinates x, y, w, h, + also includes confidence score of the box and class one-hot key of each anchor + box. + + Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box + predictions should be as follows: + + $$ + b_x = \\sigma(t_x) + c_x + $$ + $$ + b_y = \\sigma(t_y) + c_y + $$ + $$ + b_w = p_w e^{t_w} + $$ + $$ + b_h = p_h e^{t_h} + $$ + + in the equation above, :math:`c_x, c_y` is the left top corner of current grid + and :math:`p_w, p_h` is specified by anchors. + + The logistic regression value of the 5th channel of each anchor prediction boxes + represents the confidence score of each prediction box, and the logistic + regression value of the last :attr:`class_num` channels of each anchor prediction + boxes represents the classifcation scores. Boxes with confidence scores less than + :attr:`conf_thresh` should be ignored, and box final scores is the product of + confidence scores and classification scores. + + $$ + score_{pred} = score_{conf} * score_{class} + $$ + + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel, + ops::YoloBoxKernel); diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu new file mode 100644 index 00000000..5a882958 --- /dev/null +++ b/paddle/fluid/operators/detection/yolo_box_op.cu @@ -0,0 +1,120 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/yolo_box_op.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes, + T* scores, const float conf_thresh, + const int* anchors, const int n, const int h, + const int w, const int an_num, const int class_num, + const int box_num, int input_size) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + T box[4]; + for (; tid < n * box_num; tid += stride) { + int grid_num = h * w; + int i = tid / box_num; + int j = (tid % box_num) / grid_num; + int k = (tid % grid_num) / w; + int l = tid % w; + + int an_stride = (5 + class_num) * grid_num; + int img_height = imgsize[2 * i]; + int img_width = imgsize[2 * i + 1]; + + int obj_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4); + T conf = sigmoid(input[obj_idx]); + if (conf < conf_thresh) { + continue; + } + + int box_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0); + GetYoloBox(box, input, anchors, l, k, j, h, input_size, box_idx, + grid_num, img_height, img_width); + box_idx = (i * box_num + j * grid_num + k * w + l) * 4; + CalcDetectionBox(boxes, box, box_idx, img_height, img_width); + + int label_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5); + int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; + CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, + grid_num); + } +} + +template +class YoloBoxOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* img_size = ctx.Input("ImgSize"); + auto* boxes = ctx.Output("Boxes"); + auto* scores = ctx.Output("Scores"); + + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float conf_thresh = ctx.Attr("conf_thresh"); + int downsample_ratio = ctx.Attr("downsample_ratio"); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int box_num = boxes->dims()[1]; + const int an_num = anchors.size() / 2; + int input_size = downsample_ratio * h; + + auto& dev_ctx = ctx.cuda_device_context(); + auto& allocator = + platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); + int bytes = sizeof(int) * anchors.size(); + auto anchors_ptr = allocator.Allocate(sizeof(int) * anchors.size()); + int* anchors_data = reinterpret_cast(anchors_ptr->ptr()); + const auto gplace = boost::get(ctx.GetPlace()); + const auto cplace = platform::CPUPlace(); + memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes, + dev_ctx.stream()); + + const T* input_data = input->data(); + const int* imgsize_data = img_size->data(); + T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); + T* scores_data = + scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(dev_ctx, boxes, static_cast(0)); + set_zero(dev_ctx, scores, static_cast(0)); + + int grid_dim = (n * box_num + 512 - 1) / 512; + grid_dim = grid_dim > 8 ? 8 : grid_dim; + + KeYoloBoxFw<<>>( + input_data, imgsize_data, boxes_data, scores_data, conf_thresh, + anchors_data, n, h, w, an_num, class_num, box_num, input_size); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel, + ops::YoloBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h new file mode 100644 index 00000000..8b7c7df0 --- /dev/null +++ b/paddle/fluid/operators/detection/yolo_box_op.h @@ -0,0 +1,149 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +HOSTDEVICE inline T sigmoid(T x) { + return 1.0 / (1.0 + std::exp(-x)); +} + +template +HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i, + int j, int an_idx, int grid_size, + int input_size, int index, int stride, + int img_height, int img_width) { + box[0] = (i + sigmoid(x[index])) * img_width / grid_size; + box[1] = (j + sigmoid(x[index + stride])) * img_height / grid_size; + box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width / + input_size; + box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * + img_height / input_size; +} + +HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx, + int an_num, int an_stride, int stride, + int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} + +template +HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx, + const int img_height, + const int img_width) { + boxes[box_idx] = box[0] - box[2] / 2; + boxes[box_idx + 1] = box[1] - box[3] / 2; + boxes[box_idx + 2] = box[0] + box[2] / 2; + boxes[box_idx + 3] = box[1] + box[3] / 2; + + boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast(0); + boxes[box_idx + 1] = + boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast(0); + boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 + ? boxes[box_idx + 2] + : static_cast(img_width - 1); + boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 + ? boxes[box_idx + 3] + : static_cast(img_height - 1); +} + +template +HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input, + const int label_idx, const int score_idx, + const int class_num, const T conf, + const int stride) { + for (int i = 0; i < class_num; i++) { + scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]); + } +} + +template +class YoloBoxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* imgsize = ctx.Input("ImgSize"); + auto* boxes = ctx.Output("Boxes"); + auto* scores = ctx.Output("Scores"); + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float conf_thresh = ctx.Attr("conf_thresh"); + int downsample_ratio = ctx.Attr("downsample_ratio"); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int box_num = boxes->dims()[1]; + const int an_num = anchors.size() / 2; + int input_size = downsample_ratio * h; + + const int stride = h * w; + const int an_stride = (class_num + 5) * stride; + + Tensor anchors_; + auto anchors_data = + anchors_.mutable_data({an_num * 2}, ctx.GetPlace()); + std::copy(anchors.begin(), anchors.end(), anchors_data); + + const T* input_data = input->data(); + const int* imgsize_data = imgsize->data(); + T* boxes_data = boxes->mutable_data({n, box_num, 4}, ctx.GetPlace()); + memset(boxes_data, 0, boxes->numel() * sizeof(T)); + T* scores_data = + scores->mutable_data({n, box_num, class_num}, ctx.GetPlace()); + memset(scores_data, 0, scores->numel() * sizeof(T)); + + T box[4]; + for (int i = 0; i < n; i++) { + int img_height = imgsize_data[2 * i]; + int img_width = imgsize_data[2 * i + 1]; + + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + int obj_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4); + T conf = sigmoid(input_data[obj_idx]); + if (conf < conf_thresh) { + continue; + } + + int box_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0); + GetYoloBox(box, input_data, anchors_data, l, k, j, h, input_size, + box_idx, stride, img_height, img_width); + box_idx = (i * box_num + j * stride + k * w + l) * 4; + CalcDetectionBox(boxes_data, box, box_idx, img_height, + img_width); + + int label_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5); + int score_idx = (i * box_num + j * stride + k * w + l) * class_num; + CalcLabelScore(scores_data, input_data, label_idx, score_idx, + class_num, conf, stride); + } + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc new file mode 100644 index 00000000..5732b180 --- /dev/null +++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc @@ -0,0 +1,299 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/detection/yolov3_loss_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class Yolov3LossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("GTBox"), + "Input(GTBox) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("GTLabel"), + "Input(GTLabel) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Loss"), + "Output(Loss) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("ObjectnessMask"), + "Output(ObjectnessMask) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("GTMatchMask"), + "Output(GTMatchMask) of Yolov3LossOp should not be null."); + + auto dim_x = ctx->GetInputDim("X"); + auto dim_gtbox = ctx->GetInputDim("GTBox"); + auto dim_gtlabel = ctx->GetInputDim("GTLabel"); + auto anchors = ctx->Attrs().Get>("anchors"); + int anchor_num = anchors.size() / 2; + auto anchor_mask = ctx->Attrs().Get>("anchor_mask"); + int mask_num = anchor_mask.size(); + auto class_num = ctx->Attrs().Get("class_num"); + + PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); + PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3], + "Input(X) dim[3] and dim[4] should be euqal."); + PADDLE_ENFORCE_EQ( + dim_x[1], mask_num * (5 + class_num), + "Input(X) dim[1] should be equal to (anchor_mask_number * (5 " + "+ class_num))."); + PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3, + "Input(GTBox) should be a 3-D tensor"); + PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5"); + PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2, + "Input(GTLabel) should be a 2-D tensor"); + PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0], + "Input(GTBox) and Input(GTLabel) dim[0] should be same"); + PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1], + "Input(GTBox) and Input(GTLabel) dim[1] should be same"); + PADDLE_ENFORCE_GT(anchors.size(), 0, + "Attr(anchors) length should be greater then 0."); + PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, + "Attr(anchors) length should be even integer."); + for (size_t i = 0; i < anchor_mask.size(); i++) { + PADDLE_ENFORCE_LT( + anchor_mask[i], anchor_num, + "Attr(anchor_mask) should not crossover Attr(anchors)."); + } + PADDLE_ENFORCE_GT(class_num, 0, + "Attr(class_num) should be an integer greater then 0."); + + if (ctx->HasInput("GTScore")) { + auto dim_gtscore = ctx->GetInputDim("GTScore"); + PADDLE_ENFORCE_EQ(dim_gtscore.size(), 2, + "Input(GTScore) should be a 2-D tensor"); + PADDLE_ENFORCE_EQ( + dim_gtscore[0], dim_gtbox[0], + "Input(GTBox) and Input(GTScore) dim[0] should be same"); + PADDLE_ENFORCE_EQ( + dim_gtscore[1], dim_gtbox[1], + "Input(GTBox) and Input(GTScore) dim[1] should be same"); + } + + std::vector dim_out({dim_x[0]}); + ctx->SetOutputDim("Loss", framework::make_ddim(dim_out)); + + std::vector dim_obj_mask({dim_x[0], mask_num, dim_x[2], dim_x[3]}); + ctx->SetOutputDim("ObjectnessMask", framework::make_ddim(dim_obj_mask)); + + std::vector dim_gt_match_mask({dim_gtbox[0], dim_gtbox[1]}); + ctx->SetOutputDim("GTMatchMask", framework::make_ddim(dim_gt_match_mask)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); + } +}; + +class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input tensor of YOLOv3 loss operator, " + "This is a 4-D tensor with shape of [N, C, H, W]." + "H and W should be same, and the second dimention(C) stores" + "box locations, confidence score and classification one-hot" + "keys of each anchor box"); + AddInput("GTBox", + "The input tensor of ground truth boxes, " + "This is a 3-D tensor with shape of [N, max_box_num, 5], " + "max_box_num is the max number of boxes in each image, " + "In the third dimention, stores x, y, w, h coordinates, " + "x, y is the center cordinate of boxes and w, h is the " + "width and height and x, y, w, h should be divided by " + "input image height to scale to [0, 1]."); + AddInput("GTLabel", + "The input tensor of ground truth label, " + "This is a 2-D tensor with shape of [N, max_box_num], " + "and each element should be an integer to indicate the " + "box class id."); + AddInput("GTScore", + "The score of GTLabel, This is a 2-D tensor in same shape " + "GTLabel, and score values should in range (0, 1). This " + "input is for GTLabel score can be not 1.0 in image mixup " + "augmentation.") + .AsDispensable(); + AddOutput("Loss", + "The output yolov3 loss tensor, " + "This is a 1-D tensor with shape of [N]"); + AddOutput("ObjectnessMask", + "This is an intermediate tensor with shape of [N, M, H, W], " + "M is the number of anchor masks. This parameter caches the " + "mask for calculate objectness loss in gradient kernel.") + .AsIntermediate(); + AddOutput("GTMatchMask", + "This is an intermediate tensor with shape of [N, B], " + "B is the max box number of GT boxes. This parameter caches " + "matched mask index of each GT boxes for gradient calculate.") + .AsIntermediate(); + + AddAttr("class_num", "The number of classes to predict."); + AddAttr>("anchors", + "The anchor width and height, " + "it will be parsed pair by pair.") + .SetDefault(std::vector{}); + AddAttr>("anchor_mask", + "The mask index of anchors used in " + "current YOLOv3 loss calculation.") + .SetDefault(std::vector{}); + AddAttr("downsample_ratio", + "The downsample ratio from network input to YOLOv3 loss " + "input, so 32, 16, 8 should be set for the first, second, " + "and thrid YOLOv3 loss operators.") + .SetDefault(32); + AddAttr("ignore_thresh", + "The ignore threshold to ignore confidence loss.") + .SetDefault(0.7); + AddAttr("use_label_smooth", + "Whether to use label smooth. Default True.") + .SetDefault(true); + AddComment(R"DOC( + This operator generates yolov3 loss based on given predict result and ground + truth boxes. + + The output of previous network is in shape [N, C, H, W], while H and W + should be the same, H and W specify the grid size, each grid point predict + given number bounding boxes, this given number, which following will be represented as S, + is specified by the number of anchor clusters in each scale. In the second dimension(the channel + dimension), C should be equal to S * (class_num + 5), class_num is the object + category number of source dataset(such as 80 in coco dataset), so in the + second(channel) dimension, apart from 4 box location coordinates x, y, w, h, + also includes confidence score of the box and class one-hot key of each anchor box. + + Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions + should be as follows: + + $$ + b_x = \\sigma(t_x) + c_x + $$ + $$ + b_y = \\sigma(t_y) + c_y + $$ + $$ + b_w = p_w e^{t_w} + $$ + $$ + b_h = p_h e^{t_h} + $$ + + In the equation above, :math:`c_x, c_y` is the left top corner of current grid + and :math:`p_w, p_h` is specified by anchors. + + As for confidence score, it is the logistic regression value of IoU between + anchor boxes and ground truth boxes, the score of the anchor box which has + the max IoU should be 1, and if the anchor box has IoU bigger than ignore + thresh, the confidence score loss of this anchor box will be ignored. + + Therefore, the yolov3 loss consists of three major parts: box location loss, + objectness loss and classification loss. The L1 loss is used for + box coordinates (w, h), sigmoid cross entropy loss is used for box + coordinates (x, y), objectness loss and classification loss. + + Each groud truth box finds a best matching anchor box in all anchors. + Prediction of this anchor box will incur all three parts of losses, and + prediction of anchor boxes with no GT box matched will only incur objectness + loss. + + In order to trade off box coordinate losses between big boxes and small + boxes, box coordinate losses will be mutiplied by scale weight, which is + calculated as follows. + + $$ + weight_{box} = 2.0 - t_w * t_h + $$ + + Final loss will be represented as follows. + + $$ + loss = (loss_{xy} + loss_{wh}) * weight_{box} + + loss_{conf} + loss_{class} + $$ + + While :attr:`use_label_smooth` is set to be :attr:`True`, the classification + target will be smoothed when calculating classification loss, target of + positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of + negetive samples will be smoothed to :math:`1.0 / class\_num`. + + While :attr:`GTScore` is given, which means the mixup score of ground truth + boxes, all losses incured by a ground truth box will be multiplied by its + mixup score. + )DOC"); + } +}; + +class Yolov3LossOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@GRAD) should not be null"); + auto dim_x = ctx->GetInputDim("X"); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), dim_x); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); + } +}; + +class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("yolov3_loss_grad"); + op->SetInput("X", Input("X")); + op->SetInput("GTBox", Input("GTBox")); + op->SetInput("GTLabel", Input("GTLabel")); + op->SetInput("GTScore", Input("GTScore")); + op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + op->SetInput("ObjectnessMask", Output("ObjectnessMask")); + op->SetInput("GTMatchMask", Output("GTMatchMask")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("GTBox"), {}); + op->SetOutput(framework::GradVarName("GTLabel"), {}); + op->SetOutput(framework::GradVarName("GTScore"), {}); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker, + ops::Yolov3LossGradMaker); +REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad); +REGISTER_OP_CPU_KERNEL(yolov3_loss, ops::Yolov3LossKernel, + ops::Yolov3LossKernel); +REGISTER_OP_CPU_KERNEL(yolov3_loss_grad, ops::Yolov3LossGradKernel, + ops::Yolov3LossGradKernel); diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h new file mode 100644 index 00000000..f8d49960 --- /dev/null +++ b/paddle/fluid/operators/detection/yolov3_loss_op.h @@ -0,0 +1,502 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenTensor = framework::EigenTensor; +template +using EigenVector = framework::EigenVector; + +template +static inline bool LessEqualZero(T x) { + return x < 1e-6; +} + +template +static T SigmoidCrossEntropy(T x, T label) { + return (x > 0 ? x : 0.0) - x * label + std::log(1.0 + std::exp(-std::abs(x))); +} + +template +static T L1Loss(T x, T y) { + return std::abs(y - x); +} + +template +static T SigmoidCrossEntropyGrad(T x, T label) { + return 1.0 / (1.0 + std::exp(-x)) - label; +} + +template +static T L1LossGrad(T x, T y) { + return x > y ? 1.0 : -1.0; +} + +static int GetMaskIndex(std::vector mask, int val) { + for (size_t i = 0; i < mask.size(); i++) { + if (mask[i] == val) { + return i; + } + } + return -1; +} + +template +struct Box { + T x, y, w, h; +}; + +template +static inline T sigmoid(T x) { + return 1.0 / (1.0 + std::exp(-x)); +} + +template +static inline Box GetYoloBox(const T* x, std::vector anchors, int i, + int j, int an_idx, int grid_size, + int input_size, int index, int stride) { + Box b; + b.x = (i + sigmoid(x[index])) / grid_size; + b.y = (j + sigmoid(x[index + stride])) / grid_size; + b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] / input_size; + b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] / input_size; + return b; +} + +template +static inline Box GetGtBox(const T* gt, int batch, int max_boxes, int idx) { + Box b; + b.x = gt[(batch * max_boxes + idx) * 4]; + b.y = gt[(batch * max_boxes + idx) * 4 + 1]; + b.w = gt[(batch * max_boxes + idx) * 4 + 2]; + b.h = gt[(batch * max_boxes + idx) * 4 + 3]; + return b; +} + +template +static inline T BoxOverlap(T c1, T w1, T c2, T w2) { + T l1 = c1 - w1 / 2.0; + T l2 = c2 - w2 / 2.0; + T left = l1 > l2 ? l1 : l2; + T r1 = c1 + w1 / 2.0; + T r2 = c2 + w2 / 2.0; + T right = r1 < r2 ? r1 : r2; + return right - left; +} + +template +static inline T CalcBoxIoU(Box b1, Box b2) { + T w = BoxOverlap(b1.x, b1.w, b2.x, b2.w); + T h = BoxOverlap(b1.y, b1.h, b2.y, b2.h); + T inter_area = (w < 0 || h < 0) ? 0.0 : w * h; + T union_area = b1.w * b1.h + b2.w * b2.h - inter_area; + return inter_area / union_area; +} + +static inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num, + int an_stride, int stride, int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} + +template +static void CalcBoxLocationLoss(T* loss, const T* input, Box gt, + std::vector anchors, int an_idx, + int box_idx, int gi, int gj, int grid_size, + int input_size, int stride, T score) { + T tx = gt.x * grid_size - gi; + T ty = gt.y * grid_size - gj; + T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); + T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); + + T scale = (2.0 - gt.w * gt.h) * score; + loss[0] += SigmoidCrossEntropy(input[box_idx], tx) * scale; + loss[0] += SigmoidCrossEntropy(input[box_idx + stride], ty) * scale; + loss[0] += L1Loss(input[box_idx + 2 * stride], tw) * scale; + loss[0] += L1Loss(input[box_idx + 3 * stride], th) * scale; +} + +template +static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input, + Box gt, std::vector anchors, + int an_idx, int box_idx, int gi, int gj, + int grid_size, int input_size, int stride, + T score) { + T tx = gt.x * grid_size - gi; + T ty = gt.y * grid_size - gj; + T tw = std::log(gt.w * input_size / anchors[2 * an_idx]); + T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]); + + T scale = (2.0 - gt.w * gt.h) * score; + input_grad[box_idx] = + SigmoidCrossEntropyGrad(input[box_idx], tx) * scale * loss; + input_grad[box_idx + stride] = + SigmoidCrossEntropyGrad(input[box_idx + stride], ty) * scale * loss; + input_grad[box_idx + 2 * stride] = + L1LossGrad(input[box_idx + 2 * stride], tw) * scale * loss; + input_grad[box_idx + 3 * stride] = + L1LossGrad(input[box_idx + 3 * stride], th) * scale * loss; +} + +template +static inline void CalcLabelLoss(T* loss, const T* input, const int index, + const int label, const int class_num, + const int stride, const T pos, const T neg, + T score) { + for (int i = 0; i < class_num; i++) { + T pred = input[index + i * stride]; + loss[0] += SigmoidCrossEntropy(pred, (i == label) ? pos : neg) * score; + } +} + +template +static inline void CalcLabelLossGrad(T* input_grad, const T loss, + const T* input, const int index, + const int label, const int class_num, + const int stride, const T pos, const T neg, + T score) { + for (int i = 0; i < class_num; i++) { + T pred = input[index + i * stride]; + input_grad[index + i * stride] = + SigmoidCrossEntropyGrad(pred, (i == label) ? pos : neg) * score * + loss; + } +} + +template +static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness, + const int n, const int an_num, const int h, + const int w, const int stride, + const int an_stride) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + T obj = objness[k * w + l]; + if (obj > 1e-5) { + // positive sample: obj = mixup score + loss[i] += SigmoidCrossEntropy(input[k * w + l], 1.0) * obj; + } else if (obj > -0.5) { + // negetive sample: obj = 0 + loss[i] += SigmoidCrossEntropy(input[k * w + l], 0.0); + } + } + } + objness += stride; + input += an_stride; + } + } +} + +template +static inline void CalcObjnessLossGrad(T* input_grad, const T* loss, + const T* input, const T* objness, + const int n, const int an_num, + const int h, const int w, + const int stride, const int an_stride) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + T obj = objness[k * w + l]; + if (obj > 1e-5) { + input_grad[k * w + l] = + SigmoidCrossEntropyGrad(input[k * w + l], 1.0) * obj * + loss[i]; + } else if (obj > -0.5) { + input_grad[k * w + l] = + SigmoidCrossEntropyGrad(input[k * w + l], 0.0) * loss[i]; + } + } + } + objness += stride; + input += an_stride; + input_grad += an_stride; + } + } +} + +template +static void inline GtValid(bool* valid, const T* gtbox, const int n, + const int b) { + for (int i = 0; i < n; i++) { + for (int j = 0; j < b; j++) { + if (LessEqualZero(gtbox[j * 4 + 2]) || LessEqualZero(gtbox[j * 4 + 3])) { + valid[j] = false; + } else { + valid[j] = true; + } + } + valid += b; + gtbox += b * 4; + } +} + +template +class Yolov3LossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); + auto* gt_score = ctx.Input("GTScore"); + auto* loss = ctx.Output("Loss"); + auto* objness_mask = ctx.Output("ObjectnessMask"); + auto* gt_match_mask = ctx.Output("GTMatchMask"); + auto anchors = ctx.Attr>("anchors"); + auto anchor_mask = ctx.Attr>("anchor_mask"); + int class_num = ctx.Attr("class_num"); + float ignore_thresh = ctx.Attr("ignore_thresh"); + int downsample_ratio = ctx.Attr("downsample_ratio"); + bool use_label_smooth = ctx.Attr("use_label_smooth"); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int an_num = anchors.size() / 2; + const int mask_num = anchor_mask.size(); + const int b = gt_box->dims()[1]; + int input_size = downsample_ratio * h; + + const int stride = h * w; + const int an_stride = (class_num + 5) * stride; + + T label_pos = 1.0; + T label_neg = 0.0; + if (use_label_smooth) { + T smooth_weight = std::min(1.0 / static_cast(class_num), 1.0 / 40); + label_pos = 1.0 - smooth_weight; + label_neg = smooth_weight; + } + + const T* input_data = input->data(); + const T* gt_box_data = gt_box->data(); + const int* gt_label_data = gt_label->data(); + T* loss_data = loss->mutable_data({n}, ctx.GetPlace()); + memset(loss_data, 0, loss->numel() * sizeof(T)); + T* obj_mask_data = + objness_mask->mutable_data({n, mask_num, h, w}, ctx.GetPlace()); + memset(obj_mask_data, 0, objness_mask->numel() * sizeof(T)); + int* gt_match_mask_data = + gt_match_mask->mutable_data({n, b}, ctx.GetPlace()); + + const T* gt_score_data; + if (!gt_score) { + Tensor gtscore; + gtscore.mutable_data({n, b}, ctx.GetPlace()); + math::SetConstant()( + ctx.template device_context(), >score, + static_cast(1.0)); + gt_score = >score; + gt_score_data = gtscore.data(); + } else { + gt_score_data = gt_score->data(); + } + + // calc valid gt box mask, avoid calc duplicately in following code + Tensor gt_valid_mask; + bool* gt_valid_mask_data = + gt_valid_mask.mutable_data({n, b}, ctx.GetPlace()); + GtValid(gt_valid_mask_data, gt_box_data, n, b); + + for (int i = 0; i < n; i++) { + for (int j = 0; j < mask_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + // each predict box find a best match gt box, if overlap is bigger + // then ignore_thresh, ignore the objectness loss. + int box_idx = + GetEntryIndex(i, j, k * w + l, mask_num, an_stride, stride, 0); + Box pred = GetYoloBox(input_data, anchors, l, k, anchor_mask[j], + h, input_size, box_idx, stride); + T best_iou = 0; + for (int t = 0; t < b; t++) { + if (!gt_valid_mask_data[i * b + t]) { + continue; + } + Box gt = GetGtBox(gt_box_data, i, b, t); + T iou = CalcBoxIoU(pred, gt); + if (iou > best_iou) { + best_iou = iou; + } + } + + // If best IoU is bigger then ignore_thresh, + // ignore the objectness loss. + if (best_iou > ignore_thresh) { + int obj_idx = (i * mask_num + j) * stride + k * w + l; + obj_mask_data[obj_idx] = static_cast(-1); + } + // all losses should be calculated if best IoU + // is bigger then truth thresh, but currently, + // truth thresh is an unreachable value as 1.0. + } + } + } + for (int t = 0; t < b; t++) { + if (!gt_valid_mask_data[i * b + t]) { + gt_match_mask_data[i * b + t] = -1; + continue; + } + Box gt = GetGtBox(gt_box_data, i, b, t); + int gi = static_cast(gt.x * w); + int gj = static_cast(gt.y * h); + Box gt_shift = gt; + gt_shift.x = 0.0; + gt_shift.y = 0.0; + T best_iou = 0.0; + int best_n = 0; + // each gt box find a best match anchor box as positive sample, + // for positive sample, all losses should be calculated, and for + // other samples, only objectness loss is required. + for (int an_idx = 0; an_idx < an_num; an_idx++) { + Box an_box; + an_box.x = 0.0; + an_box.y = 0.0; + an_box.w = anchors[2 * an_idx] / static_cast(input_size); + an_box.h = anchors[2 * an_idx + 1] / static_cast(input_size); + float iou = CalcBoxIoU(an_box, gt_shift); + if (iou > best_iou) { + best_iou = iou; + best_n = an_idx; + } + } + + int mask_idx = GetMaskIndex(anchor_mask, best_n); + gt_match_mask_data[i * b + t] = mask_idx; + if (mask_idx >= 0) { + T score = gt_score_data[i * b + t]; + int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, + an_stride, stride, 0); + CalcBoxLocationLoss(loss_data + i, input_data, gt, anchors, best_n, + box_idx, gi, gj, h, input_size, stride, score); + + int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi; + obj_mask_data[obj_idx] = score; + + int label = gt_label_data[i * b + t]; + int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, + an_stride, stride, 5); + CalcLabelLoss(loss_data + i, input_data, label_idx, label, + class_num, stride, label_pos, label_neg, score); + } + } + } + + CalcObjnessLoss(loss_data, input_data + 4 * stride, obj_mask_data, n, + mask_num, h, w, stride, an_stride); + } +}; + +template +class Yolov3LossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); + auto* gt_score = ctx.Input("GTScore"); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); + auto* objness_mask = ctx.Input("ObjectnessMask"); + auto* gt_match_mask = ctx.Input("GTMatchMask"); + auto anchors = ctx.Attr>("anchors"); + auto anchor_mask = ctx.Attr>("anchor_mask"); + int class_num = ctx.Attr("class_num"); + int downsample_ratio = ctx.Attr("downsample_ratio"); + bool use_label_smooth = ctx.Attr("use_label_smooth"); + + const int n = input_grad->dims()[0]; + const int c = input_grad->dims()[1]; + const int h = input_grad->dims()[2]; + const int w = input_grad->dims()[3]; + const int mask_num = anchor_mask.size(); + const int b = gt_match_mask->dims()[1]; + int input_size = downsample_ratio * h; + + const int stride = h * w; + const int an_stride = (class_num + 5) * stride; + + T label_pos = 1.0; + T label_neg = 0.0; + if (use_label_smooth) { + T smooth_weight = std::min(1.0 / static_cast(class_num), 1.0 / 40); + label_pos = 1.0 - smooth_weight; + label_neg = smooth_weight; + } + + const T* input_data = input->data(); + const T* gt_box_data = gt_box->data(); + const int* gt_label_data = gt_label->data(); + const T* loss_grad_data = loss_grad->data(); + const T* obj_mask_data = objness_mask->data(); + const int* gt_match_mask_data = gt_match_mask->data(); + T* input_grad_data = + input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); + memset(input_grad_data, 0, input_grad->numel() * sizeof(T)); + + const T* gt_score_data; + if (!gt_score) { + Tensor gtscore; + gtscore.mutable_data({n, b}, ctx.GetPlace()); + math::SetConstant()( + ctx.template device_context(), >score, + static_cast(1.0)); + gt_score = >score; + gt_score_data = gtscore.data(); + } else { + gt_score_data = gt_score->data(); + } + + for (int i = 0; i < n; i++) { + for (int t = 0; t < b; t++) { + int mask_idx = gt_match_mask_data[i * b + t]; + if (mask_idx >= 0) { + T score = gt_score_data[i * b + t]; + Box gt = GetGtBox(gt_box_data, i, b, t); + int gi = static_cast(gt.x * w); + int gj = static_cast(gt.y * h); + + int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, + an_stride, stride, 0); + CalcBoxLocationLossGrad(input_grad_data, loss_grad_data[i], + input_data, gt, anchors, + anchor_mask[mask_idx], box_idx, gi, gj, h, + input_size, stride, score); + + int label = gt_label_data[i * b + t]; + int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num, + an_stride, stride, 5); + CalcLabelLossGrad(input_grad_data, loss_grad_data[i], input_data, + label_idx, label, class_num, stride, label_pos, + label_neg, score); + } + } + } + + CalcObjnessLossGrad(input_grad_data + 4 * stride, loss_grad_data, + input_data + 4 * stride, obj_mask_data, n, mask_num, + h, w, stride, an_stride); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc new file mode 100644 index 00000000..dff97f7c --- /dev/null +++ b/paddle/fluid/operators/detection_map_op.cc @@ -0,0 +1,198 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection_map_op.h" +#include + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class DetectionMAPOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("DetectRes"), + "Input(DetectRes) of DetectionMAPOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input(Label) of DetectionMAPOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("AccumPosCount"), + "Output(AccumPosCount) of DetectionMAPOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("AccumTruePos"), + "Output(AccumTruePos) of DetectionMAPOp should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("AccumFalsePos"), + "Output(AccumFalsePos) of DetectionMAPOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("MAP"), + "Output(MAP) of DetectionMAPOp should not be null."); + + auto det_dims = ctx->GetInputDim("DetectRes"); + PADDLE_ENFORCE_EQ(det_dims.size(), 2UL, + "The rank of Input(DetectRes) must be 2, " + "the shape is [N, 6]."); + PADDLE_ENFORCE_EQ(det_dims[1], 6UL, + "The shape is of Input(DetectRes) [N, 6]."); + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(label_dims.size(), 2, + "The rank of Input(Label) must be 2, " + "the shape is [N, 6]."); + if (ctx->IsRuntime() || label_dims[1] > 0) { + PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5, + "The shape of Input(Label) is [N, 6] or [N, 5]."); + } + + if (ctx->HasInput("PosCount")) { + PADDLE_ENFORCE(ctx->HasInput("TruePos"), + "Input(TruePos) of DetectionMAPOp should not be null when " + "Input(TruePos) is not null."); + PADDLE_ENFORCE( + ctx->HasInput("FalsePos"), + "Input(FalsePos) of DetectionMAPOp should not be null when " + "Input(FalsePos) is not null."); + } + + ctx->SetOutputDim("MAP", framework::make_ddim({1})); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input("DetectRes")->type(), + platform::CPUPlace()); + } +}; + +class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("DetectRes", + "(LoDTensor) A 2-D LoDTensor with shape [M, 6] represents the " + "detections. Each row has 6 values: " + "[label, confidence, xmin, ymin, xmax, ymax], M is the total " + "number of detect results in this mini-batch. For each instance, " + "the offsets in first dimension are called LoD, the number of " + "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is " + "no detected data."); + AddInput("Label", + "(LoDTensor) A 2-D LoDTensor represents the" + "Labeled ground-truth data. Each row has 6 values: " + "[label, xmin, ymin, xmax, ymax, is_difficult] or 5 values: " + "[label, xmin, ymin, xmax, ymax], where N is the total " + "number of ground-truth data in this mini-batch. For each " + "instance, the offsets in first dimension are called LoD, " + "the number of offset is N + 1, if LoD[i + 1] - LoD[i] == 0, " + "means there is no ground-truth data."); + AddInput("HasState", + "(Tensor) A tensor with shape [1], 0 means ignoring input " + "states, which including PosCount, TruePos, FalsePos.") + .AsDispensable(); + AddInput("PosCount", + "(Tensor) A tensor with shape [Ncls, 1], store the " + "input positive example count of each class, Ncls is the count of " + "input classification. " + "This input is used to pass the AccumPosCount generated by the " + "previous mini-batch when the multi mini-batches cumulative " + "calculation carried out. " + "When the input(PosCount) is empty, the cumulative " + "calculation is not carried out, and only the results of the " + "current mini-batch are calculated.") + .AsDispensable(); + AddInput("TruePos", + "(LoDTensor) A 2-D LoDTensor with shape [Ntp, 2], store the " + "input true positive example of each class." + "This input is used to pass the AccumTruePos generated by the " + "previous mini-batch when the multi mini-batches cumulative " + "calculation carried out. ") + .AsDispensable(); + AddInput("FalsePos", + "(LoDTensor) A 2-D LoDTensor with shape [Nfp, 2], store the " + "input false positive example of each class." + "This input is used to pass the AccumFalsePos generated by the " + "previous mini-batch when the multi mini-batches cumulative " + "calculation carried out. ") + .AsDispensable(); + AddOutput("AccumPosCount", + "(Tensor) A tensor with shape [Ncls, 1], store the " + "positive example count of each class. It combines the input " + "input(PosCount) and the positive example count computed from " + "input(Detection) and input(Label)."); + AddOutput("AccumTruePos", + "(LoDTensor) A LoDTensor with shape [Ntp', 2], store the " + "true positive example of each class. It combines the " + "input(TruePos) and the true positive examples computed from " + "input(Detection) and input(Label)."); + AddOutput("AccumFalsePos", + "(LoDTensor) A LoDTensor with shape [Nfp', 2], store the " + "false positive example of each class. It combines the " + "input(FalsePos) and the false positive examples computed from " + "input(Detection) and input(Label)."); + AddOutput("MAP", + "(Tensor) A tensor with shape [1], store the mAP evaluate " + "result of the detection."); + AddAttr("class_num", + "(int) " + "The class number."); + AddAttr( + "background_label", + "(int, default: 0) " + "The index of background label, the background label will be ignored. " + "If set to -1, then all categories will be considered.") + .SetDefault(0); + AddAttr( + "overlap_threshold", + "(float) " + "The lower bound jaccard overlap threshold of detection output and " + "ground-truth data.") + .SetDefault(.5f); + AddAttr("evaluate_difficult", + "(bool, default true) " + "Switch to control whether the difficult data is evaluated.") + .SetDefault(true); + AddAttr("ap_type", + "(string, default 'integral') " + "The AP algorithm type, 'integral' or '11point'.") + .SetDefault("integral") + .InEnum({"integral", "11point"}) + .AddCustomChecker([](const std::string& ap_type) { + PADDLE_ENFORCE_NE(GetAPType(ap_type), APType::kNone, + "The ap_type should be 'integral' or '11point."); + }); + AddComment(R"DOC( +Detection mAP evaluate operator. +The general steps are as follows. First, calculate the true positive and +false positive according to the input of detection and labels, then +calculate the mAP evaluate value. +Supporting '11 point' and 'integral' mAP algorithm. Please get more information +from the following articles: +https://sanchom.wordpress.com/tag/average-precision/ +https://arxiv.org/abs/1512.02325 + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(detection_map, ops::DetectionMAPOp, ops::DetectionMAPOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + detection_map, ops::DetectionMAPOpKernel, + ops::DetectionMAPOpKernel); diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h new file mode 100644 index 00000000..dd5d138a --- /dev/null +++ b/paddle/fluid/operators/detection_map_op.h @@ -0,0 +1,480 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +enum APType { kNone = 0, kIntegral, k11point }; + +APType GetAPType(std::string str) { + if (str == "integral") { + return APType::kIntegral; + } else if (str == "11point") { + return APType::k11point; + } else { + return APType::kNone; + } +} + +template +inline bool SortScorePairDescend(const std::pair& pair1, + const std::pair& pair2) { + return pair1.first > pair2.first; +} + +template +inline void GetAccumulation(std::vector> in_pairs, + std::vector* accu_vec) { + std::stable_sort(in_pairs.begin(), in_pairs.end(), SortScorePairDescend); + accu_vec->clear(); + size_t sum = 0; + for (size_t i = 0; i < in_pairs.size(); ++i) { + auto count = in_pairs[i].second; + sum += count; + accu_vec->push_back(sum); + } +} + +template +class DetectionMAPOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in_detect = ctx.Input("DetectRes"); + auto* in_label = ctx.Input("Label"); + auto* out_map = ctx.Output("MAP"); + + auto* in_pos_count = ctx.Input("PosCount"); + auto* in_true_pos = ctx.Input("TruePos"); + auto* in_false_pos = ctx.Input("FalsePos"); + + auto* out_pos_count = ctx.Output("AccumPosCount"); + auto* out_true_pos = ctx.Output("AccumTruePos"); + auto* out_false_pos = ctx.Output("AccumFalsePos"); + + float overlap_threshold = ctx.Attr("overlap_threshold"); + bool evaluate_difficult = ctx.Attr("evaluate_difficult"); + auto ap_type = GetAPType(ctx.Attr("ap_type")); + int class_num = ctx.Attr("class_num"); + + auto& label_lod = in_label->lod(); + auto& detect_lod = in_detect->lod(); + PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, + "Only support one level sequence now."); + PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), + "The batch_size of input(Label) and input(Detection) " + "must be the same."); + + std::vector>> gt_boxes; + std::vector>>> detect_boxes; + + GetBoxes(*in_label, *in_detect, >_boxes, detect_boxes); + + std::map label_pos_count; + std::map>> true_pos; + std::map>> false_pos; + + auto* has_state = ctx.Input("HasState"); + int state = 0; + if (has_state) { + state = has_state->data()[0]; + } + + if (in_pos_count != nullptr && state) { + GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, &label_pos_count, + &true_pos, &false_pos, class_num); + } + + CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult, + overlap_threshold, &label_pos_count, &true_pos, + &false_pos); + + int background_label = ctx.Attr("background_label"); + T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos, + background_label); + + GetOutputPos(ctx, label_pos_count, true_pos, false_pos, out_pos_count, + out_true_pos, out_false_pos, class_num); + + T* map_data = out_map->mutable_data(ctx.GetPlace()); + map_data[0] = map; + } + + protected: + struct Box { + Box(T xmin, T ymin, T xmax, T ymax) + : xmin(xmin), ymin(ymin), xmax(xmax), ymax(ymax), is_difficult(false) {} + + T xmin, ymin, xmax, ymax; + bool is_difficult; + }; + + inline T JaccardOverlap(const Box& box1, const Box& box2) const { + if (box2.xmin > box1.xmax || box2.xmax < box1.xmin || + box2.ymin > box1.ymax || box2.ymax < box1.ymin) { + return 0.0; + } else { + T inter_xmin = std::max(box1.xmin, box2.xmin); + T inter_ymin = std::max(box1.ymin, box2.ymin); + T inter_xmax = std::min(box1.xmax, box2.xmax); + T inter_ymax = std::min(box1.ymax, box2.ymax); + + T inter_width = inter_xmax - inter_xmin; + T inter_height = inter_ymax - inter_ymin; + T inter_area = inter_width * inter_height; + + T bbox_area1 = (box1.xmax - box1.xmin) * (box1.ymax - box1.ymin); + T bbox_area2 = (box2.xmax - box2.xmin) * (box2.ymax - box2.ymin); + + return inter_area / (bbox_area1 + bbox_area2 - inter_area); + } + } + + inline void ClipBBox(const Box& bbox, Box* clipped_bbox) const { + T one = static_cast(1.0); + T zero = static_cast(0.0); + clipped_bbox->xmin = std::max(std::min(bbox.xmin, one), zero); + clipped_bbox->ymin = std::max(std::min(bbox.ymin, one), zero); + clipped_bbox->xmax = std::max(std::min(bbox.xmax, one), zero); + clipped_bbox->ymax = std::max(std::min(bbox.ymax, one), zero); + } + + void GetBoxes(const framework::LoDTensor& input_label, + const framework::LoDTensor& input_detect, + std::vector>>* gt_boxes, + std::vector>>>& + detect_boxes) const { + auto labels = framework::EigenTensor::From(input_label); + auto detect = framework::EigenTensor::From(input_detect); + + auto& label_lod = input_label.lod(); + auto& detect_lod = input_detect.lod(); + + int batch_size = label_lod[0].size() - 1; + auto& label_index = label_lod[0]; + + for (int n = 0; n < batch_size; ++n) { + std::map> boxes; + for (size_t i = label_index[n]; i < label_index[n + 1]; ++i) { + int label = labels(i, 0); + if (input_label.dims()[1] == 6) { + Box box(labels(i, 2), labels(i, 3), labels(i, 4), labels(i, 5)); + auto is_difficult = labels(i, 1); + if (std::abs(is_difficult - 0.0) < 1e-6) + box.is_difficult = false; + else + box.is_difficult = true; + boxes[label].push_back(box); + } else { + PADDLE_ENFORCE_EQ(input_label.dims()[1], 5); + Box box(labels(i, 1), labels(i, 2), labels(i, 3), labels(i, 4)); + boxes[label].push_back(box); + } + } + gt_boxes->push_back(boxes); + } + + auto detect_index = detect_lod[0]; + for (int n = 0; n < batch_size; ++n) { + std::map>> boxes; + for (size_t i = detect_index[n]; i < detect_index[n + 1]; ++i) { + Box box(detect(i, 2), detect(i, 3), detect(i, 4), detect(i, 5)); + int label = detect(i, 0); + auto score = detect(i, 1); + boxes[label].push_back(std::make_pair(score, box)); + } + detect_boxes.push_back(boxes); + } + } + + void GetOutputPos( + const framework::ExecutionContext& ctx, + const std::map& label_pos_count, + const std::map>>& true_pos, + const std::map>>& false_pos, + framework::Tensor* output_pos_count, + framework::LoDTensor* output_true_pos, + framework::LoDTensor* output_false_pos, const int class_num) const { + int true_pos_count = 0; + int false_pos_count = 0; + for (auto it = true_pos.begin(); it != true_pos.end(); ++it) { + auto tp = it->second; + true_pos_count += tp.size(); + } + for (auto it = false_pos.begin(); it != false_pos.end(); ++it) { + auto fp = it->second; + false_pos_count += fp.size(); + } + + int* pos_count_data = output_pos_count->mutable_data( + framework::make_ddim({class_num, 1}), ctx.GetPlace()); + + T* true_pos_data = output_true_pos->mutable_data( + framework::make_ddim({true_pos_count, 2}), ctx.GetPlace()); + T* false_pos_data = output_false_pos->mutable_data( + framework::make_ddim({false_pos_count, 2}), ctx.GetPlace()); + true_pos_count = 0; + false_pos_count = 0; + std::vector true_pos_starts = {0}; + std::vector false_pos_starts = {0}; + for (int i = 0; i < class_num; ++i) { + auto it_count = label_pos_count.find(i); + pos_count_data[i] = 0; + if (it_count != label_pos_count.end()) { + pos_count_data[i] = it_count->second; + } + auto it_true_pos = true_pos.find(i); + if (it_true_pos != true_pos.end()) { + const std::vector>& true_pos_vec = + it_true_pos->second; + for (const std::pair& tp : true_pos_vec) { + true_pos_data[true_pos_count * 2] = tp.first; + true_pos_data[true_pos_count * 2 + 1] = static_cast(tp.second); + true_pos_count++; + } + } + true_pos_starts.push_back(true_pos_count); + + auto it_false_pos = false_pos.find(i); + if (it_false_pos != false_pos.end()) { + const std::vector>& false_pos_vec = + it_false_pos->second; + for (const std::pair& fp : false_pos_vec) { + false_pos_data[false_pos_count * 2] = fp.first; + false_pos_data[false_pos_count * 2 + 1] = static_cast(fp.second); + false_pos_count++; + } + } + false_pos_starts.push_back(false_pos_count); + } + + framework::LoD true_pos_lod; + true_pos_lod.emplace_back(true_pos_starts); + framework::LoD false_pos_lod; + false_pos_lod.emplace_back(false_pos_starts); + + output_true_pos->set_lod(true_pos_lod); + output_false_pos->set_lod(false_pos_lod); + } + + void GetInputPos(const framework::Tensor& input_pos_count, + const framework::LoDTensor& input_true_pos, + const framework::LoDTensor& input_false_pos, + std::map* label_pos_count, + std::map>>* true_pos, + std::map>>* false_pos, + const int class_num) const { + const int* pos_count_data = input_pos_count.data(); + for (int i = 0; i < class_num; ++i) { + (*label_pos_count)[i] = pos_count_data[i]; + } + + auto SetData = [](const framework::LoDTensor& pos_tensor, + std::map>>& pos) { + const T* pos_data = pos_tensor.data(); + auto& pos_data_lod = pos_tensor.lod()[0]; + for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { + for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { + T score = pos_data[j * 2]; + int flag = pos_data[j * 2 + 1]; + pos[i].push_back(std::make_pair(score, flag)); + } + } + }; + + SetData(input_true_pos, *true_pos); + SetData(input_false_pos, *false_pos); + return; + } + + void CalcTrueAndFalsePositive( + const std::vector>>& gt_boxes, + const std::vector>>>& + detect_boxes, + bool evaluate_difficult, float overlap_threshold, + std::map* label_pos_count, + std::map>>* true_pos, + std::map>>* false_pos) const { + int batch_size = gt_boxes.size(); + for (int n = 0; n < batch_size; ++n) { + auto& image_gt_boxes = gt_boxes[n]; + for (auto& image_gt_box : image_gt_boxes) { + size_t count = 0; + auto& labeled_bboxes = image_gt_box.second; + if (evaluate_difficult) { + count = labeled_bboxes.size(); + } else { + for (auto& box : labeled_bboxes) { + if (!box.is_difficult) { + ++count; + } + } + } + if (count == 0) { + continue; + } + int label = image_gt_box.first; + if (label_pos_count->find(label) == label_pos_count->end()) { + (*label_pos_count)[label] = count; + } else { + (*label_pos_count)[label] += count; + } + } + } + + for (size_t n = 0; n < detect_boxes.size(); ++n) { + auto image_gt_boxes = gt_boxes[n]; + auto detections = detect_boxes[n]; + + if (image_gt_boxes.size() == 0) { + for (auto it = detections.begin(); it != detections.end(); ++it) { + auto pred_boxes = it->second; + int label = it->first; + for (size_t i = 0; i < pred_boxes.size(); ++i) { + auto score = pred_boxes[i].first; + (*true_pos)[label].push_back(std::make_pair(score, 0)); + (*false_pos)[label].push_back(std::make_pair(score, 1)); + } + } + continue; + } + + for (auto it = detections.begin(); it != detections.end(); ++it) { + int label = it->first; + auto pred_boxes = it->second; + if (image_gt_boxes.find(label) == image_gt_boxes.end()) { + for (size_t i = 0; i < pred_boxes.size(); ++i) { + auto score = pred_boxes[i].first; + (*true_pos)[label].push_back(std::make_pair(score, 0)); + (*false_pos)[label].push_back(std::make_pair(score, 1)); + } + continue; + } + + auto matched_bboxes = image_gt_boxes.find(label)->second; + std::vector visited(matched_bboxes.size(), false); + // Sort detections in descend order based on scores + std::sort(pred_boxes.begin(), pred_boxes.end(), + SortScorePairDescend); + for (size_t i = 0; i < pred_boxes.size(); ++i) { + T max_overlap = -1.0; + size_t max_idx = 0; + auto score = pred_boxes[i].first; + for (size_t j = 0; j < matched_bboxes.size(); ++j) { + Box& pred_box = pred_boxes[i].second; + ClipBBox(pred_box, &pred_box); + T overlap = JaccardOverlap(pred_box, matched_bboxes[j]); + if (overlap > max_overlap) { + max_overlap = overlap; + max_idx = j; + } + } + if (max_overlap > overlap_threshold) { + bool match_evaluate_difficult = + evaluate_difficult || + (!evaluate_difficult && !matched_bboxes[max_idx].is_difficult); + if (match_evaluate_difficult) { + if (!visited[max_idx]) { + (*true_pos)[label].push_back(std::make_pair(score, 1)); + (*false_pos)[label].push_back(std::make_pair(score, 0)); + visited[max_idx] = true; + } else { + (*true_pos)[label].push_back(std::make_pair(score, 0)); + (*false_pos)[label].push_back(std::make_pair(score, 1)); + } + } + } else { + (*true_pos)[label].push_back(std::make_pair(score, 0)); + (*false_pos)[label].push_back(std::make_pair(score, 1)); + } + } + } + } + } + + T CalcMAP(APType ap_type, const std::map& label_pos_count, + const std::map>>& true_pos, + const std::map>>& false_pos, + const int background_label) const { + T mAP = 0.0; + int count = 0; + for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) { + int label = it->first; + int label_num_pos = it->second; + if (label_num_pos == background_label || + true_pos.find(label) == true_pos.end()) { + continue; + } + auto label_true_pos = true_pos.find(label)->second; + auto label_false_pos = false_pos.find(label)->second; + // Compute average precision. + std::vector tp_sum; + GetAccumulation(label_true_pos, &tp_sum); + std::vector fp_sum; + GetAccumulation(label_false_pos, &fp_sum); + std::vector precision, recall; + size_t num = tp_sum.size(); + // Compute Precision. + for (size_t i = 0; i < num; ++i) { + precision.push_back(static_cast(tp_sum[i]) / + static_cast(tp_sum[i] + fp_sum[i])); + recall.push_back(static_cast(tp_sum[i]) / label_num_pos); + } + // VOC2007 style + if (ap_type == APType::k11point) { + std::vector max_precisions(11, 0.0); + int start_idx = num - 1; + for (int j = 10; j >= 0; --j) + for (int i = start_idx; i >= 0; --i) { + if (recall[i] < j / 10.) { + start_idx = i; + if (j > 0) max_precisions[j - 1] = max_precisions[j]; + break; + } else { + if (max_precisions[j] < precision[i]) + max_precisions[j] = precision[i]; + } + } + for (int j = 10; j >= 0; --j) mAP += max_precisions[j] / 11; + ++count; + } else if (ap_type == APType::kIntegral) { + // Nature integral + float average_precisions = 0.; + float prev_recall = 0.; + for (size_t i = 0; i < num; ++i) { + if (fabs(recall[i] - prev_recall) > 1e-6) + average_precisions += precision[i] * fabs(recall[i] - prev_recall); + prev_recall = recall[i]; + } + mAP += average_precisions; + ++count; + } else { + LOG(FATAL) << "Unkown ap version: " << ap_type; + } + } + if (count != 0) mAP /= count; + return mAP; + } +}; // namespace operators + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cc b/paddle/fluid/operators/dgc_clip_by_norm_op.cc new file mode 100644 index 00000000..6ebad4de --- /dev/null +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/dgc_clip_by_norm_op.h" + +namespace paddle { +namespace operators { + +class DGCClipByNormOp : public ClipByNormOp { + public: + using ClipByNormOp::ClipByNormOp; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("current_step"), + "current_step should be set."); + + return ClipByNormOp::InferShape(ctx); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "current_step") { + VLOG(10) << "var_name:" << var_name << " need not to transform"; + return expected_kernel_type; + } + + return framework::OperatorWithKernel::GetKernelTypeForVar( + var_name, tensor, expected_kernel_type); + } +}; + +class DGCClipByNormOpMaker : public ClipByNormOpMaker { + public: + void Make() override { + AddInput("current_step", "(Tensor) Current step."); + AddAttr("rampup_begin_step", + "(float, -1.0)" + "The period when begin k_select.") + .SetDefault(-1.0); + + return ClipByNormOpMaker::Make(); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(dgc_clip_by_norm, ops::DGCClipByNormOp, + ops::DGCClipByNormOpMaker); + +REGISTER_OP_CPU_KERNEL( + dgc_clip_by_norm, + ops::DGCClipByNormKernel); diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.cu b/paddle/fluid/operators/dgc_clip_by_norm_op.cu new file mode 100644 index 00000000..e7f564b7 --- /dev/null +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dgc_clip_by_norm_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + dgc_clip_by_norm, + ops::DGCClipByNormKernel); diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h new file mode 100644 index 00000000..197bf59b --- /dev/null +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/clip_by_norm_op.h" + +namespace paddle { +namespace operators { + +template +class DGCClipByNormKernel : public ClipByNormKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto rampup_begin_step = context.Attr("rampup_begin_step"); + if (static_cast(rampup_begin_step) < 0) { + return; + } + + auto current_step_tensor = context.Input("current_step"); + auto* current_step = current_step_tensor->data(); + + VLOG(10) << "current_step:" << *current_step + << ", rampup_begin_step:" << rampup_begin_step; + + if (static_cast(*current_step) < static_cast(rampup_begin_step)) { + VLOG(10) << "current_step:" << *current_step + << " < rampup_begin_step:" << rampup_begin_step + << " so does't use dgc_clip_by_norm"; + return; + } + + return ClipByNormKernel::Compute(context); + }; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/dgc_op.cc b/paddle/fluid/operators/dgc_op.cc new file mode 100644 index 00000000..ccdeea2d --- /dev/null +++ b/paddle/fluid/operators/dgc_op.cc @@ -0,0 +1,138 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dgc_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class DGCOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("U"), "Input(U) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasInput("V"), "Input(V) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(Grad) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasInput("current_step"), + "Input(current_step) of DGCop should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("U_out"), + "Output(U_out) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("V_out"), + "Output(V_out) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("k"), + "Output(k) of DGCop should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("EncodeGrad"), + "Output(EncodeGrad) of DGCop should not be null."); + } + + protected: + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "current_step" || var_name == "rampup_step" || + var_name == "k") { + VLOG(10) << "var_name:" << var_name << " need not to transform"; + return expected_kernel_type; + } + + return framework::OperatorWithKernel::GetKernelTypeForVar( + var_name, tensor, expected_kernel_type); + } +}; + +class DGCOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("U", "(Tensor) Middle tensor of DGC"); + AddInput("V", "(Tensor) Middle tensor of DGC"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("current_step", "(Tensor) Current step."); + + AddOutput("U_out", + "(Tensor) " + "Output encoded gradient"); + AddOutput("V_out", + "(Tensor) " + "Output encoded gradient"); + AddOutput("EncodeGrad", + "(Tensor) " + "Output encoded gradient"); + AddOutput("Grad_out", + "(Tensor) " + "Output grad gradient"); + AddOutput("k", + "(Tensor) " + "Output top-k value"); + + AddAttr("m", + "(float, 0.9) " + "The momentum of learning rate.") + .SetDefault(0.9); + + AddAttr("use_nesterov", + "(bool, true)" + "The momentum of learning rate.") + .SetDefault(true); + + AddAttr>("sparsity", + "(vecotr, float)" + "The period sparsity of k_select."); + + AddAttr("rampup_begin_step", + "(float, 0.0)" + "The period when begin k_select.") + .SetDefault(0.0); + + AddAttr("rampup_step", + "(float, 0.0)" + "The period when begin k_select."); + + AddComment(R"DOC( + Original paper is https://arxiv.org/abs/1712.01887 + + DGC reduce the communication bandwidth by sending only the important gradients (sparse update):\ + only gradients larger than a threshold are transmitted. + + To avoid losing information, DGC accumulate the rest of the gradients locally. + + Eventually, these gradients become large enough to be transmitted. + + Thus, DGC send the large gradients immediately but eventually send all of the gradients over time. + + To ensure no loss of accuracy, DGC employs momentum correc-tionandlocal gradient clipping on top of the gradient sparsification to maintain model performance. + + DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication. + + This optimizer will do two things: + + 1. Compress the gradient by get TopK import value from tensor \ + and use it for allreduce to reduce network bandwidth. + + 2. Call momentum to optimize on the cost. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(dgc, ops::DGCOp, ops::DGCOpMaker); diff --git a/paddle/fluid/operators/dgc_op.cu b/paddle/fluid/operators/dgc_op.cu new file mode 100644 index 00000000..0f0bf441 --- /dev/null +++ b/paddle/fluid/operators/dgc_op.cu @@ -0,0 +1,20 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/dgc_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + dgc, ops::DGCOpKernel); diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h new file mode 100644 index 00000000..8d1683bd --- /dev/null +++ b/paddle/fluid/operators/dgc_op.h @@ -0,0 +1,132 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "dgc/dgc.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" + +namespace paddle { +namespace operators { + +inline float get_period_sparcity(const std::vector& sparsity, + float cur_step, float rampup_steps) { + PADDLE_ENFORCE(static_cast(cur_step) >= 0); + + size_t idx = static_cast(cur_step * sparsity.size() / rampup_steps); + if (idx >= sparsity.size()) { + return 0.999; + } + + PADDLE_ENFORCE(idx < sparsity.size()); + return sparsity[idx]; +} + +template +class DGCOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto u = ctx.Input("U"); + auto v = ctx.Input("V"); + auto g = ctx.Input("Grad"); + + // attrs + float m = ctx.Attr("m"); + bool use_nesterov = ctx.Attr("use_nesterov"); + auto sparsity = ctx.Attr>("sparsity"); + auto rampup_begin_step = ctx.Attr("rampup_begin_step"); + auto rampup_step = ctx.Attr("rampup_step"); + + // current step + auto current_step_tensor = ctx.Input("current_step"); + const float* current_step = current_step_tensor->data(); + + if (static_cast(*current_step) < static_cast(rampup_begin_step)) { + VLOG(10) << "current_step:" << *current_step + << " < rampup_begin_step:" << rampup_begin_step + << " so does't use dgc"; + return; + } + + float ratio = + 1 - get_period_sparcity(sparsity, static_cast(*current_step), + rampup_step); + PADDLE_ENFORCE(ratio > 0.0 && ratio < 1.0); + int k = static_cast(g->numel() * ratio); + + VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov + << ", rampup_begin_step:" << rampup_begin_step + << ", rampup_step:" << rampup_step + << ", current_step:" << *current_step << ", ratio:" << ratio + << ", k:" << k; + + auto k_out = ctx.Output("k"); + T* k_out_data = k_out->data(); + *k_out_data = k; + + auto u_out = ctx.Output("U_out"); + auto v_out = ctx.Output("V_out"); + auto encode_grad_out = ctx.Output("EncodeGrad"); + + // FIXME(gongwb): use cublas. + auto u_out_e = framework::EigenVector::Flatten(*u_out); + auto u_e = framework::EigenVector::Flatten(*u); + auto g_e = framework::EigenVector::Flatten(*g); + auto& dev_ctx = ctx.template device_context(); + auto& eigen_ctx = *dev_ctx.eigen_device(); + if (use_nesterov) { + // u = m * (u + g) + u_out_e.device(eigen_ctx) = m * (u_e + g_e); + + // v = u + v + g + ElementwiseComputeEx, DeviceContext, T>( + ctx, u, v, 0, AddFunctor(), v_out); + + ElementwiseComputeEx, DeviceContext, T>( + ctx, g, v, 0, AddFunctor(), v_out); + } else { + // u = m * u + g + u_out_e.device(eigen_ctx) = m * u_e + g_e; + + // v = u + v + ElementwiseComputeEx, DeviceContext, T>( + ctx, u, v, 0, AddFunctor(), v_out); + } + + T* v_out_data = v_out->mutable_data(ctx.GetPlace()); + T* u_out_data = u_out->mutable_data(ctx.GetPlace()); + T* encode_grad_out_data = encode_grad_out->mutable_data( + framework::DDim{2 * k}, ctx.GetPlace()); + + int buf_size = paddle::communication::dgc::get_buffer_size(k); + auto& allocator = platform::DeviceTemporaryAllocator::Instance().Get( + ctx.GetPlace(), dev_ctx.stream()); + auto tmp_ious_data = allocator.Allocate(buf_size); + void* buf = reinterpret_cast(tmp_ious_data->ptr()); + + if (!paddle::communication::dgc::k_select( + static_cast(encode_grad_out_data), k, v_out_data, + static_cast(v_out->numel()), buf, dev_ctx.stream(), + u_out_data)) { + LOG(FATAL) << "v_out numel:" << v_out->numel(); + } + + auto grad_out = ctx.Output("Grad_out"); + math::SetConstant tset; + tset(dev_ctx, grad_out, static_cast(0)); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/diag_op.cc b/paddle/fluid/operators/diag_op.cc new file mode 100644 index 00000000..5fb18a1d --- /dev/null +++ b/paddle/fluid/operators/diag_op.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/diag_op.h" + +namespace paddle { +namespace operators { + +class DiagOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Diagonal"), + "Input(Diagonal) of DiagOp should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of DiagOp should not be null."); + + auto s_dims = ctx->GetInputDim("Diagonal"); + PADDLE_ENFORCE(s_dims.size() == 1, + "The rank of Input(Diagonal) should only be 1."); + + ctx->SetOutputDim("Out", {s_dims[0], s_dims[0]}); + } +}; + +class DiagOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Diagonal", + "Diagonal values of square matrix. It is a tensor with rank 1."); + AddOutput("Out", "A square matrix."); + AddComment(R"DOC( + Return a square matrix with specified diagonal values. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(diag, ops::DiagOp, ops::DiagOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + diag, ops::DiagKernel, + ops::DiagKernel, + ops::DiagKernel, + ops::DiagKernel); diff --git a/paddle/fluid/operators/diag_op.cu b/paddle/fluid/operators/diag_op.cu new file mode 100644 index 00000000..9fe1b83b --- /dev/null +++ b/paddle/fluid/operators/diag_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/diag_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + diag, ops::DiagKernel, + ops::DiagKernel, + ops::DiagKernel, + ops::DiagKernel); diff --git a/paddle/fluid/operators/diag_op.h b/paddle/fluid/operators/diag_op.h new file mode 100644 index 00000000..f89415ae --- /dev/null +++ b/paddle/fluid/operators/diag_op.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +template +struct DiagFunctor { + DiagFunctor(const T* diagonal, int64_t numel, T* output) + : diagonal_(diagonal), numel_(numel), output_(output) {} + + HOSTDEVICE void operator()(size_t idx) const { + output_[idx * numel_ + idx] = diagonal_[idx]; + } + + const T* diagonal_; + int64_t numel_; + T* output_; +}; + +template +class DiagKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* diagonal = context.Input("Diagonal"); + auto* diag_data = diagonal->data(); + auto numel = diagonal->numel(); + auto* out = context.Output("Out"); + T* out_data = out->mutable_data(context.GetPlace()); + + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, out, static_cast(0)); + + platform::ForRange for_range(dev_ctx, numel); + DiagFunctor functor(diag_data, numel, out_data); + for_range(functor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt new file mode 100644 index 00000000..8909135d --- /dev/null +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -0,0 +1,66 @@ +if(NOT WITH_DISTRIBUTE) + return() +endif() + +if(WITH_GRPC) + set(cc_generic_services "false") +else() + set(cc_generic_services "true") +endif() +configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) + +cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool) +cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder) + +# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +if(WITH_GRPC) + set(GRPC_DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) + set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc) + grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc + request_handler_impl.cc rpc_client.cc rpc_server.cc + variable_response.cc + collective_client.cc collective_server.cc + ${GRPC_SRCS} + PROTO send_recv.proto + DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder) + + set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) + + cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc + DEPS ${RPC_DEPS} scope profiler math_function) + +else() + set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) + set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + + set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib) + + brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc + request_handler_impl.cc rpc_client.cc rpc_server.cc + variable_response.cc + collective_client.cc collective_server.cc + ${BRPC_SRCS} + PROTO send_recv.proto + DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS}) + + set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS}) + cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc + DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_op) +endif() + + +cc_test(rpc_server_test SRCS rpc_server_test.cc + DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op) +cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) +cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) +cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) +cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) +cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv) +cc_test(communicator_test SRCS communicator_test.cc DEPS communicator) +if(WITH_GPU) + cc_test(collective_server_test SRCS collective_server_test.cc + DEPS sendrecvop_rpc executor ${RPC_DEPS} + selected_rows_functor scope math_function) +endif() diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc new file mode 100644 index 00000000..3f3b6b95 --- /dev/null +++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" + +namespace paddle { +namespace operators { +namespace distributed { + +std::once_flag AsyncSparseParamUpdateRecorder::init_flag_; +std::unique_ptr + AsyncSparseParamUpdateRecorder::recorder_(nullptr); + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h new file mode 100644 index 00000000..eadd842c --- /dev/null +++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h @@ -0,0 +1,183 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include + +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class ConcurrentSet { + public: + ConcurrentSet() : pool_(new ::ThreadPool(1)) {} + ~ConcurrentSet() {} + + std::future Update(const std::vector& rows) { + auto task = [this, rows] { + if (VLOG_IS_ON(3)) { + std::ostringstream sstream; + sstream << "["; + for (auto& id : rows) { + sstream << id << ", "; + } + sstream << "]"; + VLOG(3) << "update ids -> " << sstream.str(); + } + for (auto row : rows) { + set_.insert(row); + } + }; + return pool_->enqueue(std::move(task)); + } + + std::future GetAndClear(std::vector* result) { + auto task = [this, &result] { + result->clear(); + for (auto& id : set_) { + result->push_back(id); + } + if (VLOG_IS_ON(3)) { + std::ostringstream sstream; + sstream << "["; + for (auto& id : *result) { + sstream << id << ", "; + } + sstream << "]"; + VLOG(3) << "result ids size: " << result->size() << " " + << sstream.str(); + } + set_.clear(); + }; + return pool_->enqueue(std::move(task)); + } + + private: + std::unordered_set set_; + std::unique_ptr<::ThreadPool> pool_{nullptr}; +}; + +class AsyncSparseParamUpdateRecorder { + using TrainerToRows = std::vector>; + + public: + AsyncSparseParamUpdateRecorder( + int trainer_num, + const std::unordered_map& grad_to_param) + : trainer_num_(trainer_num), grad_to_param_(grad_to_param) { + if (VLOG_IS_ON(3)) { + std::ostringstream sstream; + sstream << "["; + for (auto& item : grad_to_param) { + sstream << item.first << ":" << item.second << ", "; + } + sstream << "]"; + VLOG(3) << "trainer_num: " << trainer_num + << " grad_to_param_: " << sstream.str(); + } + for (auto& iter : grad_to_param) { + param_to_grad_[iter.second] = iter.first; + auto& param_name = iter.second; + param_to_updated_rows_[param_name] = TrainerToRows(); + auto& trainer_to_rows = param_to_updated_rows_[param_name]; + for (auto i = 0; i < trainer_num; ++i) { + trainer_to_rows.emplace_back(new ConcurrentSet()); + } + } + } + + ~AsyncSparseParamUpdateRecorder() = default; + + void Update(const std::string& grad_name, + const std::vector& update_rows) { + VLOG(3) << "update grad: " << grad_name + << " row size: " << update_rows.size(); + auto& param_name = grad_to_param_.at(grad_name); + auto& trainer_to_rows = param_to_updated_rows_.at(param_name); + + std::vector> fs; + for (auto& set : trainer_to_rows) { + fs.push_back(set->Update(update_rows)); + } + for (auto& f : fs) { + f.wait(); + } + } + + void GetAndClear(const std::string& param_name, int trainer_id, + std::vector* result) { + VLOG(3) << "GetAndClear param: " << param_name + << " for trainer: " << trainer_id; + PADDLE_ENFORCE_LT(trainer_id, trainer_num_); + param_to_updated_rows_.at(param_name)[trainer_id] + ->GetAndClear(result) + .wait(); + } + + bool HasParam(const std::string& param_name) { + return param_to_grad_.find(param_name) != param_to_grad_.end(); + } + + bool HasGrad(const std::string& grad_name) { + return grad_to_param_.find(grad_name) != grad_to_param_.end(); + } + + private: + const int trainer_num_; + std::unordered_map grad_to_param_; + std::unordered_map param_to_grad_; + std::unordered_map param_to_updated_rows_; + + // init recorder + public: + static void Init( + int trainer_num, + const std::unordered_map& grad_to_param) { + InitImpl(trainer_num, grad_to_param); + } + + static AsyncSparseParamUpdateRecorder* GetInstance() { + return recorder_.get(); + } + + private: + // Init is called by GetInstance. + static void InitImpl( + int trainer_num, + const std::unordered_map& grad_to_param) { + if (recorder_ == nullptr) { + recorder_.reset( + new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param)); + } + } + + static std::once_flag init_flag_; + static std::unique_ptr recorder_; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc new file mode 100644 index 00000000..67e8fd8a --- /dev/null +++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" + +#include + +#include "gtest/gtest.h" + +namespace paddle { +namespace operators { +namespace distributed { + +TEST(ConcurrentSet, All) { + ConcurrentSet concurrent_set; + std::vector in1 = {1, 2, 3, 4}; + std::vector in2 = {2, 3, 5, 6}; + + std::vector> futures; + futures.push_back(concurrent_set.Update(in1)); + futures.push_back(concurrent_set.Update(in2)); + + for (auto &f : futures) { + f.wait(); + } + + std::unordered_set in; + std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); + std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); + + std::vector ret; + concurrent_set.GetAndClear(&ret).wait(); + + std::unordered_set out; + std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); + + EXPECT_EQ(in, out); + + concurrent_set.GetAndClear(&ret).wait(); + EXPECT_EQ(ret.size(), 0); +} + +TEST(AsyncSparseParamUpdateRecorder, All) { + std::unordered_map grad_to_param; + grad_to_param["grad1"] = "param1"; + grad_to_param["grad2"] = "param2"; + + int trainer_num = 10; + + AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param); + std::vector in1 = {1, 2, 3, 4}; + std::vector in2 = {2, 3, 5, 6}; + + std::unordered_set in; + std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); + std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); + + recorder.Update("grad1", in1); + recorder.Update("grad1", in2); + + EXPECT_TRUE(recorder.HasParam("param1")); + EXPECT_TRUE(recorder.HasParam("param2")); + EXPECT_FALSE(recorder.HasParam("param3")); + + EXPECT_TRUE(recorder.HasGrad("grad1")); + EXPECT_TRUE(recorder.HasGrad("grad2")); + EXPECT_FALSE(recorder.HasGrad("grad3")); + + std::vector ret; + EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret)); + + for (int i = 0; i < trainer_num; ++i) { + std::vector ret; + std::unordered_set out; + + recorder.GetAndClear("param1", i, &ret); + std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); + + EXPECT_EQ(in, out); + + recorder.GetAndClear("param1", i, &ret); + EXPECT_EQ(ret.size(), 0); + } +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc new file mode 100644 index 00000000..4c22ad8e --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc @@ -0,0 +1,456 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); +DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); + +BRPCClient::~BRPCClient() { Wait(); } + +void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + // this channel can be used by other now. + ch_ptr->Push(ch_ctx); + + if (cntl->Failed()) { + LOG(FATAL) << "Fail to send SendVar: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + var_h->Finish(false); + cls->DecreaseReqCount(); + return; + } + var_h->Finish(true); + cls->DecreaseReqCount(); + + VLOG(4) << "HandleSendResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + VLOG(4) << "Finish HandleSendResponse"; +} + +VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch_ptr = GetChannel(ep_val); + const std::string method = kSendRPC; + VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); + + auto* var = p_scope->FindVar(var_name_val); + sendrecv::VariableMessage request; + distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request, + &cntl->request_attachment(), "", false, + trainer_id_); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + platform::RecordRPCEvent record_event(method); + + ch_ctx->stub->SendVariable(cntl, &request, response, done); + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); + req_count_++; + + return var_h; +} +void HandleFetchBarrierResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + // this channel can be used other now. + ch_ptr->Push(ch_ctx); + + if (cntl->Failed()) { + LOG(FATAL) << "Fail to get HandleFetchBarrierResponse: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + var_h->Finish(false); + cls->DecreaseReqCount(); + return; + } + + var_h->Finish(true); + cls->DecreaseReqCount(); + + VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + VLOG(4) << "Finish HandleFetchBarrierResponse"; +} +void HandleGetResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, VarHandlePtr var_h, + ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx, + BRPCClient* cls) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + // this channel can be used other now. + ch_ptr->Push(ch_ctx); + + if (cntl->Failed()) { + LOG(FATAL) << "Fail to GetVar: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + cls->DecreaseReqCount(); + var_h->Finish(false); + return; + } + + VLOG(4) << "HandleGetResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + + framework::Variable* outvar = nullptr; + int trainer_id; + distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(), + *var_h->ctx(), var_h->scope(), &outvar, + &trainer_id); + VLOG(4) << "Finish HandleGetResponse"; + cls->DecreaseReqCount(); + var_h->Finish(true); +} + +VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& out_var_name, + const std::string& method_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const std::string out_varname_val = out_var_name; + const framework::Scope* p_scope = &scope; + const auto ch_ptr = GetChannel(ep_val); + const std::string method = kGetRPC; + VarHandlePtr var_h( + new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); + + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + req.set_out_varname(out_varname_val); + req.set_trainer_id(trainer_id_); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + platform::RecordRPCEvent record_event(method); + + if (method_name == kGetMonomerRPC) { + ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); + } else if (method_name == kGetNoBarrierRPC) { + ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done); + } else { + ch_ctx->stub->GetVariable(cntl, &req, response, done); + } + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); + + req_count_++; + + return var_h; +} + +VarHandlePtr BRPCClient::AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, int64_t time_out) { + std::string var_name_no_barrier = + string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); + + return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name, + kGetNoBarrierRPC, time_out); +} + +VarHandlePtr BRPCClient::AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC, + time_out); +} + +VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, + const std::string& var_name, + int64_t time_out) { + return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out); +} + +VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& out_var_name, + const std::string& table_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC, + time_out); +} + +VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string in_var_name_val = in_var_name; + const std::string out_var_name_val = out_var_name; + const std::string table_name_val = table_name; + const framework::Scope* p_scope = &scope; + const auto ch_ptr = GetChannel(ep_val); + + const std::string method = kPrefetchRPC; + + VarHandlePtr var_h( + new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); + + auto* var = p_scope->FindVar(in_var_name_val); + sendrecv::VariableMessage req; + distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req, + &cntl->request_attachment(), out_var_name_val, + false, 0, table_name_val); + + platform::RecordRPCEvent record_event(method); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + ch_ctx->stub->PrefetchVariable(cntl, &req, response, done); + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); + + req_count_++; + return var_h; +} + +VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { + return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE, + time_out); +} + +VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { + auto ch_ptr = GetChannel(ep); + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); + + sendrecv::VariableMessage req; + req.set_varname(FETCH_BARRIER_MESSAGE); + + const std::string method = kFetchBarrierRPC; + // var handle + VarHandlePtr var_h( + new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + + platform::RecordRPCEvent record_event(method); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + ch_ctx->stub->GetVariable(cntl, &req, response, done); + + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + + return var_h; +} + +bool BRPCClient::Wait() { + VLOG(9) << "begin to brpcclient wait"; + { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return req_count_ == 0; }); + } + VLOG(9) << "end to brpcclient wait"; + return true; +} + +ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { + VLOG(4) << "begin to GetChannel:" << ep; + { + std::lock_guard guard(chan_mutex_); + auto it = channels_.find(ep); + if (it != channels_.end()) { + VLOG(4) << "end to GetChannel:" << ep; + return it->second; + } + } + + ChannelQueuePtr q(new framework::BlockingQueue()); + + brpc::ChannelOptions options; +#ifdef PADDLE_WITH_BRPC_RDMA + options.use_rdma = true; +#endif + options.protocol = "baidu_std"; + // don't use pooled type. the server can't afford that. + options.connection_type = "single"; + options.connect_timeout_ms = 1000; + options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; + options.max_retry = FLAGS_max_retry; + + VLOG(1) << "create " << brpc_channel_num_per_server_ + << " brpc channels to pserver:" << ep; + + for (int i = 0; i < brpc_channel_num_per_server_; ++i) { + std::shared_ptr c(new ChannelContext()); + if (c->channel.Init(ep.c_str(), &options) != 0) { + LOG(FATAL) << "Fail to initialize channel"; + return nullptr; + } + + c->stub.reset(new sendrecv::SendRecvService_Stub( + static_cast(&c->channel))); + q->Push(c); + } + + { + std::lock_guard guard(chan_mutex_); + channels_[ep] = q; + } + + VLOG(4) << "end to GetChannel:" << ep; + return q; +} + +VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, + int64_t time_out) { + return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out); +} + +void BRPCClient::SendComplete() { + for (auto& kv : channels_) { + AsyncSendComplete(kv.first); + } +} + +VarHandlePtr BRPCClient::AsyncSendVarMessage( + const std::string& ep, const std::string& method_name, + const sendrecv::VariableMessage& req, int64_t time_out) { + auto ch_ptr = GetChannel(ep); + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); + + platform::RecordRPCEvent record_event(method_name); + + VarHandlePtr var_h( + new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + if (method_name == kCheckPointNotifyRPC) { + ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); + } else if (method_name == kSendMonomerFetchBarrierRPC) { + ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); + } else { + ch_ctx->stub->SendVariable(cntl, &req, response, done); + } + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + + return var_h; +} + +VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep, + const std::string& method_name, + const std::string& message, + int64_t time_out) { + sendrecv::VariableMessage req; + req.set_varname(message); + + return AsyncSendVarMessage(ep, method_name, req, time_out); +} + +VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep, + const std::string& dir, + int64_t time_out) { + sendrecv::VariableMessage req; + req.set_varname(CHECKPOINT_SAVE_MESSAGE); + req.set_out_varname(dir); + + return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out); +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h new file mode 100644 index 00000000..51864dfd --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h @@ -0,0 +1,173 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include // NOLINT +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace operators { +namespace distributed { + +struct ChannelContext { + brpc::Channel channel; + std::shared_ptr stub; +}; + +typedef std::shared_ptr ChannelContextPtr; +typedef std::shared_ptr> + ChannelQueuePtr; + +class BRPCClient : public RPCClient { + public: + BRPCClient() {} + virtual ~BRPCClient(); + + VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& out_var_name, + const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& out_varname, + int64_t time_out = FLAGS_rpc_deadline); + + VarHandlePtr AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) override; + + bool Wait() override; + + void SendComplete() override; + + private: + VarHandlePtr _AsyncGetVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, const std::string& method_name, + const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline); + + void Proceed(); + ChannelQueuePtr GetChannel(const std::string& ep); + + VarHandlePtr AsyncSendComplete(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline); + + VarHandlePtr AsyncSendMessage(const std::string& ep, + const std::string& method_name, + const std::string& message, int64_t time_out); + + VarHandlePtr AsyncSendVarMessage(const std::string& ep, + const std::string& method_name, + const sendrecv::VariableMessage& req, + int64_t time_out); + + friend void HandleSendResponse(brpc::Controller* cntl, + sendrecv::VoidMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls); + + friend void HandleGetResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls); + + friend void HandleFetchBarrierResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, + ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, + BRPCClient* cls); + void DecreaseReqCount() { + if (--req_count_ <= 0) { + sync_cond_.notify_all(); + } + } + + private: + std::unordered_map channels_; + + // mutex for Wait client sync + std::mutex sync_mutex_; + std::condition_variable sync_cond_; + std::atomic req_count_{0}; + + static constexpr int brpc_channel_num_per_server_ = 4; + + // mutex for GetChannel thread safety + std::mutex chan_mutex_; + DISABLE_COPY_AND_ASSIGN(BRPCClient); +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc new file mode 100644 index 00000000..d5c61400 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_BRPC_RDMA + +#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" +#include "brpc/channel.h" +#include "brpc/rdma/rdma_helper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace distributed { + +RdmaMemPool& RdmaMemPool::Instance() { + static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool(); + return *g_rdma_mem_pool; +} + +void* RdmaMemPool::Find(const std::string& varname, int64_t size) { + pthread_rwlock_rdlock(&access_); + auto it = pool_.find(varname); + if (it == pool_.end()) { + pthread_rwlock_unlock(&access_); + return nullptr; + } + + auto info = it->second; + if (info.data_size != size) { + pthread_rwlock_unlock(&access_); + PADDLE_ENFORCE(false, "var:%s size:%ld != %ld", varname, size, + info.data_size); + return nullptr; + } + + pthread_rwlock_unlock(&access_); + return info.data; +} + +void RdmaMemPool::Register(const std::string& varname, void* data, + int64_t data_size) { + void* old = Find(varname, data_size); + if (old != nullptr) { + if (data != old) { + PADDLE_ENFORCE(false, "var:%s data:%ld != %ld", varname, data, old); + } + VLOG(7) << "Find on rdma:" << varname << " data:" << data + << " data_size:" << data_size; + return; + } + + VarInfo info; + info.data = data; + info.data_size = data_size; + + pthread_rwlock_wrlock(&access_); + pool_[varname] = info; + pthread_rwlock_unlock(&access_); + + if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) { + LOG(FATAL) << "register " << varname << " data:" << data + << " data_size:" << data_size << " error"; + } + + VLOG(4) << "register on rdma:" << varname << " data:" << data + << " data_size:" << data_size; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h new file mode 100644 index 00000000..156a93ec --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_BRPC_RDMA + +#include // NOLINT +#include +#include + +namespace paddle { +namespace operators { +namespace distributed { + +/* + * This class is used to avoid duplicated registion of brpc::rdma. + */ +class RdmaMemPool { + public: + static RdmaMemPool& Instance(); + RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {} + + virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); } + + void Register(const std::string& varname, void* data, int64_t size); + void* Find(const std::string& varname, int64_t size); + + private: + struct VarInfo { + void* data; + int64_t data_size; + + VarInfo() : data(nullptr), data_size(0) {} + }; + + private: + std::unordered_map pool_; + pthread_rwlock_t access_; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc new file mode 100644 index 00000000..49e048f0 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc @@ -0,0 +1,207 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUDA +#include +#endif +#include +#include +#include // NOLINT + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class IOBufWriter { + public: + static void Append(const std::string& varname, butil::IOBuf* iobuf, int k, + const char* v, int64_t vlen) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + iobuf->append(v, vlen); + } + + static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v, + int64_t vlen, bool in_cuda_pinned, + void (*destroy)(void*), void* user_data) { + VLOG(7) << "AppendTCPZeroCopy " + << " k:" << k + << " data:" << static_cast(const_cast(v)) + << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + + // FIXME(gongwb): use append_zerocopy + /* + if (in_cuda_pinned) { + iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory); + } else { + iobuf->append_zerocopy(v, vlen, nullptr); + } + */ + iobuf->append(v, vlen); + destroy(user_data); + } + +#ifdef PADDLE_WITH_BRPC_RDMA + static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf, + int k, const char* v, int64_t vlen, + bool in_cuda_pinned, void (*destroy)(void*), + void* user_data) { + VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k + << " data:" << static_cast(const_cast(v)) + << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + + RdmaMemPool::Instance().Register( + varname, static_cast(const_cast(v)), vlen); + + // FIXME(gongwb): use append_zerocopy + // iobuf->append_zerocopy(v, vlen, nullptr); + iobuf->append(v, vlen); + destroy(user_data); + return; + } +#endif + + static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf, + int k, const char* v, int64_t vlen, + bool in_cuda_pinned, void (*destroy)(void*), + void* user_data) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + +#ifdef PADDLE_WITH_BRPC_RDMA + IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, + destroy, user_data); +#else + IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy, + user_data); +#endif + } +}; + +void SerializeToIOBuf(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + butil::IOBuf* iobuf, const std::string& out_varname, + bool var_is_not_stable, int trainer_id, + const std::string& table_name) { + std::unique_ptr payload; + + request->set_varname(name); + request->set_trainer_id(trainer_id); + // Note: normally the profiler is enabled in 1 trainer, hence only + // 1 trainer returns true for ShouldSendProfileState(). It tells PS + // servers the trainer's profiling state so that PS can follow the + // trainer. + if (platform::ShouldSendProfileState()) { + if (platform::IsProfileEnabled()) { + request->set_profile(platform::kEnableProfiler); + } else { + request->set_profile(platform::kDisableProfiler); + } + } + if (!out_varname.empty()) { + request->set_out_varname(out_varname); + } + if (!table_name.empty()) { + request->set_table_name(table_name); + } + if (var->IsType()) { + request->set_type(::sendrecv::LOD_TENSOR); + payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request))); + } else if (var->IsType()) { + request->set_type(::sendrecv::SELECTED_ROWS); + payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request))); +#ifdef PADDLE_WITH_CUDA + } else if (var->IsType()) { + request->set_type(::sendrecv::NCCL_ID); + const ncclUniqueId& uid = var->Get(); + // TODO(gongwb): use append_zero to avoid data copy. + IOBufWriter::Append(name, iobuf, + sendrecv::VariableMessage::kSerializedFieldNumber, + uid.internal, NCCL_UNIQUE_ID_BYTES); + return; +#endif + } else { + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + } + + PADDLE_ENFORCE_NOT_NULL(payload); + + // FIXME(gongwb): it seems that can use zero copy. + if (var_is_not_stable) { + IOBufWriter::Append( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size()); + } else { + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + IOBufWriter::AppendZeroCopy( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size(), + true, SerializeDestroyCallback, static_cast(payload.get())); + payload.release(); +#endif + } else { + IOBufWriter::AppendZeroCopy( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size(), + false, SerializeDestroyCallback, static_cast(payload.get())); + payload.release(); + } + } + + if (var->IsType()) { + auto* slr = var->GetMutable(); + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); + size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); + + IOBufWriter::Append(name, iobuf, + ::sendrecv::VariableMessage::kRowsFieldNumber, + reinterpret_cast(slr->rows().data()), + static_cast(rows_memory_size)); + } +} + +void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta, + const butil::IOBuf& iobuf, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id) { + operators::distributed::BRPCVariableResponse resp(scope, &ctx); + PADDLE_ENFORCE(resp.Parse(iobuf, meta) == 0, "parse iobuf to tensor error!"); + *var = resp.GetVar(); + *trainer_id = resp.GetTrainerId(); +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h new file mode 100644 index 00000000..a5bdc331 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" + +namespace paddle { +namespace operators { +namespace distributed { + +void SerializeToIOBuf(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + butil::IOBuf* iobuf, const std::string& out_varname, + bool var_is_not_stable, const int trainer_id = 0, + const std::string& table_name = std::string()); + +void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id); + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc new file mode 100644 index 00000000..b902d3db --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc @@ -0,0 +1,175 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "brpc/channel.h" +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; + +void RunSerdeTestSelectedRows(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + butil::IOBuf iobuf; + sendrecv::VariableMessage msg; + int tensor_numel = 564 * 128; + + // serialize var to IOBuf + { + framework::Variable var; + auto* slr = var.GetMutable(); + slr->set_height(1000); + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + tensor->Resize(framework::make_ddim({564, 128})); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 32.7); + for (int i = 0; i < 564; ++i) rows->push_back(i); + + operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, + "", false); + } + + // desrialize + { + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::BRPCVariableResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(iobuf, msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto* slr2 = var2->GetMutable(); + auto* tensor2 = slr2->mutable_value(); + auto* rows2 = slr2->mutable_rows(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(*tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2->data()); + } + const int64_t* rows_data2 = rows2->data(); + + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); + } + for (size_t i = 0; i < rows2->size(); ++i) { + EXPECT_EQ(rows_data2[i], static_cast(i)); + } + EXPECT_EQ(slr2->height(), 1000); + } +} + +void RunTestLodTensor(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + // serialize var to ByteBuffer + butil::IOBuf iobuf; + sendrecv::VariableMessage msg; + int tensor_numel = 512 * 8 * 4 * 2; + { + framework::Variable var; + auto* tensor = var.GetMutable(); + tensor->Resize(framework::make_ddim({512, 8, 4, 2})); + framework::LoD lod; + lod.push_back(framework::Vector({1, 3, 8})); + tensor->set_lod(lod); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 31.9); + + operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, + "", false); + } + + // check sendrecv::VariableMessage meta data + { + EXPECT_EQ(msg.varname(), "myvar"); + EXPECT_EQ(msg.type(), 0); + EXPECT_EQ(msg.dims()[0], 512); + EXPECT_EQ(msg.dims()[1], 8); + EXPECT_EQ(msg.dims()[2], 4); + EXPECT_EQ(msg.dims()[3], 2); + EXPECT_EQ(msg.lod_level(), 1); + EXPECT_EQ(msg.lod(0).lod_data(0), 1); + EXPECT_EQ(msg.lod(0).lod_data(1), 3); + EXPECT_EQ(msg.lod(0).lod_data(2), 8); + } + + // deserialize + { + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::BRPCVariableResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(iobuf, msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto tensor2 = var2->Get(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2.data()); + } + + for (int i = 0; i < tensor_numel; ++i) + EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); + } +} + +TEST(LodTensor, Run) { + platform::CPUPlace place; + RunTestLodTensor(place); +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu(0); + RunTestLodTensor(gpu); +#endif +} + +TEST(SelectedRows, Run) { + platform::CPUPlace place; + RunSerdeTestSelectedRows(place); +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu; + RunSerdeTestSelectedRows(gpu); +#endif +} diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc new file mode 100644 index 00000000..fea9b094 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc/brpc_server.cc @@ -0,0 +1,403 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/request_handler.h" + +namespace sendrecv { + +namespace distributed = paddle::operators::distributed; + +typedef std::unordered_map + HandlerMap; + +class BRPCServiceImpl : public SendRecvService { + public: + explicit BRPCServiceImpl(const HandlerMap& rpc_call_map, + distributed::RPCServer* rpc_server) + : rpc_server_(rpc_server) { + VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size(); + auto it = rpc_call_map.find(distributed::kRequestSend); + if (it != rpc_call_map.end()) { + request_send_h_ = it->second; + send_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestSend))); + } + + it = rpc_call_map.find(distributed::kRequestGet); + if (it != rpc_call_map.end()) { + request_get_h_ = it->second; + get_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestGet))); + } + + it = rpc_call_map.find(distributed::kRequestGetNoBarrier); + if (it != rpc_call_map.end()) { + request_getnobarrier_h_ = it->second; + getnobarrier_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier))); + } + + it = rpc_call_map.find(distributed::kRequestPrefetch); + if (it != rpc_call_map.end()) { + request_prefetch_h_ = it->second; + prefetch_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); + } + + it = rpc_call_map.find(distributed::kRequestCheckpoint); + if (it != rpc_call_map.end()) { + request_checkpoint_h_ = it->second; + checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); + } + + it = rpc_call_map.find(distributed::kRequestGetMonomerVariable); + if (it != rpc_call_map.end()) { + request_get_monomer_handler_h_ = it->second; + } + + it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier); + if (it != rpc_call_map.end()) { + request_get_monomer_barrier_handler_h_ = it->second; + } + } + + virtual ~BRPCServiceImpl() {} + void SendVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + send_threads_->Run( + [=] { _SendVariable(cntl_butil, request, response, done); }); + } + + void _SendVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE(request_send_h_ != nullptr, + "RequestSend handler should be registed first!"); + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + VLOG(3) << "RequestSend var_name:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + distributed::BRPCVariableResponse resp(request_send_h_->scope(), + request_send_h_->dev_ctx(), + !request_send_h_->sync_mode()); + PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0, + "parse iobuf to tensor error!"); + + auto scope = resp.GetMutableLocalScope(); + auto invar = resp.GetVar(); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; + + request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id); + } + + void GetVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VariableMessage* response, + google::protobuf::Closure* done) override { + get_threads_->Run( + [=] { _GetVariable(cntl_butil, request, response, done); }); + } + + void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + getnobarrier_threads_->Run( + [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); }); + } + + void _GetVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VariableMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE(request_get_h_ != nullptr, + "RequestGet handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + std::string out_varname = request->out_varname(); + VLOG(3) << "RequestGet varname:" << varname + << ", out_varname:" << out_varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + auto scope = request_get_h_->scope(); + paddle::framework::Variable* invar = nullptr; + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; + + request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); + + if (outvar) { + distributed::SerializeToIOBuf(out_varname, outvar, + *request_get_h_->dev_ctx(), response, + &cntl->response_attachment(), "", false); + } + } + + void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE(request_getnobarrier_h_ != nullptr, + "RequestGetNoBarrier handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + std::string out_varname = request->out_varname(); + int trainer_id = request->trainer_id(); + + VLOG(3) << "RequestGetNoBarrier varname:" << varname + << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id + << ", from:" << cntl->remote_side(); + + auto scope = request_getnobarrier_h_->scope(); + paddle::framework::Variable* invar = nullptr; + paddle::framework::Variable* outvar = nullptr; + + request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); + + if (outvar) { + distributed::SerializeToIOBuf( + out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response, + &cntl->response_attachment(), "", false); + } + } + + void PrefetchVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + prefetch_threads_->Run( + [=] { _PrefetchVariable(cntl_butil, request, response, done); }); + } + + void _PrefetchVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE(request_prefetch_h_ != nullptr, + "kRequestPrefetch handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + // prefetch process... + std::string in_var_name = request->varname(); + std::string out_var_name = request->out_varname(); + VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name + << ", out_var_name: " << out_var_name + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + distributed::BRPCVariableResponse resp( + request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true); + + PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0, + "parse iobuf to tensor error!"); + + auto scope = resp.GetMutableLocalScope(); + auto invar = scope->FindVar(in_var_name); + std::string table_name = request->table_name(); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = scope->Var(out_var_name); + + request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id, + out_var_name, table_name); + + distributed::SerializeToIOBuf(out_var_name, outvar, + *request_prefetch_h_->dev_ctx(), response, + &cntl->response_attachment(), "", true); + } + + void CheckpointNotify(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + checkpoint_notify_threads_->Run( + [=] { _CheckpointNotify(cntl_butil, request, response, done); }); + } + + void _CheckpointNotify(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE( + request_checkpoint_h_ != nullptr, + "kRequestCheckpointNotify handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(), + request_checkpoint_h_->dev_ctx()); + + auto scope = resp.GetMutableLocalScope(); + + std::string checkpoint_notify = request->varname(); + std::string checkpoint_dir = request->out_varname(); + int trainer_id = request->trainer_id(); + + VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify + << ", dir: " << checkpoint_dir + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr, + trainer_id, checkpoint_dir); + } + + void GetMonomerVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE( + request_get_monomer_handler_h_ != nullptr, + "kRequestGetMonomerVariable handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + // proc request. + std::string varname = request->varname(); + VLOG(3) << "GetMonomerVariable " << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + rpc_server_->WaitVarCond(varname); + distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); + + auto scope = h.scope_; + auto invar = scope->FindVar(varname); + paddle::framework::Variable* outvar = nullptr; + + request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar, + request->trainer_id()); + + if (outvar) { + distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response, + &cntl->response_attachment(), "", false); + } + } + + void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE( + request_get_monomer_barrier_handler_h_ != nullptr, + "RequestGetMonomerBarrier handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + rpc_server_->WaitVarCond(varname); + distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); + + paddle::framework::Scope* scope = nullptr; + paddle::framework::Variable* invar = nullptr; + paddle::framework::Variable* outvar = nullptr; + + request_get_monomer_barrier_handler_h_->Handle( + varname, scope, invar, &outvar, request->trainer_id()); + } + + private: + distributed::RequestHandler* request_send_h_{nullptr}; + distributed::RequestHandler* request_get_h_{nullptr}; + distributed::RequestHandler* request_getnobarrier_h_{nullptr}; + distributed::RequestHandler* request_prefetch_h_{nullptr}; + distributed::RequestHandler* request_checkpoint_h_{nullptr}; + distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; + distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr}; + + distributed::RPCServer* rpc_server_{nullptr}; + + // FIXME(gongwb): brpc should support process one rpc use one threadpool. + std::unique_ptr send_threads_; + std::unique_ptr get_threads_; + std::unique_ptr getnobarrier_threads_; + std::unique_ptr prefetch_threads_; + std::unique_ptr checkpoint_notify_threads_; +}; +} // namespace sendrecv + +namespace paddle { +namespace operators { +namespace distributed { + +void AsyncBRPCServer::StartServer() { + // Instance of your service. + sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this); + + // Add the service into server. Notice the second parameter, because the + // service is put on stack, we don't want server to delete it, otherwise + // use brpc::SERVER_OWNS_SERVICE. + if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { + LOG(FATAL) << "Fail to add service"; + return; + } + + brpc::ServerOptions options; +#ifdef PADDLE_WITH_BRPC_RDMA + options.use_rdma = true; +#endif + options.idle_timeout_sec = idle_timeout_s_; + options.max_concurrency = max_concurrency_; + if (server_.Start(bind_address_.c_str(), &options) != 0) { + LOG(FATAL) << "Fail to start EchoServer" << bind_address_; + return; + } + + butil::EndPoint ep = server_.listen_address(); + selected_port_ = ep.port; + + { + std::lock_guard lock(this->mutex_ready_); + ready_ = 1; + } + condition_ready_.notify_all(); + + server_.Join(); +} + +void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); } + +void AsyncBRPCServer::WaitServerReady() { + VLOG(3) << "AsyncGRPCServer is wait server ready"; + std::unique_lock lock(this->mutex_ready_); + condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); + VLOG(3) << "AsyncGRPCServer WaitSeverReady"; +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h new file mode 100644 index 00000000..78bbe5ad --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc/brpc_server.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // NOLINT +#include // NOLINT +#include + +#include "brpc/server.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class AsyncBRPCServer final : public RPCServer { + public: + explicit AsyncBRPCServer(const std::string& address, int client_num) + : RPCServer(address, client_num), ready_(0) {} + + virtual ~AsyncBRPCServer() {} + void StartServer() override; + void WaitServerReady() override; + + private: + void ShutDownImpl() override; + + brpc::Server server_; + + static constexpr int idle_timeout_s_ = -1; + static constexpr int max_concurrency_ = 0; + + std::mutex mutex_ready_; + std::condition_variable condition_ready_; + int ready_; +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc new file mode 100644 index 00000000..eb78917a --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace distributed { + +namespace pb = ::google::protobuf; +using vr = ::sendrecv::VariableMessage; + +int BRPCVariableResponse::Parse(Source* source) { + pb::io::ZeroCopyInputStream* input_stream = source->contents(); + pb::io::CodedInputStream input(input_stream); + input.SetTotalBytesLimit(INT_MAX, INT_MAX); + + while (1) { + unsigned int tag = 0; + if (!input.ReadLittleEndian32(&tag)) { + break; + } + + uint64_t num_bytes = 0; + if (!input.ReadLittleEndian64(&num_bytes)) { + break; + } + + int field = static_cast(tag); + int ret = field == 0 ? -1 : field; + switch (field) { + case vr::kSerializedFieldNumber: { + if (!ProcSerializedField(field, &input, num_bytes)) { + return ret; + } + break; + } + case vr::kRowsFieldNumber: { + PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || + meta_.type() == sendrecv::LOD_TENSOR) && + meta_.varname() != "", + "meta info should be got first!"); + + if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { + return ret; + } + break; + } + default: { + PADDLE_ENFORCE(false, "not surpported %u fieldnumber", field); + return ret; + } + } + } + + return 0; +} +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h new file mode 100644 index 00000000..6282f08a --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" + +#include "paddle/fluid/operators/distributed/distributed_pb.h" + +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/distributed/variable_response.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class BRPCSourceWrapper : public Source { + public: + explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {} + ::google::protobuf::io::ZeroCopyInputStream* contents() override { + return &source_; + } + + private: + butil::IOBufAsZeroCopyInputStream source_; +}; + +class BRPCVariableResponse : public VariableResponse { + public: + BRPCVariableResponse(const framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + bool create_scope = false) + : VariableResponse(scope, dev_ctx, create_scope) {} + + virtual ~BRPCVariableResponse() {} + + // parse attachment from iobuf + int Parse(Source* source) override; + int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) { + BRPCSourceWrapper wrapper(iobuf); + return VariableResponse::Parse(&wrapper, meta); + } +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc new file mode 100644 index 00000000..6d3f5343 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_client.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include // NOLINT +#include +#include "gflags/gflags.h" + +#include "paddle/fluid/operators/distributed/collective_client.h" + +DECLARE_int32(rpc_deadline); + +namespace paddle { +namespace operators { +namespace distributed { +std::once_flag CollectiveClient::init_flag_; +std::unique_ptr CollectiveClient::client_(nullptr); + +bool CollectiveClient::Gather(const std::vector& remote_vars, + std::vector* dst, + const platform::DeviceContext& ctx, + framework::Scope* scope, int64_t time_out) { + for (auto r : remote_vars) { + VLOG(50) << "begin gather from ep:" << r.String(); + scope->Var(r.var_name_)->GetMutable(); + VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable( + r.ep_, ctx, *scope, r.var_name_, time_out); + } + + rpc_client_->Wait(); + + for (auto r : remote_vars) { + auto select_rows = + scope->FindVar(r.var_name_)->GetMutable(); + dst->push_back(select_rows); + + VLOG(4) << "gather from ep:" << r.String() + << ", select_rows:" << GetSelectedRowsInfo(*select_rows); + + rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_); + } + + rpc_client_->Wait(); + return true; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h new file mode 100644 index 00000000..6a3a450a --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_client.h @@ -0,0 +1,93 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include +#include "gflags/gflags.h" + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/request_handler.h" + +DECLARE_int32(rpc_deadline); + +namespace paddle { +namespace operators { +namespace distributed { + +inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) { + std::stringstream ss; + ss << ", height:" << slr.height() << ", rows:["; + for (unsigned int i = 0; i < slr.rows().size(); i++) { + if (i != slr.rows().size() - 1) { + ss << slr.rows()[i] << ","; + } else { + ss << slr.rows()[i]; + } + } + ss << "], dims:" << slr.value().dims(); + return ss.str(); +} + +struct RemoteVar { + std::string ep_; + std::string var_name_; + int trainer_id_{0}; + + std::string String() { + std::stringstream ss; + ss << "ep:" << ep_ << ", var_name:" << var_name_ + << ", trainer_id:" << trainer_id_; + + return ss.str(); + } +}; + +class CollectiveClient { + public: + CollectiveClient() { + rpc_client_.reset(new RPCCLIENT_T()); + rpc_client_->InitImpl(); + } + virtual ~CollectiveClient() {} + + // note this function will retain the rank order. + bool Gather(const std::vector& remote_vars, + std::vector* dst, + const platform::DeviceContext& ctx, framework::Scope* scope, + int64_t time_out = FLAGS_rpc_deadline); + + static CollectiveClient* GetInstance() { + std::call_once(init_flag_, [&]() { + if (client_.get() == nullptr) { + client_.reset(new CollectiveClient()); + } + }); + return client_.get(); + } + + private: + std::unique_ptr rpc_client_; + + static std::once_flag init_flag_; + static std::unique_ptr client_; +}; +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc new file mode 100644 index 00000000..c9565240 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // for removing the port file +#include +#include +#include +#include // NOLINT +#include + +#include "paddle/fluid/operators/distributed/collective_server.h" + +DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get"); + +namespace paddle { +namespace operators { +namespace distributed { + +std::once_flag CollectiveServer::init_flag_; +std::shared_ptr CollectiveServer::collective_server_(nullptr); + +CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) { + VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in; + rpc_server_.reset(new RPCSERVER_T(end_point, fan_in)); +} + +void CollectiveServer::Stop() { + rpc_server_->ShutDown(); + server_thread_->join(); + loop_thread_->join(); +} + +void CollectiveServer::StartServer() { + get_monomer_handler_.reset(new GetMonomerHandler()); + get_monomer_handler_->SetRPCServer(rpc_server_.get()); + + get_barrier_handler_.reset(new GetMonomerBarrierHandler()); + get_barrier_handler_->SetRPCServer(rpc_server_.get()); + + rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable, + get_monomer_handler_.get(), + FLAGS_collective_get_thread_num); + rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier, + get_barrier_handler_.get(), 1); + + server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); })); + rpc_server_->WaitServerReady(); + + loop_thread_.reset(new std::thread([&]() { + while (true) { + if (rpc_server_->IsExit()) { + LOG(WARNING) << "get exit!rpc_processor break!"; + break; + } + sleep(1); + } + VLOG(1) << "CollectiveServer loop_thread end"; + })); +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h new file mode 100644 index 00000000..03c688a7 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include + +#include "gflags/gflags.h" + +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class CollectiveServer; + +class GetMonomerHandler final : public RequestHandler { + public: + GetMonomerHandler() : RequestHandler(true) {} + virtual ~GetMonomerHandler() {} + bool Handle(const std::string& var_name, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override { + VLOG(50) << "GetMonomerHandler recv " << var_name; + + *outvar = scope->FindVar(var_name); + PADDLE_ENFORCE(outvar != nullptr, "%s not found", var_name); + + return true; + } +}; + +class GetMonomerBarrierHandler final : public RequestHandler { + public: + GetMonomerBarrierHandler() : RequestHandler(true) {} + virtual ~GetMonomerBarrierHandler() {} + bool Handle(const std::string& var_name, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override { + VLOG(50) << "GetMonomerHandler recv " << var_name; + + rpc_server_->IncreaseVarBarrier(var_name); + + return true; + } +}; + +class CollectiveServer final { + public: + explicit CollectiveServer(const std::string& end_point, int fan_in); + + virtual ~CollectiveServer() {} + + void StartServer(); + + static CollectiveServer* GetInstance(const std::string& end_point, + int fan_in) { + std::call_once(init_flag_, [&]() { + if (collective_server_.get() == nullptr) { + collective_server_.reset(new CollectiveServer(end_point, fan_in)); + collective_server_->StartServer(); + } + }); + + return collective_server_.get(); + } + + std::shared_ptr GetRPCServer() { return rpc_server_; } + + void Stop(); + + private: + std::unique_ptr get_monomer_handler_; + std::unique_ptr get_barrier_handler_; + + std::shared_ptr rpc_server_; + std::shared_ptr server_thread_; + std::shared_ptr loop_thread_; + + bool ready_{false}; + + static std::once_flag init_flag_; + static std::shared_ptr collective_server_; +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc new file mode 100644 index 00000000..90f2f9fd --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server_test.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor_util.h" + +#include "paddle/fluid/operators/distributed/collective_client.h" +#include "paddle/fluid/operators/distributed/collective_server.h" +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace distributed = paddle::operators::distributed; + +std::unique_ptr StartServer( + const std::string& ep, int fan_in, framework::Scope* scope, + platform::DeviceContext* dev_ctx) { + distributed::CollectiveServer* server = + distributed::CollectiveServer::GetInstance(ep, fan_in); + + auto rpc_server = server->GetRPCServer(); + rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable, + scope, dev_ctx); + + std::cout << "StartServer return" << std::endl; + return std::unique_ptr(server); +} + +std::unique_ptr GenerateVars(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + framework::Scope* scope = new framework::Scope(); + framework::Variable* var = scope->Var("var1"); + auto* slr = var->GetMutable(); + slr->set_height(20000); + + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + + tensor->Resize(framework::make_ddim({3, 1024})); + tensor->mutable_data(place); + + paddle::operators::math::set_constant(ctx, tensor, 32.7); + for (int i = 0; i < 3; ++i) rows->push_back(i); + + std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr); + + return std::unique_ptr(scope); +} + +void Gather(const std::vector& vars, + platform::DeviceContext* dev_ctx) { + distributed::CollectiveClient* client = + distributed::CollectiveClient::GetInstance(); + + framework::Scope* scope = new framework::Scope(); + framework::Variable* var = scope->Var("var1"); + var->GetMutable(); + + std::vector dst; + client->Gather(vars, &dst, *dev_ctx, scope); + std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]); + dev_ctx->Wait(); + + ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024})); + ASSERT_EQ(dst[0]->height(), 20000); + ASSERT_EQ(dst[0]->rows().size(), static_cast(3)); + for (int i = 0; i < 3; i++) { + ASSERT_EQ(dst[0]->rows()[i], i); + } + + std::vector vec; + TensorToVector(dst[0]->value(), *dev_ctx, &vec); + for (size_t i = 0; i < 3 * 1024; i++) { + ASSERT_FLOAT_EQ(vec[i], 32.7); + } +} + +TEST(CollectiveServer, GPU) { + platform::CUDAPlace place; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + std::string ep = "127.0.0.1:7164"; + auto scope = GenerateVars(place); + + auto* v1 = scope->FindVar("var1"); + std::cout << "var1:" << v1 << std::endl; + + auto server = StartServer(ep, 2, scope.get(), &ctx); + auto rpc_server = server->GetRPCServer(); + + distributed::RemoteVar var; + var.ep_ = ep; + var.var_name_ = "var1"; + var.trainer_id_ = 0; + + std::vector vars{var}; + Gather(vars, &ctx); + Gather(vars, &ctx); + + std::cout << "begin WaitVarBarrier" << std::endl; + rpc_server->WaitVarBarrier("var1"); + rpc_server->ClearRegisteredVars(); + server->Stop(); + + scope.release(); + server.release(); +} diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc new file mode 100644 index 00000000..af277d69 --- /dev/null +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -0,0 +1,325 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/distributed/communicator.h" + +#include +#include +#include // NOLINT +#include // NOLINT + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/distributed/parameter_recv.h" +#include "paddle/fluid/operators/distributed/parameter_send.h" + +DEFINE_bool(communicator_independent_recv_thread, true, + "use an independent to recv vars from parameter server"); +DEFINE_int32(communicator_send_queue_size, 20, + "queue size to recv gradient before send"); +DEFINE_int32(communicator_min_send_grad_num_before_recv, 20, + "max grad num to send before recv parameters"); +DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv"); +DEFINE_int32(communicator_send_wait_times, 5, + "times that send thread will wait if merge num does not reach " + "max_merge_var_num"); +DEFINE_int32(communicator_max_merge_var_num, 20, + "max var num to merge and send"); +DEFINE_bool(communicator_fake_rpc, false, + "fake mode does not really send any thing"); +DEFINE_bool(communicator_merge_sparse_grad, true, + "merge sparse gradient before sending"); + +namespace paddle { +namespace operators { +namespace distributed { + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +std::shared_ptr Communicator::communicator_(nullptr); + +Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx, + const RpcCtxMap &recv_varname_to_ctx, + Scope *recv_scope) + : send_varname_to_ctx_(send_varname_to_ctx), + recv_varname_to_ctx_(recv_varname_to_ctx), + recv_scope_(recv_scope) { + // get all send information from graph, build vars_to_send + VLOG(0) << "communicator_independent_recv_thread: " + << FLAGS_communicator_independent_recv_thread; + VLOG(0) << "communicator_send_queue_size: " + << FLAGS_communicator_send_queue_size; + VLOG(0) << "communicator_min_send_grad_num_before_recv: " + << FLAGS_communicator_min_send_grad_num_before_recv; + VLOG(0) << "communicator_thread_pool_size: " + << FLAGS_communicator_thread_pool_size; + VLOG(0) << "communicator_send_wait_times: " + << FLAGS_communicator_send_wait_times; + VLOG(0) << "communicator_max_merge_var_num: " + << FLAGS_communicator_max_merge_var_num; + VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc; + VLOG(0) << "communicator_merge_sparse_grad: " + << FLAGS_communicator_merge_sparse_grad; + send_scope_.reset(new Scope()); + for (auto &iter : send_varname_to_ctx_) { + send_varname_to_queue_[iter.first] = + std::make_shared>>( + FLAGS_communicator_send_queue_size); + } + send_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size)); + recv_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size)); +} + +Communicator::~Communicator() { + if (FLAGS_v >= 3) { + std::string msg("~Communicator"); + fwrite(msg.c_str(), msg.length(), 1, stdout); + } + running_ = false; + if (send_thread_) send_thread_->join(); + if (recv_thread_) recv_thread_->join(); + if (FLAGS_v >= 3) { + std::string msg("~Communicator done"); + fwrite(msg.c_str(), msg.length(), 1, stdout); + } +} + +void Communicator::SendThread() { + VLOG(3) << "SendThread start!"; + while (running_) { + std::vector> task_futures; + task_futures.reserve(send_varname_to_ctx_.size()); + VLOG(3) << "run send graph"; + auto before_run_send_graph = GetCurrentUS(); + for (auto &iter : send_varname_to_queue_) { + auto &var_name = iter.first; + auto &var_queue = iter.second; + if (var_queue->Size() > 0) { + auto send_task = [this, &var_name, &var_queue] { + VLOG(3) << var_name << " merge and send"; + std::vector> vars; + size_t merged_var_num = 0; + size_t wait_times = 0; + while (merged_var_num < FLAGS_communicator_max_merge_var_num) { + if (var_queue->Size() == 0) { + VLOG(3) << "wait_times -> " << wait_times; + if (wait_times >= FLAGS_communicator_send_wait_times) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + wait_times++; + continue; + } else { + wait_times = 0; + + vars.push_back(var_queue->Pop()); + // only count the send number of the first var + if (var_name == send_varname_to_queue_.begin()->first) { + grad_num_.fetch_add(1, std::memory_order_relaxed); + } + merged_var_num++; + } + } + auto before_merge = GetCurrentUS(); + MergeVars(var_name, vars, send_scope_.get()); + auto after_merge = GetCurrentUS(); + VLOG(3) << "merge " << merged_var_num << " " << var_name + << " use time " << after_merge - before_merge; + auto send_functor = distributed::ParameterSend(); + auto &ctx = send_varname_to_ctx_.at(var_name); + if (!FLAGS_communicator_fake_rpc) { + send_functor(ctx, *send_scope_, true); + } + auto after_send = GetCurrentUS(); + VLOG(3) << "send " << var_name << " use time " + << after_send - after_merge; + }; + task_futures.emplace_back( + send_threadpool_->enqueue(std::move(send_task))); + } else { + VLOG(4) << var_name << " queue empty"; + } + } + for (auto &task_f : task_futures) { + task_f.wait(); + } + auto after_run_send_graph = GetCurrentUS(); + auto send_graph_use_time = after_run_send_graph - before_run_send_graph; + if (send_graph_use_time > 100) { + VLOG(1) << "run send graph use time " + << after_run_send_graph - before_run_send_graph; + } + if (!FLAGS_communicator_independent_recv_thread) { + RecvAll(); + } + } + VLOG(0) << "communicator stopped, send thread exit"; +} + +void Communicator::RecvAll() { + VLOG(3) << "parallel run recv graph"; + if (!running_) return; + auto before_send = GetCurrentUS(); + std::vector> task_futures; + task_futures.reserve(recv_varname_to_ctx_.size()); + for (auto &iter : recv_varname_to_ctx_) { + auto recv_task = [this, &iter] { + auto &var_name = iter.first; + VLOG(4) << "recv var " << var_name; + auto recv_functor = distributed::ParameterRecv(); + if (!FLAGS_communicator_fake_rpc) { + recv_functor(iter.second, *recv_scope_); + } + }; + task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task))); + } + for (auto &task : task_futures) { + task.wait(); + } + auto after_recv = GetCurrentUS(); + VLOG(1) << "run recv graph use time " << after_recv - before_send; +} + +void Communicator::RecvThread() { + VLOG(3) << "RecvThread start!"; + while (running_) { + auto grad_num = grad_num_.load(); + if (grad_num > FLAGS_communicator_min_send_grad_num_before_recv) { + VLOG(1) << "current grad num " << grad_num; + RecvAll(); + grad_num_.store(0); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + } + VLOG(0) << "communicator stopped, recv thread exit"; +} + +void Communicator::Send(const std::string &var_name, + const framework::Scope &scope) { + VLOG(3) << "communicator send " << var_name; + // push var into send queue by var_name + auto *grad_var = scope.FindVar(var_name); + PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited"); + if (grad_var->IsType() && + !FLAGS_communicator_merge_sparse_grad) { + auto send_functor = distributed::ParameterSend(); + auto &ctx = send_varname_to_ctx_.at(var_name); + if (!FLAGS_communicator_fake_rpc) { + send_functor(ctx, scope, true); + } + } else { + auto tmp_grad_var = std::make_shared(); + framework::CopyVariable(*grad_var, tmp_grad_var.get()); + auto &queue = send_varname_to_queue_.at(var_name); + VLOG(3) << "send " << var_name << " queue size " << queue->Size(); + queue->Push(tmp_grad_var); + } +} + +void Communicator::Init(const paddle::framework::ProgramDesc &program, + Scope *param_scope) { + using RpcCtxMap = operators::distributed::RpcCtxMap; + VLOG(3) << "ProcessGraph"; + RpcCtxMap send_varname_to_ctx; + RpcCtxMap recv_varname_to_ctx; + for (auto *op : program.Block(0).AllOps()) { + VLOG(3) << "node name " << op->Type(); + if (op->Type() == "send") { + auto send_var_name = op->Input("X")[0]; + auto send_varnames = boost::get>( + op->GetNullableAttr("send_varnames")); + auto epmap = + boost::get>(op->GetNullableAttr("epmap")); + auto height_section = + boost::get>(op->GetNullableAttr("sections")); + auto trainer_id = boost::get(op->GetNullableAttr("trainer_id")); + send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext( + send_var_name, send_varnames, epmap, height_section, trainer_id); + VLOG(3) << "find and init an send op: " + << send_varname_to_ctx[send_var_name]; + } else if (op->Type() == "recv") { + auto do_not_run = boost::get(op->GetNullableAttr("do_not_run")); + PADDLE_ENFORCE_GT(do_not_run, 0, "recv should not run!"); + auto recv_var_name = op->Output("Out")[0]; + auto recv_varnames = boost::get>( + op->GetNullableAttr("recv_varnames")); + auto epmap = + boost::get>(op->GetNullableAttr("epmap")); + auto trainer_id = boost::get(op->GetNullableAttr("trainer_id")); + recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext( + recv_var_name, recv_varnames, epmap, {}, trainer_id); + } + } + + // init communicator here + if (send_varname_to_ctx.size() == 0 && recv_varname_to_ctx.size() == 0) { + LOG(WARNING) << "no var need to send and recv!!"; + } + operators::distributed::Communicator::Init(send_varname_to_ctx, + recv_varname_to_ctx, param_scope); +} + +Communicator *Communicator::GetInstance() { return communicator_.get(); } + +std::shared_ptr Communicator::GetInstantcePtr() { + return communicator_; +} + +void Communicator::Start() { + VLOG(0) << "Communicator start"; + if (!communicator_) { + VLOG(0) << "Communicator is not inited, do nothing"; + } else { + VLOG(1) << "start send thread and recv thread"; + running_ = true; + // start send and recv thread + send_thread_.reset( + new std::thread(std::bind(&Communicator::SendThread, this))); + if (FLAGS_communicator_independent_recv_thread) { + recv_thread_.reset( + new std::thread(std::bind(&Communicator::RecvThread, this))); + } + } +} + +void Communicator::Stop() { + VLOG(0) << "Communicator stop"; + running_ = false; + if (!communicator_) { + VLOG(0) << "Communicator is not inited, do nothing"; + } else { + if (send_thread_) { + VLOG(1) << "stop send thread"; + send_thread_->join(); + send_thread_.reset(nullptr); + } + if (recv_thread_) { + VLOG(1) << "stop recv thread"; + recv_thread_->join(); + recv_thread_.reset(nullptr); + } + } + VLOG(0) << "Communicator stop done"; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h new file mode 100644 index 00000000..6db02fc8 --- /dev/null +++ b/paddle/fluid/operators/distributed/communicator.h @@ -0,0 +1,218 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using Scope = framework::Scope; +using Variable = framework::Variable; + +template +class BlockingQueue { + public: + explicit BlockingQueue(size_t capacity) : capacity_(capacity) { + PADDLE_ENFORCE_GT(capacity_, 0, "The capacity must be greater than 0."); + } + + bool Push(const T& elem) { + { + std::unique_lock lock(mutex_); + cv_.wait(lock, [&] { return queue_.size() < capacity_; }); + PADDLE_ENFORCE_LT(queue_.size(), capacity_); + queue_.push_back(elem); + } + cv_.notify_one(); + return true; + } + + bool Push(T&& elem) { + { + std::unique_lock lock(mutex_); + cv_.wait(lock, [&] { return queue_.size() < capacity_; }); + PADDLE_ENFORCE_LT(queue_.size(), capacity_); + queue_.emplace_back(std::move(elem)); + } + cv_.notify_one(); + return true; + } + + T Pop() { + std::unique_lock lock(mutex_); + cv_.wait(lock, [=] { return !queue_.empty(); }); + T rc(std::move(queue_.front())); + queue_.pop_front(); + cv_.notify_one(); + return rc; + } + + size_t Cap() const { + std::lock_guard lock(mutex_); + return capacity_; + } + + size_t Size() const { + std::lock_guard lock(mutex_); + return queue_.size(); + } + + private: + const size_t capacity_; + std::deque queue_; + + mutable std::mutex mutex_; + std::condition_variable cv_; +}; + +template +using EigenVector = framework::EigenVector; + +inline void MergeVars(const std::string& var_name, + const std::vector>& vars, + Scope* scope) { + PADDLE_ENFORCE(!vars.empty(), "should have value to merge!"); + auto cpu_place = platform::CPUPlace(); + auto& var0 = vars[0]; + auto* out_var = scope->Var(var_name); + if (var0->IsType()) { + auto dims = var0->Get().dims(); + VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims; + + // init output tensor + auto* out_t = out_var->GetMutable(); + out_t->mutable_data(dims, cpu_place); + + // check the input dims + for (auto& var : vars) { + auto& var_t = var->Get(); + PADDLE_ENFORCE_EQ(var_t.dims(), dims, "should have the same dims"); + } + + // set output tensor to 0. + auto cpu_ctx = paddle::platform::CPUDeviceContext(); + math::SetConstant + constant_functor; + constant_functor(cpu_ctx, out_t, static_cast(0)); + + // sum all vars to out + auto result = EigenVector::Flatten(*out_t); + for (auto& var : vars) { + auto& in_t = var->Get(); + auto in = EigenVector::Flatten(in_t); + result.device(*cpu_ctx.eigen_device()) = result + in; + } + } else if (var0->IsType()) { + auto& slr0 = var0->Get(); + auto* out_slr = out_var->GetMutable(); + out_slr->mutable_rows()->clear(); + out_slr->mutable_value()->mutable_data({{}}, cpu_place); + std::vector inputs; + inputs.reserve(vars.size()); + for (auto& var : vars) { + inputs.push_back(&var->Get()); + } + math::scatter::MergeAdd + merge_add; + auto dev_ctx = paddle::platform::CPUDeviceContext(); + merge_add(dev_ctx, inputs, out_slr, false); + VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height() + << " dims: " << slr0.value().dims(); + } else { + PADDLE_THROW("unsupported var type!"); + } +} + +using RpcCtxMap = std::unordered_map; + +class Communicator { + public: + Communicator(const RpcCtxMap& send_varname_to_ctx, + const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope); + + ~Communicator(); + + void Start(); + void Stop(); + + bool IsRunning() { return running_; } + + // send grad + void Send(const std::string& var_name, const framework::Scope& scope); + + private: + // recv all parameter + void RecvAll(); + void SendThread(); + void RecvThread(); + + bool running_ = false; + std::unordered_map>>> + send_varname_to_queue_; + RpcCtxMap send_varname_to_ctx_; + RpcCtxMap recv_varname_to_ctx_; + std::unique_ptr send_thread_{nullptr}; + std::unique_ptr recv_thread_{nullptr}; + Scope* recv_scope_; // should be global scope + std::unique_ptr send_scope_; // an independent scope + std::unique_ptr<::ThreadPool> send_threadpool_{nullptr}; + std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr}; + std::atomic_uint grad_num_{0}; // the num of gradient sent since last recv + + // the following code is for initialize the commnunicator + public: + static void Init(const RpcCtxMap& send_varname_to_ctx, + const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope) { + if (communicator_ == nullptr) { + communicator_.reset(new Communicator(send_varname_to_ctx, + recv_varname_to_ctx, recv_scope)); + } + } + + static void Init(const paddle::framework::ProgramDesc& program, + Scope* param_scope); + + static Communicator* GetInstance(); + + static std::shared_ptr GetInstantcePtr(); + + private: + static std::shared_ptr communicator_; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc new file mode 100644 index 00000000..5294ac33 --- /dev/null +++ b/paddle/fluid/operators/distributed/communicator_test.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "paddle/fluid/operators/distributed/communicator.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; + +TEST(communicator, merge_lod_tensors) { + auto cpu_place = platform::CPUPlace(); + auto dims = framework::make_ddim({2, 3}); + std::vector> in_vars; + float out_value = 0; + for (auto i = 0; i < 10; ++i) { + auto var = std::make_shared(); + in_vars.emplace_back(var); + auto *tensor = var->GetMutable(); + auto *data = tensor->mutable_data(dims, cpu_place); + for (auto j = 0; j < tensor->numel(); ++j) { + data[j] = static_cast(i); + } + out_value += static_cast(i); + } + const std::string out_name = "Out"; + std::unique_ptr scope; + scope.reset(new framework::Scope()); + scope->Var(out_name); + for (auto i = 0; i < 10; ++i) { + MergeVars(out_name, in_vars, scope.get()); + } + auto &out_tensor = scope->FindVar(out_name)->Get(); + auto *out_data = out_tensor.data(); + ASSERT_EQ(out_tensor.dims(), dims); + for (auto i = 0; i < out_tensor.numel(); ++i) { + ASSERT_EQ(out_data[i], out_value); + } +} + +TEST(communicator, merge_selected_rows) { + auto cpu_place = platform::CPUPlace(); + int64_t width = 10; + std::vector> in_vars; + const int64_t height = 100; + for (auto i = 0; i < 10; ++i) { + std::vector rows; + for (auto k = 0; k <= i; ++k) { + rows.push_back(k); + } + auto var = std::make_shared(); + in_vars.emplace_back(var); + auto *slr = var->GetMutable(); + slr->set_height(height); + slr->set_rows(rows); + auto dims = + framework::make_ddim({static_cast(rows.size()), width}); + auto *data = slr->mutable_value()->mutable_data(dims, cpu_place); + for (auto i = 0; i < rows.size(); ++i) { + for (auto j = 0; j < width; ++j) { + data[i * width + j] = static_cast(rows[i]); + } + } + } + const std::string out_name = "Out"; + std::unique_ptr scope; + scope.reset(new framework::Scope()); + scope->Var(out_name); + for (auto i = 0; i < 10; ++i) { + MergeVars(out_name, in_vars, scope.get()); + } + auto &out_slr = scope->FindVar(out_name)->Get(); + auto &out_t = out_slr.value(); + auto *out_data = out_t.data(); + ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width})); + std::vector out_values; + out_values.reserve(10); + for (auto i = 0; i < 10; ++i) { + out_values.push_back(static_cast(i * (10 - i))); + } + for (auto i = 0; i < out_slr.rows().size(); ++i) { + ASSERT_EQ(out_slr.rows()[i], i); + for (auto j = 0; j < width; ++j) { + ASSERT_EQ(out_data[i * width + j], out_values[i]); + } + } +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/distributed.h b/paddle/fluid/operators/distributed/distributed.h new file mode 100644 index 00000000..3a9f9225 --- /dev/null +++ b/paddle/fluid/operators/distributed/distributed.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_DISTRIBUTE + +#ifdef PADDLE_WITH_GRPC + +#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" +#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer +#define RPCCLIENT_T paddle::operators::distributed::GRPCClient + +#else // PADDLE_WITH_GRPC + +#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" +#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" +#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer +#define RPCCLIENT_T paddle::operators::distributed::BRPCClient + +#endif // PADDLE_WITH_GRPC + +#endif // PADDLE_WITH_DISTRIBUTE diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/operators/distributed/distributed_pb.h new file mode 100644 index 00000000..f1c662be --- /dev/null +++ b/paddle/fluid/operators/distributed/distributed_pb.h @@ -0,0 +1,30 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_DISTRIBUTE + +#ifdef PADDLE_WITH_GRPC + +#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +#else // PADDLE_WITH_GRPC + +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +#endif // PADDLE_WITH_GRPC + +#endif // PADDLE_WITH_DISTRIBUTE diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc new file mode 100644 index 00000000..c2cb0d7f --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" + +namespace paddle { +namespace operators { +namespace distributed { + +GrpcByteBufferSource::GrpcByteBufferSource() {} + +bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) { + cur_ = -1; + left_ = 0; + ptr_ = nullptr; + byte_count_ = 0; + bool ok = src.Dump(&slices_).ok(); + if (!ok) { + slices_.clear(); + } + return ok; +} + +bool GrpcByteBufferSource::Next(const void** data, int* size) { + // Use loop instead of if in case buffer contained empty slices. + while (left_ == 0) { + // Advance to next slice. + cur_++; + if (cur_ >= slices_.size()) { + return false; + } + const ::grpc::Slice& s = slices_[cur_]; + left_ = s.size(); + ptr_ = reinterpret_cast(s.begin()); + } + + *data = ptr_; + *size = left_; + byte_count_ += left_; + ptr_ += left_; + left_ = 0; + return true; +} + +void GrpcByteBufferSource::BackUp(int count) { + ptr_ -= count; + left_ += count; + byte_count_ -= count; +} + +bool GrpcByteBufferSource::Skip(int count) { + const void* data; + int size; + while (Next(&data, &size)) { + if (size >= count) { + BackUp(size - count); + return true; + } + // size < count; + count -= size; + } + // error or we have too large count; + return false; +} + +google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { + return byte_count_; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h new file mode 100644 index 00000000..e9074574 --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h @@ -0,0 +1,170 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#pragma once + +#include + +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "grpc++/grpc++.h" +#include "paddle/fluid/operators/distributed/variable_response.h" + +namespace grpc { +// A ZeroCopyInputStream that reads from grpc_byte_buffer +class GrpcBufferReader final + : public ::google::protobuf::io::ZeroCopyInputStream { + typedef void (CoreCodegenInterface::*OldReaderInitAPI)( + grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); + typedef int (CoreCodegenInterface::*NewReaderInitAPI)( + grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); + void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader, + grpc_byte_buffer* buffer) { + (g_core_codegen_interface->*ptr)(reader, buffer); + } + void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader, + grpc_byte_buffer* buffer) { + int result = (g_core_codegen_interface->*ptr)(reader, buffer); + (void)result; + } + + public: + explicit GrpcBufferReader(grpc_byte_buffer* buffer) + : byte_count_(0), backup_count_(0) { + ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_, + buffer); + } + ~GrpcBufferReader() override { + g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_); + } + + bool Next(const void** data, int* size) override { + if (backup_count_ > 0) { + *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) - + backup_count_; + GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX); + *size = static_cast(backup_count_); + backup_count_ = 0; + return true; + } + if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_, + &slice_)) { + return false; + } + g_core_codegen_interface->grpc_slice_unref(slice_); + *data = GRPC_SLICE_START_PTR(slice_); + // On win x64, int is only 32bit + GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX); + byte_count_ += * size = static_cast(GRPC_SLICE_LENGTH(slice_)); + return true; + } + + void BackUp(int count) override { backup_count_ = count; } + + bool Skip(int count) override { + const void* data; + int size; + while (Next(&data, &size)) { + if (size >= count) { + BackUp(size - count); + return true; + } + // size < count; + count -= size; + } + // error or we have too large count; + return false; + } + + ::google::protobuf::int64 ByteCount() const override { + return byte_count_ - backup_count_; + } + + private: + int64_t byte_count_; + int64_t backup_count_; + grpc_byte_buffer_reader reader_; + grpc_slice slice_; +}; + +}; // namespace grpc + +namespace paddle { +namespace operators { +namespace distributed { + +// A ZeroCopyInputStream that reads from a grpc::ByteBuffer. +class GrpcByteBufferSource + : public ::google::protobuf::io::ZeroCopyInputStream { + public: + GrpcByteBufferSource(); + bool Init(const ::grpc::ByteBuffer& src); // Can be called multiple times. + bool Next(const void** data, int* size) override; + void BackUp(int count) override; + bool Skip(int count) override; + ::google::protobuf::int64 ByteCount() const override; + + private: + std::vector<::grpc::Slice> slices_; + size_t cur_; // Current slice index. + int left_; // Number of bytes in slices_[cur_] left to yield. + const char* ptr_; // Address of next byte in slices_[cur_] to yield. + ::google::protobuf::int64 byte_count_; +}; + +class GrpcByteBufferSourceWrapper : public Source { + public: + explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) + : source_(source) {} + ::google::protobuf::io::ZeroCopyInputStream* contents() override { + return source_; + } + + private: + GrpcByteBufferSource* source_; +}; + +class GrpcByteSource : public Source { + public: + explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {} + ~GrpcByteSource() override { DeleteStream(); } + + typedef ::grpc::GrpcBufferReader Reader; + + ::google::protobuf::io::ZeroCopyInputStream* contents() override { + DeleteStream(); + stream_ = new (&space_) Reader(buffer_); + return stream_; + } + + private: + void DeleteStream() { + if (stream_) { + stream_->~Reader(); + } + } + + grpc_byte_buffer* buffer_; // Not owned + Reader* stream_ = nullptr; // Points into space_ if non-nullptr + char space_[sizeof(Reader)]; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc new file mode 100644 index 00000000..8504110c --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc @@ -0,0 +1,481 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "glog/logging.h" // For VLOG +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/profiler.h" + +DECLARE_bool(rpc_disable_reuse_port); + +namespace paddle { +namespace operators { +namespace distributed { + +void GRPCClient::InitImpl() { + // start the client process thread + // TODO(wuyi): can make this in a threadpool + PADDLE_ENFORCE(client_thread_ == nullptr, + "please not re init proceed thread"); + client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this))); +} + +void GRPCClient::SendComplete() { + std::unique_lock lk(completed_mutex_); + if (!completed_) { + for (auto& it : channels_) { + VLOG(3) << "send complete message to " << it.first; + this->AsyncSendComplete(it.first); + } + PADDLE_ENFORCE(this->Wait(), "internal grpc error"); + completed_ = true; + } +} + +GRPCClient::~GRPCClient() { + stopped_ = true; + Wait(); + cq_.Shutdown(); + { + std::lock_guard guard(chan_mutex_); + for (auto& it : channels_) { + it.second.reset(); + } + channels_.clear(); + } + client_thread_->join(); +} + +VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + SendProcessor* s = new SendProcessor(ch); + const std::string method = kSendRPC; + VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); + + framework::AsyncIO([var_name_val, p_scope, p_ctx, s, method, h, this] { + auto* var = p_scope->FindVar(var_name_val); + + ::grpc::ByteBuffer req; + SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); + + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + + // stub context + s->response_call_back_ = nullptr; + + platform::RecordRPCEvent record_event(method); + + auto call = s->stub_g_.PrepareUnaryCall( + s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + }); + req_count_++; + + return h; +} + +void ProcGetResponse(const VarHandle& var_h, + const ::grpc::ByteBuffer& ret_msg) { + VLOG(4) << "ProcGetResponse"; + framework::Variable* outvar = nullptr; + // get response's trainer_id is not used + int trainer_id; + DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar, + &trainer_id); +} + +template +void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { + ::grpc::Slice slice(proto.ByteSizeLong()); + proto.SerializeWithCachedSizesToArray(const_cast(slice.begin())); + ::grpc::ByteBuffer tmp(&slice, 1); + result->Swap(&tmp); +} + +VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& out_varname, + const std::string& table_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname, + "/sendrecv.SendRecvService/GetVariable", table_name, + time_out); +} + +VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, int64_t time_out) { + std::string var_name_no_barrier = + string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); + + return _AsyncGetVar( + ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname, + "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out); +} + +VarHandlePtr GRPCClient::AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name, + "/sendrecv.SendRecvService/GetMonomerVariable", "", + time_out); +} + +VarHandlePtr GRPCClient::_AsyncGetVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& method, + const std::string& var_name, const std::string& out_varname, + const std::string& rpc_path, const std::string& table_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const std::string out_varname_val = out_varname; + const std::string table_name_val = table_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + GetProcessor* s = new GetProcessor(ch); + + VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); + s->Prepare(h, time_out); + + framework::AsyncIO([var_name_val, out_varname_val, table_name_val, s, method, + p_ctx, h, rpc_path, this] { + // prepare input + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + req.set_out_varname(out_varname_val); + req.set_trainer_id(trainer_id_); + req.set_table_name(table_name_val); + ::grpc::ByteBuffer buf; + RequestToByteBuffer(req, &buf); + + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + + // stub context + s->response_call_back_ = ProcGetResponse; + + platform::RecordRPCEvent record_event(method); + + auto call = + s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + }); + + req_count_++; + + return h; +} + +VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string in_var_name_val = in_var_name; + const std::string out_var_name_val = out_var_name; + const std::string table_name_val = table_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + GetProcessor* s = new GetProcessor(ch); + + const std::string method = kPrefetchRPC; + + VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); + + framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, + s, method, h, table_name_val, this] { + auto* var = p_scope->FindVar(in_var_name_val); + + ::grpc::ByteBuffer req; + SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val, + 0, table_name_val); + + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + + // stub context + s->response_call_back_ = ProcGetResponse; + + platform::RecordRPCEvent record_event(method); + + auto call = s->stub_g_.PrepareUnaryCall( + s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, + &cq_); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, static_cast(s)); + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + }); + + req_count_++; + return h; +} + +VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { + const auto ch = GetChannel(ep); + + BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); + const std::string method = kBatchBarrierRPC; + VarHandlePtr h( + new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr)); + s->Prepare(h, time_out); + + sendrecv::VariableMessage req; + req.set_varname(BATCH_BARRIER_MESSAGE); + + platform::RecordRPCEvent record_event(method); + + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + + return h; +} + +VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { + const auto ch = GetChannel(ep); + FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); + const std::string method = kFetchBarrierRPC; + VarHandlePtr h( + new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + s->Prepare(h, time_out); + + sendrecv::VariableMessage req; + req.set_varname(FETCH_BARRIER_MESSAGE); + + platform::RecordRPCEvent record_event(method); + + auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + + return h; +} + +VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, + const std::string& var_name, + int64_t time_out) { + const auto ch = GetChannel(ep); + BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); + const std::string method = kSendMonomerFetchBarrierRPC; + VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr)); + s->Prepare(h, time_out); + + VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; + + sendrecv::VariableMessage req; + req.set_varname(var_name); + + platform::RecordRPCEvent record_event(method); + + auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + + return h; +} + +VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, + int64_t time_out) { + const auto ch = GetChannel(ep); + + BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); + const std::string method = kSendCompleteRPC; + VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr)); + s->Prepare(h, time_out); + + sendrecv::VariableMessage req; + req.set_varname(COMPLETE_MESSAGE); + + platform::RecordRPCEvent record_event(method); + + auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + + return h; +} + +VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, + const std::string& dir, + int64_t time_out) { + const auto ch = GetChannel(ep); + + CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); + + const std::string method = kCheckPointNotifyRPC; + + VarHandlePtr h( + new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr)); + s->Prepare(h, time_out); + + sendrecv::VariableMessage req; + req.set_varname(CHECKPOINT_SAVE_MESSAGE); + req.set_out_varname(dir); + + platform::RecordRPCEvent record_event(method); + + auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + + return h; +} + +bool GRPCClient::Wait() { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); }); + return ok_; +} + +void GRPCClient::Proceed() { + void* tag = nullptr; + bool ok = false; + + VLOG(3) << "GRPCClient Proceed begin"; + while (!stopped_ && cq_.Next(&tag, &ok)) { + BaseProcessor* c = static_cast(tag); + GPR_ASSERT(ok); + PADDLE_ENFORCE(c); + + if (c->status_.ok()) { + VLOG(3) << c->GetVarHandlePtr()->String() << " process"; + c->Process(); + } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) { + LOG(FATAL) << c->GetVarHandlePtr()->String() + << " meets grpc error, error_code:" << c->status_.error_code() + << " error_message:" << c->status_.error_message() + << " error_details:" << c->status_.error_details(); + { + std::lock_guard lk(sync_mutex_); + ok_ = false; + } + c->Finish(false); + } else { + LOG(FATAL) << c->GetVarHandlePtr()->String() + << " meets grpc error, error_code:" << c->status_.error_code() + << " error_message:" << c->status_.error_message() + << " error_details:" << c->status_.error_details(); + + c->Finish(false); + } + + bool notify = false; + { + std::lock_guard lk(sync_mutex_); + req_count_--; + notify = (req_count_ <= 0 || !c->status_.ok()); + } + + delete c; + + if (notify) { + sync_cond_.notify_all(); + } + } + + // Last log message + // Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a + // static Mutex log_mutex is used for synchronization, which might have been + // destructed at this moment. + if (FLAGS_v >= 3) { + std::string msg("GRPCClient Proceed end"); + fwrite(msg.c_str(), msg.length(), 1, stdout); + } +} + +std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { + std::lock_guard guard(chan_mutex_); + auto it = channels_.find(ep); + if (it != channels_.end()) { + return it->second; + } + + // Channel configurations: + grpc::ChannelArguments args; + args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000); + if (FLAGS_rpc_disable_reuse_port) { + args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); + } + args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); + args.SetMaxSendMessageSize(std::numeric_limits::max()); + args.SetMaxReceiveMessageSize(std::numeric_limits::max()); + + auto ch = + grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args); + channels_[ep] = ch; + return ch; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h new file mode 100644 index 00000000..ad2f04a6 --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h @@ -0,0 +1,272 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include // NOLINT +#include // NOLINT +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include +#include + +#include "grpc++/channel.h" +#include "grpc++/generic/generic_stub.h" +#include "grpc++/grpc++.h" +#include "grpc++/support/byte_buffer.h" +#include "grpc++/support/slice.h" +#include "grpc/support/log.h" +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace operators { +namespace distributed { + +void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); + +class BaseProcessor { + public: + BaseProcessor() { context_ = nullptr; } + + virtual ~BaseProcessor() {} + + virtual void Prepare(VarHandlePtr h, int64_t time_out) { + var_h_ = h; + + context_.reset(new grpc::ClientContext()); + context_->set_wait_for_ready(true); + if (time_out) { + std::chrono::system_clock::time_point deadline = + std::chrono::system_clock::now() + + std::chrono::milliseconds(time_out); + context_->set_deadline(deadline); + } + } + + void Process() { + ProcessImpl(); + var_h_->Finish(true); + } + + VarHandlePtr GetVarHandlePtr() { return var_h_; } + bool Wait() { return var_h_->Wait(); } + void Finish(bool ok) { return var_h_->Finish(ok); } + virtual void ProcessImpl() = 0; + + std::unique_ptr context_; + grpc::Status status_; + + protected: + VarHandlePtr var_h_; +}; + +typedef std::function + RequestSendCallBack; + +class SendProcessor : public BaseProcessor { + public: + explicit SendProcessor(std::shared_ptr ch) + : BaseProcessor(), stub_g_(ch) {} + + virtual ~SendProcessor() {} + + void ProcessImpl() override { + if (response_call_back_) { + response_call_back_(*var_h_.get(), reply_); + } + } + + ::grpc::GenericStub stub_g_; + ::grpc::ByteBuffer reply_; + RequestSendCallBack response_call_back_ = nullptr; +}; + +typedef std::function + RequestGetCallBack; + +class GetProcessor : public BaseProcessor { + public: + explicit GetProcessor(std::shared_ptr ch) + : BaseProcessor(), stub_g_(ch) {} + + virtual ~GetProcessor() {} + + void ProcessImpl() override { + if (response_call_back_) { + response_call_back_(*var_h_.get(), reply_); + } + } + + ::grpc::ByteBuffer reply_; + ::grpc::GenericStub stub_g_; + RequestGetCallBack response_call_back_ = ProcGetResponse; +}; + +class BatchBarrierProcessor : public BaseProcessor { + public: + explicit BatchBarrierProcessor(std::shared_ptr ch) + : BaseProcessor() { + stub_ = sendrecv::SendRecvService::NewStub(ch); + } + + virtual ~BatchBarrierProcessor() {} + + void ProcessImpl() override {} + sendrecv::VoidMessage reply_; + std::unique_ptr stub_; +}; + +class FetchBarrierProcessor : public BaseProcessor { + public: + explicit FetchBarrierProcessor(std::shared_ptr ch) + : BaseProcessor() { + stub_ = sendrecv::SendRecvService::NewStub(ch); + } + + virtual ~FetchBarrierProcessor() {} + + void ProcessImpl() override {} + sendrecv::VariableMessage reply_; + std::unique_ptr stub_; +}; + +class CheckpointNotifyProcessor : public BaseProcessor { + public: + explicit CheckpointNotifyProcessor(std::shared_ptr ch) + : BaseProcessor() { + stub_ = sendrecv::SendRecvService::NewStub(ch); + } + + virtual ~CheckpointNotifyProcessor() {} + + void ProcessImpl() override {} + sendrecv::VoidMessage reply_; + std::unique_ptr stub_; +}; + +class GRPCClient : public RPCClient { + public: + GRPCClient() : ok_(true), completed_(false), stopped_(false) {} + virtual ~GRPCClient(); + + VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& out_varname, + const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) override; + + VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendComplete( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + bool Wait() override; + + void SendComplete() override; + + void InitImpl() override; + + private: + void Proceed(); + + std::shared_ptr GetChannel(const std::string& ep); + VarHandlePtr _AsyncGetVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& method, + const std::string& var_name, const std::string& out_varname, + const std::string& rpc_path, const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline); + + private: + grpc::CompletionQueue cq_; + std::unordered_map> channels_; + std::unique_ptr client_thread_{nullptr}; + + // mutex for Wait client sync + std::mutex sync_mutex_; + std::condition_variable sync_cond_; + std::atomic req_count_{0}; + bool ok_; + + // mutex for GetChannel thread safety + std::mutex chan_mutex_; + DISABLE_COPY_AND_ASSIGN(GRPCClient); + + // mutex for sending complete message only once + std::mutex completed_mutex_; + bool completed_; + + volatile bool stopped_; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc new file mode 100644 index 00000000..91c398d0 --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc @@ -0,0 +1,162 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUDA +#include +#endif +#include +#include +#include // NOLINT + +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" +#include "paddle/fluid/operators/distributed/proto_encoder_helper.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +void SerializeToByteBuffer(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, + ::grpc::ByteBuffer* msg, const std::string& out_name, + const int trainer_id, + const std::string& table_name) { + platform::RecordRPCEvent record_event("serial"); + VarMsg request; + TensorPayload* payload = nullptr; + + request.set_varname(name); + request.set_trainer_id(trainer_id); + // Note: normally the profiler is enabled in 1 trainer, hence only + // 1 trainer returns true for ShouldSendProfileState(). It tells PS + // servers the trainer's profiling state so that PS can follow the + // trainer. + if (platform::ShouldSendProfileState()) { + if (platform::IsProfileEnabled()) { + request.set_profile(platform::kEnableProfiler); + } else { + request.set_profile(platform::kDisableProfiler); + } + } + if (!out_name.empty()) { + request.set_out_varname(out_name); + } + if (!table_name.empty()) { + request.set_table_name(table_name); + } + if (var->IsType()) { + request.set_type(::sendrecv::LOD_TENSOR); + payload = new TensorPayload(GetTensorPayload(var, ctx, &request)); + } else if (var->IsType()) { + request.set_type(::sendrecv::SELECTED_ROWS); + payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request)); +#ifdef PADDLE_WITH_CUDA + } else if (var->IsType()) { + request.set_type(::sendrecv::NCCL_ID); +#endif + } else { + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + } + + std::string header; + request.AppendToString(&header); + auto buffer = std::unique_ptr(new char[1024]); + void* buf = buffer.get(); + ProtoEncodeHelper e(static_cast(buf), 1024); + e.WriteRawBytes(std::string(header.data(), header.size())); +// NCCLID is copied directly to the message, return bytebuffer +// with only one slice if serializing NCCLID. +#ifdef PADDLE_WITH_CUDA + if (var->IsType()) { + e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, + NCCL_UNIQUE_ID_BYTES); + const ncclUniqueId& uid = var->Get(); + e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES)); + + // for serialize NCCL_ID + ::grpc::Slice slices(e.size()); + memcpy(const_cast(slices.begin()), e.data(), e.size()); + ::grpc::ByteBuffer tmp(&slices, 1); + msg->Swap(&tmp); + return; + } +#endif + PADDLE_ENFORCE_NOT_NULL(payload); + + e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, + payload->memory_size()); + if (payload->memory_size() >= std::numeric_limits::max()) { + LOG(FATAL) << "FATAL error: varname:" << name + << ", vlen:" << payload->memory_size() + << " >= std::numeric_limits::max():" + << std::numeric_limits::max() << ", so exit!"; + } + // steal reference of tensor data + ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows + int num_slices = 2; // only SelectedRows have rows buffer + slices[0] = ::grpc::Slice(e.size()); + memcpy(const_cast(slices[0].begin()), e.data(), e.size()); + slices[1] = ::grpc::Slice( + grpc_slice_new_with_user_data(payload->ptr(), payload->memory_size(), + SerializeDestroyCallback, payload), + ::grpc::Slice::STEAL_REF); + + if (var->IsType()) { + auto* slr = var->GetMutable(); + ProtoEncodeHelper e2(static_cast(buf), 128); + + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); + size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); + + e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); + slices[2] = ::grpc::Slice(e2.size()); + memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); + + slices[3] = ::grpc::Slice( + grpc_slice_new_with_user_data( + const_cast( + reinterpret_cast(slr->rows().data())), + rows_memory_size, [](void* backing) {}, + const_cast( + reinterpret_cast(slr->rows().data()))), + ::grpc::Slice::STEAL_REF); + num_slices = 4; + } + + ::grpc::ByteBuffer tmp(&slices[0], num_slices); + msg->Swap(&tmp); +} + +void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id) { + platform::RecordRPCEvent record_event("deserial"); + operators::distributed::GRPCVariableResponse resp(scope, &ctx); + PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); + *var = resp.GetVar(); + *trainer_id = resp.GetTrainerId(); +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h new file mode 100644 index 00000000..c9a57beb --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/platform/port.h" + +#include "paddle/fluid/operators/distributed/distributed_pb.h" + +namespace paddle { +namespace operators { +namespace distributed { + +typedef void (*DestroyCallback)(void*); + +void SerializeToByteBuffer(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, + ::grpc::ByteBuffer* msg, + const std::string& out_varname = std::string(), + const int trainer_id = 0, + const std::string& table_name = std::string()); + +void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id); + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc new file mode 100644 index 00000000..749c1bf3 --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc @@ -0,0 +1,224 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; + +void RunSerdeTestSelectedRows(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + // serialize var to ByteBuffer + framework::Variable var; + auto* slr = var.GetMutable(); + slr->set_height(1000); + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + tensor->Resize(framework::make_ddim({564, 128})); + tensor->mutable_data(place); + int tensor_numel = 564 * 128; + math::set_constant(ctx, tensor, 32.7); + for (int i = 0; i < 564; ++i) rows->push_back(i); + + ::grpc::ByteBuffer msg; + operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg); + EXPECT_GT(msg.Length(), static_cast(0)); + + // deserialize + std::vector<::grpc::Slice> slices; + (void)msg.Dump(&slices); + std::string tmp; + for (const auto& s : slices) { + tmp.append(reinterpret_cast(s.begin()), s.size()); + } + + sendrecv::VariableMessage varmsg; + EXPECT_TRUE(varmsg.ParseFromString(tmp)); + + // deserialize bytebuffer + EXPECT_EQ(varmsg.varname(), "myvar"); + EXPECT_EQ(varmsg.type(), 1); + + const float* tensor_data = + reinterpret_cast(varmsg.serialized().data()); + const int64_t* rows_data = + reinterpret_cast(varmsg.rows().data()); + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data[i], 32.7); + } + for (int i = 0; i < 564; ++i) { + EXPECT_EQ(rows_data[i], i); + } + + // deserialize zero-copy + // framework::Variable var2; + // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2); + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::GRPCVariableResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto* slr2 = var2->GetMutable(); + auto* tensor2 = slr2->mutable_value(); + auto* rows2 = slr2->mutable_rows(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(*tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2->data()); + } + const int64_t* rows_data2 = rows2->data(); + + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); + } + for (size_t i = 0; i < rows2->size(); ++i) { + EXPECT_EQ(rows_data2[i], static_cast(i)); + } + EXPECT_EQ(slr2->height(), 1000); +} + +void RunTestLodTensor(platform::Place place, int from_type = 0) { + // serialize var to ByteBuffer + framework::Variable var; + auto* tensor = var.GetMutable(); + tensor->Resize(framework::make_ddim({512, 8, 4, 2})); + framework::LoD lod; + lod.push_back(framework::Vector({1, 3, 8})); + tensor->set_lod(lod); + int tensor_numel = 512 * 8 * 4 * 2; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 31.9); + + ::grpc::ByteBuffer msg; + operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg, + "outvar", 0, "table_name"); + EXPECT_GT(msg.Length(), static_cast(0)); + + // deserialize + std::vector<::grpc::Slice> slices; + (void)msg.Dump(&slices); + std::string tmp; + for (const auto& s : slices) { + tmp.append(reinterpret_cast(s.begin()), s.size()); + } + sendrecv::VariableMessage varmsg; + EXPECT_TRUE(varmsg.ParseFromString(tmp)); + EXPECT_EQ(varmsg.varname(), "myvar"); + EXPECT_EQ(varmsg.type(), 0); + EXPECT_EQ(varmsg.dims()[0], 512); + EXPECT_EQ(varmsg.dims()[1], 8); + EXPECT_EQ(varmsg.dims()[2], 4); + EXPECT_EQ(varmsg.dims()[3], 2); + EXPECT_EQ(varmsg.lod_level(), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); + EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); + + const float* tensor_data = + reinterpret_cast(varmsg.serialized().data()); + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data[i], 31.9); + } + + // message binary + std::string str; + varmsg.SerializeToString(&str); + + // message bytebuffer + ::grpc::Slice slices_2[1]; + int num_slices = 1; + slices_2[0] = ::grpc::Slice(str.length()); + memcpy(const_cast(slices_2[0].begin()), str.c_str(), str.length()); + ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices); + + // deserialize zero-copy + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::GRPCVariableResponse resp(&scope, &ctx); + if (from_type == 0) { + EXPECT_EQ(resp.Parse(msg), 0); + } else { + EXPECT_EQ(resp.Parse(bytebuffer2), 0); + } + + framework::Variable* var2 = resp.GetVar(); + + auto tensor2 = var2->Get(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2.data()); + } + + EXPECT_EQ(varmsg.lod_level(), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); + EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); + for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); +} + +TEST(LodTensor, Run) { + platform::CPUPlace place; + RunTestLodTensor(place); + RunTestLodTensor(place, 1); +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu(0); + RunTestLodTensor(gpu); + RunTestLodTensor(gpu, 1); +#endif +} + +TEST(SelectedRows, Run) { + platform::CPUPlace place; + RunSerdeTestSelectedRows(place); + +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu; + RunSerdeTestSelectedRows(gpu); +#endif +} diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc new file mode 100644 index 00000000..75526bed --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc @@ -0,0 +1,594 @@ +/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" + +using ::grpc::ServerAsyncResponseWriter; + +DECLARE_bool(rpc_disable_reuse_port); + +namespace paddle { +namespace operators { +namespace distributed { +enum CallStatus { PROCESS = 0, FINISH }; + +// reference: +// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server +class RequestBase { + public: + explicit RequestBase(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id) + : service_(service), + cq_(cq), + status_(PROCESS), + request_handler_(request_handler), + req_id_(req_id) { + PADDLE_ENFORCE(cq_); + } + virtual ~RequestBase() {} + virtual void Process() = 0; + + std::string Status2String(const std::string& method) { + std::string status = "Process"; + if (status_ == FINISH) { + status = "Finish"; + } + + std::ostringstream s; + s << method << " name:[" << GetReqName() << "]" + << ", ep:[" << ctx_.peer() << "]" + << " " << status << " using req_id:" << req_id_; + return s.str(); + } + + CallStatus Status() const { + std::lock_guard l(status_mu_); + return status_; + } + + template + void Finish(const T& reply, ServerAsyncResponseWriter* responder) { + std::lock_guard l(status_mu_); + status_ = FINISH; + responder->Finish(reply, ::grpc::Status::OK, + reinterpret_cast(static_cast(req_id_))); + } + virtual std::string GetReqName() = 0; + + protected: + mutable std::mutex status_mu_; + ::grpc::ServerContext ctx_; + GrpcService::AsyncService* service_; + ::grpc::ServerCompletionQueue* cq_; + CallStatus status_; + RequestHandler* request_handler_; + int req_id_; +}; + +class RequestSend final : public RequestBase { + public: + explicit RequestSend(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id) + : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { + request_.reset(new GRPCVariableResponse(request_handler->scope(), + request_handler->dev_ctx(), + !request_handler->sync_mode())); + int method_id = static_cast(distributed::GrpcMethod::kSendVariable); + service_->RequestAsyncUnary( + method_id, &ctx_, request_.get(), &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + virtual ~RequestSend() {} + std::string GetReqName() override { return request_->Varname(); } + + void Process() override { + std::string varname = GetReqName(); + VLOG(4) << "RequestSend var_name:" << varname; + + auto scope = request_->GetMutableLocalScope(); + auto invar = request_->GetVar(); + int trainer_id = request_->GetTrainerId(); + framework::Variable* outvar = nullptr; + request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); + Finish(reply_, &responder_); + } + + protected: + sendrecv::VoidMessage reply_; + std::shared_ptr request_; + ServerAsyncResponseWriter responder_; +}; + +class RequestGet final : public RequestBase { + public: + explicit RequestGet(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id) + : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { + auto method_id = static_cast(distributed::GrpcMethod::kGetVariable); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGet() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + std::string out_varname = request_.out_varname(); + std::string table_name = request_.table_name(); + int trainer_id = request_.trainer_id(); + + VLOG(4) << "RequestGet " << out_varname << " from " << varname; + + auto scope = request_handler_->scope(); + framework::Variable* invar = nullptr; + framework::Variable* outvar = nullptr; + + tmp_scope_ = std::move(scope->NewTmpScope()); + request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar, + trainer_id, out_varname, table_name); + + VLOG(1) << "before SerializeToByteBuffer"; + if (outvar) { + SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), + &reply_); + } + VLOG(1) << "after SerializeToByteBuffer"; + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + ::grpc::ByteBuffer reply_; + std::unique_ptr tmp_scope_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; +}; + +class RequestGetNoBarrier final : public RequestBase { + public: + explicit RequestGetNoBarrier(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id) + : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { + auto method_id = + static_cast(distributed::GrpcMethod::kGetVariableNoBarrier); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGetNoBarrier() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + std::string out_varname = request_.out_varname(); + int trainer_id = request_.trainer_id(); + + VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname; + + auto scope = request_handler_->scope(); + framework::Variable* invar = nullptr; + framework::Variable* outvar = nullptr; + + request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, + out_varname); + + if (outvar) { + SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), + &reply_); + } + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + ::grpc::ByteBuffer reply_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; +}; + +class RequestGetMonomerVariable final : public RequestBase { + public: + explicit RequestGetMonomerVariable(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, + int req_id, RPCServer* rpc_server) + : RequestBase(service, cq, request_handler, req_id), + responder_(&ctx_), + rpc_server_(rpc_server) { + auto method_id = + static_cast(distributed::GrpcMethod::kGetMonomerVariable); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGetMonomerVariable() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + + rpc_server_->WaitVarCond(varname); + MonomerHandle h = rpc_server_->GetMonomer(varname); + + auto scope = h.scope_; + auto invar = scope->FindVar(varname); + framework::Variable* outvar = nullptr; + + request_handler_->Handle(varname, scope, invar, &outvar, + request_.trainer_id()); + + if (outvar) { + SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_); + } + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + ::grpc::ByteBuffer reply_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; + RPCServer* rpc_server_{nullptr}; +}; + +class RequestGetMonomerBarrier final : public RequestBase { + public: + explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id, + RPCServer* rpc_server) + : RequestBase(service, cq, request_handler, req_id), + responder_(&ctx_), + rpc_server_(rpc_server) { + auto method_id = + static_cast(distributed::GrpcMethod::kGetMonomerBarrier); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGetMonomerBarrier() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + VLOG(4) << "RequestGetMonomerBarrier " << varname; + + rpc_server_->WaitVarCond(varname); + MonomerHandle h = rpc_server_->GetMonomer(varname); + + framework::Scope* scope = nullptr; + framework::Variable* invar = nullptr; + framework::Variable* outvar = nullptr; + + request_handler_->Handle(varname, scope, invar, &outvar, + request_.trainer_id()); + + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + sendrecv::VoidMessage reply_; + ServerAsyncResponseWriter responder_; + RPCServer* rpc_server_{nullptr}; +}; + +class RequestPrefetch final : public RequestBase { + public: + explicit RequestPrefetch(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id) + : RequestBase(service, cq, request_handler, req_id), + responder_(&ctx_), + local_scope_(nullptr) { + request_.reset(new GRPCVariableResponse(request_handler->scope(), + request_handler->dev_ctx(), true)); + int method_id = + static_cast(distributed::GrpcMethod::kPrefetchVariable); + service_->RequestAsyncUnary( + method_id, &ctx_, request_.get(), &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestPrefetch() {} + + std::string GetReqName() override { return request_->Varname(); } + + void Process() override { + // prefetch process... + std::string in_var_name = request_->Varname(); + std::string out_var_name = request_->OutVarname(); + std::string table_name = request_->TableName(); + int trainer_id = request_->GetTrainerId(); + VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name + << " out_var_name: " << out_var_name; + + auto scope = request_->GetMutableLocalScope(); + auto invar = scope->FindVar(in_var_name); + // out var must be created in local scope! + framework::Variable* outvar = scope->Var(out_var_name); + + request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id, + out_var_name, table_name); + + SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), + &reply_); + Finish(reply_, &responder_); + } + + protected: + std::shared_ptr request_; + ::grpc::ByteBuffer reply_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; + framework::Scope* local_scope_; +}; + +class RequestCheckpointNotify final : public RequestBase { + public: + explicit RequestCheckpointNotify(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id) + : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { + request_.reset(new GRPCVariableResponse(request_handler->scope(), + request_handler->dev_ctx())); + int method_id = + static_cast(distributed::GrpcMethod::kCheckpointNotify); + service_->RequestAsyncUnary( + method_id, &ctx_, request_.get(), &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestCheckpointNotify() {} + + std::string GetReqName() override { return request_->Varname(); } + + void Process() override { + auto scope = request_->GetMutableLocalScope(); + + std::string checkpoint_notify = request_->Varname(); + std::string checkpoint_dir = request_->OutVarname(); + int trainer_id = request_->GetTrainerId(); + + VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify + << ", dir: " << checkpoint_dir; + + request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr, + trainer_id, checkpoint_dir); + Finish(reply_, &responder_); + } + + protected: + std::shared_ptr request_; + sendrecv::VoidMessage reply_; + ServerAsyncResponseWriter responder_; +}; + +void AsyncGRPCServer::WaitServerReady() { + VLOG(4) << "AsyncGRPCServer is waiting server ready"; + std::unique_lock lock(this->mutex_ready_); + condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); + VLOG(4) << "AsyncGRPCServer WaitSeverReady"; +} + +// Define an option subclass in order to disable SO_REUSEPORT for the +// server socket. +// Come from: +// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc +class NoReusePortOption : public ::grpc::ServerBuilderOption { + public: + void UpdateArguments(::grpc::ChannelArguments* args) override { + args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); + } + + void UpdatePlugins(std::vector>* + plugins) override {} +}; + +void AsyncGRPCServer::StartServer() { + ::grpc::ServerBuilder builder; + builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(), + &selected_port_); + + builder.SetMaxSendMessageSize(std::numeric_limits::max()); + builder.SetMaxReceiveMessageSize(std::numeric_limits::max()); + if (FLAGS_rpc_disable_reuse_port) { + builder.SetOption( + std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption)); + } + builder.RegisterService(&service_); + + for (auto t : rpc_call_map_) { + rpc_cq_[t.first].reset(builder.AddCompletionQueue().release()); + } + + server_ = builder.BuildAndStart(); + LOG(INFO) << "Server listening on " << bind_address_ + << " selected port: " << selected_port_; + + std::function f = + std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this, + std::placeholders::_1, std::placeholders::_2); + + for (auto& t : rpc_call_map_) { + auto& rpc_name = t.first; + auto& cq = rpc_cq_[rpc_name]; + auto threadnum = rpc_thread_num_[rpc_name]; + auto& reqs = rpc_reqs_[rpc_name]; + + reqs.reserve(kRequestBufSize); + + for (int i = 0; i < kRequestBufSize; i++) { + VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i; + TryToRegisterNewOne(rpc_name, i); + } + + for (int i = 0; i < threadnum; i++) { + rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind( + &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f))); + VLOG(4) << t.first << " creates threads!"; + } + } + + { + std::lock_guard lock(this->mutex_ready_); + ready_ = 1; + } + condition_ready_.notify_all(); + + // wait server + server_->Wait(); + + for (auto& t : rpc_threads_) { + auto& threads = t.second; + for (size_t i = 0; i < threads.size(); ++i) { + threads[i]->join(); + VLOG(4) << t.first << " threads ends!"; + } + } +} + +void AsyncGRPCServer::ShutdownQueue() { + for (auto& t : rpc_cq_) { + t.second->Shutdown(); + VLOG(4) << t.first << " queue shutdown!"; + } +} + +void AsyncGRPCServer::ShutDownImpl() { + std::unique_lock lock(cq_mutex_); + is_shut_down_ = true; + ShutdownQueue(); + + VLOG(4) << "server_ shutdown!"; + server_->Shutdown(); +} + +void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, + int req_id) { + std::unique_lock lock(cq_mutex_); + if (is_shut_down_) { + VLOG(4) << "shutdown, do not TryToRegisterNewSendOne"; + return; + } + + VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name + << " REQ ID: " << req_id; + + auto& reqs = rpc_reqs_[rpc_name]; + auto& handler = rpc_call_map_[rpc_name]; + auto& cq = rpc_cq_[rpc_name]; + + RequestBase* b = nullptr; + if (rpc_name == kRequestSend) { + b = new RequestSend(&service_, cq.get(), handler, req_id); + } else if (rpc_name == kRequestGet) { + b = new RequestGet(&service_, cq.get(), handler, req_id); + + } else if (rpc_name == kRequestGetNoBarrier) { + b = new RequestGetNoBarrier(&service_, cq.get(), handler, req_id); + } else if (rpc_name == kRequestGetMonomerVariable) { + b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id, + this); + } else if (rpc_name == kRequestGetMonomerBarrier) { + b = new RequestGetMonomerBarrier(&service_, cq.get(), handler, req_id, + this); + } else if (rpc_name == kRequestPrefetch) { + b = new RequestPrefetch(&service_, cq.get(), handler, req_id); + } else if (rpc_name == kRequestCheckpoint) { + b = new RequestCheckpointNotify(&service_, cq.get(), handler, req_id); + } else { + PADDLE_ENFORCE(false, "not supported rpc"); + } + + reqs[req_id] = b; + + VLOG(4) << "TryToRegisterNewOne status:" << b->Status(); +} + +void AsyncGRPCServer::HandleRequest( + ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, + std::function TryToRegisterNewOne) { + void* tag = NULL; + bool ok = false; + + while (true) { + VLOG(4) << "HandleRequest " << rpc_name << " wait next"; + if (!cq->Next(&tag, &ok)) { + LOG(WARNING) << "CompletionQueue " << rpc_name << " shutdown!"; + break; + } + + int req_id = static_cast(reinterpret_cast(tag)); + VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id + << " get next"; + + auto& reqs = rpc_reqs_[rpc_name]; + RequestBase* base = nullptr; + { + PADDLE_ENFORCE(req_id >= 0 && req_id < kRequestBufSize); + std::unique_lock lock(cq_mutex_); + base = reqs[req_id]; + } + + VLOG(3) << base->Status2String(rpc_name); + + // reference: + // https://github.com/tensorflow/tensorflow/issues/5596 + // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM + // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I + if (!ok) { + VLOG(4) << "completion queue:" << rpc_name << " recv no regular event" + << " context:" << base->Status2String(rpc_name); + TryToRegisterNewOne(rpc_name, req_id); + delete base; + continue; + } + + switch (base->Status()) { + case PROCESS: { + base->Process(); + break; + } + case FINISH: { + TryToRegisterNewOne(rpc_name, req_id); + delete base; + break; + } + default: { assert(false); } + } + } +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h new file mode 100644 index 00000000..2fd3a7a7 --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.h @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include + +#include "grpc++/grpc++.h" +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_service.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class RequestBase; + +class AsyncGRPCServer final : public RPCServer { + public: + explicit AsyncGRPCServer(const std::string& address, int client_num) + : RPCServer(address, client_num), ready_(0) {} + + virtual ~AsyncGRPCServer() {} + void WaitServerReady() override; + void StartServer() override; + + private: + // HandleRequest needs to be thread-safe. + void HandleRequest( + ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, + std::function TryToRegisterNewOne); + + void TryToRegisterNewOne(const std::string& rpc_name, int req_id); + void ShutdownQueue(); + void ShutDownImpl() override; + + private: + static const int kRequestBufSize = 100; + + std::mutex cq_mutex_; + volatile bool is_shut_down_ = false; + + GrpcService::AsyncService service_; + std::unique_ptr<::grpc::Server> server_; + + // condition of the sub program + std::condition_variable barrier_condition_; + + std::mutex mutex_ready_; + std::condition_variable condition_ready_; + + int ready_; + + std::map> rpc_cq_; + std::map>> rpc_threads_; + std::map> rpc_reqs_; +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h new file mode 100644 index 00000000..2965fe44 --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_service.h @@ -0,0 +1,136 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" +#include "paddle/fluid/platform/profiler.h" + +// NOTE: This method was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// method and did some modifications so that we can parse gRPC +// requests without too much copying of the tensor data. + +namespace grpc { +class CompletionQueue; +class Channel; +class RpcService; +class ServerCompletionQueue; +class ServerContext; + +// Support parsing/unparsing of tensorflow::VariableResponse. +// Wire-format is identical to RecvVariableResponse. +template <> +class SerializationTraits< + paddle::operators::distributed::GRPCVariableResponse> { + public: + static Status Serialize( + const paddle::operators::distributed::GRPCVariableResponse& msg, + grpc_byte_buffer** bp, bool* own_buffer) { + PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!"); + return Status(); + } + static Status Deserialize( + grpc_byte_buffer* buffer, + paddle::operators::distributed::GRPCVariableResponse* msg, + int max_message_size = INT_MAX) { + if (buffer == nullptr) { + return Status(StatusCode::INTERNAL, "No payload"); + } + + Status result = g_core_codegen_interface->ok(); + if (result.ok()) { + paddle::operators::distributed::GrpcByteSource source(buffer); + int ret = msg->Parse(&source); + if (ret != 0) { + result = Status(StatusCode::INTERNAL, "VariableResponse parse error"); + } + } + g_core_codegen_interface->grpc_byte_buffer_destroy(buffer); + return result; + } +}; +} // namespace grpc + +namespace paddle { +namespace operators { +namespace distributed { + +enum class GrpcMethod { + kSendVariable, + kGetVariable, + kPrefetchVariable, + kCheckpointNotify, + kGetVariableNoBarrier, + kGetMonomerVariable, + kGetMonomerBarrier, +}; + +static const int kGrpcNumMethods = + static_cast(GrpcMethod::kGetMonomerBarrier) + 1; + +inline const char* GrpcMethodName(GrpcMethod id) { + switch (id) { + case GrpcMethod::kSendVariable: + return "/sendrecv.SendRecvService/SendVariable"; + case GrpcMethod::kGetVariable: + return "/sendrecv.SendRecvService/GetVariable"; + case GrpcMethod::kGetVariableNoBarrier: + return "/sendrecv.SendRecvService/GetVariableNoBarrier"; + case GrpcMethod::kGetMonomerVariable: + return "/sendrecv.SendRecvService/GetMonomerVariable"; + case GrpcMethod::kGetMonomerBarrier: + return "/sendrecv.SendRecvService/GetMonomerBarrier"; + case GrpcMethod::kPrefetchVariable: + return "/sendrecv.SendRecvService/PrefetchVariable"; + case GrpcMethod::kCheckpointNotify: + return "/sendrecv.SendRecvService/CheckpointNotify"; + } + + // Shouldn't be reached. + PADDLE_ENFORCE(false, "Invalid id: not found valid method name"); + return nullptr; +} + +class GrpcService final { + public: + class AsyncService : public ::grpc::Service { + public: + AsyncService() { + for (int i = 0; i < kGrpcNumMethods; ++i) { + AddMethod(new ::grpc::internal::RpcServiceMethod( + GrpcMethodName(static_cast(i)), + ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr)); + ::grpc::Service::MarkMethodAsync(i); + } + } + virtual ~AsyncService() {} + + // Make RequestAsyncUnary public for grpc_call.h + using ::grpc::Service::RequestAsyncUnary; + }; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc new file mode 100644 index 00000000..87e83ca5 --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc @@ -0,0 +1,330 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#ifdef PADDLE_WITH_CUDA +#include +#endif + +#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +enum WireType { + WIRETYPE_VARINT = 0, + WIRETYPE_LENGTH_DELIMITED = 2, +}; + +inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; } + +inline WireType GetTagWireType(uint32_t tag) { + return static_cast(tag & 0x7); +} + +bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input, + int* result) { + uint64_t v; + if (input->ReadVarint64(&v) && v <= static_cast(INT_MAX)) { + *result = static_cast(v); + return true; + } else { + return false; + } +} + +int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) { + GrpcByteBufferSource source; + source.Init(byte_buffer); + GrpcByteBufferSourceWrapper r(&source); + + return Parse(&r); +} + +bool ParseLodData(::google::protobuf::io::CodedInputStream* input, + std::vector* lod) { + while (true) { + auto p = input->ReadTagWithCutoff(127); + int tag = GetTagFieldNumber(p.first); + WireType wt = GetTagWireType(p.first); + + if (!p.second) { + return (tag == 0); + } + + switch (tag) { + case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: { + uint64_t v; + if (wt == WIRETYPE_VARINT) { + if (!input->ReadVarint64(&v)) { + return false; + } + lod->push_back(v); + break; + } + + if (wt == WIRETYPE_LENGTH_DELIMITED) { + int num_bytes = 0; + if (!input->ReadVarintSizeAsInt(&num_bytes)) { + return tag; + } + int start_pos = input->CurrentPosition(); + while (input->CurrentPosition() - start_pos < num_bytes) { + uint64_t v; + if (!input->ReadVarint64(&v)) { + return tag; + } + lod->push_back(v); + } + break; + } + + return false; + } + default: { return false; } + } + } + + return true; +} + +int GRPCVariableResponse::Parse(Source* source) { + ::google::protobuf::io::ZeroCopyInputStream* input_stream = + source->contents(); + ::google::protobuf::io::CodedInputStream input(input_stream); + input.SetTotalBytesLimit(INT_MAX, INT_MAX); + + while (true) { + auto p = input.ReadTagWithCutoff(127); + int tag = GetTagFieldNumber(p.first); + WireType wt = GetTagWireType(p.first); + if (!p.second) { + if (tag != 0) { + return -1; + } + return 0; + } + + switch (tag) { + case sendrecv::VariableMessage::kVarnameFieldNumber: { + uint32_t length; + if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { + return tag; + } + + std::string temp; + if (!input.ReadString(&temp, length)) { + return tag; + } + + meta_.set_varname(temp); + break; + } + case sendrecv::VariableMessage::kTypeFieldNumber: { + uint32_t v; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { + return tag; + } + + meta_.set_type(static_cast<::sendrecv::VarType>(v)); + break; + } + case sendrecv::VariableMessage::kDataTypeFieldNumber: { + uint32_t v = 0; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { + return tag; + } + + meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v)); + break; + } + case sendrecv::VariableMessage::kDimsFieldNumber: { + // not packed + if (wt == WIRETYPE_VARINT) { + uint64_t v; + if (!input.ReadVarint64(&v)) { + return tag; + } + meta_.add_dims(v); + break; + } + + // packed + if (wt == WIRETYPE_LENGTH_DELIMITED) { + int num_bytes = 0; + if (!input.ReadVarintSizeAsInt(&num_bytes)) { + return tag; + } + int start_pos = input.CurrentPosition(); + while (input.CurrentPosition() - start_pos < num_bytes) { + uint64_t v; + if (!input.ReadVarint64(&v)) { + return tag; + } + meta_.add_dims(v); + } + break; + } + return tag; + } + case sendrecv::VariableMessage::kLodLevelFieldNumber: { + uint64_t v = 0; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { + return tag; + } + meta_.set_lod_level(static_cast(v)); + break; + } + case sendrecv::VariableMessage::kLodFieldNumber: { + int length = 0; + if (wt != WIRETYPE_LENGTH_DELIMITED || + !ReadVarintSizeAsInt(&input, &length)) { + return tag; + } + + std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p = + input.IncrementRecursionDepthAndPushLimit(length); + + std::vector lod_data; + if (p.second < 0 || !ParseLodData(&input, &lod_data)) { + return tag; + } + + if (!input.DecrementRecursionDepthAndPopLimit(p.first)) { + return tag; + } + + if (lod_data.size() == 0) { + break; + } + + auto lod = meta_.add_lod(); + for (uint32_t i = 0; i < lod_data.size(); i++) { + lod->add_lod_data(lod_data[i]); + } + break; + } + case sendrecv::VariableMessage::kSlrHeightFieldNumber: { + uint64_t v = 0; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { + return tag; + } + meta_.set_slr_height(static_cast(v)); + break; + } + case sendrecv::VariableMessage::kSerializedFieldNumber: { + int num_bytes = 0; + if (wt != WIRETYPE_LENGTH_DELIMITED || + !ReadVarintSizeAsInt(&input, &num_bytes)) { + return tag; + } + + if (!ProcSerializedField(tag, &input, num_bytes)) { + return tag; + } + + break; + } + case sendrecv::VariableMessage::kRowsFieldNumber: { + PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || + meta_.type() == sendrecv::LOD_TENSOR) && + meta_.varname() != "", + "meta info should be got first!"); + + int num_bytes = 0; + if (wt != WIRETYPE_LENGTH_DELIMITED || + !ReadVarintSizeAsInt(&input, &num_bytes)) { + return tag; + } + + if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { + return tag; + } + break; + } + case sendrecv::VariableMessage::kOutVarnameFieldNumber: { + uint32_t length; + if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { + return tag; + } + + std::string temp; + if (!input.ReadString(&temp, length)) { + return tag; + } + + meta_.set_out_varname(temp); + break; + } + case sendrecv::VariableMessage::kProfileFieldNumber: { + uint64_t profiling = 0; + if (!input.ReadVarint64(&profiling)) { + return tag; + } + meta_.set_profile(profiling); + int64_t listener_id = platform::ListenerId(); + if (listener_id <= 0) { + break; + } + if (profiling == platform::kEnableProfiler && + !platform::IsProfileEnabled()) { + platform::EnableProfiler(platform::ProfilerState::kCPU); + } else if (profiling == platform::kDisableProfiler && + platform::IsProfileEnabled()) { + platform::DisableProfiler( + platform::EventSortingKey::kDefault, + string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path, + listener_id)); + } + break; + } + case sendrecv::VariableMessage::kTrainerIdFieldNumber: { + uint64_t trainer_id = 0; + if (!input.ReadVarint64(&trainer_id)) { + return tag; + } + meta_.set_trainer_id(trainer_id); + break; + } + case sendrecv::VariableMessage::kTableNameFieldNumber: { + uint32_t length; + if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { + return tag; + } + + std::string temp; + if (!input.ReadString(&temp, length)) { + return tag; + } + + meta_.set_table_name(temp); + break; + } + default: { + // Unknown tag, return unknown error. + return -1; + } + } + } + + return 0; +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h new file mode 100644 index 00000000..3ca1d89f --- /dev/null +++ b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" + +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" +#include "paddle/fluid/operators/distributed/variable_response.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class GRPCVariableResponse : public VariableResponse { + public: + GRPCVariableResponse(const framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + bool create_scope = false) + : VariableResponse(scope, dev_ctx, create_scope) {} + + virtual ~GRPCVariableResponse() {} + + int Parse(Source* source) override; + + // return: + // 0:ok. + // -1: unkown error. + // other: number of error field. + int Parse(const ::grpc::ByteBuffer& byte_buffer); +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc new file mode 100644 index 00000000..0e8d877e --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -0,0 +1,241 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using LoDTensor = framework::LoDTensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +static std::vector> SplitIds( + const std::vector& ids_vector, + const std::vector& height_section) { + std::set all_ids; + for (auto id : ids_vector) { + all_ids.insert(id); + } + + auto abs_sections = ToAbsoluteSection(height_section); + std::vector> splited_ids; + splited_ids.resize(height_section.size() + 1); + for (auto& id : all_ids) { + auto section_index = GetSectionIndex(id, abs_sections); + splited_ids[section_index].push_back(id - abs_sections[section_index]); + } + return splited_ids; +} + +static void SplitIdsIntoMultipleVarsBySection( + const std::vector& in_var_names, + const std::vector& height_section, + const std::vector>& splited_ids, + framework::Scope* scope) { + PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), ""); + + auto place = platform::CPUPlace(); + + for (size_t i = 0; i < in_var_names.size(); ++i) { + auto* id_tensor = + scope->Var(in_var_names[i])->GetMutable(); + auto& ids = splited_ids[i]; + if (!ids.empty()) { + auto* id_tensor_data = id_tensor->mutable_data( + framework::make_ddim({static_cast(ids.size()), 1}), place); + memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); + } + } +} + +static void MergeMultipleVarsIntoOneBySection( + const std::string& id_name, const std::vector& ids_vector, + const std::string& out_name, const std::vector& out_var_names, + const std::vector& height_section, + const std::vector>& splited_ids, + const framework::ExecutionContext& context, framework::Scope* scope, + platform::DeviceContext* actual_ctx) { + PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); + + auto cpu_place = platform::CPUPlace(); + + auto abs_sections = ToAbsoluteSection(height_section); + std::unordered_map> id_to_offset; + for (size_t i = 0; i < ids_vector.size(); ++i) { + id_to_offset[ids_vector[i]].push_back(i); + } + + auto& id_tensor = scope->FindVar(id_name)->Get(); + auto* out_tensor = + scope->FindVar(out_name)->GetMutable(); + + PADDLE_ENFORCE_GT( + out_tensor->numel(), 0, + "When calling this method, the LoDTensor's numel must larger than zero. " + "Please check LoDTensor::Resize has been called first."); + + auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); + + bool is_on_cpu_place = true; + if (!platform::is_cpu_place(id_tensor.place())) { + is_on_cpu_place = false; + } + + for (size_t section_idx = 0; section_idx < out_var_names.size(); + ++section_idx) { + auto& ids_in_this_section = splited_ids[section_idx]; + if (!ids_in_this_section.empty()) { + auto& prefetch_out_var = + scope->Var(out_var_names[section_idx])->Get(); + const auto* out_var_data = prefetch_out_var.data(); + auto& dims = prefetch_out_var.dims(); + + PADDLE_ENFORCE_EQ(dims.size(), 2, ""); + PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]); + + auto row_numel = dims[1]; + + for (int64_t i = 0; i < dims[0]; ++i) { + auto id = ids_in_this_section[i]; + auto origin_id = id + abs_sections[section_idx]; + auto& offsets = id_to_offset[origin_id]; + for (auto& offset : offsets) { + // should support GPU tensor + if (is_on_cpu_place) { + memory::Copy(cpu_place, out_tensor_data + offset * row_numel, + cpu_place, out_var_data + i * row_numel, + sizeof(float) * row_numel); + } else { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("paddle is not compiled with CUDA!"); +#else + auto stream = + static_cast(actual_ctx)->stream(); + memory::Copy(boost::get(id_tensor.place()), + out_tensor_data + offset * row_numel, cpu_place, + out_var_data + i * row_numel, + sizeof(float) * row_numel, stream); +#endif + } + } + } + } else { + VLOG(3) << "ids in this section is empty"; + } + } +} + +void prefetch(const std::string& id_name, const std::string& out_name, + const std::vector& table_names, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope) { + std::unique_ptr local_scope = scope.NewTmpScope(); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& cpu_ctx = *pool.Get(platform::CPUPlace()); + auto& actual_ctx = *pool.Get(context.GetPlace()); + + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance( + context.Attr("trainer_id")); + + std::vector in_var_names; + std::vector out_var_names; + for (size_t i = 0; i < epmap.size(); ++i) { + in_var_names.push_back(id_name + "@" + epmap[i]); + out_var_names.push_back(out_name + "@" + epmap[i]); + } + + auto& id_tensor = scope.FindVar(id_name)->Get(); + std::vector ids_vector; + if (platform::is_cpu_place(id_tensor.place())) { + auto* id_data = id_tensor.data(); + for (int64_t i = 0; i < id_tensor.numel(); ++i) { + ids_vector.push_back(id_data[i]); + } + } else { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("paddle is not compiled with CUDA!"); +#else + auto cpu_place = platform::CPUPlace(); + framework::LoDTensor cpu_tensor; + auto* cpu_tensor_data = + cpu_tensor.mutable_data(id_tensor.dims(), cpu_place); + auto stream = + static_cast(&actual_ctx)->stream(); + memory::Copy(cpu_place, cpu_tensor_data, + boost::get(id_tensor.place()), + id_tensor.data(), sizeof(int64_t) * id_tensor.numel(), + stream); + for (int64_t i = 0; i < cpu_tensor.numel(); ++i) { + ids_vector.push_back(cpu_tensor_data[i]); + } +#endif + } + + auto splited_ids = SplitIds(ids_vector, height_sections); + SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids, + local_scope.get()); + + // create output var in local scope + for (auto& name : out_var_names) { + local_scope->Var(name)->GetMutable(); + } + + std::vector rets; + for (size_t i = 0; i < in_var_names.size(); i++) { + if (NeedSend(*local_scope.get(), in_var_names[i])) { + VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i] + << " to get " << out_var_names[i] << " back"; + rets.push_back(rpc_client->AsyncPrefetchVar( + epmap[i], cpu_ctx, *local_scope.get(), in_var_names[i], + out_var_names[i], table_names[i])); + } else { + VLOG(3) << "don't send no-initialied variable: " << out_var_names[i]; + } + } + + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + + MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, + out_var_names, height_sections, splited_ids, + context, local_scope.get(), &actual_ctx); +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h new file mode 100644 index 00000000..0429ec44 --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -0,0 +1,83 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { +namespace distributed { + +void prefetch(const std::string& id_name, const std::string& out_name, + const std::vector& table_names, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope); + +template +void prefetch_with_reconstruct(const std::string& id_name, + const std::string& out_name, + const std::vector& table_names, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope, + framework::LoDTensor* original) { + prefetch(id_name, out_name, table_names, epmap, height_sections, context, + scope); + auto& out = scope.FindVar(out_name)->Get(); + auto& ids = scope.FindVar(id_name)->Get(); + auto* original_value = original->data(); + auto* out_value = out.data(); + size_t original_width = original->numel() / original->dims()[0]; + + bool is_on_cpu_place = true; + if (!platform::is_cpu_place(ids.place())) { + is_on_cpu_place = false; + } + if (is_on_cpu_place) { + for (int64_t i = 0; i < ids.numel(); i++) { + const T* out_rows = out_value + original_width * i; + T* original_row = + original_value + original_width * ids.data()[i]; + std::memcpy(original_row, out_rows, original_width * sizeof(T)); + } + } else { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("paddle is not compiled with CUDA!"); +#else + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& actual_ctx = *pool.Get(context.GetPlace()); + for (int64_t i = 0; i < ids.numel(); i++) { + const T* out_rows = out_value + original_width * i; + T* original_row = + original_value + original_width * ids.data()[i]; + auto stream = + static_cast(&actual_ctx)->stream(); + memory::Copy(boost::get(ids.place()), original_row, + platform::CPUPlace(), out_rows, original_width * sizeof(T), + stream); + } +#endif + } +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc new file mode 100644 index 00000000..da73167a --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -0,0 +1,138 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/operators/distributed/parameter_recv.h" + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using LoDTensor = framework::LoDTensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +template +void ParameterRecv::operator()(const RpcContext &rpc_ctx, + const framework::Scope &scope) { + VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name; + std::unique_ptr local_scope = scope.NewTmpScope(); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &cpu_ctx = *pool.Get(platform::CPUPlace()); + + distributed::RPCClient *rpc_client = + distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); + + auto *recv_var = scope.FindVar(rpc_ctx.var_name); + + // recv all vars to local scope + if (recv_var->IsType()) { + std::vector rets; + for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) { + auto &recv_var_name = rpc_ctx.splited_var_names[i]; + local_scope->Var(recv_var_name); + VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; + rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx, + *local_scope.get(), recv_var_name, + recv_var_name)); + } + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + } else { + PADDLE_THROW("unsupported var type to recv!"); + } + + // concat recved tensor into one var + { + size_t output_offset = 0; + size_t row_offset = 0; + framework::Tensor *recv_tensor = + recv_var->GetMutable(); + auto dev_ctx = paddle::platform::CPUDeviceContext(); + int64_t recv_numel = 0; + for (auto &recv_var_name : rpc_ctx.splited_var_names) { + auto *recv_var = local_scope->FindVar(recv_var_name); + if (recv_var->IsType()) { + auto &in = recv_var->Get(); + recv_numel += in.numel(); + auto in_stride = framework::stride_numel(in.dims()); + auto out_stride = framework::stride_numel(recv_tensor->dims()); + StridedNumelCopyWithAxis( + dev_ctx, 0, recv_tensor->data() + output_offset, out_stride, + in.data(), in_stride, in_stride[0]); + output_offset += in_stride[0]; + } else if (recv_var->IsType()) { + auto &recv_slr = recv_var->Get(); + auto &recv_dims = recv_tensor->dims(); + int64_t width = recv_dims[1]; + recv_numel += recv_slr.height() * width; + PADDLE_ENFORCE_EQ(recv_slr.value().dims()[1], width); + PADDLE_ENFORCE_EQ(recv_slr.value().dims()[0], recv_slr.rows().size()); + VLOG(3) << "recv slr " << recv_var_name << " dims " + << recv_slr.value().dims(); + if (VLOG_IS_ON(3)) { + std::ostringstream sstream; + sstream << "["; + for (auto &row_id : recv_slr.rows()) { + sstream << row_id << ", "; + } + sstream << "]"; + VLOG(3) << "recv_slr size: " << recv_slr.rows().size() << " " + << sstream.str(); + } + + for (auto i = 0; i < recv_slr.rows().size(); ++i) { + auto row_id = recv_slr.rows()[i] + row_offset; + PADDLE_ENFORCE_LT(row_id, recv_dims[0]); + memcpy(recv_tensor->data() + row_id * width, + recv_slr.value().data() + i * width, sizeof(T) * width); + } + row_offset += recv_slr.height(); + } else { + PADDLE_THROW("unsupported recieved var type"); + } + } + auto numel = recv_tensor->numel(); + if (recv_numel != numel) { + LOG(FATAL) << "recv_numel: " << recv_numel << " acture numel: " << numel; + } + PADDLE_ENFORCE_EQ(recv_numel, numel); + } + + VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name; +} + +template struct ParameterRecv; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h new file mode 100644 index 00000000..e955fca7 --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_recv.h @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" + +namespace paddle { +namespace operators { +namespace distributed { + +template +struct ParameterRecv { + void operator()(const RpcContext &rpc_ctx, const framework::Scope &scope); +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc new file mode 100644 index 00000000..dfabad56 --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/operators/distributed/parameter_send.h" + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using LoDTensor = framework::LoDTensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +template +void ParameterSend::operator()(const RpcContext &rpc_ctx, + const framework::Scope &scope, bool sync) { + std::unique_ptr local_scope = scope.NewTmpScope(); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &cpu_ctx = *pool.Get(platform::CPUPlace()); + + distributed::RPCClient *rpc_client = + distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); + + auto *send_var = scope.FindVar(rpc_ctx.var_name); + size_t out_num = rpc_ctx.splited_var_names.size(); + if (send_var->IsType()) { + if (out_num > 1) { + auto &send_tensor = send_var->Get(); + auto &send_tensor_dims = send_tensor.dims(); + std::vector outs_dims; + outs_dims.reserve(out_num); + + // infer output shape + PADDLE_ENFORCE_EQ(rpc_ctx.height_sections.size(), out_num, + "tensor split sections size" + "should be equal to output size."); + for (size_t i = 0; i < out_num; ++i) { + auto dim = send_tensor_dims; + dim[0] = rpc_ctx.height_sections[i]; + outs_dims.push_back(dim); + } + + // create output var in local scope + size_t row_offset = 0; + for (auto i = 0; i < out_num; ++i) { + framework::Tensor *out = local_scope->Var(rpc_ctx.splited_var_names[i]) + ->GetMutable(); + *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); + row_offset += outs_dims[i][0]; + } + } + } else if (send_var->IsType()) { + auto &send_slr = send_var->Get(); + auto abs_sections = ToAbsoluteSection(rpc_ctx.height_sections); + + auto &send_rows = send_slr.rows(); + std::vector> outs_rows_idx; + std::vector> outs_dense_idx; + + outs_rows_idx.resize(out_num); + outs_dense_idx.resize(out_num); + + auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0]; + auto *src = send_slr.value().data(); + + // create output var in local scope + std::vector outs; + for (auto &name : rpc_ctx.splited_var_names) { + auto *out = local_scope->Var(name)->GetMutable(); + outs.push_back(out); + } + + // split rows index into output sparse vars + for (size_t i = 0; i < send_rows.size(); ++i) { + size_t out_idx = GetSectionIndex(send_rows[i], abs_sections); + outs_rows_idx[out_idx].push_back(send_rows[i]); + outs_dense_idx[out_idx].push_back(i); + } + auto place = platform::CPUPlace(); + + for (size_t i = 0; i < outs_rows_idx.size(); ++i) { + auto rows_idx = outs_rows_idx[i]; + outs[i]->set_height(rpc_ctx.height_sections[i]); + auto dims = send_slr.GetCompleteDims(); + dims[0] = rows_idx.size(); + outs[i]->mutable_rows()->clear(); + outs[i]->mutable_value()->mutable_data(dims, send_slr.place()); + if (rows_idx.size() > 0) { + for (auto idx : rows_idx) { + outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); + } + auto dst = outs[i]->mutable_value()->mutable_data(place); + for (size_t j = 0; j < rows_idx.size(); j++) { + if (platform::is_cpu_place(place)) { + memory::Copy( + platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(), + src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel); + } else { + PADDLE_THROW("do not support GPU now"); + /* + #ifdef PADDLE_WITH_CUDA + auto stream = ctx.cuda_device_context().stream(); + memory::Copy(platform::CUDAPlace(), dst + j * row_numel, + platform::CUDAPlace(), + src + outs_dense_idx[i][j] * row_numel, + sizeof(T) * row_numel, stream); + #else + PADDLE_THROW("Paddle is not compiled with GPU"); + #endif + */ + } + } + } + PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(), + "rows should has the same size with tensor dim 0"); + } + + } else { + PADDLE_THROW("unsupported var type to send!"); + } + + std::vector rets; + for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) { + auto &send_var_name = rpc_ctx.splited_var_names[i]; + auto &endpoint = rpc_ctx.epmap[i]; + if (NeedSend(*local_scope.get(), send_var_name)) { + VLOG(3) << "sending " << send_var_name << " to " << endpoint; + rets.push_back(rpc_client->AsyncSendVar( + endpoint, cpu_ctx, *local_scope.get(), send_var_name)); + } else { + VLOG(3) << "don't send non-initialized variable: " + << rpc_ctx.splited_var_names[i]; + } + } + + if (sync) { + for (auto &handle : rets) { + PADDLE_ENFORCE(handle->Wait(), "internal error in RPCClient"); + } + } +} + +template struct ParameterSend; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h new file mode 100644 index 00000000..9077f4a4 --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_send.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" + +namespace paddle { +namespace operators { +namespace distributed { + +template +struct ParameterSend { + void operator()(const RpcContext &rpc_ctx, const framework::Scope &scope, + bool sync); +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h new file mode 100644 index 00000000..e9f06f54 --- /dev/null +++ b/paddle/fluid/operators/distributed/proto_encoder_helper.h @@ -0,0 +1,153 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#pragma once + +#include + +#include "grpc++/grpc++.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace distributed { + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1 << 7)) { + *(ptr++) = v; + } else if (v < (1 << 14)) { + *(ptr++) = v | B; + *(ptr++) = v >> 7; + } else if (v < (1 << 21)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = v >> 14; + } else if (v < (1 << 28)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = v >> 21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = (v >> 21) | B; + *(ptr++) = v >> 28; + } + return reinterpret_cast(ptr); +} + +char* EncodeVarint64(char* dst, uint64_t v) { + static const int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B - 1)) | B; + v >>= 7; + } + *(ptr++) = static_cast(v); + return reinterpret_cast(ptr); +} + +int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +class ProtoEncodeHelper { + public: + ProtoEncodeHelper(char* buf, int max_size) + : base_(buf), p_(buf), limit_(base_ + max_size) {} + + ~ProtoEncodeHelper() { +#define REPLACE_ENFORCE_GLOG 1 + // Make sure callers didn't do operations that went over max_size promised + if (paddle::platform::is_error(p_ <= limit_)) { + paddle::platform::throw_on_error(p_ <= limit_, ""); + } +#undef REPLACE_ENFORCE_GLOG + } + + const char* data() const { return base_; } + size_t size() const { return p_ - base_; } + + void WriteUint64(int tag, uint64_t v) { + Encode32(combine(tag, WIRETYPE_VARINT)); + Encode64(v); + } + void WriteBool(int tag, bool v) { + Encode32(combine(tag, WIRETYPE_VARINT)); + EncodeBool(v); + } + void WriteString(int tag, const std::string& v) { + Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); + Encode32(v.size()); + EncodeBytes(v.data(), v.size()); + } + void WriteVarlengthBeginning(int tag, uint32_t len) { + Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); + Encode32(len); + } + void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); } + + private: + // Note: this module's behavior must match the protocol buffer wire encoding + // format. + enum { + WIRETYPE_VARINT = 0, + WIRETYPE_LENGTH_DELIMITED = 2, + }; + static uint32_t combine(uint32_t tag, uint32_t type) { + return ((tag << 3) | type); + } + inline void Encode32(uint32_t v) { + if (v < 128) { + // Fast path for single-byte values. Many of the calls will use a + // constant value for v, so the comparison will get optimized away + // when Encode32 is inlined into the caller. + *p_ = v; + p_++; + } else { + p_ = EncodeVarint32(p_, v); + } + } + void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); } + void EncodeBool(bool v) { + *p_ = (v ? 1 : 0); // Equal to varint32 encoding of 0 or 1 + p_++; + } + void EncodeBytes(const char* bytes, int N) { + memcpy(p_, bytes, N); + p_ += N; + } + + char* base_; + char* p_; + char* limit_; // Just for CHECKs +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h new file mode 100644 index 00000000..de8f3018 --- /dev/null +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -0,0 +1,244 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // NOLINT + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace operators { +namespace distributed { + +constexpr char kRequestSend[] = "RequestSend"; +constexpr char kRequestGet[] = "RequestGet"; +constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable"; +constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier"; +constexpr char kRequestPrefetch[] = "RequestPrefetch"; +constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; +constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; +constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier"; + +constexpr char kSendRPC[] = "SendRPC"; +constexpr char kGetRPC[] = "GetRPC"; +constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC"; +constexpr char kGetMonomerRPC[] = "GetMonomerRPC"; +constexpr char kPrefetchRPC[] = "PrefetchRPC"; +constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC"; +constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC"; +constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC"; +constexpr char kSendCompleteRPC[] = "SendCompleteRPC"; +constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC"; + +#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" +#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" +#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" +#define COMPLETE_MESSAGE "COMPLETE@RECV" +#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV" + +#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY" +#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY" + +class RPCServer; + +class VarHandle { + public: + VarHandle(const std::string ep, const std::string& method, + const std::string& name, + const platform::DeviceContext* p_ctx = nullptr, + const framework::Scope* p_scope = nullptr) + : status_(kDefaultState) { + ep_ = ep; + ctx_ = p_ctx; + scope_ = p_scope; + name_ = name; + method_ = method; + } + + virtual ~VarHandle() {} + + public: + bool Wait() { + int ret = kDefaultState; + { + std::unique_lock lk(sync_mutex_); + wait_cond_.wait(lk, [this] { return status_ != kDefaultState; }); + ret = status_; + } + VLOG(7) << "VarHandle wait:" << ret; + return ret != kErrorState; + } + + void Finish(bool ok) { + { + std::unique_lock lk(sync_mutex_); + status_ = ok ? kFinishState : kErrorState; + } + VLOG(7) << "VarHandle finish:" << ok; + wait_cond_.notify_all(); + } + + std::string String() const { + std::ostringstream s; + s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:[" + << status_ << "]"; + return s.str(); + } + + std::string ep() const { return ep_; } + const platform::DeviceContext* ctx() const { return ctx_; } + const framework::Scope* scope() const { return scope_; } + std::string name() const { return name_; } + std::string method() const { return method_; } + + protected: + // RPC endpoint. + std::string ep_; + const platform::DeviceContext* ctx_; + const framework::Scope* scope_; + // Variable name. + std::string name_; + // RPC method name. + std::string method_; + + protected: + std::mutex sync_mutex_; + std::condition_variable wait_cond_; + + enum VarHandleStatus { + kDefaultState = -1, + kErrorState = 0, + kFinishState = 1, + }; + VarHandleStatus status_; + + private: + DISABLE_COPY_AND_ASSIGN(VarHandle); +}; + +typedef std::shared_ptr VarHandlePtr; + +class RequestHandler { + public: + explicit RequestHandler(bool sync_mode) + : sync_mode_(sync_mode), + dev_ctx_(nullptr), + executor_(nullptr), + scope_(nullptr), + program_(nullptr), + rpc_server_(nullptr) {} + + virtual ~RequestHandler() {} + + // Set attributes. + void SetScope(framework::Scope* scope) { scope_ = scope; } + void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; } + void SetProgram(framework::ProgramDesc* program) { program_ = program; } + void SetExecutor(framework::Executor* executor) { executor_ = executor; } + + // Used for dist lookup table prefetch + void SetPrefetchPreparedCtx( + std::unordered_map< + std::string, std::shared_ptr>* g) { + prefetch_var_name_to_prepared_ctx_ = g; + } + + void SetCheckpointNotifyPreparedCtx( + std::shared_ptr g) { + checkpoint_prepared_ctx_ = g; + } + + // Used for async. + void SetGradToPreparedCtx( + std::unordered_map< + std::string, std::shared_ptr>* g) { + grad_to_prepared_ctx_ = g; + } + + void SetSparseGradToParam(std::unordered_map* g) { + sparse_grad_to_param_ = g; + } + + void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; } + + // Get attributes. + bool sync_mode() { return sync_mode_; } + framework::Scope* scope() { return scope_; } + const platform::DeviceContext* dev_ctx() { return dev_ctx_; } + framework::ProgramDesc* program() { return program_; } + framework::Executor* executor() { return executor_; } + + // This function processes user's rpc request. + // The implemention is in request_handler_impl. + // example: + // std::string varname = request_.varname(); + // + // auto scope = request_handler_->scope(); + // auto invar = scope->FindVar(varname); + // framework::Variable* outvar = nullptr; + // + // request_handler_->Handle(varname, scope, invar, &outvar); + // if (outvar) { + // SerializeToByteBuffer(varname, outvar, + // *request_handler_->dev_ctx(), &reply_); + // } + virtual bool Handle(const std::string& varname, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, + const std::string& out_var_name = "", + const std::string& table_name = "") = 0; + + protected: + const bool sync_mode_; + + const platform::DeviceContext* dev_ctx_; + framework::Executor* executor_; + framework::Scope* scope_; + framework::ProgramDesc* program_; + + // used for distribute lookup table prefetch + std::unordered_map>* + prefetch_var_name_to_prepared_ctx_; + // used for checkpoint notify + std::shared_ptr checkpoint_prepared_ctx_; + + // Used for async. + std::unordered_map>* + grad_to_prepared_ctx_; + std::unordered_map* sparse_grad_to_param_; + + RPCServer* rpc_server_; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc new file mode 100644 index 00000000..876b764a --- /dev/null +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/string/piece.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace operators { +namespace distributed { + +// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables +// to directory specified. +constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; + +bool RequestSendHandler::Handle(const std::string& varname, + framework::Scope* scope, + framework::Variable* invar, + framework::Variable** outvar, + const int trainer_id, + const std::string& out_var_name, + const std::string& table_name) { + VLOG(4) << "RequestSendHandler:" << varname; + + // Sync + if (varname == BATCH_BARRIER_MESSAGE) { + VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; + rpc_server_->IncreaseBatchBarrier(kRequestSend); + } else if (varname == COMPLETE_MESSAGE) { + VLOG(3) << "sync: recv complete message"; + rpc_server_->Complete(); + } else { + // Async + if (!sync_mode_) { + VLOG(3) << "async process var: " << varname; + if (varname == BATCH_BARRIER_MESSAGE) { + PADDLE_THROW( + "async mode should not recv BATCH_BARRIER_MESSAGE or " + "COMPLETE_MESSAGE"); + } + if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(varname)) { + auto& grad_slr = + scope->FindVar(varname)->Get(); + AsyncSparseParamUpdateRecorder::GetInstance()->Update(varname, + grad_slr.rows()); + } + executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), + scope); + return true; + } else { // sync + rpc_server_->WaitCond(kRequestSend); + VLOG(3) << "sync: processing received var: " << varname; + + if (invar == nullptr) { + LOG(FATAL) << "sync: Can not find server side var: " << varname; + return false; + } + } + } + return true; +} + +bool RequestGetHandler::Handle(const std::string& varname, + framework::Scope* scope, + framework::Variable* invar, + framework::Variable** outvar, + const int trainer_id, + const std::string& out_var_name, + const std::string& table_name) { + VLOG(3) << "RequestGetHandler:" << varname + << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id + << " table_name: " << table_name; + + if (sync_mode_) { + if (varname == FETCH_BARRIER_MESSAGE) { + VLOG(3) << "sync: recv fetch barrier message"; + rpc_server_->IncreaseBatchBarrier(kRequestGet); + } else { + rpc_server_->WaitCond(kRequestGet); + *outvar = scope_->FindVar(varname); + } + } else { + if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) { + if (enable_dc_asgd_) { + // NOTE: the format is determined by distribute_transpiler.py + std::string param_bak_name = + string::Sprintf("%s.trainer_%d_bak", varname, trainer_id); + VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id; + auto var = scope_->FindVar(varname); + auto t_orig = var->Get(); + auto param_bak = scope_->Var(param_bak_name); + auto t = param_bak->GetMutable(); + t->mutable_data(dev_ctx_->GetPlace(), t_orig.type()); + VLOG(3) << "copying " << varname << " to " << param_bak_name; + framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); + } + if (AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) && + !table_name.empty()) { + std::vector updated_rows; + AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear( + varname, trainer_id, &updated_rows); + if (VLOG_IS_ON(3)) { + std::ostringstream sstream; + sstream << "["; + for (auto& row_id : updated_rows) { + sstream << row_id << ", "; + } + sstream << "]"; + VLOG(3) << "updated_rows size: " << updated_rows.size() << " " + << sstream.str(); + } + auto& origin_tensor = + scope_->FindVar(varname)->Get(); + auto* origin_tensor_data = origin_tensor.data(); + auto& dims = origin_tensor.dims(); + *outvar = scope->Var(); + auto* out_slr = (*outvar)->GetMutable(); + out_slr->set_rows(updated_rows); + out_slr->set_height(dims[0]); + auto out_dims = framework::make_ddim( + {static_cast(updated_rows.size()), dims[1]}); + auto* data = out_slr->mutable_value()->mutable_data( + out_dims, origin_tensor.place()); + auto width = dims[1]; + for (auto i = 0; i < updated_rows.size(); ++i) { + PADDLE_ENFORCE_LT(updated_rows[i], dims[0]); + memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width, + sizeof(float) * width); + } + } else { + *outvar = scope_->FindVar(varname); + } + } + } + return true; +} + +bool RequestGetNoBarrierHandler::Handle(const std::string& varname, + framework::Scope* scope, + framework::Variable* invar, + framework::Variable** outvar, + const int trainer_id, + const std::string& out_var_name, + const std::string& table_name) { + VLOG(4) << "RequestGetNoBarrierHandler:" << varname + << " out_var_name: " << out_var_name; + + // get var from pserver immediately without barriers + string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE); + string::Piece var_name_piece = string::Piece(varname); + + if (string::Contains(var_name_piece, without_barrier_piece)) { + var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece); + VLOG(4) << "Get var " << var_name_piece << " with " + << WITHOUT_BARRIER_MESSAGE; + *outvar = scope_->FindVar(var_name_piece.ToString()); + return true; + } else { + PADDLE_THROW("GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE); + } + return true; +} + +bool RequestPrefetchHandler::Handle(const std::string& varname, + framework::Scope* scope, + framework::Variable* invar, + framework::Variable** outvar, + const int trainer_id, + const std::string& out_var_name, + const std::string& table_name) { + VLOG(4) << "RequestPrefetchHandler " << varname; + + if (table_name.empty()) { + auto var_desc = program_->Block(0).FindVar(out_var_name); + InitializeVariable(*outvar, var_desc->GetType()); + executor_->RunPreparedContext( + (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope); + } else { + (*outvar)->GetMutable(); + auto lookup_table_op = + BuildLookupTableOp(table_name, varname, out_var_name); + paddle::platform::CPUPlace cpu_place; + lookup_table_op->Run(*scope, cpu_place); + } + return true; +} + +bool RequestCheckpointHandler::Handle(const std::string& varname, + framework::Scope* scope, + framework::Variable* invar, + framework::Variable** outvar, + const int trainer_id, + const std::string& out_var_name, + const std::string& table_name) { + PADDLE_ENFORCE( + checkpoint_notify_id != -1, + "when checkpoint_notify_id = -1, there should be no RPC invoke."); + + // TODO(tangwei12): find out why scope will be error. + auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable(); + lt_var->clear(); + lt_var->append(out_var_name); + VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: " + << out_var_name; + executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_); + return true; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h new file mode 100644 index 00000000..f3c1b245 --- /dev/null +++ b/paddle/fluid/operators/distributed/request_handler_impl.h @@ -0,0 +1,131 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/distributed/request_handler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class RequestSendHandler final : public RequestHandler { + public: + explicit RequestSendHandler(bool sync_mode, bool enable_dc_asgd = false) + : RequestHandler(sync_mode) { + enable_dc_asgd_ = enable_dc_asgd; + } + virtual ~RequestSendHandler() {} + bool Handle(const std::string& varname, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; + + private: + bool enable_dc_asgd_; +}; + +class RequestGetHandler final : public RequestHandler { + public: + explicit RequestGetHandler(bool sync_mode, bool enable_dc_asgd = false) + : RequestHandler(sync_mode) { + enable_dc_asgd_ = enable_dc_asgd; + } + virtual ~RequestGetHandler() {} + bool Handle(const std::string& varname, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; + + private: + bool enable_dc_asgd_; +}; + +class RequestGetNoBarrierHandler final : public RequestHandler { + public: + RequestGetNoBarrierHandler() : RequestHandler(false) {} + virtual ~RequestGetNoBarrierHandler() {} + bool Handle(const std::string& varname, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; +}; + +static inline void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + *var->mutable_arguments()->Add() = arg_name; + } +} + +class RequestPrefetchHandler final : public RequestHandler { + public: + explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {} + virtual ~RequestPrefetchHandler() {} + bool Handle(const std::string& varname, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; + + private: + std::unique_ptr BuildLookupTableOp( + const std::string& table_name, const std::string& id_name, + const std::string& out_name) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("lookup_table"); + BuildVar("W", {table_name.data()}, op_desc.add_inputs()); + BuildVar("Ids", {id_name.data()}, op_desc.add_inputs()); + BuildVar("Out", {out_name.data()}, op_desc.add_outputs()); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + return op; + } +}; + +class RequestCheckpointHandler final : public RequestHandler { + public: + explicit RequestCheckpointHandler(bool sync_mode, int checkpoint_notify_id) + : RequestHandler(sync_mode) { + this->checkpoint_notify_id = checkpoint_notify_id; + } + virtual ~RequestCheckpointHandler() {} + bool Handle(const std::string& varname, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; + + private: + int checkpoint_notify_id; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc new file mode 100644 index 00000000..390e9af0 --- /dev/null +++ b/paddle/fluid/operators/distributed/rpc_client.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "gflags/gflags.h" + +// default to 3min to avoid temprary network failures. +DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc"); + +namespace paddle { +namespace operators { +namespace distributed { + +std::once_flag RPCClient::init_flag_; +std::unique_ptr RPCClient::rpc_client_(nullptr); +int RPCClient::trainer_id_ = 0; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h new file mode 100644 index 00000000..d4be2c28 --- /dev/null +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -0,0 +1,121 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include +#include "gflags/gflags.h" + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/distributed/request_handler.h" + +DECLARE_int32(rpc_deadline); + +namespace paddle { +namespace operators { +namespace distributed { + +class RPCClient { + public: + RPCClient() {} + virtual ~RPCClient() {} + virtual VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& out_varname, + const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncGetVarNoBarrier( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_varname, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncPrefetchVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& in_var_name, + const std::string& out_var_name, const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) = 0; + + virtual VarHandlePtr AsyncSendComplete( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + + // Complete tells all the pserver instances that finishe the training, + // the pserver can reduce it's barrier count, and continue to train + // with other trainers. + virtual void SendComplete() = 0; + + virtual bool Wait() = 0; + + template + static RPCClient* GetInstance(int trainer_id) { + std::call_once(init_flag_, &RPCClient::Init, trainer_id); + return rpc_client_.get(); + } + + // Init is called by GetInstance. + template + static void Init(int trainer_id) { + VLOG(0) << "init rpc client with trainer_id " << trainer_id; + trainer_id_ = trainer_id; + if (rpc_client_.get() == nullptr) { + rpc_client_.reset(new T()); + rpc_client_->InitImpl(); + } + } + + virtual void InitImpl() {} + + protected: + // each trainer have exact one trainer id, it should be static + static int trainer_id_; + + private: + static std::once_flag init_flag_; + static std::unique_ptr rpc_client_; +}; +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h new file mode 100644 index 00000000..eb127bf4 --- /dev/null +++ b/paddle/fluid/operators/distributed/rpc_common.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace operators { +namespace distributed { + +struct RpcContext { + RpcContext() = default; + + RpcContext(const std::string &name, const std::vector &names, + const std::vector &emap, + const std::vector §ions, int id) + : var_name(name), + splited_var_names(names), + epmap(emap), + height_sections(sections), + trainer_id(id) {} + + RpcContext(const RpcContext &ctx) { + var_name = ctx.var_name; + splited_var_names = ctx.splited_var_names; + epmap = ctx.epmap; + height_sections = ctx.height_sections; + trainer_id = ctx.trainer_id; + } + + std::string var_name; + std::vector splited_var_names; + std::vector epmap; + std::vector height_sections; + int trainer_id; +}; + +inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) { + os << "{"; + os << "var_name: " << rpc_ctx.var_name << "\n"; + + os << "splited_var_names: ["; + for (auto &name : rpc_ctx.splited_var_names) { + os << name << ", "; + } + os << "]\n"; + + os << "epmap: ["; + for (auto &ep : rpc_ctx.epmap) { + os << ep << ", "; + } + os << "]\n"; + + os << "height_sections: ["; + for (auto §ion : rpc_ctx.height_sections) { + os << section << ", "; + } + os << "]\n"; + os << "}"; + return os; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc new file mode 100644 index 00000000..c3a46e34 --- /dev/null +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -0,0 +1,234 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/rpc_server.h" + +#include +#include +#include +#include +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +void RPCServer::ShutDown() { + LOG(INFO) << "RPCServer ShutDown "; + ShutDownImpl(); + + exit_flag_ = true; + barrier_cond_.notify_all(); + rpc_cond_.notify_all(); +} + +void RPCServer::SavePort() const { + auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid()); + std::ofstream port_file; + port_file.open(file_path); + port_file << selected_port_; + port_file.close(); + VLOG(3) << "selected port written to " << file_path; +} + +void RPCServer::WaitBarrier(const std::string& rpc_name) { + VLOG(3) << "WaitBarrier in: " << rpc_name; + std::unique_lock lock(this->mutex_); + barrier_cond_.wait(lock, [this, &rpc_name] { + return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) || + exit_flag_.load()); + }); + + VLOG(3) << "WaitBarrier out: " << rpc_name + << " counter: " << barrier_counter_[rpc_name]; +} + +void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { + VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; + // barrier msg should make sure that it's in the right cond(send|recv) + WaitCond(rpc_name); + int b = 0; + std::unique_lock lock(mutex_); + b = ++barrier_counter_[rpc_name]; + VLOG(3) << rpc_name << " barrier_counter: " << b; + if (b >= client_num_) { + lock.unlock(); + VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for " + << rpc_name; + barrier_cond_.notify_all(); + lock.lock(); + } +} + +void RPCServer::Complete() { + { + std::unique_lock lock(mutex_); + client_num_--; + need_reset_all_vars_ = true; + + VLOG(3) << "decrease client_num to: " << client_num_; + if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { + barrier_counter_[kRequestGet]--; + } + } + barrier_cond_.notify_all(); +} + +bool RPCServer::NeedResetAllVars() { + std::unique_lock lock(mutex_); + return need_reset_all_vars_; +} + +int RPCServer::GetClientNum() { + std::unique_lock lock(mutex_); + return client_num_; +} + +void RPCServer::ResetBarrierCounter() { + VLOG(3) << "RPCServer ResetBarrierCounter "; + std::unique_lock lock(mutex_); + for (auto& t : barrier_counter_) { + t.second = 0; + } + need_reset_all_vars_ = false; +} + +void RPCServer::RegisterRPC(const std::string& rpc_name, + RequestHandler* handler, int thread_num) { + rpc_call_map_[rpc_name] = handler; + rpc_thread_num_[rpc_name] = thread_num; + + static int cond = -1; + rpc_cond_map_[rpc_name] = ++cond; + VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler + << ", cond: " << rpc_cond_map_[rpc_name]; +} + +void RPCServer::SetCond(const std::string& rpc_name) { + VLOG(3) << "RPCServer SetCond " << rpc_name; + { + std::unique_lock lock(mutex_); + cur_cond_ = rpc_cond_map_[rpc_name]; + } + + rpc_cond_.notify_all(); +} + +void RPCServer::WaitCond(const std::string& rpc_name) { + VLOG(3) << "RPCServer WaitCond in " << rpc_name; + int cond = 0; + { + std::unique_lock lock(mutex_); + cond = rpc_cond_map_[rpc_name]; + } + + std::unique_lock lock(mutex_); + rpc_cond_.wait( + lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); + VLOG(3) << "RPCServer WaitCond out " << rpc_name; +} + +void RPCServer::RegisterVar(const std::string& var_name, + const std::string& rpc_name, + framework::Scope* scope, + platform::DeviceContext* dev_ctx) { + MonomerHandle h; + h.var_name_ = var_name; + h.rpc_name_ = rpc_name; + h.scope_ = scope; + h.dev_ctx_ = dev_ctx; + + { + std::unique_lock lock(mutex_); + if (var_map_.find(var_name) != var_map_.end()) { + PADDLE_ENFORCE(false, "%s alreay in var_map", var_name); + } + var_map_[var_name] = h; + } + + rpc_cond_.notify_all(); + VLOG(3) << "RegisterVar context:" << h.String(); +} + +void RPCServer::IncreaseVarBarrier(const std::string& var_name) { + int b = 0; + MonomerHandle h; + { + std::unique_lock lock(mutex_); + b = ++var_map_[var_name].barrier_; + h = var_map_[var_name]; + } + + if (b >= client_num_) { + barrier_cond_.notify_all(); + } + + VLOG(3) << "IncreaseVarBarrier context:" << h.String(); +} + +void RPCServer::WaitVarBarrier(const std::string& var_name) { + VLOG(3) << "WaitVarBarrier var_name:" << var_name; + + std::unique_lock lock(mutex_); + barrier_cond_.wait(lock, [&]() { + return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) || + exit_flag_.load()); + }); + + VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String(); +} + +void RPCServer::SetVarCond(const std::string& var_name) { + VLOG(3) << "SetVarCond var_name:" << var_name; + { + std::unique_lock lock(mutex_); + if (var_map_.find(var_name) != var_map_.end()) { + rpc_cond_.notify_all(); + } + } +} + +void RPCServer::WaitVarCond(const std::string& var_name) { + VLOG(3) << "WaitVarCond var_name:" << var_name; + + std::unique_lock lock(mutex_); + rpc_cond_.wait(lock, [=] { + return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); + }); + + VLOG(3) << "WaitVarCond var_name:" << var_name << " end"; +} + +MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { + MonomerHandle h; + { + std::unique_lock lock(mutex_); + h = var_map_[var_name]; + } + + return h; +} + +void RPCServer::ClearRegisteredVars() { + std::unique_lock lock(mutex_); + var_map_.clear(); +} + +void RPCServer::ClearVar(const std::string& var_name) { + std::unique_lock lock(mutex_); + var_map_.erase(var_name); +} +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h new file mode 100644 index 00000000..8c7b7f1d --- /dev/null +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -0,0 +1,137 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include + +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace distributed { + +struct MonomerHandle { + std::string var_name_; + std::string rpc_name_; + framework::Scope* scope_{nullptr}; + platform::DeviceContext* dev_ctx_{nullptr}; + int64_t barrier_{0}; + + std::string String() { + std::stringstream ss; + ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_ + << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_ + << ", barrier_:" << barrier_; + return ss.str(); + } +}; + +class RPCServer { + public: + explicit RPCServer(const std::string& address, int client_num) + : cur_cond_(0), + bind_address_(address), + exit_flag_(false), + selected_port_(0), + client_num_(client_num), + need_reset_all_vars_(false) {} + + virtual ~RPCServer() {} + virtual void StartServer() = 0; + virtual void WaitServerReady() = 0; + + void ShutDown(); + + bool IsExit() { return exit_flag_.load(); } + + int GetSelectedPort() const { return selected_port_; } + + int GetClientNum(); + + void SavePort() const; + + // RegisterRPC, register the rpc method name to a handler + // class, and auto generate a condition id for this call + // to be used for the barrier. + void RegisterRPC(const std::string& rpc_name, RequestHandler* handler, + int thread_num = 5); + + int GetThreadNum(const std::string& rpc_name) { + return rpc_thread_num_[rpc_name]; + } + + // Wait util all the clients have reached the barrier for one + // rpc method. This function should be called in the + // RequestHandler if you want to run the server/client in a + // synchronous mode. + void WaitBarrier(const std::string& rpc_name); + + void SetCond(const std::string& rpc_name); + void WaitCond(const std::string& rpc_name); + void IncreaseBatchBarrier(const std::string rpc_name); + + void RegisterVar(const std::string& var_name, const std::string& rpc_name, + framework::Scope* scope, platform::DeviceContext* dev_ctx); + void IncreaseVarBarrier(const std::string& var_name); + void WaitVarBarrier(const std::string& var_name); + void SetVarCond(const std::string& var_name); + void WaitVarCond(const std::string& var_name); + void ClearRegisteredVars(); + void ClearVar(const std::string& var_name); + MonomerHandle GetMonomer(const std::string& var_name); + + void Complete(); + + void ResetBarrierCounter(); + + bool NeedResetAllVars(); + + protected: + virtual void ShutDownImpl() = 0; + + private: + std::mutex mutex_; + std::unordered_map barrier_counter_; + std::condition_variable barrier_cond_; + + std::unordered_map rpc_cond_map_; + std::atomic cur_cond_; + std::condition_variable rpc_cond_; + + protected: + std::string bind_address_; + std::atomic exit_flag_; + int selected_port_; + int client_num_; + bool need_reset_all_vars_; + + std::unordered_map rpc_call_map_; + std::unordered_map rpc_thread_num_; + friend class RequestHandler; + + // TODO(gongwb): use more cond to notify or wait; + std::unordered_map var_map_; +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc new file mode 100644 index 00000000..089ea623 --- /dev/null +++ b/paddle/fluid/operators/distributed/rpc_server_test.cc @@ -0,0 +1,183 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace distributed = paddle::operators::distributed; + +USE_NO_KERNEL_OP(lookup_sparse_table); + +std::unique_ptr g_rpc_service; +std::unique_ptr g_req_handler; + +framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { + auto root_block = program->MutableBlock(0); + auto* block = program->AppendBlock(*root_block); + + framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}}); + framework::VariableNameMap output({{"Output", {"out"}}}); + auto op = block->AppendOp(); + op->SetType("lookup_sparse_table"); + op->SetInput("W", {"w"}); + op->SetInput("Ids", {"ids"}); + op->SetOutput("Out", {"out"}); + + auto& out = *root_block->Var("out"); + out.SetType(framework::proto::VarType::LOD_TENSOR); + out.SetShape({10, 10}); + + return block; +} + +void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { + auto w_var = scope->Var("w"); + w_var->GetMutable(); + + auto out_var = scope->Var("out"); + out_var->GetMutable(); + + auto ids_var = scope->Var("ids"); + ids_var->GetMutable(); +} + +void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, + int64_t rows_numel) { + CreateVarsOnScope(scope, place); + auto ids_var = scope->Var("ids")->GetMutable(); + int64_t* ids_ptr = + ids_var->mutable_data(framework::DDim({rows_numel, 1}), *place); + for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2; +} + +void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, + int64_t rows_numel) { + CreateVarsOnScope(scope, place); + auto w = scope->Var("w")->GetMutable(); + auto w_value = w->mutable_value(); + w_value->Resize({rows_numel, 10}); + for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); + + auto ptr = w_value->mutable_data(*place); + + for (int64_t i = 0; i < w_value->numel(); ++i) { + ptr[i] = static_cast(i / 10); + } +} + +void StartServer(const std::string& rpc_name) { + framework::ProgramDesc program; + framework::Scope scope; + platform::CPUPlace place; + framework::Executor exe(place); + platform::CPUDeviceContext ctx(place); + auto* block = AppendPrefetchBlcok(&program); + std::string in_var_name("ids"); + std::vector prefetch_block_ids{block->ID()}; + auto prepared = exe.Prepare(program, prefetch_block_ids); + InitTensorsOnServer(&scope, &place, 10); + + std::unordered_map> + prefetch_var_name_to_prepared; + prefetch_var_name_to_prepared[in_var_name] = prepared[0]; + + g_req_handler->SetProgram(&program); + g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); + g_req_handler->SetDevCtx(&ctx); + g_req_handler->SetScope(&scope); + g_req_handler->SetExecutor(&exe); + + g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); + g_req_handler->SetRPCServer(g_rpc_service.get()); + + std::thread server_thread( + std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); + + server_thread.join(); +} + +TEST(PREFETCH, CPU) { + g_req_handler.reset(new distributed::RequestPrefetchHandler(true)); + g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); + distributed::RPCClient* client = + distributed::RPCClient::GetInstance(0); + + std::thread server_thread(StartServer, distributed::kRequestPrefetch); + g_rpc_service->WaitServerReady(); + + int port = g_rpc_service->GetSelectedPort(); + std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); + + framework::Scope scope; + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + { + // create var on local scope + int64_t rows_numel = 5; + InitTensorsOnClient(&scope, &place, rows_numel); + std::string in_var_name("ids"); + std::string out_var_name("out"); + + client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name); + client->Wait(); + auto var = scope.Var(out_var_name); + auto value = var->GetMutable(); + auto ptr = value->mutable_data(place); + + for (int64_t i = 0; i < rows_numel; ++i) { + EXPECT_EQ(ptr[0 + i * value->dims()[1]], static_cast(i * 2)); + } + } + + g_rpc_service->ShutDown(); + server_thread.join(); + LOG(INFO) << "begin reset"; + g_rpc_service.reset(nullptr); + g_req_handler.reset(nullptr); +} + +TEST(COMPLETE, CPU) { + g_req_handler.reset(new distributed::RequestSendHandler(true)); + g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2)); + distributed::RPCClient* client = + distributed::RPCClient::GetInstance(0); + PADDLE_ENFORCE(client != nullptr); + std::thread server_thread(StartServer, distributed::kRequestSend); + g_rpc_service->WaitServerReady(); + int port = g_rpc_service->GetSelectedPort(); + std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); + client->AsyncSendComplete(ep); + client->Wait(); + + EXPECT_EQ(g_rpc_service->GetClientNum(), 1); + + g_rpc_service->ShutDown(); + server_thread.join(); + g_rpc_service.reset(nullptr); + g_req_handler.reset(nullptr); +} diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in new file mode 100644 index 00000000..63036678 --- /dev/null +++ b/paddle/fluid/operators/distributed/send_recv.proto.in @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under +the Apache License, Version 2.0 (the "License"); you may not use this file +except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto3"; +package sendrecv; + +option cc_generic_services = @cc_generic_services@; + +service SendRecvService { + // For parameter server round-robin like hashing, do not split tensors. + // Send and recv only one tensor + // TODO(typhoonzero): add streaming API + rpc SendVariable(VariableMessage) returns (VoidMessage) {} + // Argument VariableMessage for GetVariable should only contain varname. + rpc GetVariable(VariableMessage) returns (VariableMessage) {} + rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {} + // pre-fetch variable by given variable name and Ids + rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} + + rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} + + rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {} + rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} +} + +// It can be: LoDTensor、SelectedRows or NCCL_ID +enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + NCCL_ID = 2; +} + +// VariableMessage is serialized paddle variable message. +// NOTICE(gongwb):don't modify this proto if you are not +// not familar with how we serialize in sendrecvop_utils.h +// and deserilize it in variable_response.h. +message VariableMessage { + enum Type { + // Pod Types + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; + } + + message LodData { repeated int64 lod_data = 1; } + string varname = 1; + // TODO(Yancey1989): reference framework::proto::VarDesc::VarType + VarType type = 2; + // bool persistable is not needed for sending. + // tensor info: + Type data_type = 3; + repeated int64 dims = 4; + + // lod details: + int64 lod_level = 5; + repeated LodData lod = 6; + // selected_rows height, aka. original dim0 + int64 slr_height = 7; + // tensor data + bytes serialized = 8; + // selected_rows data + bytes rows = 9; + // Look up table block execution output variable name. + string out_varname = 10; + // If 1, the ps server will start profiling, the ps + // server stops profiling and generates a profile to /tmp/profile_ps_* + // when profile switches from 1 to 2. + int64 profile = 11; + int64 trainer_id = 12; + string table_name = 13; +} + +message VoidMessage {} diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc new file mode 100644 index 00000000..9bd2c992 --- /dev/null +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUDA +#include +#endif +#include +#include // NOLINT + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/platform/port.h" + +DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not."); + +namespace paddle { +namespace operators { +namespace distributed { + +using VarMsg = sendrecv::VariableMessage; + +static TensorPayload GetCommunicationAllocationFromTensor( + const platform::DeviceContext& ctx, const framework::Tensor& tensor) { + if (is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(is_gpu_place(tensor.place())); + auto& gpu_dev_ctx = + reinterpret_cast(ctx); + auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type()); + platform::CUDAPinnedPlace cuda_pinned; + auto result = memory::AllocShared(cuda_pinned, copy_size); + + memory::Copy(cuda_pinned, result->ptr(), + boost::get(tensor.place()), + tensor.data(), copy_size, gpu_dev_ctx.stream()); + ctx.Wait(); + return TensorPayload(result); +#else + PADDLE_THROW("This situation should not be happened"); +#endif + } else { + return TensorPayload(tensor); + } +} +TensorPayload GetTensorPayload(framework::Variable* var, + const platform::DeviceContext& ctx, + VarMsg* request) { + auto tensor = var->Get(); + // FIXME(wuyi): data types in send_recv.proto is copied from + // framework.proto + request->set_data_type(static_cast(tensor.type())); + for (auto& dim : framework::vectorize(tensor.dims())) { + request->add_dims(dim); + } + const framework::LoD lod = tensor.lod(); + if (lod.size() > 0) { + request->set_lod_level(lod.size()); + for (auto& each : lod) { + VarMsg::LodData* lod_inner = request->add_lod(); + for (auto& d : each) { + lod_inner->add_lod_data(d); + } + } + } + return GetCommunicationAllocationFromTensor(ctx, tensor); +} + +TensorPayload GetSelectedRowsPayload(framework::Variable* var, + const platform::DeviceContext& ctx, + VarMsg* request) { + auto* slr = var->GetMutable(); + request->set_data_type(static_cast(slr->value().type())); + request->set_lod_level(0); + request->set_slr_height(slr->height()); + + for (auto& dim : framework::vectorize(slr->value().dims())) { + request->add_dims(dim); + } + + auto* tensor = slr->mutable_value(); + return GetCommunicationAllocationFromTensor(ctx, *tensor); +} + +TensorPayload::TensorPayload(std::shared_ptr allocation) + : allocation_(allocation), offset_(0), memory_size_(allocation->size()) {} +TensorPayload::TensorPayload(const framework::Tensor& tensor) + : allocation_(tensor.Holder()), + offset_(tensor.offset()), + memory_size_(tensor.numel() * framework::SizeOfType(tensor.type())) {} +void* TensorPayload::ptr() const { + return reinterpret_cast( + reinterpret_cast(allocation_->ptr()) + offset_); +} +size_t TensorPayload::memory_size() const { return memory_size_; } +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h new file mode 100644 index 00000000..5457101a --- /dev/null +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -0,0 +1,93 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/distributed/distributed_pb.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using VarMsg = sendrecv::VariableMessage; + +class TensorPayload final { + public: + explicit TensorPayload(const framework::Tensor& tensor); + explicit TensorPayload(std::shared_ptr allocation); + + TensorPayload(const TensorPayload& o) = default; + TensorPayload& operator=(const TensorPayload& o) = default; + + void* ptr() const; + size_t memory_size() const; + + private: + std::shared_ptr allocation_; + size_t offset_; + size_t memory_size_; +}; + +inline void SerializeDestroyCallback(void* payload) { + if (payload != nullptr) { + auto* shared_payload = reinterpret_cast(payload); + delete shared_payload; + } +} + +TensorPayload GetTensorPayload(framework::Variable* var, + const platform::DeviceContext& ctx, + VarMsg* request); + +TensorPayload GetSelectedRowsPayload(framework::Variable* var, + const platform::DeviceContext& ctx, + VarMsg* request); + +inline framework::proto::VarType::Type ToVarType( + sendrecv::VariableMessage::Type type) { + switch (type) { + case sendrecv::VariableMessage::FP32: + return framework::proto::VarType::FP32; // NOLINT + case sendrecv::VariableMessage::FP64: + return framework::proto::VarType::FP64; // NOLINT + case sendrecv::VariableMessage::INT32: + return framework::proto::VarType::INT32; // NOLINT + case sendrecv::VariableMessage::INT64: + return framework::proto::VarType::INT64; // NOLINT + case sendrecv::VariableMessage::BOOL: + return framework::proto::VarType::BOOL; // NOLINT + default: + PADDLE_THROW("Not support type %d", type); + } +} + +template