From 73fa5ef3922ddf69c0e434d44bea91ff93ab7ecb Mon Sep 17 00:00:00 2001 From: xiexionghang Date: Mon, 11 Nov 2019 13:46:23 +0800 Subject: [PATCH] fork from paddlev1.4, branch:paddle_feed_news_201910 --- CMakeLists.txt | 37 +- Dockerfile | 9 +- README.md | 33 +- README_cn.md | 29 +- cmake/configure.cmake | 26 +- cmake/copyfile.py | 44 + cmake/cuda.cmake | 4 - cmake/external/box_ps.cmake | 68 + cmake/external/brpc.cmake | 4 +- cmake/external/dgc.cmake | 10 +- cmake/external/eigen.cmake | 9 - cmake/external/gflags.cmake | 5 + cmake/external/gtest.cmake | 13 +- cmake/external/leveldb.cmake | 2 - cmake/external/mklml.cmake | 2 +- cmake/external/ngraph.cmake | 3 +- cmake/external/openblas.cmake | 36 +- cmake/external/protobuf.cmake | 1 + cmake/external/snappy.cmake | 65 - cmake/external/snappystream.cmake | 63 - cmake/external/yaml-cpp.cmake | 59 - cmake/flags.cmake | 31 +- cmake/generic.cmake | 6 +- cmake/inference_lib.cmake | 337 +- cmake/operators.cmake | 5 +- cmake/package.cmake | 21 - cmake/tensorrt.cmake | 18 +- paddle/fluid/API.spec | 329 +- paddle/fluid/CMakeLists.txt | 1 - paddle/fluid/framework/CMakeLists.txt | 18 +- paddle/fluid/framework/archive.h | 34 +- paddle/fluid/framework/channel.h | 10 +- paddle/fluid/framework/commit.h | 21 - paddle/fluid/framework/data_feed.cc | 114 +- paddle/fluid/framework/data_feed.h | 55 + .../fluid/framework/data_layout_transform.cc | 30 +- .../fluid/framework/data_layout_transform.h | 21 +- paddle/fluid/framework/data_set.cc | 376 +- paddle/fluid/framework/data_set.h | 50 +- paddle/fluid/framework/ddim.cc | 16 - paddle/fluid/framework/ddim.h | 9 +- paddle/fluid/framework/details/CMakeLists.txt | 25 +- .../framework/details/all_reduce_op_handle.cc | 205 +- .../framework/details/all_reduce_op_handle.h | 10 +- .../framework/details/broadcast_op_handle.cc | 7 +- .../fluid/framework/details/build_strategy.cc | 405 +- .../fluid/framework/details/build_strategy.h | 20 +- .../details/eager_deletion_op_handle.cc | 3 +- .../framework/details/execution_strategy.h | 2 +- .../fast_threaded_ssa_graph_executor.cc | 16 +- .../framework/details/fetch_op_handle.cc | 11 +- .../details/fused_all_reduce_op_handle.cc | 129 +- .../details/fused_all_reduce_op_handle.h | 23 +- .../framework/details/multi_devices_helper.h | 5 + .../fluid/framework/details/op_handle_base.h | 2 + .../framework/details/reduce_and_gather.h | 85 +- .../framework/details/reduce_op_handle.cc | 20 +- .../details/scale_loss_grad_op_handle.cc | 8 +- .../details/scope_buffered_monitor.cc | 202 + .../details/scope_buffered_monitor.h | 49 + .../scope_buffered_ssa_graph_executor.cc | 60 +- .../scope_buffered_ssa_graph_executor.h | 13 +- .../details/share_tensor_buffer_functor.cc | 126 + .../details/share_tensor_buffer_functor.h | 73 + .../details/share_tensor_buffer_op_handle.cc | 107 +- .../details/share_tensor_buffer_op_handle.h | 41 +- .../details/sparse_all_reduce_op_handle.cc | 8 +- paddle/fluid/framework/details/var_handle.h | 8 + paddle/fluid/framework/device_worker.h | 32 +- paddle/fluid/framework/dim.h | 1 - paddle/fluid/framework/dist_multi_trainer.cc | 121 +- paddle/fluid/framework/downpour_worker.cc | 273 +- paddle/fluid/framework/executor.cc | 33 +- paddle/fluid/framework/executor.h | 3 +- paddle/fluid/framework/executor_gc_helper.cc | 14 +- paddle/fluid/framework/executor_gc_helper.h | 12 +- paddle/fluid/framework/fleet/CMakeLists.txt | 5 + paddle/fluid/framework/fleet/box_wrapper.cc | 247 + paddle/fluid/framework/fleet/box_wrapper.h | 126 + paddle/fluid/framework/fleet/fleet_wrapper.cc | 154 +- paddle/fluid/framework/fleet/fleet_wrapper.h | 36 +- paddle/fluid/framework/garbage_collector.cc | 22 +- paddle/fluid/framework/garbage_collector.h | 4 +- paddle/fluid/framework/hogwild_worker.cc | 54 +- paddle/fluid/framework/inplace_op_inference.h | 15 +- .../framework/inplace_op_inference_test.cc | 328 -- paddle/fluid/framework/io/fs.cc | 21 +- paddle/fluid/framework/io/fs.h | 4 - paddle/fluid/framework/io/shell.cc | 14 +- paddle/fluid/framework/ir/CMakeLists.txt | 59 +- .../framework/ir/coalesce_grad_tensor_pass.cc | 56 +- paddle/fluid/framework/ir/codegen.cc | 96 + paddle/fluid/framework/ir/codegen.h | 36 + paddle/fluid/framework/ir/codegen_helper.cc | 61 + paddle/fluid/framework/ir/codegen_helper.h | 70 + paddle/fluid/framework/ir/codegen_test.cc | 43 + .../ir/conv_elementwise_add2_act_fuse.cc | 104 - .../ir/cudnn_placement_pass.cc} | 5 +- .../fluid/framework/ir/cudnn_placement_pass.h | 41 + .../ir/cudnn_placement_pass_tester.cc | 119 + .../ir/embedding_fc_lstm_fuse_pass.cc | 6 +- .../ir/fc_elementwise_layernorm_fuse_pass.cc | 259 + .../fc_elementwise_layernorm_fuse_pass.h} | 28 +- ..._elementwise_layernorm_fuse_pass_tester.cc | 67 + paddle/fluid/framework/ir/fc_fuse_pass.cc | 105 +- paddle/fluid/framework/ir/fc_fuse_pass.h | 4 +- .../fluid/framework/ir/fc_fuse_pass_tester.cc | 102 +- paddle/fluid/framework/ir/fc_gru_fuse_pass.cc | 4 +- .../fluid/framework/ir/fc_lstm_fuse_pass.cc | 5 +- .../fuse_adam_op_pass.cc | 102 +- .../fuse_momentum_op_pass.cc | 12 +- .../fuse_optimizer_op_pass.cc | 191 +- .../fuse_optimizer_op_pass.h | 10 +- .../fuse_sgd_op_pass.cc | 12 +- paddle/fluid/framework/ir/graph.h | 39 +- .../framework/ir/graph_pattern_detector.cc | 126 +- .../framework/ir/graph_pattern_detector.h | 80 +- ...ces_graph_print_pass.h => graph_printer.h} | 2 +- paddle/fluid/framework/ir/graph_test.cc | 46 + paddle/fluid/framework/ir/graph_viz_pass.cc | 18 +- .../framework/ir/infer_clean_graph_pass.cc | 67 - .../ir/memory_optimize_pass/CMakeLists.txt | 16 +- ...uffer_shared_cross_op_memory_reuse_pass.cc | 422 ++ .../buffer_shared_inplace_op_pass.cc | 6 +- ...onditional_block_op_eager_deletion_pass.cc | 61 + .../eager_deletion_pass.cc | 8 +- .../memory_optimize_pass/inplace_op_pass.cc | 487 -- .../memory_optimization_var_info.h | 48 +- .../memory_optimize_helper.cc | 569 -- .../memory_optimize_helper.h | 189 - .../memory_optimize_helper_test.cc | 525 -- .../memory_optimize_pass.cc | 224 - .../memory_optimize_pass.h | 72 - .../memory_optimize_pass/memory_reuse_pass.cc | 222 +- .../memory_optimize_pass/memory_reuse_pass.h | 29 +- .../ir/memory_optimize_pass/op_graph_view.cc | 18 + .../ir/memory_optimize_pass/op_graph_view.h | 57 + .../record_skip_memory_opt_vars_pass.cc | 170 - .../recurrent_op_eager_deletion_pass.cc | 3 +- .../reference_count_pass.cc | 21 +- .../reference_count_pass_helper.h | 24 +- .../while_op_eager_deletion_pass.cc | 2 +- .../conv_activation_mkldnn_fuse_pass.cc | 97 + ...s.h => conv_activation_mkldnn_fuse_pass.h} | 26 +- ...onv_activation_mkldnn_fuse_pass_tester.cc} | 78 +- .../ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc | 71 - .../conv_brelu_mkldnn_fuse_pass_tester.cc | 135 - .../conv_concat_relu_mkldnn_fuse_pass.cc | 2 +- ...onv_concat_relu_mkldnn_fuse_pass_tester.cc | 7 +- .../conv_elementwise_add_mkldnn_fuse_pass.cc | 6 +- .../conv_elementwise_add_mkldnn_fuse_pass.h | 5 + .../ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc | 76 - .../framework/ir/mkldnn/cpu_quantize_pass.cc | 8 + .../ir/mkldnn/cpu_quantize_squash_pass.cc | 86 +- .../ir/mkldnn/cpu_quantize_squash_pass.h | 15 +- .../mkldnn/cpu_quantize_squash_pass_tester.cc | 320 +- .../ir/mkldnn/mkldnn_placement_pass.cc | 33 - .../ir/mkldnn/mkldnn_placement_pass.h | 17 +- .../all_reduce_deps_pass.cc | 2 +- .../fuse_all_reduce_op_pass.cc | 21 +- .../multi_devices_graph_pass.cc | 10 +- .../multi_devices_graph_pass.h | 10 +- .../multi_devices_graph_print_pass.cc | 9 +- .../sequential_execution_pass.cc | 1 - .../framework/ir/ngraph_subgraph_pass.cc | 116 +- paddle/fluid/framework/ir/node.h | 6 +- paddle/fluid/framework/ir/pass.cc | 8 +- paddle/fluid/framework/ir/pass.h | 8 + paddle/fluid/framework/ir/pass_builder.cc | 3 + paddle/fluid/framework/ir/pass_test.cc | 2 +- .../fluid/framework/ir/pass_tester_helper.h | 338 ++ .../fluid/framework/ir/placement_pass_base.cc | 69 + .../fluid/framework/ir/placement_pass_base.h | 42 + .../ir/repeated_fc_relu_fuse_pass.cc | 297 +- .../ir/repeated_fc_relu_fuse_pass_tester.cc | 71 + .../ir/seqpool_cvm_concat_fuse_pass.cc | 153 + .../ir/seqpool_cvm_concat_fuse_pass.h | 54 + .../ir/seqpool_cvm_concat_fuse_pass_tester.cc | 239 + .../ir/simplify_with_basic_ops_pass.cc | 202 + .../ir/simplify_with_basic_ops_pass.h | 42 + .../ir/simplify_with_basic_ops_pass_tester.cc | 77 + .../framework/ir/sync_batch_norm_pass.cc | 2 +- paddle/fluid/framework/lod_tensor.cc | 76 +- paddle/fluid/framework/lod_tensor.h | 14 - paddle/fluid/framework/lod_tensor_test.cc | 80 +- paddle/fluid/framework/lod_tensor_test.cu | 1 - paddle/fluid/framework/multi_trainer.cc | 6 + .../framework/no_need_buffer_vars_inference.h | 19 +- paddle/fluid/framework/op_call_stack.cc | 47 + .../framework/{revision.h => op_call_stack.h} | 13 +- paddle/fluid/framework/op_compatible_info.cc | 104 + paddle/fluid/framework/op_compatible_info.h | 69 + .../framework/op_compatible_info_test.cc | 58 + paddle/fluid/framework/op_desc.cc | 51 +- paddle/fluid/framework/op_desc.h | 9 + paddle/fluid/framework/op_info.h | 27 +- paddle/fluid/framework/op_registry.h | 9 +- paddle/fluid/framework/operator.cc | 31 +- paddle/fluid/framework/operator.h | 12 +- .../fluid/framework/operator_kernel_configs.h | 2 + paddle/fluid/framework/parallel_executor.cc | 147 +- paddle/fluid/framework/parallel_executor.h | 4 +- paddle/fluid/framework/pipeline_trainer.cc | 1 + paddle/fluid/framework/prune.cc | 229 +- paddle/fluid/framework/prune.h | 11 +- paddle/fluid/framework/prune_test.cc | 64 +- paddle/fluid/framework/scope.h | 3 + paddle/fluid/framework/tensor.cc | 10 +- paddle/fluid/framework/tensor_test.cc | 17 +- paddle/fluid/framework/tensor_util.cc | 10 +- paddle/fluid/framework/tensor_util.h | 3 +- paddle/fluid/framework/threadpool.cc | 5 +- paddle/fluid/framework/trainer.h | 15 + paddle/fluid/framework/trainer_desc.proto | 17 + .../fluid/framework/transfer_scope_cache.cc | 49 - paddle/fluid/imperative/CMakeLists.txt | 13 +- paddle/fluid/imperative/backward_strategy.h | 7 +- paddle/fluid/imperative/engine.cc | 241 +- paddle/fluid/imperative/engine.h | 82 +- .../fluid/imperative/gradient_accumulator.cc | 175 + .../fluid/imperative/gradient_accumulator.h | 63 + paddle/fluid/imperative/layer.cc | 555 +- paddle/fluid/imperative/layer.h | 570 +- paddle/fluid/imperative/nccl_context.cc | 9 +- paddle/fluid/imperative/prepared_operator.cc | 123 + paddle/fluid/imperative/prepared_operator.h | 62 + paddle/fluid/imperative/tests/CMakeLists.txt | 5 + .../{ => tests}/nccl_context_test.cc | 0 .../tests/test_gradient_accmulator.cc | 121 + paddle/fluid/imperative/tests/test_layer.cc | 154 + .../fluid/imperative/tests/test_prepare_op.cc | 216 + paddle/fluid/imperative/tests/test_tracer.cc | 193 + paddle/fluid/imperative/tracer.cc | 436 +- paddle/fluid/imperative/tracer.h | 52 +- paddle/fluid/imperative/type_defs.h | 16 +- paddle/fluid/inference/CMakeLists.txt | 2 +- paddle/fluid/inference/anakin/CMakeLists.txt | 2 +- .../fluid/inference/anakin/convert/conv2d.cc | 2 +- .../inference/anakin/convert/conv2d_fusion.cc | 4 +- paddle/fluid/inference/anakin/convert/fc.cc | 2 +- .../fluid/inference/anakin/convert/helper.h | 2 +- .../inference/anakin/convert/op_converter.h | 2 +- .../inference/anakin/convert/ut_helper.h | 2 +- paddle/fluid/inference/anakin/engine.cc | 2 +- paddle/fluid/inference/analysis/analyzer.cc | 3 + .../inference/analysis/analyzer_tester.cc | 4 +- paddle/fluid/inference/analysis/argument.h | 8 +- paddle/fluid/inference/analysis/helper.cc | 12 + .../inference/analysis/ir_pass_manager.cc | 9 +- .../analysis/ir_passes/subgraph_detector.cc | 7 + .../ir_passes/tensorrt_subgraph_pass.cc | 60 +- .../ir_passes/tensorrt_subgraph_pass.h | 1 + .../inference/analysis/passes/CMakeLists.txt | 2 + .../passes/inference_op_replace_pass.cc | 2 +- .../analysis/passes/ir_graph_clean_pass.cc | 47 + .../analysis/passes/ir_graph_clean_pass.h} | 31 +- .../analysis/passes/memory_optimize_pass.cc | 596 +- .../analysis/passes/memory_optimize_pass.h | 68 +- .../fluid/inference/analysis/passes/passes.cc | 3 + paddle/fluid/inference/api/CMakeLists.txt | 4 +- paddle/fluid/inference/api/analysis_config.cc | 44 +- .../fluid/inference/api/analysis_predictor.cc | 124 +- .../fluid/inference/api/analysis_predictor.h | 9 +- .../api/analysis_predictor_tester.cc | 3 - paddle/fluid/inference/api/api.cc | 11 +- .../fluid/inference/api/api_anakin_engine.cc | 43 +- .../fluid/inference/api/api_anakin_engine.h | 8 +- paddle/fluid/inference/api/api_impl_tester.cc | 2 +- .../inference/api/demo_ci/CMakeLists.txt | 43 +- .../fluid/inference/api/demo_ci/vis_demo.cc | 3 + .../inference/api/details/zero_copy_tensor.cc | 19 +- paddle/fluid/inference/api/helper.h | 2 +- .../fluid/inference/api/mkldnn_quantizer.cc | 13 +- .../inference/api/paddle_analysis_config.h | 27 +- paddle/fluid/inference/api/paddle_api.h | 12 +- .../inference/api/paddle_inference_pass.h | 33 - .../inference/api/paddle_pass_builder.cc | 29 +- .../fluid/inference/api/paddle_pass_builder.h | 12 +- paddle/fluid/inference/io.cc | 2 +- paddle/fluid/inference/paddle_fluid.map | 3 +- .../fluid/inference/tensorrt/CMakeLists.txt | 2 +- .../inference/tensorrt/convert/CMakeLists.txt | 7 + .../tensorrt/convert/activation_op.cc | 20 +- .../tensorrt/convert/batch_norm_op.cc | 9 +- .../inference/tensorrt/convert/dropout_op.cc | 18 +- .../tensorrt/convert/elementwise_op.cc | 2 +- .../fluid/inference/tensorrt/convert/fc_op.cc | 4 +- .../tensorrt/convert/leaky_relu_op.cc | 13 +- .../inference/tensorrt/convert/op_converter.h | 6 +- .../inference/tensorrt/convert/prelu_op.cc | 4 +- .../tensorrt/convert/shuffle_channel_op.cc | 57 + .../inference/tensorrt/convert/swish_op.cc | 53 + .../tensorrt/convert/test_activation_op.cc | 3 + .../tensorrt/convert/test_dropout_op.cc | 3 + .../tensorrt/convert/test_op_converter.cc | 6 +- .../convert/test_shuffle_channel_op.cc | 48 + .../tensorrt/convert/test_swish_op.cc | 47 + .../inference/tensorrt/convert/ut_helper.h | 3 +- paddle/fluid/inference/tensorrt/engine.cc | 63 +- paddle/fluid/inference/tensorrt/engine.h | 53 +- paddle/fluid/inference/tensorrt/op_teller.cc | 32 +- paddle/fluid/inference/tensorrt/op_teller.h | 1 + .../inference/tensorrt/plugin/CMakeLists.txt | 2 +- .../tensorrt/plugin/prelu_op_plugin.cu | 1 + .../tensorrt/plugin/split_op_plugin.cu | 152 +- .../tensorrt/plugin/split_op_plugin.h | 1 + .../tensorrt/plugin/swish_op_plugin.cu | 76 + .../tensorrt/plugin/swish_op_plugin.h | 72 + .../tensorrt/plugin/trt_plugin_factory.h | 2 +- .../fluid/inference/tests/api/CMakeLists.txt | 41 +- .../tests/api/analyzer_bert_tester.cc | 36 +- .../tests/api/analyzer_dam_tester.cc | 27 - .../analyzer_int8_object_detection_tester.cc | 12 +- .../tests/api/analyzer_mm_dnn_tester.cc | 268 - .../tests/api/analyzer_seq_conv1_tester.cc | 2 +- .../api/full_pascalvoc_test_preprocess.py | 147 +- .../api/test_detection_dataset_preprocess.py | 35 + .../fluid/inference/tests/api/tester_helper.h | 186 +- .../tests/api/trt_cascade_rcnn_test.cc | 62 + .../inference/tests/api/trt_fc_prelu_test.cc | 58 + .../inference/tests/api/trt_mobilenet_test.cc | 1 + .../inference/tests/api/trt_resnet50_test.cc | 2 +- .../inference/tests/api/trt_test_helper.h | 1 + paddle/fluid/memory/CMakeLists.txt | 17 +- paddle/fluid/memory/allocation/CMakeLists.txt | 10 + paddle/fluid/memory/allocation/allocator.h | 13 +- .../memory/allocation/allocator_facade.cc | 29 +- .../memory/allocation/allocator_strategy.cc | 6 +- .../auto_growth_best_fit_allocator.cc | 15 + .../auto_growth_best_fit_allocator.h | 4 +- .../memory/allocation/best_fit_allocator.cc | 4 +- .../fluid/memory/allocation/cuda_allocator.cc | 8 +- .../cuda_device_context_allocator.h | 167 + .../allocation/naive_best_fit_allocator.cc | 7 +- .../allocation/naive_best_fit_allocator.h | 2 + .../memory/allocation/retry_allocator.cc | 53 +- .../fluid/memory/allocation/retry_allocator.h | 6 +- .../memory/allocation/retry_allocator_test.cc | 53 +- paddle/fluid/memory/detail/CMakeLists.txt | 6 +- paddle/fluid/memory/detail/buddy_allocator.cc | 90 +- paddle/fluid/memory/detail/buddy_allocator.h | 10 - .../memory/detail/buddy_allocator_test.cc | 159 +- paddle/fluid/memory/detail/memory_block.cc | 12 +- paddle/fluid/memory/detail/meta_cache.cc | 6 +- .../fluid/memory/detail/system_allocator.cc | 56 +- paddle/fluid/memory/detail/system_allocator.h | 1 - .../memory/detail/system_allocator_test.cc | 19 + paddle/fluid/memory/malloc.cc | 6 +- paddle/fluid/memory/malloc.h | 8 + paddle/fluid/memory/malloc_test.cu | 137 + .../fluid/op_use_default_grad_op_maker.spec | 15 +- paddle/fluid/operators/CMakeLists.txt | 18 +- paddle/fluid/operators/activation_op.cc | 173 +- paddle/fluid/operators/activation_op.cu | 14 + paddle/fluid/operators/activation_op.h | 260 +- paddle/fluid/operators/affine_channel_op.cc | 62 +- paddle/fluid/operators/affine_channel_op.cu | 15 +- paddle/fluid/operators/argsort_op.cu | 1 - paddle/fluid/operators/assign_op.cc | 64 +- paddle/fluid/operators/attention_lstm_op.cc | 17 +- paddle/fluid/operators/batch_norm_op.cc | 93 +- paddle/fluid/operators/batch_norm_op.cu | 131 +- paddle/fluid/operators/batch_norm_op.h | 22 +- paddle/fluid/operators/bpr_loss_op.h | 2 +- paddle/fluid/operators/center_loss_op.cc | 157 + paddle/fluid/operators/center_loss_op.cu | 146 + paddle/fluid/operators/center_loss_op.h | 155 + paddle/fluid/operators/clip_op.cc | 9 +- .../operators/collective/c_allgather_op.cu.cc | 6 +- .../operators/collective/c_allreduce_op.h | 4 +- .../operators/collective/c_broadcast_op.cu.cc | 12 +- .../collective/c_comm_init_all_op.cc | 93 + .../collective/c_reducescatter_op.cu.cc | 6 +- .../collective/c_sync_comm_stream_op.cc | 7 +- paddle/fluid/operators/concat_op.cc | 2 +- .../operators/controlflow/CMakeLists.txt | 3 + .../controlflow/conditional_block_op.cc | 71 +- .../controlflow/conditional_block_op.h | 20 +- .../conditional_block_op_helper.cc | 167 + .../controlflow/conditional_block_op_helper.h | 35 + .../fluid/operators/controlflow/fetch_op.cc | 12 +- .../controlflow/recurrent_op_helper.cc | 21 +- .../controlflow/recurrent_op_helper.h | 4 +- .../fluid/operators/controlflow/while_op.cc | 24 +- .../operators/controlflow/while_op_helper.cc | 26 +- .../operators/controlflow/while_op_helper.h | 3 +- paddle/fluid/operators/conv_cudnn_helper.h | 23 + paddle/fluid/operators/conv_cudnn_op.cu.cc | 22 +- paddle/fluid/operators/conv_cudnn_op_cache.h | 4 - paddle/fluid/operators/conv_fusion_op.cu.cc | 10 +- paddle/fluid/operators/conv_op.cc | 71 +- paddle/fluid/operators/conv_op.h | 213 + .../operators/conv_transpose_cudnn_op.cu.cc | 13 +- paddle/fluid/operators/conv_transpose_op.cc | 8 + paddle/fluid/operators/crf_decoding_op.cc | 90 +- paddle/fluid/operators/crf_decoding_op.h | 66 +- paddle/fluid/operators/crop_tensor_op.cc | 300 + ...near_chain_crf_op.cu => crop_tensor_op.cu} | 19 +- paddle/fluid/operators/crop_tensor_op.h | 284 + paddle/fluid/operators/cross_entropy_op.cc | 83 +- paddle/fluid/operators/cross_entropy_op.h | 22 +- paddle/fluid/operators/ctc_align_op.cc | 50 +- paddle/fluid/operators/ctc_align_op.cu | 111 +- paddle/fluid/operators/ctc_align_op.h | 105 +- paddle/fluid/operators/data_norm_op.cc | 23 +- .../operators/deformable_conv_filter.cu.h | 37 + paddle/fluid/operators/deformable_conv_func.h | 149 + paddle/fluid/operators/deformable_conv_op.cc | 10 +- paddle/fluid/operators/deformable_conv_op.cu | 61 +- paddle/fluid/operators/deformable_conv_op.h | 613 ++ .../fluid/operators/deformable_conv_v1_op.cc | 272 + .../fluid/operators/deformable_conv_v1_op.cu | 609 ++ .../fluid/operators/deformable_conv_v1_op.h | 564 ++ .../operators/deformable_psroi_pooling_op.cu | 9 +- paddle/fluid/operators/dequantize_op.cc | 3 +- paddle/fluid/operators/detail/safe_ref.h | 2 +- .../fluid/operators/detection/box_coder_op.cu | 6 +- .../detection/box_decoder_and_assign_op.cc | 24 +- .../detection/generate_mask_labels_op.cc | 8 +- paddle/fluid/operators/detection/gpc.cc | 7 +- .../operators/detection/multiclass_nms_op.cc | 85 +- .../detection/roi_perspective_transform_op.cc | 8 +- .../detection/roi_perspective_transform_op.cu | 8 +- .../operators/detection/target_assign_op.h | 1 - .../fluid/operators/detection/yolo_box_op.cu | 5 +- paddle/fluid/operators/dgc_op.h | 12 +- .../distributed/collective_server_test.cc | 5 + .../operators/distributed/communicator.cc | 59 +- .../operators/distributed/communicator.h | 9 +- .../distributed/communicator_test.cc | 3 +- .../operators/distributed/grpc/grpc_client.cc | 190 +- .../distributed/parameter_prefetch.cc | 237 +- .../distributed/parameter_prefetch.h | 63 +- .../operators/distributed/request_handler.h | 2 + .../distributed/request_handler_impl.cc | 37 +- .../fluid/operators/distributed/rpc_client.cc | 1 + .../fluid/operators/distributed/rpc_client.h | 1 + .../operators/distributed/rpc_server_test.cc | 7 + .../distributed_lookup_table_op.cc | 166 + .../distributed_ops/fetch_barrier_op.cc | 10 +- .../distributed_ops/fl_listen_and_serv_op.cc | 279 + .../distributed_ops/fl_listen_and_serv_op.h | 91 + .../distributed_ops/listen_and_serv_op.cc | 2 + .../operators/distributed_ops/recv_op.cc | 22 +- .../distributed_ops/send_barrier_op.cc | 11 +- .../operators/distributed_ops/send_op.cc | 15 +- .../operators/distributed_ops/split_ids_op.cc | 17 +- paddle/fluid/operators/dropout_op.cu | 22 +- paddle/fluid/operators/dropout_op.h | 17 +- .../operators/elementwise/CMakeLists.txt | 2 + .../elementwise/elementwise_add_op.cc | 4 +- .../elementwise/elementwise_add_op.cu | 4 +- .../elementwise/elementwise_div_op.cc | 3 +- .../elementwise/elementwise_div_op.cu | 2 + .../elementwise/elementwise_div_op.h | 38 +- .../elementwise/elementwise_mod_op.cc | 4 +- .../elementwise/elementwise_mod_op.cu | 4 +- .../elementwise/elementwise_mod_op.h | 29 + .../elementwise/elementwise_mul_op.cc | 3 +- .../elementwise/elementwise_mul_op.cu | 4 +- .../elementwise/elementwise_mul_op.h | 54 +- .../operators/elementwise/elementwise_op.h | 35 +- .../elementwise/elementwise_op_function.h | 214 +- .../elementwise/elementwise_pow_op.cc | 39 +- .../elementwise/elementwise_pow_op.cu | 11 +- .../elementwise/elementwise_pow_op.h | 41 +- .../elementwise/elementwise_sub_op.cc | 4 +- .../mkldnn/elementwise_add_mkldnn_op.cc | 44 +- .../mkldnn/elementwise_mul_mkldnn_op.cc | 25 +- .../test_elementwise_add_grad_grad.cc | 83 + .../test_elementwise_div_grad_grad.cc | 97 + .../test_elementwise_op_grad_grad.h | 151 + paddle/fluid/operators/expand_op.cc | 56 +- paddle/fluid/operators/expand_op.h | 21 +- paddle/fluid/operators/eye_op.cc | 91 + paddle/fluid/operators/eye_op.cu | 24 + paddle/fluid/operators/eye_op.h | 61 + paddle/fluid/operators/fake_quantize_op.h | 9 +- paddle/fluid/operators/fc_op.cc | 200 +- .../fc_op.cu.cc} | 11 +- paddle/fluid/operators/fc_op.h | 52 +- paddle/fluid/operators/fill_op.cc | 104 +- .../fill_op.cu.cc} | 7 +- paddle/fluid/operators/fill_op.h | 80 + paddle/fluid/operators/filter_by_instag_op.cc | 146 + paddle/fluid/operators/filter_by_instag_op.h | 201 + paddle/fluid/operators/flatten_op.cc | 236 +- paddle/fluid/operators/flatten_op.cu.cc | 44 + paddle/fluid/operators/flatten_op.h | 116 + paddle/fluid/operators/fused/CMakeLists.txt | 4 +- .../fused/fused_embedding_fc_lstm_op.cc | 4 +- .../fused/fused_embedding_seq_pool_op.cc | 6 + .../fused/fused_embedding_seq_pool_op.h | 131 +- .../fused_fc_elementwise_layernorm_op.cc | 185 + .../fused_fc_elementwise_layernorm_op.cu | 201 + .../fused/fusion_conv_inception_op.cu | 8 +- paddle/fluid/operators/fused/fusion_gru_op.cc | 26 +- .../fluid/operators/fused/fusion_lstm_op.cc | 20 +- .../fused/fusion_repeated_fc_relu_op.cc | 3 +- .../fused/fusion_seqconv_eltadd_relu_op.cc | 11 +- .../fused/fusion_seqexpand_concat_fc_op.cc | 12 +- .../fused/fusion_seqpool_concat_op.cc | 3 +- .../fused/fusion_seqpool_cvm_concat_op.cc | 148 + .../fused/fusion_seqpool_cvm_concat_op.h | 41 + .../fused/fusion_squared_mat_sub_op.cc | 3 +- paddle/fluid/operators/gather.cu.h | 90 +- paddle/fluid/operators/gather.h | 58 +- paddle/fluid/operators/gather_nd_op.cc | 183 + paddle/fluid/operators/gather_nd_op.cu | 105 + paddle/fluid/operators/gather_nd_op.h | 91 + .../operators/grid_sampler_cudnn_op.cu.cc | 10 +- paddle/fluid/operators/group_norm_op.cc | 29 +- paddle/fluid/operators/group_norm_op.cu | 138 +- paddle/fluid/operators/group_norm_op.h | 215 +- .../fluid/operators/hierarchical_sigmoid_op.h | 8 +- paddle/fluid/operators/huber_loss_op.cc | 42 +- paddle/fluid/operators/huber_loss_op.h | 8 +- paddle/fluid/operators/instance_norm_op.cc | 646 +++ paddle/fluid/operators/instance_norm_op.cu | 655 +++ paddle/fluid/operators/instance_norm_op.h | 121 + paddle/fluid/operators/interpolate_op.cc | 295 +- paddle/fluid/operators/interpolate_op.cu | 917 ++- paddle/fluid/operators/interpolate_op.h | 778 ++- paddle/fluid/operators/jit/gen/seqpool.cc | 2 +- paddle/fluid/operators/jit/kernels.h | 160 - paddle/fluid/operators/label_smooth_op.cu | 94 +- paddle/fluid/operators/linear_chain_crf_op.cc | 125 +- paddle/fluid/operators/linear_chain_crf_op.h | 190 +- paddle/fluid/operators/load_combine_op.h | 9 +- paddle/fluid/operators/lod_reset_op.cc | 10 +- paddle/fluid/operators/lod_reset_op.h | 12 +- paddle/fluid/operators/lookup_table_op.cu | 86 +- paddle/fluid/operators/lookup_table_op.h | 41 +- paddle/fluid/operators/lookup_table_v2_op.cc | 192 + paddle/fluid/operators/lookup_table_v2_op.cu | 201 + paddle/fluid/operators/lookup_table_v2_op.h | 218 + paddle/fluid/operators/lstm_unit_op.cu | 1 - .../fluid/operators/match_matrix_tensor_op.cc | 334 ++ .../fluid/operators/match_matrix_tensor_op.h | 41 + paddle/fluid/operators/math.h | 4 + paddle/fluid/operators/math/CMakeLists.txt | 3 +- paddle/fluid/operators/math/blas.h | 37 + paddle/fluid/operators/math/blas_impl.cu.h | 28 +- paddle/fluid/operators/math/blas_impl.h | 193 +- .../fluid/operators/math/concat_and_split.cu | 15 +- paddle/fluid/operators/math/cpu_vec.h | 2 +- paddle/fluid/operators/math/cross_entropy.cu | 5 +- paddle/fluid/operators/math/cross_entropy.h | 3 +- paddle/fluid/operators/math/depthwise_conv.cu | 8 +- paddle/fluid/operators/math/fc.cc | 62 + paddle/fluid/operators/math/fc.cu | 73 + paddle/fluid/operators/math/fc.h | 34 + paddle/fluid/operators/math/fc_compute.h | 55 - paddle/fluid/operators/math/im2col.cu | 16 +- paddle/fluid/operators/math/sample_prob.cu | 6 +- .../operators/math/selected_rows_functor.cc | 104 + .../operators/math/selected_rows_functor.cu | 10 +- .../operators/math/selected_rows_functor.h | 12 + .../math/selected_rows_functor_test.cc | 40 + .../fluid/operators/math/sequence_pooling.cc | 84 +- .../fluid/operators/math/sequence_pooling.cu | 14 +- .../fluid/operators/math/sequence_pooling.h | 4 +- paddle/fluid/operators/math/softmax.cu | 4 +- paddle/fluid/operators/math/softmax_impl.h | 42 +- paddle/fluid/operators/math/unpooling.cu | 10 +- paddle/fluid/operators/math/vol2col.cu | 8 +- paddle/fluid/operators/matmul_op.cc | 43 +- paddle/fluid/operators/mean_iou_op.cu | 5 +- paddle/fluid/operators/merge_lod_tensor_op.cc | 37 +- .../operators/mkldnn/activation_mkldnn_op.cc | 161 +- .../operators/mkldnn/batch_norm_mkldnn_op.cc | 87 +- .../operators/mkldnn/concat_mkldnn_op.cc | 53 +- .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 815 ++- .../mkldnn/conv_transpose_mkldnn_op.cc | 81 +- .../operators/mkldnn/dequantize_mkldnn_op.cc | 23 +- paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 47 +- .../mkldnn/gaussian_random_mkldnn_op.cc | 2 - .../fluid/operators/mkldnn/lrn_mkldnn_op.cc | 99 +- .../fluid/operators/mkldnn/mul_mkldnn_op.cc | 81 +- .../fluid/operators/mkldnn/pool_mkldnn_op.cc | 162 +- .../operators/mkldnn/quantize_mkldnn_op.cc | 21 +- .../operators/mkldnn/requantize_mkldnn_op.cc | 12 +- .../operators/mkldnn/softmax_mkldnn_op.cc | 243 +- .../fluid/operators/mkldnn/sum_mkldnn_op.cc | 33 +- .../operators/mkldnn/transpose_mkldnn_op.cc | 42 +- .../fluid/operators/modified_huber_loss_op.h | 5 +- paddle/fluid/operators/mul_op.cc | 16 +- paddle/fluid/operators/nce_op.h | 7 +- .../fluid/operators/ngraph/ngraph_bridge.cc | 3 +- .../fluid/operators/ngraph/ngraph_engine.cc | 201 +- paddle/fluid/operators/ngraph/ngraph_engine.h | 123 +- paddle/fluid/operators/ngraph/ops/concat_op.h | 5 +- paddle/fluid/operators/ngraph/ops/conv2d_op.h | 4 +- .../operators/ngraph/ops/cross_entropy_op.h | 3 +- .../fluid/operators/ngraph/ops/dropout_op.h | 4 +- .../operators/ngraph/ops/lookup_table_op.h | 22 +- .../fluid/operators/ngraph/ops/reshape_op.h | 3 +- paddle/fluid/operators/ngraph/ops/slice_op.h | 12 +- paddle/fluid/operators/norm_utils.h | 46 + paddle/fluid/operators/one_hot_v2_op.cc | 122 + paddle/fluid/operators/one_hot_v2_op.cu | 99 + paddle/fluid/operators/one_hot_v2_op.h | 94 + .../fluid/operators/optimizers/adadelta_op.cc | 5 + .../fluid/operators/optimizers/adagrad_op.cc | 5 + paddle/fluid/operators/optimizers/adam_op.cc | 5 + .../fluid/operators/optimizers/adamax_op.cc | 5 + .../optimizers/decayed_adagrad_op.cc | 5 + paddle/fluid/operators/optimizers/dpsgd_op.cc | 107 + paddle/fluid/operators/optimizers/dpsgd_op.h | 114 + paddle/fluid/operators/optimizers/ftrl_op.cc | 5 + .../operators/optimizers/lars_momentum_op.h | 2 +- .../fluid/operators/optimizers/momentum_op.h | 11 +- paddle/fluid/operators/optimizers/sgd_op.cc | 5 + paddle/fluid/operators/pool_cudnn_op.cu.cc | 8 +- paddle/fluid/operators/prelu_op.cu | 4 +- paddle/fluid/operators/prroi_pool_op.cc | 188 + paddle/fluid/operators/prroi_pool_op.cu | 309 + paddle/fluid/operators/prroi_pool_op.h | 364 ++ paddle/fluid/operators/pull_box_sparse_op.cc | 121 + paddle/fluid/operators/pull_box_sparse_op.cu | 44 + paddle/fluid/operators/pull_box_sparse_op.h | 90 + paddle/fluid/operators/quantize_op.cc | 3 +- paddle/fluid/operators/random_crop_op.cc | 2 +- paddle/fluid/operators/random_crop_op.h | 11 +- paddle/fluid/operators/reader/CMakeLists.txt | 7 - .../fluid/operators/reader/buffered_reader.cc | 13 +- .../reader/create_batch_reader_op.cc | 151 - .../reader/create_custom_reader_op.cc | 2 +- .../reader/create_multi_pass_reader_op.cc | 93 - .../reader/create_random_data_generator_op.cc | 107 - .../reader/create_recordio_file_reader_op.cc | 93 - .../reader/create_shuffle_reader_op.cc | 124 - .../fluid/operators/reader/open_files_op.cc | 277 - .../operators/reader/reader_op_registry.cc | 15 - .../operators/reader/reader_op_registry.h | 3 - paddle/fluid/operators/recurrent_op.cc | 182 +- paddle/fluid/operators/recurrent_op.h | 27 +- .../fluid/operators/reduce_ops/cub_reduce.h | 2 +- .../operators/reduce_ops/reduce_mean_op.cc | 24 +- .../reduce_ops/reduce_mean_op.part.cu | 18 +- paddle/fluid/operators/reduce_ops/reduce_op.h | 22 +- .../operators/reduce_ops/reduce_sum_op.cc | 61 +- .../operators/reduce_ops/reduce_sum_op.h | 5 +- .../reduce_ops/reduce_sum_op.part.cu | 18 +- paddle/fluid/operators/requantize_op.cc | 3 +- paddle/fluid/operators/reshape_op.cc | 99 +- .../fluid/operators/rnn_memory_helper_op.cc | 2 +- paddle/fluid/operators/roi_align_op.cc | 7 +- paddle/fluid/operators/roi_align_op.cu | 11 +- paddle/fluid/operators/roi_align_op.h | 22 +- paddle/fluid/operators/roi_pool_op.cu | 10 +- paddle/fluid/operators/row_conv_op.cc | 94 +- paddle/fluid/operators/row_conv_op.cu | 61 +- paddle/fluid/operators/sample_logits_op.h | 26 +- paddle/fluid/operators/scale_op.cc | 3 + paddle/fluid/operators/scale_op.cu | 5 + paddle/fluid/operators/scatter.cu.h | 89 +- paddle/fluid/operators/scatter.h | 69 +- paddle/fluid/operators/scatter_nd_add_op.cc | 186 + paddle/fluid/operators/scatter_nd_add_op.cu | 98 + paddle/fluid/operators/scatter_nd_add_op.h | 86 + paddle/fluid/operators/scatter_op.cu | 35 +- paddle/fluid/operators/scatter_op.h | 17 +- paddle/fluid/operators/search_compute.h | 138 + .../sequence_ops/sequence_mask_op.cc | 2 +- .../operators/sequence_ops/sequence_mask_op.h | 4 +- .../operators/sequence_ops/sequence_pad_op.cc | 47 +- .../sequence_ops/sequence_pool_op.cc | 20 +- .../operators/sequence_ops/sequence_pool_op.h | 21 +- .../sequence_ops/sequence_softmax_op.h | 3 + .../sequence_topk_avg_pooling_op.cc | 130 + .../sequence_topk_avg_pooling_op.h | 213 + .../sequence_ops/sequence_unpad_op.cc | 30 +- .../sigmoid_cross_entropy_with_logits_op.cc | 12 +- .../sigmoid_cross_entropy_with_logits_op.cu | 13 +- paddle/fluid/operators/slice_op.cc | 167 +- paddle/fluid/operators/slice_op.cu | 14 +- paddle/fluid/operators/slice_op.h | 139 +- paddle/fluid/operators/softmax_op.cc | 25 +- .../softmax_with_cross_entropy_op.cc | 20 +- paddle/fluid/operators/squeeze_op.cc | 275 +- paddle/fluid/operators/squeeze_op.cu.cc | 44 + paddle/fluid/operators/squeeze_op.h | 146 + paddle/fluid/operators/stack_op.h | 5 +- paddle/fluid/operators/strided_slice_op.cc | 272 + paddle/fluid/operators/strided_slice_op.cu | 30 + paddle/fluid/operators/strided_slice_op.h | 350 ++ paddle/fluid/operators/sum_op.cc | 10 +- paddle/fluid/operators/sum_op.cu | 16 +- paddle/fluid/operators/sum_op.h | 10 +- paddle/fluid/operators/sync_batch_norm_op.cu | 311 +- paddle/fluid/operators/temporal_shift_op.cc | 15 +- .../operators/tensor_array_to_tensor_op.cc | 15 +- .../operators/tensorrt/tensorrt_engine_op.h | 33 +- .../tensorrt/tensorrt_engine_op_test.cc | 2 + .../test_leaky_relu_grad_grad_functor.cc} | 30 +- .../test_leaky_relu_grad_grad_functor.cu} | 26 +- .../test_leaky_relu_grad_grad_functor.h | 124 + paddle/fluid/operators/top_k_op.cu | 1 - .../uniform_random_batch_size_like_op.cc | 8 + paddle/fluid/operators/uniform_random_op.cc | 35 +- paddle/fluid/operators/uniform_random_op.cu | 33 +- paddle/fluid/operators/unique_op.h | 41 +- .../fluid/operators/unique_with_counts_op.cc | 71 + .../fluid/operators/unique_with_counts_op.h | 43 + paddle/fluid/operators/unsqueeze_op.cc | 232 +- paddle/fluid/operators/unsqueeze_op.cu.cc | 45 + paddle/fluid/operators/unsqueeze_op.h | 137 + paddle/fluid/operators/unstack_op.cc | 150 +- paddle/fluid/operators/unstack_op.cu | 32 + paddle/fluid/operators/unstack_op.h | 254 +- paddle/fluid/operators/var_conv_2d_op.cc | 431 ++ paddle/fluid/operators/var_conv_2d_op.h | 45 + paddle/fluid/operators/warpctc_cudnn_op.cu.cc | 197 - paddle/fluid/operators/warpctc_op.cc | 66 +- paddle/fluid/operators/warpctc_op.h | 162 +- paddle/fluid/platform/CMakeLists.txt | 21 +- paddle/fluid/platform/assert.h | 48 - paddle/fluid/platform/collective_helper.cc | 82 +- paddle/fluid/platform/collective_helper.h | 50 +- paddle/fluid/platform/cpu_info.cc | 13 +- paddle/fluid/platform/cuda_helper.h | 10 +- paddle/fluid/platform/cudnn_desc.h | 4 +- paddle/fluid/platform/cudnn_helper.h | 46 +- .../fluid/platform/cudnn_workspace_helper.h | 2 +- paddle/fluid/platform/device_code.cc | 123 + paddle/fluid/platform/device_code.h | 64 + paddle/fluid/platform/device_code_test.cc | 78 + paddle/fluid/platform/device_context.cc | 165 +- paddle/fluid/platform/device_context.h | 219 +- paddle/fluid/platform/dynload/CMakeLists.txt | 7 +- paddle/fluid/platform/dynload/cublas.h | 18 +- paddle/fluid/platform/dynload/cuda_driver.cc | 30 + paddle/fluid/platform/dynload/cuda_driver.h | 79 + .../fluid/platform/dynload/dynamic_loader.cc | 46 +- .../fluid/platform/dynload/dynamic_loader.h | 5 +- paddle/fluid/platform/dynload/mklml.cc | 5 + paddle/fluid/platform/dynload/mklml.h | 5 + paddle/fluid/platform/dynload/nvrtc.cc | 30 + paddle/fluid/platform/dynload/nvrtc.h | 77 + paddle/fluid/platform/dynload/tensorrt.h | 4 +- paddle/fluid/platform/enforce.h | 237 +- paddle/fluid/platform/enforce_test.cc | 106 + paddle/fluid/platform/flags.cc | 453 ++ paddle/fluid/platform/float16_test.cc | 3 +- paddle/fluid/platform/float16_test.cu | 5 +- paddle/fluid/platform/gpu_info.cc | 144 +- paddle/fluid/platform/gpu_info.h | 4 + paddle/fluid/platform/init.cc | 11 +- paddle/fluid/platform/init.h | 4 + paddle/fluid/platform/init_test.cc | 7 + paddle/fluid/platform/mkldnn_helper.h | 111 +- paddle/fluid/platform/mkldnn_reuse.h | 1033 ++-- paddle/fluid/platform/nccl_helper.h | 24 +- paddle/fluid/platform/profiler.cu | 8 +- paddle/fluid/platform/temporary_allocator.cc | 121 - paddle/fluid/platform/temporary_allocator.h | 68 - .../platform/temporary_allocator_test.cc | 222 - paddle/fluid/pybind/.gitignore | 1 + paddle/fluid/pybind/CMakeLists.txt | 8 +- paddle/fluid/pybind/box_helper_py.cc | 50 + .../pybind/{recordio.h => box_helper_py.h} | 5 +- paddle/fluid/pybind/const_value.cc | 2 - paddle/fluid/pybind/data_set_py.cc | 177 +- paddle/fluid/pybind/fleet_wrapper_py.cc | 9 +- paddle/fluid/pybind/imperative.cc | 220 +- paddle/fluid/pybind/imperative.h | 4 - paddle/fluid/pybind/inference_api.cc | 155 +- paddle/fluid/pybind/pybind.cc | 307 +- paddle/fluid/pybind/pybind.h | 553 -- paddle/fluid/pybind/reader_py.cc | 13 +- paddle/fluid/pybind/recordio.cc | 88 - paddle/fluid/recordio/CMakeLists.txt | 9 - paddle/fluid/recordio/README.md | 13 - paddle/fluid/recordio/chunk.cc | 174 - paddle/fluid/recordio/chunk.h | 73 - paddle/fluid/recordio/chunk_test.cc | 47 - paddle/fluid/recordio/header.cc | 70 - paddle/fluid/recordio/header.h | 66 - paddle/fluid/recordio/scanner.cc | 57 - paddle/fluid/recordio/writer.cc | 40 - paddle/fluid/recordio/writer.h | 44 - paddle/fluid/recordio/writer_scanner_test.cc | 70 - paddle/fluid/string/CMakeLists.txt | 6 +- paddle/fluid/string/piece.cc | 22 +- paddle/fluid/string/string_helper.cc | 20 + paddle/fluid/string/string_helper.h | 32 +- paddle/fluid/string/to_string.h | 10 - paddle/fluid/train/CMakeLists.txt | 4 - .../fluid/train/custom_trainer/CMakeLists.txt | 1 - .../train/custom_trainer/feed/.clang-format | 33 - .../train/custom_trainer/feed/CMakeLists.txt | 19 - .../custom_trainer/feed/accessor/accessor.h | 19 - .../feed/accessor/dense_input_accessor.cc | 191 - .../feed/accessor/epoch_accessor.cc | 187 - .../feed/accessor/epoch_accessor.h | 111 - .../feed/accessor/input_data_accessor.h | 208 - .../feed/accessor/label_input_accessor.cc | 80 - .../feed/accessor/sparse_input_accessor.cc | 286 - .../feed/accessor/weights_input_accessor.cc | 74 - .../custom_trainer/feed/common/CMakeLists.txt | 1 - .../feed/common/bthread_task_runner.cc | 22 - .../feed/common/bthread_task_runner.h | 50 - .../custom_trainer/feed/common/pipeline.h | 161 - .../feed/common/pslib_warpper.cc | 89 - .../feed/common/pslib_warpper.h | 43 - .../custom_trainer/feed/common/registerer.cc | 18 - .../custom_trainer/feed/common/registerer.h | 114 - .../feed/common/runtime_environment.cc | 236 - .../feed/common/runtime_environment.h | 125 - .../custom_trainer/feed/common/scope_helper.h | 80 - .../custom_trainer/feed/common/yaml_helper.h | 32 - .../train/custom_trainer/feed/conf/env.conf | 19 - .../custom_trainer/feed/conf/gflags.conf | 5 - .../custom_trainer/feed/conf/ps_table_config | 120 - .../custom_trainer/feed/conf/trainer.yaml | 58 - .../feed/dataset/abacus_data_reader.cc | 76 - .../feed/dataset/archive_data_reader.cc | 390 -- .../feed/dataset/data_reader.cc | 194 - .../custom_trainer/feed/dataset/data_reader.h | 140 - .../custom_trainer/feed/dataset/dataset.cc | 80 - .../custom_trainer/feed/dataset/dataset.h | 49 - .../feed/dataset/dataset_container.cc | 199 - .../feed/dataset/dataset_container.h | 89 - .../custom_trainer/feed/executor/executor.cc | 126 - .../custom_trainer/feed/executor/executor.h | 35 - .../feed/executor/multi_thread_executor.cc | 258 - .../feed/executor/multi_thread_executor.h | 103 - .../feed/io/auto_file_system.cc | 96 - .../custom_trainer/feed/io/file_system.cc | 78 - .../custom_trainer/feed/io/file_system.h | 42 - .../feed/io/hadoop_file_system.cc | 197 - .../feed/io/local_file_system.cc | 122 - .../train/custom_trainer/feed/io/shell.cc | 367 -- .../train/custom_trainer/feed/io/shell.h | 64 - .../fluid/train/custom_trainer/feed/main.cc | 81 - .../feed/model/epoch_donefile.txt | 3 - .../feed/monitor/auc_monitor.cc | 154 - .../custom_trainer/feed/monitor/auc_monitor.h | 61 - .../feed/monitor/cost_monitor.cc | 32 - .../feed/monitor/cost_monitor.h | 54 - .../custom_trainer/feed/monitor/monitor.h | 56 - .../feed/process/CMakeLists.txt | 1 - .../feed/process/data_set_process.h | 23 - .../feed/process/init_env_process.cc | 57 - .../feed/process/init_env_process.h | 22 - .../feed/process/learner_process.cc | 302 - .../feed/process/learner_process.h | 37 - .../custom_trainer/feed/process/process.cc | 17 - .../custom_trainer/feed/process/process.h | 25 - .../feed/scripts/compake_runable_package.sh | 44 - .../feed/scripts/create_programs.py | 203 - .../custom_trainer/feed/scripts/example.py | 53 - .../train/custom_trainer/feed/scripts/join.py | 100 - .../feed/scripts/model/example/main_program | Bin 73923 -> 0 bytes .../feed/scripts/model/example/model.yaml | 49 - .../scripts/model/example/startup_program | Bin 29236 -> 0 bytes .../feed/scripts/model/example/test_program | Bin 32767 -> 0 bytes .../feed/scripts/model/join/inference_program | Bin 37270 -> 0 bytes .../model/join/inference_program.pbtxt | 2462 -------- .../feed/scripts/model/join/main_program | Bin 77253 -> 0 bytes .../scripts/model/join/main_program.pbtxt | 5141 ----------------- .../feed/scripts/model/join/model.yaml | 109 - .../feed/scripts/model/join/startup_program | Bin 29929 -> 0 bytes .../scripts/model/join/startup_program.pbtxt | 1464 ----- .../feed/scripts/model/join/test_program | Bin 34269 -> 0 bytes .../scripts/model/join/test_program.pbtxt | 2305 -------- .../scripts/model/update/inference_program | Bin 27921 -> 0 bytes .../model/update/inference_program.pbtxt | 1799 ------ .../feed/scripts/model/update/main_program | Bin 57844 -> 0 bytes .../scripts/model/update/main_program.pbtxt | 3791 ------------ .../feed/scripts/model/update/model.yaml | 90 - .../feed/scripts/model/update/startup_program | Bin 19253 -> 0 bytes .../model/update/startup_program.pbtxt | 949 --- .../feed/scripts/model/update/test_program | Bin 24920 -> 0 bytes .../scripts/model/update/test_program.pbtxt | 1642 ------ .../feed/scripts/start_feed_trainer.sh | 50 - .../custom_trainer/feed/scripts/submit_mpi.sh | 32 - .../custom_trainer/feed/scripts/update.py | 95 - .../custom_trainer/feed/shuffler/shuffler.cc | 248 - .../custom_trainer/feed/shuffler/shuffler.h | 69 - .../custom_trainer/feed/temp/feed_trainer.cpp | 115 - .../feed/tool/format_newcate_hotnews.awk | 21 - .../train/custom_trainer/feed/tool/gdbinit | 697 --- .../custom_trainer/feed/tool/ins_weight.py | 122 - .../feed/tool/xbox_compressor_mf.py | 162 - .../feed/tool/xbox_decompressor_mf.awk | 52 - .../feed/tool/xbox_pb_converter | Bin 6983561 -> 0 bytes .../feed/tool/xbox_pb_deconverter | Bin 6970751 -> 0 bytes .../custom_trainer/feed/trainer_context.h | 144 - .../custom_trainer/feed/unit_test/main.cc | 13 - .../feed/unit_test/test_archive_dataitem.cc | 20 - .../feed/unit_test/test_create_programs.cc | 147 - .../feed/unit_test/test_datareader.cc | 265 - .../feed/unit_test/test_datareader_omp.cc | 214 - .../feed/unit_test/test_executor.cc | 122 - paddle/fluid/train/demo/CMakeLists.txt | 6 +- paddle/fluid/train/imdb_demo/CMakeLists.txt | 74 + paddle/fluid/train/imdb_demo/README.md | 97 + paddle/fluid/train/imdb_demo/demo_trainer.cc | 184 + .../fluid/train/imdb_demo/generate_program.py | 72 + paddle/fluid/train/imdb_demo/imdb_reader.py | 75 + .../train/imdb_demo/include/save_model.h | 41 + paddle/fluid/train/imdb_demo/nets.py | 140 + paddle/fluid/train/imdb_demo/run.sh | 3 + paddle/fluid/train/imdb_demo/save_model.cc | 77 + paddle/fluid/train/imdb_demo/train.cfg | 7 + .../fluid/train/imdb_demo/train_filelist.txt | 12 + .../train/test_train_recognize_digits.cc | 3 +- paddle/scripts/Dockerfile.tmp | 177 - paddle/scripts/build_docker_images.sh | 22 +- paddle/scripts/fast_install.sh | 149 +- paddle/scripts/paddle_build.sh | 227 +- paddle/testing/paddle_gtest_main.cc | 17 +- python/paddle/__init__.py | 5 + python/paddle/check_import_scipy.py | 29 + python/paddle/dataset/cifar.py | 32 +- python/paddle/dataset/common.py | 55 +- python/paddle/dataset/conll05.py | 10 +- python/paddle/dataset/imdb.py | 11 +- python/paddle/dataset/imikolov.py | 14 +- python/paddle/dataset/mnist.py | 10 +- python/paddle/dataset/movielens.py | 11 +- python/paddle/dataset/sentiment.py | 10 +- python/paddle/dataset/tests/common_test.py | 97 - python/paddle/dataset/uci_housing.py | 10 +- python/paddle/dataset/wmt14.py | 10 - python/paddle/dataset/wmt16.py | 31 - python/paddle/distributed/launch.py | 113 +- python/paddle/distributed/launch_ps.py | 32 +- python/paddle/fluid/__init__.py | 25 +- python/paddle/fluid/backward.py | 548 +- python/paddle/fluid/clip.py | 56 +- python/paddle/fluid/compiler.py | 136 +- .../paddle/fluid/contrib/layers/__init__.py | 4 + .../paddle/fluid/contrib/layers/metric_op.py | 188 + python/paddle/fluid/contrib/layers/nn.py | 341 +- .../contrib/mixed_precision/decorator.py | 59 +- .../contrib/mixed_precision/fp16_lists.py | 15 +- .../contrib/mixed_precision/fp16_utils.py | 170 +- .../contrib/quantize/quantize_transpiler.py | 62 +- .../fluid/contrib/slim/core/compressor.py | 61 +- .../paddle/fluid/contrib/slim/core/config.py | 2 +- .../fluid/contrib/slim/core/strategy.py | 7 + .../distillation/distillation_strategy.py | 3 +- .../contrib/slim/distillation/distiller.py | 20 +- .../fluid/contrib/slim/graph/executor.py | 4 +- .../fluid/contrib/slim/graph/graph_wrapper.py | 60 +- .../contrib/slim/nas/light_nas_strategy.py | 51 +- .../fluid/contrib/slim/nas/search_space.py | 9 + .../contrib/slim/prune/prune_strategy.py | 48 +- .../mkldnn_post_training_strategy.py | 19 +- .../quantization/quantization_mkldnn_pass.py | 378 +- .../slim/quantization/quantization_pass.py | 144 +- .../quantization/quantization_strategy.py | 60 +- .../fluid/contrib/slim/tests/CMakeLists.txt | 30 +- .../slim/tests/QAT_mkldnn_int8_readme.md | 10 +- .../contrib/slim/tests/configs/compress.yaml | 4 + .../slim/tests/distillation/compress.yaml | 8 +- .../slim/tests/filter_pruning/compress.yaml | 2 +- .../tests/filter_pruning/uniform_restore.yaml | 21 + .../filter_pruning/uniform_restore_0.yaml | 21 + .../filter_pruning/uniform_restore_1.yaml | 21 + .../slim/tests/light_nas/compress.yaml | 1 + .../slim/tests/light_nas/light_nas_space.py | 11 + .../contrib/slim/tests/qat_int8_comparison.py | 44 +- .../slim/tests/quantization/compress_1.yaml | 50 + .../contrib/slim/tests/test_compressor.py | 99 + .../contrib/slim/tests/test_filter_pruning.py | 75 + .../fluid/contrib/slim/tests/test_graph.py | 2 +- .../contrib/slim/tests/test_graph_wrapper.py | 20 +- .../contrib/slim/tests/test_light_nas.py | 90 +- .../test_mkldnn_int8_quantization_strategy.py | 2 +- .../tests/test_quantization_mkldnn_pass.py | 6 +- .../slim/tests/test_quantization_pass.py | 93 +- .../slim/tests/test_quantization_strategy.py | 44 +- ....py => test_slim_distillation_strategy.py} | 0 .../tests/test_image_classification_fp16.py | 152 +- .../paddle/fluid/contrib/utils/hdfs_utils.py | 2 +- python/paddle/fluid/core.py | 26 +- python/paddle/fluid/data_feeder.py | 22 +- python/paddle/fluid/dataset.py | 234 +- python/paddle/fluid/device_worker.py | 47 +- python/paddle/fluid/dygraph/base.py | 20 +- python/paddle/fluid/dygraph/checkpoint.py | 24 +- python/paddle/fluid/dygraph/layers.py | 2 + .../fluid/dygraph/learning_rate_scheduler.py | 21 + python/paddle/fluid/dygraph/nn.py | 106 +- python/paddle/fluid/dygraph/parallel.py | 69 +- python/paddle/fluid/dygraph/tracer.py | 31 +- python/paddle/fluid/evaluator.py | 2 +- python/paddle/fluid/executor.py | 123 +- python/paddle/fluid/framework.py | 343 +- .../fluid/incubate/data_generator/__init__.py | 2 +- .../fluid/incubate/fleet/base/fleet_base.py | 20 +- .../fluid/incubate/fleet/base/role_maker.py | 145 +- .../incubate/fleet/collective/__init__.py | 363 +- .../distribute_transpiler/__init__.py | 70 +- .../fleet/parameter_server/pslib/__init__.py | 152 +- .../fleet/parameter_server/pslib/node.py | 311 +- .../pslib/optimizer_factory.py | 134 +- .../fleet/parameter_server/pslib/ps_pb2.py | 226 +- .../fleet/utils/fleet_barrier_util.py | 55 + .../fluid/incubate/fleet/utils/fleet_util.py | 1433 +++++ .../paddle/fluid/incubate/fleet/utils/hdfs.py | 144 +- python/paddle/fluid/initializer.py | 97 +- python/paddle/fluid/input.py | 137 + python/paddle/fluid/io.py | 71 +- python/paddle/fluid/layer_helper_base.py | 24 +- python/paddle/fluid/layers/control_flow.py | 34 +- python/paddle/fluid/layers/detection.py | 254 +- python/paddle/fluid/layers/distributions.py | 207 +- python/paddle/fluid/layers/io.py | 445 +- .../fluid/layers/layer_function_generator.py | 2 +- .../fluid/layers/learning_rate_scheduler.py | 18 +- python/paddle/fluid/layers/math_op_patch.py | 127 +- python/paddle/fluid/layers/metric_op.py | 4 +- python/paddle/fluid/layers/nn.py | 2987 ++++++++-- python/paddle/fluid/layers/ops.py | 44 +- python/paddle/fluid/layers/tensor.py | 78 +- python/paddle/fluid/optimizer.py | 618 +- python/paddle/fluid/parallel_executor.py | 7 +- python/paddle/fluid/param_attr.py | 14 +- python/paddle/fluid/profiler.py | 2 +- python/paddle/fluid/reader.py | 872 ++- python/paddle/fluid/recordio_writer.py | 132 - python/paddle/fluid/sampcd_processor.py | 915 ++- python/paddle/fluid/tests/CMakeLists.txt | 1 - .../tests/book/test_image_classification.py | 17 +- .../paddle/fluid/tests/book/test_word2vec.py | 2 +- .../book_memory_optimization/CMakeLists.txt | 11 - .../test_memopt_image_classification_train.py | 168 - .../test_memopt_machine_translation.py | 139 - .../fluid/tests/demo/file_reader/.gitignore | 1 - .../file_reader/convert_data_to_recordio.py | 63 - .../fluid/tests/demo/file_reader/train.py | 140 - .../paddle/fluid/tests/test_communicator.py | 39 + python/paddle/fluid/tests/test_cpp_reader.py | 94 - python/paddle/fluid/tests/test_detection.py | 23 + .../paddle/fluid/tests/unittests/.gitignore | 8 - .../fluid/tests/unittests/CMakeLists.txt | 133 +- .../fluid/tests/unittests/dist_mnist.py | 12 +- .../fluid/tests/unittests/dist_save_load.py | 3 - .../fluid/tests/unittests/dist_test_utils.py | 27 + .../fluid/tests/unittests/feed_data_reader.py | 78 + .../fluid/tests/unittests/gradient_checker.py | 1 + .../unittests/ir_memory_optimize_net_base.py | 12 +- .../tests/unittests/mkldnn/mkldnn_op_test.py | 87 +- .../mkldnn/test_activation_mkldnn_op.py | 23 +- .../mkldnn/test_batch_norm_mkldnn_op.py | 33 +- .../mkldnn/test_conv2d_int8_mkldnn_op.py | 49 +- .../unittests/mkldnn/test_conv2d_mkldnn_op.py | 23 +- .../mkldnn/test_conv2d_transpose_mkldnn_op.py | 12 +- .../mkldnn/test_elementwise_mul_mkldnn_op.py | 162 +- .../mkldnn/test_gaussian_random_mkldnn_op.py | 17 +- .../mkldnn/test_softmax_mkldnn_op.py | 28 +- .../fluid/tests/unittests/multi_process.py | 44 +- .../unittests/ngraph/test_assign_ngraph_op.py | 2 +- .../unittests/ngraph/test_concat_ngraph_op.py | 2 +- .../ngraph/test_elementwise_max_ngraph_op.py | 2 +- .../ngraph/test_elementwise_min_ngraph_op.py | 2 +- .../ngraph/test_elementwise_pow_ngraph_op.py | 2 +- .../ngraph/test_elementwise_sub_ngraph_op.py | 2 +- .../ngraph/test_layer_norm_ngraph_op.py | 6 +- .../ngraph/test_lookup_table_ngraph_op.py | 2 +- .../ngraph/test_parallel_executor_ngraph.py | 87 + .../ngraph/test_reshape_ngraph_op.py | 2 +- .../unittests/ngraph/test_slice_ngraph_op.py | 2 +- .../paddle/fluid/tests/unittests/op_test.py | 441 +- .../unittests/parallel_dygraph_se_resnext.py | 121 +- .../unittests/parallel_executor_test_base.py | 151 +- .../tests/unittests/py_precise_roi_pool.py | 151 + .../fluid/tests/unittests/seresnext_net.py | 204 + .../tests/unittests/seresnext_test_base.py | 56 + .../fluid/tests/unittests/simple_nets.py | 13 +- .../tests/unittests/test_activation_op.py | 92 +- .../tests/unittests/test_affine_channel_op.py | 4 +- .../test_avoid_twice_initialization.py | 50 + ..._find_no_grad_vars.py => test_backward.py} | 43 +- .../tests/unittests/test_batch_norm_op.py | 21 +- .../unittests/test_bilinear_interp_op.py | 200 +- .../fluid/tests/unittests/test_boxps.py | 104 + ...> test_buffer_shared_memory_reuse_pass.py} | 128 +- ...euse_pass_and_fuse_optimization_op_pass.py | 46 + .../unittests/test_c_comm_init_all_op.py | 50 + .../fluid/tests/unittests/test_center_loss.py | 95 + .../unittests/test_check_import_scipy.py | 33 + .../fluid/tests/unittests/test_conv2d_op.py | 42 + .../tests/unittests/test_conv_nn_grad.py | 129 + .../tests/unittests/test_crf_decoding_op.py | 50 + .../tests/unittests/test_crop_tensor_op.py | 218 + .../tests/unittests/test_cross_entropy2_op.py | 31 +- .../tests/unittests/test_cross_entropy_op.py | 54 + .../fluid/tests/unittests/test_ctc_align.py | 164 +- .../unittests/test_cyclic_cifar_dataset.py | 38 + .../tests/unittests/test_data_norm_op.py | 203 + .../fluid/tests/unittests/test_dataset.py | 256 +- .../unittests/test_dataset_dataloader.py | 221 + .../test_decoupled_py_reader_data_check.py | 84 +- .../unittests/test_deformable_conv_op.py | 53 +- .../unittests/test_deformable_conv_v1_op.py | 240 + ...t_deprecated_memory_optimize_interfaces.py | 68 + .../fluid/tests/unittests/test_detach.py | 163 + .../fluid/tests/unittests/test_dist_base.py | 219 +- .../fluid/tests/unittests/test_dist_ctr.py | 15 +- .../tests/unittests/test_dist_fleet_ctr.py | 12 + .../unittests/test_dist_mnist_fleetapi.py | 35 + .../unittests/test_dist_mnist_with_program.py | 51 + .../unittests/test_dist_se_resnext_async.py | 43 + ...resnext.py => test_dist_se_resnext_dgc.py} | 31 - .../unittests/test_dist_se_resnext_sync.py | 43 + .../test_dist_se_resnext_sync_with_memopt.py | 44 + .../fluid/tests/unittests/test_dist_train.py | 13 +- .../tests/unittests/test_dist_word2vec.py | 3 + .../tests/unittests/test_distributions.py | 253 +- .../fluid/tests/unittests/test_downpoursgd.py | 150 + .../fluid/tests/unittests/test_dpsgd_op.py | 73 + .../unittests/test_dygraph_mnist_fp16.py | 135 + .../test_eager_deletion_dynamic_rnn_base.py | 5 +- .../test_eager_deletion_recurrent_op.py | 3 +- .../test_eager_deletion_transformer.py | 2 - .../unittests/test_eager_deletion_while_op.py | 5 +- .../unittests/test_elementwise_add_op.py | 28 + .../unittests/test_elementwise_div_op.py | 20 + .../unittests/test_elementwise_max_op.py | 12 + .../unittests/test_elementwise_min_op.py | 22 +- .../unittests/test_elementwise_mod_op.py | 21 +- .../unittests/test_elementwise_mul_op.py | 20 + .../unittests/test_elementwise_pow_op.py | 81 +- .../unittests/test_elementwise_sub_op.py | 10 + ..._executor_return_tensor_not_overwriting.py | 112 + .../fluid/tests/unittests/test_expand_op.py | 196 +- .../fluid/tests/unittests/test_eye_op.py | 74 + .../fluid/tests/unittests/test_fc_op.py | 90 +- .../fluid/tests/unittests/test_fetch_var.py | 18 +- .../fluid/tests/unittests/test_fill_op.py | 58 +- .../unittests/test_filter_by_instag_op.py | 223 + .../unittests/test_fl_listen_and_serv_op.py | 178 + .../fluid/tests/unittests/test_flatten2_op.py | 73 + .../fluid/tests/unittests/test_flatten_op.py | 11 +- .../tests/unittests/test_fleet_api_input.py | 208 + .../tests/unittests/test_fleet_rolemaker.py | 56 + .../fluid/tests/unittests/test_fleet_utils.py | 35 + .../unittests/test_fuse_all_reduce_pass.py | 2 - .../test_fuse_elewise_add_act_pass.py | 2 - .../unittests/test_fuse_optimizer_pass.py | 50 +- .../test_fuse_relu_depthwise_conv_pass.py | 35 +- .../unittests/test_fused_emb_seq_pool_op.py | 53 +- .../test_fused_fc_elementwise_layernorm_op.py | 82 + .../test_fusion_seqpool_cvm_concat_op.py | 125 + .../tests/unittests/test_gather_nd_op.py | 169 + .../unittests/test_generator_dataloader.py | 196 + .../tests/unittests/test_group_norm_op.py | 104 +- .../test_hsigmoid_remote_table_op.py | 2 + .../tests/unittests/test_huber_loss_op.py | 35 +- .../unittests/test_imperative_auto_prune.py | 336 ++ .../tests/unittests/test_imperative_basic.py | 17 +- .../unittests/test_imperative_checkpoint.py | 4 + .../unittests/test_imperative_debug_string.py | 75 + .../tests/unittests/test_imperative_deepcf.py | 1 - .../unittests/test_imperative_framework.py | 66 + .../test_imperative_partitial_backward.py | 53 + .../test_imperative_recurrent_usage.py | 2 + .../unittests/test_imperative_transformer.py | 1082 ---- ..._imperative_transformer_sorted_gradient.py | 726 ++- .../tests/unittests/test_inference_api.py | 74 + .../unittests/test_inference_model_io.py | 35 +- ...test_inplace_softmax_with_cross_entropy.py | 9 +- .../tests/unittests/test_instance_norm_op.py | 195 + .../tests/unittests/test_ir_inplace_pass.py | 6 +- .../test_ir_memory_optimize_ifelse_op.py | 14 +- .../unittests/test_ir_memory_optimize_pass.py | 52 +- .../test_ir_memory_optimize_transformer.py | 27 +- .../fluid/tests/unittests/test_launch.sh | 57 +- .../tests/unittests/test_layer_norm_op.py | 2 +- .../fluid/tests/unittests/test_layers.py | 517 +- .../unittests/test_learning_rate_scheduler.py | 2 - .../unittests/test_linear_chain_crf_op.py | 78 +- .../tests/unittests/test_listen_and_serv.sh | 49 + .../unittests/test_listen_and_serv_op.py | 31 +- .../tests/unittests/test_lod_reset_op.py | 21 + .../unittests/test_lookup_remote_table_op.py | 4 +- .../unittests/test_lookup_table_v2_op.py | 216 + .../unittests/test_match_matrix_tensor_op.py | 132 + .../unittests/test_matmul_op_with_head.py | 294 + .../test_memory_optimization_transpiler.py | 118 - .../test_memory_reuse_exclude_feed_var.py | 66 + .../fluid/tests/unittests/test_mse_loss.py | 53 + .../tests/unittests/test_multi_file_reader.py | 81 - .../tests/unittests/test_multi_pass_reader.py | 69 - .../tests/unittests/test_multiclass_nms_op.py | 178 +- .../test_multiprocess_reader_exception.py | 126 + .../unittests/test_nce_remote_table_op.py | 2 + .../tests/unittests/test_nearest_interp_op.py | 215 +- .../fluid/tests/unittests/test_nn_grad.py | 24 - .../tests/unittests/test_norm_nn_grad.py | 53 + .../tests/unittests/test_one_hot_v2_op.py | 208 + .../fluid/tests/unittests/test_optimizer.py | 278 +- .../unittests/test_parallel_dygraph_mnist.py | 12 +- .../test_parallel_dygraph_se_resnext.py | 14 +- ..._parallel_executor_feed_persistable_var.py | 88 + .../test_parallel_executor_fetch_feed.py | 4 - .../unittests/test_parallel_executor_mnist.py | 2 - .../unittests/test_parallel_executor_pg.py | 2 - ...arallel_executor_run_load_infer_program.py | 85 + .../test_parallel_executor_seresnext.py | 396 -- ...st_parallel_executor_seresnext_base_cpu.py | 37 + ...st_parallel_executor_seresnext_base_gpu.py | 37 + ...utor_seresnext_with_fuse_all_reduce_cpu.py | 38 + ...utor_seresnext_with_fuse_all_reduce_gpu.py | 39 + ...llel_executor_seresnext_with_reduce_cpu.py | 94 + ...llel_executor_seresnext_with_reduce_gpu.py | 26 + ...test_parallel_executor_test_while_train.py | 6 +- .../test_parallel_executor_transformer.py | 67 +- ...rallel_executor_transformer_auto_growth.py | 3 - ...test_partial_eager_deletion_transformer.py | 3 - .../tests/unittests/test_preprocessor.py | 96 - .../unittests/test_program_prune_backward.py | 212 + .../tests/unittests/test_prroi_pool_op.py | 138 + .../fluid/tests/unittests/test_py_func_op.py | 7 +- .../unittests/test_py_reader_combination.py | 99 + ...eader.py => test_py_reader_return_list.py} | 11 +- .../test_py_reader_using_executor.py | 137 +- .../tests/unittests/test_reader_reset.py | 75 +- .../tests/unittests/test_recordio_reader.py | 92 - .../tests/unittests/test_recurrent_op.py | 138 + .../fluid/tests/unittests/test_reshape_op.py | 117 +- .../test_roi_perspective_transform_op.py | 8 +- .../fluid/tests/unittests/test_row_conv_op.py | 49 +- .../test_runtime_and_compiletime_exception.py | 56 + .../unittests/test_save_model_without_var.py | 57 + .../tests/unittests/test_scatter_nd_op.py | 291 + .../fluid/tests/unittests/test_scatter_op.py | 42 + .../fluid/tests/unittests/test_seq_conv.py | 16 + .../fluid/tests/unittests/test_seq_pool.py | 195 +- .../tests/unittests/test_sequence_pad_op.py | 2 +- .../test_sequence_topk_avg_pooling.py | 158 + .../tests/unittests/test_sequence_unpad_op.py | 5 +- .../fluid/tests/unittests/test_slice_op.py | 363 +- .../fluid/tests/unittests/test_softmax_op.py | 16 +- .../test_split_and_merge_lod_tensor_op.py | 65 +- .../tests/unittests/test_square_error_cost.py | 55 + .../fluid/tests/unittests/test_squeeze2_op.py | 75 + .../fluid/tests/unittests/test_squeeze_op.py | 11 +- .../tests/unittests/test_strided_slice_op.py | 505 ++ .../unittests/test_sync_batch_norm_op.py | 74 +- .../fluid/tests/unittests/test_trainable.py | 83 + .../tests/unittests/test_trainer_desc.py | 50 + .../unittests/test_trilinear_interp_op.py | 640 ++ .../tests/unittests/test_uniform_random_op.py | 81 +- .../unittests/test_unique_with_counts.py | 84 + .../tests/unittests/test_unsqueeze2_op.py | 83 + .../tests/unittests/test_unsqueeze_op.py | 9 +- .../fluid/tests/unittests/test_var_conv_2d.py | 305 + .../fluid/tests/unittests/test_warpctc_op.py | 115 +- .../fluid/tests/unittests/test_while_op.py | 10 + .../tests/unittests/transformer_model.py | 77 +- python/paddle/fluid/trainer_desc.py | 42 +- python/paddle/fluid/trainer_factory.py | 7 + python/paddle/fluid/transpiler/__init__.py | 1 - python/paddle/fluid/transpiler/collective.py | 6 +- .../fluid/transpiler/distribute_transpiler.py | 164 +- .../fluid/transpiler/inference_transpiler.py | 661 --- .../memory_optimization_transpiler.py | 539 +- python/paddle/reader/__init__.py | 4 +- python/paddle/reader/creator.py | 96 - python/paddle/reader/decorator.py | 38 +- python/paddle/reader/tests/CMakeLists.txt | 1 - python/paddle/reader/tests/creator_test.py | 75 - .../paddle/reader/tests/test_data_creator.txt | 3 - .../reader/tests/test_reader_recordio.dat | Bin 76 -> 0 bytes .../reader/tests/test_recordio_creator.dat | Bin 88 -> 0 bytes python/requirements.txt | 5 +- python/setup.py.in | 25 +- tools/aws_benchmarking/README.md | 184 - tools/aws_benchmarking/client/Dockerfile | 7 - .../client/cluster_launcher.py | 415 -- .../aws_benchmarking/client/requirements.txt | 6 - tools/aws_benchmarking/diagram.png | Bin 40790 -> 0 bytes tools/aws_benchmarking/server/Dockerfile | 7 - .../aws_benchmarking/server/cluster_master.py | 735 --- tools/aws_benchmarking/server/logs/master.log | 0 .../server/pserver.sh.template | 2 - .../aws_benchmarking/server/requirements.txt | 4 - .../server/trainer.sh.template | 2 - tools/check_api_approvals.sh | 138 + tools/diff_api.py | 2 +- tools/print_signatures.py | 2 +- 1288 files changed, 67553 insertions(+), 56995 deletions(-) create mode 100644 cmake/copyfile.py create mode 100644 cmake/external/box_ps.cmake delete mode 100644 cmake/external/snappy.cmake delete mode 100644 cmake/external/snappystream.cmake delete mode 100644 cmake/external/yaml-cpp.cmake delete mode 100644 cmake/package.cmake mode change 100755 => 100644 paddle/fluid/framework/archive.h delete mode 100644 paddle/fluid/framework/commit.h mode change 100755 => 100644 paddle/fluid/framework/data_feed.cc create mode 100644 paddle/fluid/framework/details/scope_buffered_monitor.cc create mode 100644 paddle/fluid/framework/details/scope_buffered_monitor.h create mode 100644 paddle/fluid/framework/details/share_tensor_buffer_functor.cc create mode 100644 paddle/fluid/framework/details/share_tensor_buffer_functor.h create mode 100644 paddle/fluid/framework/fleet/box_wrapper.cc create mode 100644 paddle/fluid/framework/fleet/box_wrapper.h delete mode 100644 paddle/fluid/framework/inplace_op_inference_test.cc create mode 100644 paddle/fluid/framework/ir/codegen.cc create mode 100644 paddle/fluid/framework/ir/codegen.h create mode 100644 paddle/fluid/framework/ir/codegen_helper.cc create mode 100644 paddle/fluid/framework/ir/codegen_helper.h create mode 100644 paddle/fluid/framework/ir/codegen_test.cc delete mode 100644 paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc rename paddle/fluid/{platform/dynload/warpctc_lib_path.h => framework/ir/cudnn_placement_pass.cc} (76%) create mode 100644 paddle/fluid/framework/ir/cudnn_placement_pass.h create mode 100644 paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc rename paddle/fluid/framework/{revision.cc => ir/fc_elementwise_layernorm_fuse_pass.h} (59%) create mode 100644 paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc rename paddle/fluid/framework/ir/{multi_devices_graph_pass/multi_devices_graph_print_pass.h => graph_printer.h} (95%) delete mode 100644 paddle/fluid/framework/ir/infer_clean_graph_pass.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc create mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc delete mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc delete mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc delete mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h delete mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc delete mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc delete mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h delete mode 100644 paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc create mode 100644 paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc rename paddle/fluid/framework/ir/mkldnn/{conv_relu_mkldnn_fuse_pass.h => conv_activation_mkldnn_fuse_pass.h} (60%) rename paddle/fluid/framework/ir/mkldnn/{conv_relu_mkldnn_fuse_pass_tester.cc => conv_activation_mkldnn_fuse_pass_tester.cc} (60%) delete mode 100644 paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc delete mode 100644 paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc delete mode 100644 paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/pass_tester_helper.h create mode 100644 paddle/fluid/framework/ir/placement_pass_base.cc create mode 100644 paddle/fluid/framework/ir/placement_pass_base.h create mode 100644 paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc create mode 100644 paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h create mode 100644 paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc create mode 100644 paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc create mode 100644 paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h create mode 100644 paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc create mode 100644 paddle/fluid/framework/op_call_stack.cc rename paddle/fluid/framework/{revision.h => op_call_stack.h} (64%) create mode 100644 paddle/fluid/framework/op_compatible_info.cc create mode 100644 paddle/fluid/framework/op_compatible_info.h create mode 100644 paddle/fluid/framework/op_compatible_info_test.cc create mode 100644 paddle/fluid/imperative/gradient_accumulator.cc create mode 100644 paddle/fluid/imperative/gradient_accumulator.h create mode 100644 paddle/fluid/imperative/prepared_operator.cc create mode 100644 paddle/fluid/imperative/prepared_operator.h create mode 100644 paddle/fluid/imperative/tests/CMakeLists.txt rename paddle/fluid/imperative/{ => tests}/nccl_context_test.cc (100%) create mode 100644 paddle/fluid/imperative/tests/test_gradient_accmulator.cc create mode 100644 paddle/fluid/imperative/tests/test_layer.cc create mode 100644 paddle/fluid/imperative/tests/test_prepare_op.cc create mode 100644 paddle/fluid/imperative/tests/test_tracer.cc create mode 100644 paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc rename paddle/fluid/{recordio/scanner.h => inference/analysis/passes/ir_graph_clean_pass.h} (58%) delete mode 100644 paddle/fluid/inference/api/paddle_inference_pass.h create mode 100644 paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/swish_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_swish_op.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h delete mode 100644 paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc create mode 100644 paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py create mode 100644 paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc create mode 100644 paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc create mode 100644 paddle/fluid/memory/allocation/cuda_device_context_allocator.h create mode 100644 paddle/fluid/memory/malloc_test.cu create mode 100644 paddle/fluid/operators/center_loss_op.cc create mode 100644 paddle/fluid/operators/center_loss_op.cu create mode 100644 paddle/fluid/operators/center_loss_op.h create mode 100644 paddle/fluid/operators/collective/c_comm_init_all_op.cc create mode 100644 paddle/fluid/operators/controlflow/conditional_block_op_helper.cc create mode 100644 paddle/fluid/operators/controlflow/conditional_block_op_helper.h create mode 100644 paddle/fluid/operators/crop_tensor_op.cc rename paddle/fluid/operators/{linear_chain_crf_op.cu => crop_tensor_op.cu} (53%) create mode 100644 paddle/fluid/operators/crop_tensor_op.h create mode 100644 paddle/fluid/operators/deformable_conv_filter.cu.h create mode 100644 paddle/fluid/operators/deformable_conv_func.h create mode 100644 paddle/fluid/operators/deformable_conv_op.h create mode 100644 paddle/fluid/operators/deformable_conv_v1_op.cc create mode 100644 paddle/fluid/operators/deformable_conv_v1_op.cu create mode 100644 paddle/fluid/operators/deformable_conv_v1_op.h create mode 100644 paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc create mode 100644 paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h create mode 100644 paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc create mode 100644 paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc create mode 100644 paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h create mode 100644 paddle/fluid/operators/eye_op.cc create mode 100644 paddle/fluid/operators/eye_op.cu create mode 100644 paddle/fluid/operators/eye_op.h rename paddle/fluid/{platform/dynload/cupti_lib_path.h => operators/fc_op.cu.cc} (58%) rename paddle/fluid/{platform/dynload/warpctc_lib_path.h.in => operators/fill_op.cu.cc} (66%) create mode 100644 paddle/fluid/operators/fill_op.h create mode 100644 paddle/fluid/operators/filter_by_instag_op.cc create mode 100644 paddle/fluid/operators/filter_by_instag_op.h create mode 100644 paddle/fluid/operators/flatten_op.cu.cc create mode 100644 paddle/fluid/operators/flatten_op.h create mode 100644 paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc create mode 100644 paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu create mode 100644 paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc create mode 100644 paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h create mode 100644 paddle/fluid/operators/gather_nd_op.cc create mode 100644 paddle/fluid/operators/gather_nd_op.cu create mode 100644 paddle/fluid/operators/gather_nd_op.h create mode 100644 paddle/fluid/operators/instance_norm_op.cc create mode 100644 paddle/fluid/operators/instance_norm_op.cu create mode 100644 paddle/fluid/operators/instance_norm_op.h delete mode 100644 paddle/fluid/operators/jit/kernels.h mode change 100644 => 100755 paddle/fluid/operators/linear_chain_crf_op.h create mode 100644 paddle/fluid/operators/lookup_table_v2_op.cc create mode 100644 paddle/fluid/operators/lookup_table_v2_op.cu create mode 100644 paddle/fluid/operators/lookup_table_v2_op.h create mode 100644 paddle/fluid/operators/match_matrix_tensor_op.cc create mode 100644 paddle/fluid/operators/match_matrix_tensor_op.h create mode 100644 paddle/fluid/operators/math/fc.cc create mode 100644 paddle/fluid/operators/math/fc.cu create mode 100644 paddle/fluid/operators/math/fc.h delete mode 100644 paddle/fluid/operators/math/fc_compute.h create mode 100644 paddle/fluid/operators/norm_utils.h create mode 100644 paddle/fluid/operators/one_hot_v2_op.cc create mode 100644 paddle/fluid/operators/one_hot_v2_op.cu create mode 100644 paddle/fluid/operators/one_hot_v2_op.h create mode 100644 paddle/fluid/operators/optimizers/dpsgd_op.cc create mode 100644 paddle/fluid/operators/optimizers/dpsgd_op.h create mode 100644 paddle/fluid/operators/prroi_pool_op.cc create mode 100644 paddle/fluid/operators/prroi_pool_op.cu create mode 100644 paddle/fluid/operators/prroi_pool_op.h create mode 100644 paddle/fluid/operators/pull_box_sparse_op.cc create mode 100644 paddle/fluid/operators/pull_box_sparse_op.cu create mode 100644 paddle/fluid/operators/pull_box_sparse_op.h delete mode 100644 paddle/fluid/operators/reader/create_batch_reader_op.cc delete mode 100644 paddle/fluid/operators/reader/create_multi_pass_reader_op.cc delete mode 100644 paddle/fluid/operators/reader/create_random_data_generator_op.cc delete mode 100644 paddle/fluid/operators/reader/create_recordio_file_reader_op.cc delete mode 100644 paddle/fluid/operators/reader/create_shuffle_reader_op.cc delete mode 100644 paddle/fluid/operators/reader/open_files_op.cc create mode 100644 paddle/fluid/operators/scatter_nd_add_op.cc create mode 100644 paddle/fluid/operators/scatter_nd_add_op.cu create mode 100644 paddle/fluid/operators/scatter_nd_add_op.h create mode 100644 paddle/fluid/operators/search_compute.h create mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc create mode 100644 paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h create mode 100644 paddle/fluid/operators/squeeze_op.cu.cc create mode 100644 paddle/fluid/operators/squeeze_op.h create mode 100644 paddle/fluid/operators/strided_slice_op.cc create mode 100644 paddle/fluid/operators/strided_slice_op.cu create mode 100644 paddle/fluid/operators/strided_slice_op.h rename paddle/fluid/{framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h => operators/test_leaky_relu_grad_grad_functor.cc} (56%) rename paddle/fluid/{recordio/header_test.cc => operators/test_leaky_relu_grad_grad_functor.cu} (52%) create mode 100644 paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h create mode 100644 paddle/fluid/operators/unique_with_counts_op.cc create mode 100644 paddle/fluid/operators/unique_with_counts_op.h create mode 100644 paddle/fluid/operators/unsqueeze_op.cu.cc create mode 100644 paddle/fluid/operators/unsqueeze_op.h create mode 100644 paddle/fluid/operators/unstack_op.cu create mode 100644 paddle/fluid/operators/var_conv_2d_op.cc create mode 100644 paddle/fluid/operators/var_conv_2d_op.h delete mode 100644 paddle/fluid/operators/warpctc_cudnn_op.cu.cc delete mode 100644 paddle/fluid/platform/assert.h create mode 100644 paddle/fluid/platform/device_code.cc create mode 100644 paddle/fluid/platform/device_code.h create mode 100644 paddle/fluid/platform/device_code_test.cc create mode 100644 paddle/fluid/platform/dynload/cuda_driver.cc create mode 100644 paddle/fluid/platform/dynload/cuda_driver.h create mode 100644 paddle/fluid/platform/dynload/nvrtc.cc create mode 100644 paddle/fluid/platform/dynload/nvrtc.h create mode 100644 paddle/fluid/platform/flags.cc delete mode 100644 paddle/fluid/platform/temporary_allocator.cc delete mode 100644 paddle/fluid/platform/temporary_allocator.h delete mode 100644 paddle/fluid/platform/temporary_allocator_test.cc create mode 100644 paddle/fluid/pybind/.gitignore create mode 100644 paddle/fluid/pybind/box_helper_py.cc rename paddle/fluid/pybind/{recordio.h => box_helper_py.h} (87%) delete mode 100644 paddle/fluid/pybind/pybind.h delete mode 100644 paddle/fluid/pybind/recordio.cc delete mode 100644 paddle/fluid/recordio/CMakeLists.txt delete mode 100644 paddle/fluid/recordio/README.md delete mode 100644 paddle/fluid/recordio/chunk.cc delete mode 100644 paddle/fluid/recordio/chunk.h delete mode 100644 paddle/fluid/recordio/chunk_test.cc delete mode 100644 paddle/fluid/recordio/header.cc delete mode 100644 paddle/fluid/recordio/header.h delete mode 100644 paddle/fluid/recordio/scanner.cc delete mode 100644 paddle/fluid/recordio/writer.cc delete mode 100644 paddle/fluid/recordio/writer.h delete mode 100644 paddle/fluid/recordio/writer_scanner_test.cc mode change 100755 => 100644 paddle/fluid/string/string_helper.cc mode change 100755 => 100644 paddle/fluid/string/string_helper.h delete mode 100644 paddle/fluid/train/custom_trainer/CMakeLists.txt delete mode 100644 paddle/fluid/train/custom_trainer/feed/.clang-format delete mode 100644 paddle/fluid/train/custom_trainer/feed/CMakeLists.txt delete mode 100644 paddle/fluid/train/custom_trainer/feed/accessor/accessor.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/accessor/dense_input_accessor.cc delete mode 100755 paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/accessor/label_input_accessor.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/accessor/sparse_input_accessor.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/accessor/weights_input_accessor.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/common/CMakeLists.txt delete mode 100644 paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/common/pipeline.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/common/registerer.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/common/registerer.h delete mode 100755 paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc delete mode 100755 paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/common/scope_helper.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/common/yaml_helper.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/conf/env.conf delete mode 100644 paddle/fluid/train/custom_trainer/feed/conf/gflags.conf delete mode 100644 paddle/fluid/train/custom_trainer/feed/conf/ps_table_config delete mode 100644 paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml delete mode 100755 paddle/fluid/train/custom_trainer/feed/dataset/abacus_data_reader.cc delete mode 100755 paddle/fluid/train/custom_trainer/feed/dataset/archive_data_reader.cc delete mode 100755 paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc delete mode 100755 paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/dataset/dataset.h delete mode 100755 paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/executor/executor.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/executor/executor.h delete mode 100755 paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/io/file_system.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/io/file_system.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/io/local_file_system.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/io/shell.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/io/shell.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/main.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/model/epoch_donefile.txt delete mode 100644 paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.cc delete mode 100755 paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h delete mode 100755 paddle/fluid/train/custom_trainer/feed/monitor/monitor.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/process/CMakeLists.txt delete mode 100644 paddle/fluid/train/custom_trainer/feed/process/data_set_process.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/process/init_env_process.h delete mode 100755 paddle/fluid/train/custom_trainer/feed/process/learner_process.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/process/learner_process.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/process/process.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/process/process.h delete mode 100755 paddle/fluid/train/custom_trainer/feed/scripts/compake_runable_package.sh delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/example.py delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/join.py delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/example/main_program delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/example/model.yaml delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/example/startup_program delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/example/test_program delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/join/inference_program delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/join/inference_program.pbtxt delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program.pbtxt delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/join/model.yaml delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program.pbtxt delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program.pbtxt delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/update/inference_program delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/update/inference_program.pbtxt delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program.pbtxt delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/update/model.yaml delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program.pbtxt delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program.pbtxt delete mode 100755 paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh delete mode 100755 paddle/fluid/train/custom_trainer/feed/scripts/submit_mpi.sh delete mode 100644 paddle/fluid/train/custom_trainer/feed/scripts/update.py delete mode 100644 paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp delete mode 100755 paddle/fluid/train/custom_trainer/feed/tool/format_newcate_hotnews.awk delete mode 100644 paddle/fluid/train/custom_trainer/feed/tool/gdbinit delete mode 100755 paddle/fluid/train/custom_trainer/feed/tool/ins_weight.py delete mode 100755 paddle/fluid/train/custom_trainer/feed/tool/xbox_compressor_mf.py delete mode 100755 paddle/fluid/train/custom_trainer/feed/tool/xbox_decompressor_mf.awk delete mode 100755 paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_converter delete mode 100755 paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_deconverter delete mode 100755 paddle/fluid/train/custom_trainer/feed/trainer_context.h delete mode 100644 paddle/fluid/train/custom_trainer/feed/unit_test/main.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/unit_test/test_archive_dataitem.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/unit_test/test_create_programs.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/unit_test/test_datareader_omp.cc delete mode 100644 paddle/fluid/train/custom_trainer/feed/unit_test/test_executor.cc create mode 100644 paddle/fluid/train/imdb_demo/CMakeLists.txt create mode 100644 paddle/fluid/train/imdb_demo/README.md create mode 100644 paddle/fluid/train/imdb_demo/demo_trainer.cc create mode 100644 paddle/fluid/train/imdb_demo/generate_program.py create mode 100644 paddle/fluid/train/imdb_demo/imdb_reader.py create mode 100644 paddle/fluid/train/imdb_demo/include/save_model.h create mode 100644 paddle/fluid/train/imdb_demo/nets.py create mode 100644 paddle/fluid/train/imdb_demo/run.sh create mode 100644 paddle/fluid/train/imdb_demo/save_model.cc create mode 100644 paddle/fluid/train/imdb_demo/train.cfg create mode 100644 paddle/fluid/train/imdb_demo/train_filelist.txt delete mode 100644 paddle/scripts/Dockerfile.tmp create mode 100644 python/paddle/check_import_scipy.py delete mode 100644 python/paddle/dataset/tests/common_test.py create mode 100644 python/paddle/fluid/contrib/layers/metric_op.py create mode 100644 python/paddle/fluid/contrib/slim/tests/configs/compress.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_0.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/filter_pruning/uniform_restore_1.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/quantization/compress_1.yaml create mode 100644 python/paddle/fluid/contrib/slim/tests/test_compressor.py rename python/paddle/fluid/contrib/slim/tests/{test_distillation_strategy.py => test_slim_distillation_strategy.py} (100%) create mode 100644 python/paddle/fluid/incubate/fleet/utils/fleet_barrier_util.py create mode 100644 python/paddle/fluid/incubate/fleet/utils/fleet_util.py create mode 100644 python/paddle/fluid/input.py mode change 100644 => 100755 python/paddle/fluid/layers/layer_function_generator.py mode change 100644 => 100755 python/paddle/fluid/layers/metric_op.py mode change 100644 => 100755 python/paddle/fluid/layers/nn.py delete mode 100644 python/paddle/fluid/recordio_writer.py delete mode 100644 python/paddle/fluid/tests/book_memory_optimization/CMakeLists.txt delete mode 100644 python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py delete mode 100644 python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py delete mode 100644 python/paddle/fluid/tests/demo/file_reader/.gitignore delete mode 100644 python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py delete mode 100644 python/paddle/fluid/tests/demo/file_reader/train.py delete mode 100644 python/paddle/fluid/tests/test_cpp_reader.py delete mode 100644 python/paddle/fluid/tests/unittests/.gitignore create mode 100644 python/paddle/fluid/tests/unittests/dist_test_utils.py create mode 100644 python/paddle/fluid/tests/unittests/feed_data_reader.py create mode 100644 python/paddle/fluid/tests/unittests/ngraph/test_parallel_executor_ngraph.py create mode 100644 python/paddle/fluid/tests/unittests/py_precise_roi_pool.py create mode 100644 python/paddle/fluid/tests/unittests/seresnext_net.py create mode 100644 python/paddle/fluid/tests/unittests/seresnext_test_base.py create mode 100644 python/paddle/fluid/tests/unittests/test_avoid_twice_initialization.py rename python/paddle/fluid/tests/unittests/{test_backward_find_no_grad_vars.py => test_backward.py} (52%) create mode 100644 python/paddle/fluid/tests/unittests/test_boxps.py rename python/paddle/fluid/tests/unittests/{test_buffer_shared_inplace_pass.py => test_buffer_shared_memory_reuse_pass.py} (54%) create mode 100644 python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass.py create mode 100644 python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_center_loss.py create mode 100644 python/paddle/fluid/tests/unittests/test_check_import_scipy.py create mode 100644 python/paddle/fluid/tests/unittests/test_conv_nn_grad.py create mode 100644 python/paddle/fluid/tests/unittests/test_crop_tensor_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_cyclic_cifar_dataset.py create mode 100644 python/paddle/fluid/tests/unittests/test_data_norm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_dataset_dataloader.py create mode 100644 python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_deprecated_memory_optimize_interfaces.py create mode 100644 python/paddle/fluid/tests/unittests/test_detach.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_mnist_with_program.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_se_resnext_async.py rename python/paddle/fluid/tests/unittests/{test_dist_se_resnext.py => test_dist_se_resnext_dgc.py} (63%) create mode 100644 python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync.py create mode 100644 python/paddle/fluid/tests/unittests/test_dist_se_resnext_sync_with_memopt.py create mode 100644 python/paddle/fluid/tests/unittests/test_downpoursgd.py create mode 100644 python/paddle/fluid/tests/unittests/test_dpsgd_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_dygraph_mnist_fp16.py create mode 100644 python/paddle/fluid/tests/unittests/test_executor_return_tensor_not_overwriting.py create mode 100644 python/paddle/fluid/tests/unittests/test_eye_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_filter_by_instag_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fl_listen_and_serv_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_flatten2_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fleet_api_input.py create mode 100644 python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py create mode 100644 python/paddle/fluid/tests/unittests/test_fleet_utils.py create mode 100644 python/paddle/fluid/tests/unittests/test_fused_fc_elementwise_layernorm_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_fusion_seqpool_cvm_concat_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_gather_nd_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_generator_dataloader.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_debug_string.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_framework.py create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_partitial_backward.py delete mode 100644 python/paddle/fluid/tests/unittests/test_imperative_transformer.py create mode 100644 python/paddle/fluid/tests/unittests/test_inference_api.py create mode 100644 python/paddle/fluid/tests/unittests/test_instance_norm_op.py mode change 100644 => 100755 python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_listen_and_serv.sh create mode 100644 python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_match_matrix_tensor_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_matmul_op_with_head.py delete mode 100644 python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py create mode 100644 python/paddle/fluid/tests/unittests/test_memory_reuse_exclude_feed_var.py create mode 100644 python/paddle/fluid/tests/unittests/test_mse_loss.py delete mode 100644 python/paddle/fluid/tests/unittests/test_multi_file_reader.py delete mode 100644 python/paddle/fluid/tests/unittests/test_multi_pass_reader.py create mode 100644 python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py create mode 100644 python/paddle/fluid/tests/unittests/test_norm_nn_grad.py create mode 100644 python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_run_load_infer_program.py delete mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py delete mode 100644 python/paddle/fluid/tests/unittests/test_preprocessor.py create mode 100755 python/paddle/fluid/tests/unittests/test_program_prune_backward.py create mode 100644 python/paddle/fluid/tests/unittests/test_prroi_pool_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_py_reader_combination.py rename python/paddle/fluid/tests/unittests/{test_pyreader.py => test_py_reader_return_list.py} (87%) delete mode 100644 python/paddle/fluid/tests/unittests/test_recordio_reader.py create mode 100644 python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py create mode 100644 python/paddle/fluid/tests/unittests/test_save_model_without_var.py create mode 100644 python/paddle/fluid/tests/unittests/test_scatter_nd_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_sequence_topk_avg_pooling.py create mode 100644 python/paddle/fluid/tests/unittests/test_square_error_cost.py create mode 100644 python/paddle/fluid/tests/unittests/test_squeeze2_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_strided_slice_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_trainable.py create mode 100644 python/paddle/fluid/tests/unittests/test_trainer_desc.py create mode 100644 python/paddle/fluid/tests/unittests/test_trilinear_interp_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_unique_with_counts.py create mode 100644 python/paddle/fluid/tests/unittests/test_unsqueeze2_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_var_conv_2d.py delete mode 100644 python/paddle/fluid/transpiler/inference_transpiler.py delete mode 100644 python/paddle/reader/creator.py delete mode 100644 python/paddle/reader/tests/creator_test.py delete mode 100644 python/paddle/reader/tests/test_data_creator.txt delete mode 100644 python/paddle/reader/tests/test_reader_recordio.dat delete mode 100644 python/paddle/reader/tests/test_recordio_creator.dat delete mode 100644 tools/aws_benchmarking/README.md delete mode 100644 tools/aws_benchmarking/client/Dockerfile delete mode 100644 tools/aws_benchmarking/client/cluster_launcher.py delete mode 100644 tools/aws_benchmarking/client/requirements.txt delete mode 100644 tools/aws_benchmarking/diagram.png delete mode 100644 tools/aws_benchmarking/server/Dockerfile delete mode 100644 tools/aws_benchmarking/server/cluster_master.py delete mode 100644 tools/aws_benchmarking/server/logs/master.log delete mode 100644 tools/aws_benchmarking/server/pserver.sh.template delete mode 100644 tools/aws_benchmarking/server/requirements.txt delete mode 100644 tools/aws_benchmarking/server/trainer.sh.template create mode 100644 tools/check_api_approvals.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 37b6e992..8d1c3d49 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -27,18 +27,27 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " message(STATUS "AR tools: ${CMAKE_AR}") if(WIN32) + option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) + set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_STATIC_LIBRARY_PREFIX lib) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + + if (MSVC_STATIC_CRT) + message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + endif() + add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838) set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221") set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") +else(WIN32) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations") endif(WIN32) find_package(CUDA QUIET) @@ -54,7 +63,6 @@ option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FO option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) -option(WITH_CUSTOM_TRAINER "Turn on trainer implement by custom" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) @@ -66,14 +74,15 @@ option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(WITH_PSLIB "Compile with pslib support" OFF) +option(WITH_BOX_PS "Compile with box_ps support" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) option(WITH_HIGH_LEVEL_API_TEST "Test fluid python high-level api interface" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) -option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ON) +option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) # PY_VERSION if(NOT PY_VERSION) @@ -83,7 +92,7 @@ set(PYBIND11_PYTHON_VERSION ${PY_VERSION}) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING + set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" FORCE) endif() @@ -122,6 +131,12 @@ endif() if (REPLACE_ENFORCE_GLOG) add_definitions("-DREPLACE_ENFORCE_GLOG") endif() + +if (SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$") + message("Choose the correct type of sanitizer") + return() +endif() + ######################################################################################## include(external/mklml) # download mklml package @@ -144,15 +159,11 @@ include(external/cub) include(external/rocprim) include(external/xxhash) # download xxhash include(external/dlpack) -include(external/snappy) # download snappy -include(external/snappystream) # download snappystream include(external/warpctc) # download, build, install warpctc -include(external/yaml-cpp) # download yaml if (NOT WIN32) # there is no official support of nccl, cupti in windows include(cupti) -include(external/gzstream) endif (NOT WIN32) if(WITH_PSLIB) @@ -160,6 +171,9 @@ if(WITH_PSLIB) include(external/pslib_brpc) include(external/pslib) endif(WITH_PSLIB) +if(WITH_BOX_PS) + include(external/box_ps) +endif(WITH_BOX_PS) if(WITH_DISTRIBUTE) if(WITH_GRPC) @@ -211,7 +225,6 @@ if (WITH_PROFILER) endif() include(generic) # simplify cmake module -include(package) # set paddle packages include(ccache) # set ccache for compilation include(util) # set unittest and link libs include(version) # set PADDLE_VERSION diff --git a/Dockerfile b/Dockerfile index 5db7fa50..9e460988 100644 --- a/Dockerfile +++ b/Dockerfile @@ -54,8 +54,8 @@ RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ RUN rm -r /root/python_build RUN apt-get update && \ - apt-get install -y --allow-downgrades patchelf \ - python3 python3-dev python3-pip \ + apt-get install -y --allow-downgrades --allow-change-held-packages \ + patchelf python3 python3-dev python3-pip \ git python-pip python-dev python-opencv openssh-server bison \ libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \ wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ @@ -172,6 +172,11 @@ RUN pip3.6 --no-cache-dir install pylint pytest astroid isort RUN pip3.7 --no-cache-dir install pylint pytest astroid isort RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker +RUN pip3 --no-cache-dir install coverage +RUN pip3.6 --no-cache-dir install coverage +RUN pip3.7 --no-cache-dir install coverage +RUN pip --no-cache-dir install coverage + COPY ./python/requirements.txt /root/ RUN pip3 --no-cache-dir install -r /root/requirements.txt RUN pip3.6 --no-cache-dir install -r /root/requirements.txt diff --git a/README.md b/README.md index 1e00f9e8..44c8c38b 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# PaddlePaddle (clone from /baidu/paddlepaddle/paddle@feed-trainer) - +# PaddlePaddle +Fork From http://icode.baidu.com/repos/baidu/paddlepaddle/paddle/tree/paddle_feed_news_201910 (commitid:f50e701) v1.4 English | [简体中文](./README_cn.md) [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -18,17 +18,18 @@ learning to many products at Baidu. Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Latest PaddlePaddle Release: [Fluid 1.5.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.5) +### Latest PaddlePaddle Release: [Fluid 1.5.2](https://github.com/PaddlePaddle/Paddle/tree/release/1.5) ### Install Latest Stable Release: ``` # Linux CPU pip install paddlepaddle -# Linux GPU cuda9cudnn7 -pip install paddlepaddle-gpu # Linux GPU cuda10cudnn7 -pip install paddlepaddle-gpu==1.5.1.post107 +pip install paddlepaddle-gpu # Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.5.1.post87 +pip install paddlepaddle-gpu==1.5.2.post87 +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu==1.5.2.post97 + # For installation on other platform, refer to http://paddlepaddle.org/ ``` @@ -76,33 +77,33 @@ Now our developers could acquire Tesla V100 online computing resources for free. ## Installation -It is recommended to read [this doc](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) on our website. +It is recommended to read [this doc](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) on our website. ## Documentation -We provide [English](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) and -[Chinese](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) documentation. +We provide [English](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) and +[Chinese](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) documentation. - [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://paddlepaddle.org/documentation/docs/en/1.4/user_guides/howto/training/multi_node_en.html) +- [Distributed Training](http://paddlepaddle.org.cn/documentation/docs/en/1.5/user_guides/howto/training/multi_node_en.html) You can run distributed training jobs on MPI clusters. -- [Python API](http://paddlepaddle.org/documentation/docs/en/1.4/api/index_en.html) +- [Python API](http://paddlepaddle.org.cn/documentation/docs/en/1.5/api/index_en.html) Our new API enables much shorter programs. -- [How to Contribute](http://paddlepaddle.org/documentation/docs/en/1.4/advanced_usage/development/contribute_to_paddle/index_en.html) +- [How to Contribute](http://paddlepaddle.org.cn/documentation/docs/en/1.5/advanced_usage/development/contribute_to_paddle/index_en.html) We appreciate your contributions! ## Communication - [Github Issues](https://github.com/PaddlePaddle/Paddle/issues): bug reports, feature requests, install issues, usage issues, etc. -- QQ discussion group: 432676488 (PaddlePaddle). +- QQ discussion group: 796771754 (PaddlePaddle). - [Forums](http://ai.baidu.com/forum/topic/list/168?pageNo=1): discuss implementations, research, etc. ## Copyright and License diff --git a/README_cn.md b/README_cn.md index a89b3f05..cde308c9 100644 --- a/README_cn.md +++ b/README_cn.md @@ -3,8 +3,8 @@ [English](./README.md) | 简体中文 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/index_cn.html) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/index_cn.html) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -16,17 +16,18 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) -### PaddlePaddle最新版本: [Fluid 1.5.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.5) +### PaddlePaddle最新版本: [Fluid 1.5.2](https://github.com/PaddlePaddle/Paddle/tree/release/1.5) ### 安装最新稳定版本: ``` # Linux CPU pip install paddlepaddle -# Linux GPU cuda9cudnn7 -pip install paddlepaddle-gpu # Linux GPU cuda10cudnn7 -pip install paddlepaddle-gpu==1.5.1.post107 +pip install paddlepaddle-gpu # Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.5.1.post87 +pip install paddlepaddle-gpu==1.5.2.post87 +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu==1.5.2.post97 + # 其他平台上的安装指引请参考 http://paddlepaddle.org/ ``` @@ -58,33 +59,33 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**,训练模型 ## 安装 -推荐阅读官网上的[安装说明](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) +推荐阅读官网上的[安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) ## 文档 -我们提供[英文](http://www.paddlepaddle.org/documentation/docs/en/1.4/beginners_guide/index_en.html)和 -[中文](http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) 文档 +我们提供[英文](http://www.paddlepaddle.org.cn/documentation/docs/en/1.5/beginners_guide/index_en.html)和 +[中文](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html) 文档 - [深度学习101](https://github.com/PaddlePaddle/book) 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 -- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.4/user_guides/howto/training/multi_node.html) +- [分布式训练](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/user_guides/howto/training/multi_node.html) 可以在MPI集群上运行分布式训练任务 -- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.4/api_cn/index_cn.html) +- [Python API](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/api_cn/index_cn.html) 新的API支持代码更少更简洁的程序 -- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.4/advanced_usage/development/contribute_to_paddle/index_cn.html) +- [贡献方式](http://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/development/contribute_to_paddle/index_cn.html) 欢迎您的贡献! ## 交流与反馈 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议 -- QQ群: 432676488 (PaddlePaddle) +- QQ群: 796771754 (PaddlePaddle) - [论坛](http://ai.baidu.com/forum/topic/list/168): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围 ## 版权和许可证 diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 5f7b4a46..816314dd 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -62,6 +62,10 @@ if(WITH_PSLIB) add_definitions(-DPADDLE_WITH_PSLIB) endif() +if(WITH_BOX_PS) + add_definitions(-DPADDLE_WITH_BOX_PS) +endif() + if(WITH_GPU) add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DEIGEN_USE_GPU) @@ -88,14 +92,20 @@ if(WITH_GPU) include_directories(${CUDA_TOOLKIT_INCLUDE}) if(TENSORRT_FOUND) - if(${CUDA_VERSION_MAJOR} VERSION_LESS 8) - message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile") - endif() - if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) - message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile") - endif() - if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4) - message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile") + if(WIN32) + if(${CUDA_VERSION_MAJOR} VERSION_LESS 9) + message(FATAL_ERROR "TensorRT needs CUDA >= 9.0 to compile on Windows") + endif() + else() + if(${CUDA_VERSION_MAJOR} VERSION_LESS 8) + message(FATAL_ERROR "TensorRT needs CUDA >= 8.0 to compile") + endif() + if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + message(FATAL_ERROR "TensorRT needs CUDNN >= 7.0 to compile") + endif() + if(${TENSORRT_MAJOR_VERSION} VERSION_LESS 4) + message(FATAL_ERROR "Paddle needs TensorRT >= 4.0 to compile") + endif() endif() include_directories(${TENSORRT_INCLUDE_DIR}) endif() diff --git a/cmake/copyfile.py b/cmake/copyfile.py new file mode 100644 index 00000000..7ba4d950 --- /dev/null +++ b/cmake/copyfile.py @@ -0,0 +1,44 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import shutil +import glob + + +def main(): + src = sys.argv[1] + dst = sys.argv[2] + if os.path.isdir(src): #copy directory + pathList = os.path.split(src) + dst = os.path.join(dst, pathList[-1]) + if not os.path.exists(dst): + shutil.copytree(src, dst) + print("first copy directory: {0} --->>> {1}".format(src, dst)) + else: + shutil.rmtree(dst) + shutil.copytree(src, dst) + print("overwritten copy directory: {0} --->>> {1}".format(src, dst)) + else: #copy file, wildcard + if not os.path.exists(dst): + os.makedirs(dst) + srcFiles = glob.glob(src) + for srcFile in srcFiles: + shutil.copy(srcFile, dst) + print("copy file: {0} --->>> {1}".format(srcFile, dst)) + + +if __name__ == "__main__": + main() diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index b9c72c04..09d71364 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -186,10 +186,6 @@ list(APPEND CUDA_NVCC_FLAGS "-std=c++11") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC") endif(NOT WIN32) -if(WITH_FAST_MATH) - # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html - list(APPEND CUDA_NVCC_FLAGS "--use_fast_math") -endif() # in cuda9, suppress cuda warning on eigen list(APPEND CUDA_NVCC_FLAGS "-w") # Set :expt-relaxed-constexpr to suppress Eigen warnings diff --git a/cmake/external/box_ps.cmake b/cmake/external/box_ps.cmake new file mode 100644 index 00000000..ddb4c82e --- /dev/null +++ b/cmake/external/box_ps.cmake @@ -0,0 +1,68 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_BOX_PS}) + return() +ENDIF(NOT ${WITH_BOX_PS}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with BOX_PS in Paddle yet." + "Force WITH_BOX_PS=OFF") + SET(WITH_BOX_PS OFF CACHE STRING "Disable BOX_PS package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(BOX_PS_PROJECT "extern_box_ps") +IF((NOT DEFINED BOX_PS_VER) OR (NOT DEFINED BOX_PS_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(BOX_PS_VER "0.1.1" CACHE STRING "" FORCE) + SET(BOX_PS_NAME "box_ps" CACHE STRING "" FORCE) + SET(BOX_PS_URL "http://box-ps.gz.bcebos.com/box_ps_stub.tar.gz" CACHE STRING "" FORCE) +ENDIF() +MESSAGE(STATUS "BOX_PS_NAME: ${BOX_PS_NAME}, BOX_PS_URL: ${BOX_PS_URL}") +SET(BOX_PS_SOURCE_DIR "${THIRD_PARTY_PATH}/box_ps") +SET(BOX_PS_DOWNLOAD_DIR "${BOX_PS_SOURCE_DIR}/src/${BOX_PS_PROJECT}") +SET(BOX_PS_DST_DIR "box_ps") +SET(BOX_PS_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(BOX_PS_INSTALL_DIR ${BOX_PS_INSTALL_ROOT}/${BOX_PS_DST_DIR}) +SET(BOX_PS_ROOT ${BOX_PS_INSTALL_DIR}) +SET(BOX_PS_INC_DIR ${BOX_PS_ROOT}/include) +SET(BOX_PS_LIB_DIR ${BOX_PS_ROOT}/lib) +SET(BOX_PS_LIB ${BOX_PS_LIB_DIR}/libbox_ps.so) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${BOX_PS_ROOT}/lib") + +INCLUDE_DIRECTORIES(${BOX_PS_INC_DIR}) +FILE(WRITE ${BOX_PS_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(BOX_PS)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${BOX_PS_NAME}/include ${BOX_PS_NAME}/lib \n" + " DESTINATION ${BOX_PS_DST_DIR})\n") +ExternalProject_Add( + ${BOX_PS_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${BOX_PS_SOURCE_DIR} + DOWNLOAD_DIR ${BOX_PS_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${BOX_PS_URL} -c -q -O ${BOX_PS_NAME}.tar.gz + && tar zxvf ${BOX_PS_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${BOX_PS_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BOX_PS_INSTALL_ROOT} +) +ADD_LIBRARY(box_ps SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET box_ps PROPERTY IMPORTED_LOCATION ${BOX_PS_LIB}) +ADD_DEPENDENCIES(box_ps ${BOX_PS_PROJECT}) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 0dd35c09..a5a86afa 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -33,7 +33,7 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args -set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog") +set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog") # If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF ExternalProject_Add( @@ -62,7 +62,7 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) -ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy) +ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest) ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) ADD_DEPENDENCIES(brpc extern_brpc) diff --git a/cmake/external/dgc.cmake b/cmake/external/dgc.cmake index 05e63bfe..5d5fcc3d 100644 --- a/cmake/external/dgc.cmake +++ b/cmake/external/dgc.cmake @@ -23,14 +23,14 @@ INCLUDE_DIRECTORIES(${DGC_INCLUDE_DIR}) ExternalProject_Add( extern_dgc ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/PaddlePaddle/Fleet" - GIT_TAG "2d04dc3800cdd0601f1b65d547dabcc60b0cf9dc" + URL "http://fleet.bj.bcebos.com/collective.tgz" + URL_MD5 "015d565156c3de4e30fe25473f47e7a9" SOURCE_DIR "${DGC_SOURCES_DIR}" CONFIGURE_COMMAND "" - BUILD_COMMAND cd collective && make -j + BUILD_COMMAND make -j INSTALL_COMMAND mkdir -p ${DGC_INSTALL_DIR}/lib/ ${DGC_INCLUDE_DIR}/dgc - && cp ${DGC_SOURCES_DIR}/collective/build/lib/libdgc.a ${DGC_LIBRARIES} - && cp ${DGC_SOURCES_DIR}/collective/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/ + && cp ${DGC_SOURCES_DIR}/build/lib/libdgc.a ${DGC_LIBRARIES} + && cp ${DGC_SOURCES_DIR}/build/include/dgc.h ${DGC_INCLUDE_DIR}/dgc/ BUILD_IN_SOURCE 1 ) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index d6d4b79c..bea65d2d 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -3,15 +3,6 @@ INCLUDE(ExternalProject) SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) -if(NOT WITH_FAST_MATH) - # EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html - # enables some optimizations which might affect the accuracy of the result. - # This currently enables the SSE vectorization of sin() and cos(), - # and speedups sqrt() for single precision. - # Defined to 1 by default. Define it to 0 to disable. - add_definitions(-DEIGEN_FAST_MATH=0) -endif() - if(WIN32) set(EIGEN_GIT_REPOSITORY https://github.com/wopeizl/eigen-git-mirror) diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 343b7544..fae1e20d 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -21,6 +21,8 @@ IF(WIN32) set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) ELSE(WIN32) set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE) + set(BUILD_COMMAND $(MAKE) --silent) + set(INSTALL_COMMAND $(MAKE) install) ENDIF(WIN32) INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) @@ -31,6 +33,8 @@ ExternalProject_Add( GIT_REPOSITORY "https://github.com/gflags/gflags.git" GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a PREFIX ${GFLAGS_SOURCES_DIR} + BUILD_COMMAND ${BUILD_COMMAND} + INSTALL_COMMAND ${INSTALL_COMMAND} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} @@ -50,6 +54,7 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) + ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) ADD_DEPENDENCIES(gflags extern_gflags) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index e4595265..04189c4f 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -13,6 +13,9 @@ # limitations under the License. #FIXME:(gongwb) Move brpc's gtest dependency. + +include(GNUInstallDirs) + IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) IF(WITH_TESTING) ENABLE_TESTING() @@ -28,14 +31,14 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) IF(WIN32) set(GTEST_LIBRARIES - "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE) + "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE) set(GTEST_MAIN_LIBRARIES - "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE) + "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE) ELSE(WIN32) set(GTEST_LIBRARIES - "${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE) + "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE) set(GTEST_MAIN_LIBRARIES - "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE) + "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE) ENDIF(WIN32) IF(WITH_MKLML) @@ -48,7 +51,7 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${GTEST_DEPENDS} GIT_REPOSITORY "https://github.com/google/googletest.git" - GIT_TAG "release-1.8.0" + GIT_TAG "release-1.8.1" PREFIX ${GTEST_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake index ac0febd0..3ba8a466 100644 --- a/cmake/external/leveldb.cmake +++ b/cmake/external/leveldb.cmake @@ -34,8 +34,6 @@ ExternalProject_Add( BUILD_IN_SOURCE 1 ) -ADD_DEPENDENCIES(extern_leveldb snappy) - ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) ADD_DEPENDENCIES(leveldb extern_leveldb) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 06681129..17556afe 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -43,7 +43,7 @@ IF(WIN32) ELSE() #TODO(intel-huying): # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. - SET(MKLML_VER "Glibc225_vsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) + SET(MKLML_VER "csrmm2_mklml_lnx_2019.0.2" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index cdcbdd46..d28cc1c3 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs) INCLUDE(ExternalProject) SET(NGRAPH_PROJECT "extern_ngraph") -SET(NGRAPH_GIT_TAG "4ec94acc11084a5d53418f565529310fa584899a") +SET(NGRAPH_GIT_TAG "e26d602a756f5f83e6c8220f910b61d7089fa951") SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) @@ -76,6 +76,7 @@ ExternalProject_Add( CMAKE_ARGS -DMKLDNN_INCLUDE_DIR=${MKLDNN_INC_DIR} CMAKE_ARGS -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR} CMAKE_ARGS -DMKLML_LIB_DIR=${MKLML_INSTALL_DIR}/lib + CMAKE_ARGS -NGRAPH_USE_LEGACY_MKLDNN=TRUE ) add_dependencies(ngraph ${NGRAPH_PROJECT}) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index d8a4a0be..1d40ad10 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -58,7 +58,41 @@ IF(NOT ${CBLAS_FOUND}) UPDATE_COMMAND "" CONFIGURE_COMMAND "" ) - ELSE() + ELSE(NOT WIN32) + SET(CBLAS_FOUND false) + SET(CBLAS_LIBRARIES + "${CBLAS_INSTALL_DIR}/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" + CACHE FILEPATH "openblas library." FORCE) + INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}/openblas) # For openbals code to include its own headers. + INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) + ExternalProject_Add( + extern_openblas + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git + GIT_TAG "v0.3.7" + PREFIX ${CBLAS_SOURCES_DIR} + INSTALL_DIR ${CBLAS_INSTALL_DIR} + BUILD_IN_SOURCE 0 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX=${CBLAS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DBUILD_SHARED_LIBS=ON + -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${CBLAS_INSTALL_DIR} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + ) + add_custom_command(TARGET extern_openblas POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CBLAS_INSTALL_DIR}/bin/openblas${CMAKE_SHARED_LIBRARY_SUFFIX} ${CBLAS_INSTALL_DIR}/lib ) + ADD_LIBRARY(openblas STATIC IMPORTED GLOBAL) + SET_PROPERTY(TARGET openblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES}) + ADD_DEPENDENCIES(openblas extern_openblas) ENDIF(NOT WIN32) SET(CBLAS_PROVIDER openblas) ENDIF(NOT ${CBLAS_FOUND}) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 09eb437a..e746a7a5 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -222,6 +222,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=OFF + -Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake deleted file mode 100644 index 3fb6b49f..00000000 --- a/cmake/external/snappy.cmake +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -include (ExternalProject) - -# NOTE: snappy is needed when linking with recordio - -set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) -set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) -set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) - -if(WIN32) - SET(SNAPPY_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4244 /wd4267") -else() - SET(SNAPPY_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) -endif() - -ExternalProject_Add( - extern_snappy - GIT_REPOSITORY "https://github.com/google/snappy" - GIT_TAG "1.1.7" - PREFIX ${SNAPPY_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${SNAPPY_CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DBUILD_TESTING=OFF - -DSNAPPY_BUILD_TESTS:BOOL=OFF - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -) -IF(WIN32) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib") -else(WIN32) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") -endif (WIN32) - -add_library(snappy STATIC IMPORTED GLOBAL) -set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES}) - -include_directories(${SNAPPY_INCLUDE_DIR}) -add_dependencies(snappy extern_snappy) diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake deleted file mode 100644 index 392f186b..00000000 --- a/cmake/external/snappystream.cmake +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -include (ExternalProject) - -set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream) -set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream) -set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE) - -if(WIN32) - # Fix me, VS2015 come without VLA support - set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib") - MESSAGE(WARNING, "In windows, snappystream has no compile support for windows, - please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR}) -else(WIN32) - set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") - - ExternalProject_Add( - extern_snappystream - GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git" - GIT_TAG "0.2.8" - PREFIX ${SNAPPYSTREAM_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS - -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - DEPENDS snappy - ) -endif(WIN32) - -add_library(snappystream STATIC IMPORTED GLOBAL) -set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES}) - -include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers. -include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers. - -add_dependencies(snappystream extern_snappystream) diff --git a/cmake/external/yaml-cpp.cmake b/cmake/external/yaml-cpp.cmake deleted file mode 100644 index 88904aed..00000000 --- a/cmake/external/yaml-cpp.cmake +++ /dev/null @@ -1,59 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -include (ExternalProject) - -IF(NOT ${WITH_CUSTOM_TRAINER}) - return() -ENDIF(NOT ${WITH_CUSTOM_TRAINER}) - -set(YAML_SOURCES_DIR ${THIRD_PARTY_PATH}/yaml-cpp) -set(YAML_INSTALL_DIR ${THIRD_PARTY_PATH}/install/yaml-cpp) -set(YAML_INCLUDE_DIR "${YAML_INSTALL_DIR}/include" CACHE PATH "yaml include directory." FORCE) - -SET(YAML_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) - -ExternalProject_Add( - extern_yaml - GIT_REPOSITORY "https://github.com/jbeder/yaml-cpp" - GIT_TAG "yaml-cpp-0.6.2" - PREFIX ${YAML_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${YAML_CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} - -DCMAKE_INSTALL_PREFIX=${YAML_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${YAML_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DBUILD_TESTING=OFF - -DYAML_BUILD_TESTS:BOOL=OFF - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${YAML_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${YAML_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -) -set(YAML_LIBRARIES "${YAML_INSTALL_DIR}/lib/libyaml-cpp.a") - -add_library(yaml-cpp STATIC IMPORTED GLOBAL) -set_property(TARGET yaml-cpp PROPERTY IMPORTED_LOCATION ${YAML_LIBRARIES}) - -include_directories(${YAML_INCLUDE_DIR}) -add_dependencies(yaml-cpp extern_yaml) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 36b533aa..cfd5e177 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -37,6 +37,12 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") function(safe_set_flag is_c src_list flag_name) string(REPLACE "-" "_" safe_name ${flag_name}) string(REPLACE "=" "_" safe_name ${safe_name}) + + if(${flag_name} MATCHES "fsanitize") + set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS ${flag_name}) + endif() + if(is_c) CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name}) set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name}) @@ -47,6 +53,10 @@ function(safe_set_flag is_c src_list flag_name) if(${safe_name}) set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE) endif() + + if(${flag_name} MATCHES "fsanitize") + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) + endif() endfunction() # helper macro to set cflag @@ -108,6 +118,20 @@ if(BARRIER_FOUND) endif(BARRIER_FOUND) SET(CMAKE_EXTRA_INCLUDE_FILES "") +# Only one sanitizer is allowed in compile time +string(TOLOWER "${SANITIZER_TYPE}" sanitizer_type) +if(sanitizer_type STREQUAL "address") + set(fsanitize "-fsanitize=address") +elseif(sanitizer_type STREQUAL "leak") + set(fsanitize "-fsanitize=leak") +elseif(sanitizer_type STREQUAL "memory") + set(fsanitize "-fsanitize=memory") +elseif(sanitizer_type STREQUAL "thread") + set(fsanitize "-fsanitize=thread") +elseif(sanitizer_type STREQUAL "undefined") + set(fsanitize "-fsanitize=undefined") +endif() + # Common flags. the compiler flag used for C/C++ sources whenever release or debug # Do not care if this flag is support for gcc. @@ -131,7 +155,7 @@ set(COMMON_FLAGS -Wno-error=terminate # Warning in PADDLE_ENFORCE -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 -Wimplicit-fallthrough=0 # Warning in tinyformat.h - -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2 + ${fsanitize} ) set(GPU_COMMON_FLAGS @@ -173,14 +197,13 @@ endif(UNIX AND NOT APPLE) foreach(flag ${COMMON_FLAGS}) safe_set_cflag(CMAKE_C_FLAGS ${flag}) safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) - endforeach() foreach(flag ${GPU_COMMON_FLAGS}) safe_set_nvflag(${flag}) endforeach() -if(WIN32) +if(WIN32 AND MSVC_STATIC_CRT) # windows build turn off warnings. safe_set_static_flag() foreach(flag_var @@ -191,4 +214,4 @@ safe_set_static_flag() string(REGEX REPLACE "(^| )/W[0-9]( |$)" " " ${flag_var} "${${flag_var}}") set(flag_var "${flag_var} /w") endforeach(flag_var) -endif(WIN32) +endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index d846e08b..f6749c2a 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -389,7 +389,6 @@ function(cc_test_run TARGET_NAME) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) # No unit test should exceed 10 minutes. set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) @@ -472,7 +471,6 @@ function(nv_test TARGET_NAME) add_test(${TARGET_NAME} ${TARGET_NAME}) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) - set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_limit_of_tmp_allocation=4294967296) # 4G set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) endif() endfunction(nv_test) @@ -725,7 +723,7 @@ function(py_test TARGET_NAME) if(WITH_COVERAGE) add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true - FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G + FLAGS_cpu_deterministic=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} COVERAGE_FILE=${PADDLE_BINARY_DIR}/python-coverage.data ${PYTHON_EXECUTABLE} -m coverage run --branch -p ${py_test_SRCS} ${py_test_ARGS} @@ -733,7 +731,7 @@ function(py_test TARGET_NAME) else() add_test(NAME ${TARGET_NAME} COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true - FLAGS_cpu_deterministic=true FLAGS_limit_of_tmp_allocation=4294967296 # 4G + FLAGS_cpu_deterministic=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 2a3962b9..9503d1dc 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -13,12 +13,19 @@ # limitations under the License. # make package for paddle fluid shared and static library + +if(WIN32) + if(NOT PYTHON_EXECUTABLE) + FIND_PACKAGE(PythonInterp REQUIRED) + endif() +endif() + +set(COPY_SCRIPT_DIR ${PADDLE_SOURCE_DIR}/cmake) function(copy TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DSTS DEPS) + set(multiValueArgs SRCS DSTS) cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(fluid_lib_dist_dep ${TARGET} ${fluid_lib_dist_dep} PARENT_SCOPE) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) @@ -26,43 +33,16 @@ function(copy TARGET) message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers") endif () math(EXPR len "${copy_lib_SRCS_len} - 1") - - add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS}) foreach (index RANGE ${len}) list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_DSTS ${index} dst) - if (WIN32) - if(IS_DIRECTORY ${src}) - get_filename_component(last_path ${src} NAME) - string(APPEND dst "/" ${last_path}) - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" - ) - if(EXISTS ${src}) - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND cmake -E copy_directory "${src}" "${dst}" - COMMENT "copying ${src} -> ${dst}") - else() - message(WARNING "${src} not exist!") - endif() - else() - # windows cmd shell will not expand wildcard automatically. - # below expand the files, and copy them by rules. - file(GLOB src_files ${src}) - if (NOT "${src_files}" STREQUAL "") - list(REMOVE_DUPLICATES src_files) - endif () - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" - ) - foreach (src_file ${src_files}) - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" - COMMENT "copying ${src_file} -> ${dst}") - endforeach () - endif() - else (WIN32) # not windows - add_custom_command(TARGET ${TARGET} PRE_BUILD + if (WIN32) #windows + file(TO_NATIVE_PATH ${src} native_src) + file(TO_NATIVE_PATH ${dst} native_dst) + add_custom_command(TARGET ${TARGET} POST_BUILD + COMMAND ${PYTHON_EXECUTABLE} ${COPY_SCRIPT_DIR}/copyfile.py ${native_src} ${native_dst}) + else (WIN32) #not windows + add_custom_command(TARGET ${TARGET} POST_BUILD COMMAND mkdir -p "${dst}" COMMAND cp -r "${src}" "${dst}" COMMENT "copying ${src} -> ${dst}") @@ -71,210 +51,189 @@ function(copy TARGET) endfunction() # third party -set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3") -copy(eigen3_lib - SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen - DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported - DEPS eigen3 - ) +set(third_party_deps eigen3 gflags glog boost xxhash zlib) +if(NOT PROTOBUF_FOUND OR WIN32) + list(APPEND third_party_deps extern_protobuf) +endif () -set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags") -copy(gflags_lib - SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS gflags - ) +if (WITH_MKLML) + list(APPEND third_party_deps mklml) +elseif (NOT CBLAS_FOUND OR WIN32) + list(APPEND third_party_deps extern_openblas) +endif () -set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog") -copy(glog_lib - SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS glog - ) +if (WITH_MKLDNN) + list(APPEND third_party_deps mkldnn_shared_lib) +endif () -set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/") -copy(boost_lib - SRCS ${BOOST_INCLUDE_DIR}/boost - DSTS ${dst_dir} - DEPS boost - ) +if (WITH_NGRAPH) + list(APPEND third_party_deps ngraph) +endif () -set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash") -copy(xxhash_lib - SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS xxhash - ) +add_custom_target(third_party DEPENDS ${third_party_deps}) -if (NOT PROTOBUF_FOUND OR WIN32) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") - copy(protobuf_lib - SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS extern_protobuf - ) -endif () +# inference-only library +set(inference_lib_deps third_party paddle_fluid paddle_fluid_shared) +add_custom_target(inference_lib_dist DEPENDS ${inference_lib_deps}) -if (WITH_MKLML) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml") +set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/eigen3") +copy(inference_lib_dist + SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen + DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported) + +set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/boost") +copy(inference_lib_dist + SRCS ${BOOST_INCLUDE_DIR}/boost + DSTS ${dst_dir}) + +if(WITH_MKLML) + set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/mklml") if(WIN32) - copy(mklml_lib - SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB} - ${MKLML_SHARED_LIB_DEPS} ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR} - DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib - ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} - DEPS mklml - ) + copy(inference_lib_dist + SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_SHARED_LIB} + ${MKLML_SHARED_LIB_DEPS} ${MKLML_SHARED_IOMP_LIB} ${MKLML_INC_DIR} + DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}/lib + ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}) else() - copy(mklml_lib - SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} - DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} - DEPS mklml - ) + copy(inference_lib_dist + SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} + DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}) endif() elseif (NOT CBLAS_FOUND OR WIN32) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas") - copy(openblas_lib + set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/openblas") + copy(inference_lib_dist SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include - DSTS ${dst_dir} ${dst_dir} - DEPS extern_openblas - ) + DSTS ${dst_dir} ${dst_dir}) endif () -if (WITH_MKLDNN) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn") - if(WIN32) - copy(mkldnn_lib - SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB} - DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib - DEPS mkldnn_shared_lib - ) - else() - copy(mkldnn_lib - SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS mkldnn_shared_lib - ) - endif() -endif () - -if (WITH_NGRAPH) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph") - copy(ngraph_lib - SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR} - DSTS ${dst_dir} ${dst_dir} - DEPS ngraph - ) -endif () +if(WITH_MKLDNN) +set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/mkldnn") +if(WIN32) + copy(inference_lib_dist + SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} ${MKLDNN_LIB} + DSTS ${dst_dir} ${dst_dir}/lib ${dst_dir}/lib) +else() + copy(inference_lib_dist + SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB} + DSTS ${dst_dir} ${dst_dir}/lib) +endif() +endif() + +set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/gflags") +copy(inference_lib_dist + SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) -set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") -copy(snappy_lib - SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappy) +set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/glog") +copy(inference_lib_dist + SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) -set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") -copy(snappystream_lib - SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappystream) +set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/xxhash") +copy(inference_lib_dist + SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) -set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") -copy(zlib_lib +set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/zlib") +copy(inference_lib_dist SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS zlib) - -# paddle fluid module -set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") -set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") -set(module "framework") -if (NOT WIN32) - set(framework_lib_deps framework_py_proto) -endif (NOT WIN32) - -copy(framework_lib DEPS ${framework_lib_deps} - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h - ${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet - ) + DSTS ${dst_dir} ${dst_dir}/lib) -set(module "memory") -copy(memory_lib - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation - ) - -set(inference_deps paddle_fluid_shared paddle_fluid) +if (NOT PROTOBUF_FOUND OR WIN32) + set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/protobuf") + copy(inference_lib_dist + SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} + DSTS ${dst_dir} ${dst_dir}/lib) +endif () -set(module "inference/api") +if (WITH_NGRAPH) + set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/ngraph") + copy(inference_lib_dist + SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR} + DSTS ${dst_dir} ${dst_dir}) +endif () if (TENSORRT_FOUND) - copy(tensorrt_lib DEPS ${inference_deps} - SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer* - DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib) + set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/tensorrt") + copy(inference_lib_dist + SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/*nvinfer* + DSTS ${dst_dir}/include ${dst_dir}/lib) endif () if (ANAKIN_FOUND) - copy(anakin_lib DEPS ${inference_deps} + set(dst_dir "${FLUID_INFERENCE_INSTALL_DIR}/third_party/install/anakin") + copy(inference_lib_dist SRCS ${ANAKIN_ROOT}/* - DSTS ${FLUID_INSTALL_DIR}/third_party/install/anakin) + DSTS ${dst_dir}) endif () -set(module "inference") +copy(inference_lib_dist + SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt + DSTS ${FLUID_INFERENCE_INSTALL_DIR}) + +set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") if(WIN32) set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*) else(WIN32) set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*) endif(WIN32) -copy(inference_lib DEPS ${inference_deps} - SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib} - ${src_dir}/${module}/api/paddle_*.h + +copy(inference_lib_dist + SRCS ${src_dir}/inference/api/paddle_*.h ${paddle_fluid_lib} + DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib) + + +# fluid library for both train and inference +set(fluid_lib_deps inference_lib_dist) +add_custom_target(fluid_lib_dist ALL DEPENDS ${fluid_lib_deps}) + +set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid") +set(module "inference") +copy(fluid_lib_dist + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/api/paddle_*.h ${paddle_fluid_lib} DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} +) + +set(module "framework") +set(framework_lib_deps framework_proto) +add_dependencies(fluid_lib_dist ${framework_lib_deps}) +copy(fluid_lib_dist + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/data_feed.pb.h ${src_dir}/${module}/ir/memory_optimize_pass/*.h + ${src_dir}/${module}/ir/*.h ${src_dir}/${module}/fleet/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}/ir/memory_optimize_pass ${dst_dir}/${module}/ir ${dst_dir}/${module}/fleet) + +set(module "memory") +copy(fluid_lib_dist + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h ${src_dir}/${module}/allocation/*.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail ${dst_dir}/${module}/allocation ) set(module "platform") -copy(platform_lib DEPS profiler_py_proto - SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h - DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details +set(platform_lib_deps profiler_proto) +add_dependencies(fluid_lib_dist ${platform_lib_deps}) +copy(fluid_lib_dist + SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/platform/profiler.pb.h + DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details ${dst_dir}/${module} ) set(module "string") -copy(string_lib +copy(fluid_lib_dist SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat ) set(module "pybind") -copy(pybind_lib +copy(fluid_lib_dist SRCS ${CMAKE_CURRENT_BINARY_DIR}/paddle/fluid/${module}/pybind.h DSTS ${dst_dir}/${module} ) # CMakeCache Info -copy(cmake_cache - SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt - DSTS ${FLUID_INSTALL_DIR}) - -# This command generates a complete fluid library for both train and inference -add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) - -# Following commands generate a inference-only fluid library -# third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR} -copy(third_party DEPS fluid_lib_dist - SRCS ${FLUID_INSTALL_DIR}/third_party ${FLUID_INSTALL_DIR}/CMakeCache.txt - DSTS ${FLUID_INFERENCE_INSTALL_DIR} ${FLUID_INFERENCE_INSTALL_DIR} +copy(fluid_lib_dist + SRCS ${FLUID_INFERENCE_INSTALL_DIR}/third_party ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt + DSTS ${FLUID_INSTALL_DIR} ${FLUID_INSTALL_DIR} ) -# only need libpaddle_fluid.so/a and paddle_*.h for inference-only library -copy(inference_api_lib DEPS fluid_lib_dist - SRCS ${paddle_fluid_lib} - ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h - DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include -) - -add_custom_target(inference_lib_dist DEPENDS third_party inference_api_lib) - # paddle fluid version function(version version_file) execute_process( diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 134c8943..28e880fb 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -110,7 +110,7 @@ function(op_library TARGET) # Define operators that don't need pybind here. foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" -"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "deformable_conv_op" "dgc_op") +"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() @@ -191,9 +191,6 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP(fake_quantize_abs_max);\n") elseif(${TARGET} STREQUAL "tensorrt_engine_op") message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference") - elseif(${TARGET} STREQUAL "fc") - # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition - file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") else() file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") endif() diff --git a/cmake/package.cmake b/cmake/package.cmake deleted file mode 100644 index 79e02147..00000000 --- a/cmake/package.cmake +++ /dev/null @@ -1,21 +0,0 @@ -set(CPACK_PACKAGE_NAME paddle) -set(CPACK_PACKAGE_VERSION_MAJOR ${PADDLE_MAJOR_VERSION}) -set(CPACK_PACKAGE_VERSION_MINOR ${PADDLE_MINOR_VERSION}) -set(CPACK_PACKAGE_VERSION_PATCH ${PADDLE_PATCH_VERSION}) -set(CPACK_PACKAGE_VERSION ${PADDLE_VERSION}) -## DEB Settings -set(CPACK_DEBIAN_PACKAGE_NAME paddle) -set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE amd64) -set(CPACK_DEBIAN_PACKAGE_MAINTAINER PaddlePaddle Dev ) -set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Paddle") -set(CPACK_PACKAGE_DESCRIPTION "") -set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl") -set(CPACK_DEBIAN_PACKAGE_SECTION Devel) -set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION}) -set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst") -#set(CPACK_GENERATOR "DEB") -# Start cpack -include (CMakePackageConfigHelpers) -include (CPack) - - diff --git a/cmake/tensorrt.cmake b/cmake/tensorrt.cmake index 3bf12094..fc97fcbf 100644 --- a/cmake/tensorrt.cmake +++ b/cmake/tensorrt.cmake @@ -2,14 +2,28 @@ if(NOT WITH_GPU) return() endif() -set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT") +if(WIN32) + if("${TENSORRT_ROOT}" STREQUAL "") + message(WARNING "Please specify the TensorRT root path: TENSORRT_ROOT.") + endif() + string(REPLACE "\\" "/" TENSORRT_ROOT "${TENSORRT_ROOT}") + set(TR_INFER_LIB nvinfer.lib) + set(TR_INFER_RT nvinfer.dll) + set(TR_INFER_PLUGIN_RT nvinfer_plugin.dll) +else() + set(TENSORRT_ROOT "/usr" CACHE PATH "TENSORRT ROOT") + set(TR_INFER_LIB libnvinfer.a) + set(TR_INFER_RT libnvinfer.so) + set(TR_INFER_PLUGIN_RT libnvinfer_plugin.so) +endif() + find_path(TENSORRT_INCLUDE_DIR NvInfer.h PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/include $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/include NO_DEFAULT_PATH ) -find_library(TENSORRT_LIBRARY NAMES libnvinfer.so libnvinfer.a +find_library(TENSORRT_LIBRARY NAMES ${TR_INFER_LIB} ${TR_INFER_RT} PATHS ${TENSORRT_ROOT} ${TENSORRT_ROOT}/lib $ENV{TENSORRT_ROOT} $ENV{TENSORRT_ROOT}/lib NO_DEFAULT_PATH diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 7c31b5b4..77ad4f54 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -1,7 +1,7 @@ paddle.fluid.Program ('paddle.fluid.framework.Program', ('document', '7364a01d7b9132a435e46162c7fbd6c6')) paddle.fluid.Program.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.Program.block (ArgSpec(args=['self', 'index'], varargs=None, keywords=None, defaults=None), ('document', '86cd9499e226be661a3d686260ee1150')) -paddle.fluid.Program.clone (ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)), ('document', 'a65221387f84c74eee5130d7678ca900')) +paddle.fluid.Program.clone (ArgSpec(args=['self', 'for_test'], varargs=None, keywords=None, defaults=(False,)), ('document', '11777d4121a64566a746e55497a4b78c')) paddle.fluid.Program.current_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd601c7719e425e3d9cf862ea4ad194ca')) paddle.fluid.Program.global_block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd64ea1dc96e9f674499ea3006d470aa4')) paddle.fluid.Program.list_vars (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '32c14b0f12baae4b352200fa09b5e789')) @@ -34,11 +34,11 @@ paddle.fluid.DistributeTranspiler.transpile (ArgSpec(args=['self', 'trainer_id', paddle.fluid.memory_optimize (ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level', 'skip_grads'], varargs=None, keywords=None, defaults=(None, False, 0, True)), ('document', '2348247f684bfd5bb9466470f35be064')) paddle.fluid.release_memory (ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd38c5b8b2b2e0bb19bcf1b581a80a7e4')) paddle.fluid.DistributeTranspilerConfig ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspilerConfig', ('document', '550b8c767a8ae1a2eb74b18924ddc975')) -paddle.fluid.DistributeTranspilerConfig.__init__ +paddle.fluid.DistributeTranspilerConfig.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.ParallelExecutor ('paddle.fluid.parallel_executor.ParallelExecutor', ('document', '2b4d2e859f2e0c6161f4fed995f7956d')) paddle.fluid.ParallelExecutor.__init__ (ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.ParallelExecutor.drop_local_exe_scopes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '77c739744ea5708b80fb1b37cc89db40')) -paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '33ce6ec50f8eeb05d340e6b114b026fd')) +paddle.fluid.ParallelExecutor.run (ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '0af092676e5b1320bb4232396154ce4b')) paddle.fluid.create_lod_tensor (ArgSpec(args=['data', 'recursive_seq_lens', 'place'], varargs=None, keywords=None, defaults=None), ('document', 'b82ea20e2dc5ff2372e0643169ca47ff')) paddle.fluid.create_random_int_lodtensor (ArgSpec(args=['recursive_seq_lens', 'base_shape', 'place', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '74dc6d23185d90a7a50fbac19f5b65fb')) paddle.fluid.DataFeedDesc ('paddle.fluid.data_feed_desc.DataFeedDesc', ('document', '43877a0d9357db94d3dbc7359cbe8c73')) @@ -47,10 +47,9 @@ paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=No paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', 'a34790bff4a2891713ddd644db56418d')) paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'fdd07ce63e72bed57f2c0db5bec5720f')) paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'c23a79dfa04edd014b477bd4b183da06')) -paddle.fluid.CompiledProgram ('paddle.fluid.compiler.CompiledProgram', ('document', '6c45b5ccc24ae62d10115ce8abdc29a5')) -paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', '0e17773521634ef798fddd7d2ea3ef96')) -paddle.fluid.CompiledProgram.with_inference_optimize (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=None), ('document', '9e5b009d850191a010e859189c127fd8')) +paddle.fluid.CompiledProgram ('paddle.fluid.compiler.CompiledProgram', ('document', '598d294107d44d7620bce76527a92c37')) +paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph', 'build_strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', '1c7c6171bbf6d77f2fce0166aa0ec43b')) paddle.fluid.ExecutionStrategy ('paddle.fluid.core_avx.ExecutionStrategy', ('document', '535ce28c4671176386e3cd283a764084')) paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core_avx.ParallelExecutor.ExecutionStrategy) -> None paddle.fluid.BuildStrategy ('paddle.fluid.core_avx.BuildStrategy', ('document', 'eec64b9b7cba58b0a63687b4c34ffe56')) @@ -68,47 +67,70 @@ paddle.fluid.io.load_params (ArgSpec(args=['executor', 'dirname', 'main_program' paddle.fluid.io.load_persistables (ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cfa84ef7c5435625bff4cc132cb8a0e3')) paddle.fluid.io.save_inference_model (ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment', 'program_only'], varargs=None, keywords=None, defaults=(None, None, None, True, False)), ('document', 'fc82bfd137a9b1ab8ebd1651bd35b6e5')) paddle.fluid.io.load_inference_model (ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '2f54d7c206b62f8c10f4f9d78c731cfd')) -paddle.fluid.io.PyReader ('paddle.fluid.reader.PyReader', ('document', 'e37efae53f3935b32aec37eda9f3d906')) +paddle.fluid.io.batch (ArgSpec(args=['reader', 'batch_size', 'drop_last'], varargs=None, keywords=None, defaults=(False,)), ('document', 'cf2869b408b39cadadd95206b4e03b39')) +paddle.fluid.io.PyReader ('paddle.fluid.reader.PyReader', ('document', 'b03399246f69cd6fc03b43e87af8bd4e')) paddle.fluid.io.PyReader.__init__ (ArgSpec(args=['self', 'feed_list', 'capacity', 'use_double_buffer', 'iterable', 'return_list'], varargs=None, keywords=None, defaults=(None, None, True, True, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.io.PyReader.decorate_batch_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', '4364e836e3cb8ab5e68e411b763c50c7')) paddle.fluid.io.PyReader.decorate_sample_generator (ArgSpec(args=['self', 'sample_generator', 'batch_size', 'drop_last', 'places'], varargs=None, keywords=None, defaults=(True, None)), ('document', 'efa4c8b90fe6d99dcbda637b70351bb1')) paddle.fluid.io.PyReader.decorate_sample_list_generator (ArgSpec(args=['self', 'reader', 'places'], varargs=None, keywords=None, defaults=(None,)), ('document', '6c11980092720de304863de98074a64a')) +paddle.fluid.io.PyReader.next (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '08b2fd1463f3ea99d79d17303988349b')) paddle.fluid.io.PyReader.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '7432197701fdaab1848063860dc0b97e')) -paddle.fluid.io.PyReader.start (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f6395fd95b025000c5c7a5be31aebc4e')) +paddle.fluid.io.PyReader.start (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'a0983fb21a0a51e6a31716009fe9a9c1')) +paddle.fluid.io.DataLoader ('paddle.fluid.reader.DataLoader', ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.io.DataLoader.__init__ +paddle.fluid.io.DataLoader.from_dataset (ArgSpec(args=['dataset', 'places', 'drop_last'], varargs=None, keywords=None, defaults=(True,)), ('document', '58e8bffa033f26b00b256c8bb1daff11')) +paddle.fluid.io.DataLoader.from_generator (ArgSpec(args=['feed_list', 'capacity', 'use_double_buffer', 'iterable', 'return_list'], varargs=None, keywords=None, defaults=(None, None, True, True, False)), ('document', '8034bdb488fa18d60c4ffb0ba9658337')) +paddle.fluid.io.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c')) +paddle.fluid.io.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d')) +paddle.fluid.io.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb')) +paddle.fluid.io.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d')) +paddle.fluid.io.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4')) +paddle.fluid.io.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d')) +paddle.fluid.io.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad')) +paddle.fluid.io.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796')) +paddle.fluid.io.PipeReader ('paddle.reader.decorator.PipeReader', ('document', 'd3c250618f98c1a5fb646f869016a98e')) +paddle.fluid.io.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.io.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45')) +paddle.fluid.io.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0')) +paddle.fluid.io.Fake ('paddle.reader.decorator.Fake', ('document', '0d8f4847b99bed6d456ade0d903202e1')) +paddle.fluid.io.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.initializer.ConstantInitializer ('paddle.fluid.initializer.ConstantInitializer', ('document', '798f1fd87cbe9798d001ffb6e616415d')) paddle.fluid.initializer.ConstantInitializer.__init__ (ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.initializer.UniformInitializer ('paddle.fluid.initializer.UniformInitializer', ('document', 'a8f1177e4ce29766853e801d5b0a3635')) -paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.initializer.NormalInitializer ('paddle.fluid.initializer.NormalInitializer', ('document', '2171207fb07293603e0fd2ff01234b3e')) +paddle.fluid.initializer.UniformInitializer ('paddle.fluid.initializer.UniformInitializer', ('document', '587b7035cd1d56f76f2ded617b92521d')) +paddle.fluid.initializer.UniformInitializer.__init__ (ArgSpec(args=['self', 'low', 'high', 'seed', 'diag_num', 'diag_step', 'diag_val'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0, 0, 0, 1.0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.initializer.NormalInitializer ('paddle.fluid.initializer.NormalInitializer', ('document', '279a0d89bf01138fbf4c4ba14f22099b')) paddle.fluid.initializer.NormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.initializer.TruncatedNormalInitializer ('paddle.fluid.initializer.TruncatedNormalInitializer', ('document', 'b8e90aad6ee5687cb5f2b6fd404370d1')) paddle.fluid.initializer.TruncatedNormalInitializer.__init__ (ArgSpec(args=['self', 'loc', 'scale', 'seed'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.initializer.XavierInitializer ('paddle.fluid.initializer.XavierInitializer', ('document', '3d5676f1a5414aa0c815d793a795ccb3')) paddle.fluid.initializer.XavierInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'fan_out', 'seed'], varargs=None, keywords=None, defaults=(True, None, None, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.initializer.BilinearInitializer ('paddle.fluid.initializer.BilinearInitializer', ('document', '5646a5cd44f0c9111344d13f46d31169')) +paddle.fluid.initializer.BilinearInitializer ('paddle.fluid.initializer.BilinearInitializer', ('document', '8a40b54fe33c19c3edcf6624ffae5d03')) paddle.fluid.initializer.BilinearInitializer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'd389912dc079cbef432335a00017cec0')) -paddle.fluid.initializer.MSRAInitializer ('paddle.fluid.initializer.MSRAInitializer', ('document', 'ecfadb28c52d01496d107835a69ec3f9')) +paddle.fluid.initializer.MSRAInitializer ('paddle.fluid.initializer.MSRAInitializer', ('document', 'b99e0ee95e2fd02640cb4b08a7ae80cc')) paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'uniform', 'fan_in', 'seed'], varargs=None, keywords=None, defaults=(True, None, 0)), ('document', '53c757bed9345f2ad3361902531e7cf5')) -paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '53c01b661feb8e60d0efa2066976c1a8')) -paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '68bebc3963526880a07c98a5d6226794')) +paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '5f55553caf939d270c7fe8dc418084b2')) +paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'eaa04fd68661a3af59abd0e19b3b6eda')) paddle.fluid.initializer.NumpyArrayInitializer ('paddle.fluid.initializer.NumpyArrayInitializer', ('document', '064f134a27c16372967d450f499762ab')) paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1c74f52549814235077ecc34856a95eb')) -paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '1b4916f765620374ad0fdefe5a352993')) +paddle.fluid.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'd4ac047e0d5e6b7b1c5ff6ef7d7cfff5')) +paddle.fluid.one_hot (ArgSpec(args=['input', 'depth', 'allow_out_of_range'], varargs=None, keywords=None, defaults=(False,)), ('document', 'eef66730acc806088f9e8ba90252bda1')) +paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, None)), ('document', '0dc8181f14a33f91fbae9385a9b3d9fd')) +paddle.fluid.layers.center_loss (ArgSpec(args=['input', 'label', 'num_classes', 'alpha', 'param_attr', 'update_center'], varargs=None, keywords=None, defaults=(True,)), ('document', '7129819d94625c6104054e8187768589')) +paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', 'd8e405486a1e4e189b51d6ee28d67b1e')) paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', '6d3ee14da70adfa36d85c40b18716ef2')) paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'c37d51aad655c8a9f9b045c64717320a')) paddle.fluid.layers.dynamic_gru (ArgSpec(args=['input', 'size', 'param_attr', 'bias_attr', 'is_reverse', 'gate_activation', 'candidate_activation', 'h_0', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, False, 'sigmoid', 'tanh', None, False)), ('document', '83617c165827e030636c80486d5de6f3')) paddle.fluid.layers.gru_unit (ArgSpec(args=['input', 'hidden', 'size', 'param_attr', 'bias_attr', 'activation', 'gate_activation', 'origin_mode'], varargs=None, keywords=None, defaults=(None, None, 'tanh', 'sigmoid', False)), ('document', '33974b9bfa69f2f1eb85e6f956dff04e')) -paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr'], varargs=None, keywords=None, defaults=(None,)), ('document', '34f96be41684b0959897a9e735997e20')) -paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c469f22029f7b5d41ecd44dfa1e81ffd')) +paddle.fluid.layers.linear_chain_crf (ArgSpec(args=['input', 'label', 'param_attr', 'length'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9045b8971e4232132ec9952695f4c3ae')) +paddle.fluid.layers.crf_decoding (ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)), ('document', '5ce117258e243be1c81539e254178d90')) paddle.fluid.layers.cos_sim (ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None), ('document', '8e6ce424cf9e261ef32ee229c06a6e66')) paddle.fluid.layers.cross_entropy (ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)), ('document', 'f43c659ca1749a3f0ff2231e6dfda07d')) paddle.fluid.layers.bpr_loss (ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6263dfdeb6c670fa0922c9cbc8fb1bf4')) paddle.fluid.layers.square_error_cost (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'bbb9e708bab250359864fefbdf48e9d9')) paddle.fluid.layers.chunk_eval (ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types', 'seq_length'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b02844e0ad4bd713c5fe6802aa13219c')) -paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)), ('document', '3d8e8f3e0e1cf520156be37605e83ccd')) -paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '114c7fe6b0adfc6d6371122f9b9f506e')) -paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '367293b5bada54136a91621078d38334')) +paddle.fluid.layers.sequence_conv (ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'padding_start', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, True, None, None, None, None, None)), ('document', '2bf23e7884c380c3b27f2709aa322cb9')) +paddle.fluid.layers.conv2d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '06de9adb5994f6f8cb806c75b55550af')) +paddle.fluid.layers.conv3d (ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)), ('document', '71b09227709475fa178c1739dff64af6')) paddle.fluid.layers.sequence_pool (ArgSpec(args=['input', 'pool_type', 'is_test', 'pad_value'], varargs=None, keywords=None, defaults=(False, 0.0)), ('document', 'e90a93251c52dc4e6fb34fb3991b3f82')) paddle.fluid.layers.sequence_softmax (ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', 'eaa9d0bbd3d4e017c8bc4ecdac483711')) paddle.fluid.layers.softmax (ArgSpec(args=['input', 'use_cudnn', 'name', 'axis'], varargs=None, keywords=None, defaults=(False, None, -1)), ('document', 'cee673c79e3ff4582656a24e04f841e5')) @@ -116,15 +138,16 @@ paddle.fluid.layers.pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'po paddle.fluid.layers.pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)), ('document', '053b1a855f13a066d005759171724bc6')) paddle.fluid.layers.adaptive_pool2d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '52343203de40afe29607397e13aaf0d2')) paddle.fluid.layers.adaptive_pool3d (ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)), ('document', '55db6ae7275fb9678a6814aebab81a9c')) -paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '4cc22c3553e73a958e8b9a240d894431')) +paddle.fluid.layers.batch_norm (ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)), ('document', '9e5a9f4f6d82d34a33d9ca632379cbcc')) +paddle.fluid.layers.instance_norm (ArgSpec(args=['input', 'epsilon', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None)), ('document', '02972097e089629efdb0ed9404fd36ae')) paddle.fluid.layers.data_norm (ArgSpec(args=['input', 'act', 'epsilon', 'param_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var'], varargs=None, keywords=None, defaults=(None, 1e-05, None, 'NCHW', False, None, None, None, False)), ('document', '2460b30fb87037555208fa8ac6fc1787')) paddle.fluid.layers.beam_search_decode (ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '83e08f21af41ac8bac37aeab1f86fdd0')) -paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '903ac9a778e0bf1bf649bd71e9d0ba0c')) -paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', '9b1f13c1fc872f76f8f84cf11e955f53')) +paddle.fluid.layers.conv2d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', 'ab58296b567bf0c686084add7f3280a4')) +paddle.fluid.layers.conv3d_transpose (ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)), ('document', 'fe15dbfb17d97d3d29b2fa7ee6390ee6')) paddle.fluid.layers.sequence_expand (ArgSpec(args=['x', 'y', 'ref_level', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '10e122eb755c2bd1f78ef2332b28f1a0')) paddle.fluid.layers.sequence_expand_as (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '858c432e7cbd8bb952cc2eb555457d50')) -paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '1ba3ccfe13ed5091e113c09c13dc3a20')) -paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7f5ce36fb0016621e6bc001f4236d978')) +paddle.fluid.layers.sequence_pad (ArgSpec(args=['x', 'pad_value', 'maxlen', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'df08b9c499ab3a90f95d08ab5b6c6c62')) +paddle.fluid.layers.sequence_unpad (ArgSpec(args=['x', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e478180d5bc010a84f35af958cafa62c')) paddle.fluid.layers.lstm_unit (ArgSpec(args=['x_t', 'hidden_t_prev', 'cell_t_prev', 'forget_bias', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(0.0, None, None, None)), ('document', 'fe126c58e4339410e875ab1eba246d21')) paddle.fluid.layers.reduce_sum (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'dd5f06fb7cf39ca06cbab4abd03e6893')) paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'a3024789eba11a70c2ef27c358173400')) @@ -138,60 +161,66 @@ paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, ke paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '39fbc5437be389f6c0c769f82fc1fba2')) paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')), ('document', '558d13133596209190df9a624264f28f')) paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '78cf3a7323d1a7697658242e13f63759')) -paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2bc3a59efa9d52b628a6255422d9f0e8')) +paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'input_length', 'padding_value', 'name'], varargs=None, keywords=None, defaults=(None, 0, None)), ('document', '9abb7bb8d267e017620a39a146dc47ea')) paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(True, None, None, None)), ('document', '77cbfb28cd2fc589f589c7013c5086cd')) paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', 'c1df110ea65998984f564c5c10abc54a')) -paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'fa2081f6e731bb9de7cd535ca07f523a')) +paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', '3720b4a386585094435993deb028b592')) paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e50940f3ce5a08cc477b72f517491bf3')) -paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', '4aa9df890b47eb67d5442f04aaf9eeec')) +paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'input_length', 'label_length'], varargs=None, keywords=None, defaults=(0, False, None, None)), ('document', 'a5be881ada816e47ea7a6ee4396da357')) paddle.fluid.layers.sequence_reshape (ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None), ('document', 'f568714a876425004aca4ea2d4a27701')) paddle.fluid.layers.transpose (ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8e72db173d4c082e27cb11f31d8c9bfa')) paddle.fluid.layers.im2sequence (ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)), ('document', '33134416fc27dd65a767e5f15116ee16')) -paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '11a544a6e3fd0482509712dd54377fa1')) +paddle.fluid.layers.nce (ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False)), ('document', '83d4ca6dfb957912807f535756e76992')) paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'num_samples', 'num_true', 'remove_accidental_hits', 'use_customized_samples', 'customized_samples', 'customized_probabilities', 'seed'], varargs=None, keywords=None, defaults=(1, True, False, None, None, 0)), ('document', 'd4435a63d34203339831ee6a86ef9242')) paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', 'b83e7dfa81059b39bb137922dc914f50')) paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', '1270395ce97a4e1b556104abbb14f096')) -paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e')) +paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '1d8a1c8b686b55631ba1b77805e4eacf')) paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23')) paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', '79797f827d89ae72c77960e9696883a9')) -paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '96b24820e8863d6044d5be4eaaddb9fd')) +paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '65231cc8281815124934b1439fbb750c')) paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '9461e67095a6fc5d568fb2ce8fef66ff')) paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax', 'axis'], varargs=None, keywords=None, defaults=(False, -100, True, False, -1)), ('document', '54e1675aa0364f4a78fa72804ec0f413')) paddle.fluid.layers.smooth_l1 (ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'ecb75c1b00c4c76c98b482f633b7a10c')) paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth', 'allow_out_of_range'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ec4115591be842868c86b2e5334245c6')) paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '98e7927f09ee2270535b29f048e481ec')) -paddle.fluid.layers.reshape (ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '6196c9ec3075ca5a9c058ea1f8492256')) +paddle.fluid.layers.reshape (ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', 'ca73fdc4551c5765c92eb00f24874289')) paddle.fluid.layers.squeeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbac07662a6e22e8e299ced880c7775')) paddle.fluid.layers.unsqueeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b9bd3129d36a70e7c4385df51ff71c62')) -paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9a72a7c8c80926150ea826e94efd7e9b')) +paddle.fluid.layers.lod_reset (ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)), ('document', '74498d37dd622ac472cb36887fce09ea')) +paddle.fluid.layers.lod_append (ArgSpec(args=['x', 'level'], varargs=None, keywords=None, defaults=None), ('document', '37663c7c179e920838a250ea0e28d909')) paddle.fluid.layers.lrn (ArgSpec(args=['input', 'n', 'k', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(5, 1.0, 0.0001, 0.75, None)), ('document', '73d297256da8954617996958d26ee93d')) -paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '2f189f8ef61f1c23779e1593b78755c0')) +paddle.fluid.layers.pad (ArgSpec(args=['x', 'paddings', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '36b6e58678956585e5b30aa3de123a60')) paddle.fluid.layers.pad_constant_like (ArgSpec(args=['x', 'y', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0.0, None)), ('document', '95aa1972983f30fe9b5a3713e523e20f')) paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, 0.1, 'float32', None)), ('document', '214f1dfbe95a628600bbe99e836319cf')) -paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'ceedc8c22752c623d6e1ea2e8df0f43f')) -paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '6f65342f646ef04ae705080a7dfee63f')) +paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', '49368d724023a66b41b0071be41c0ba5')) +paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '9a7a3b88a4fae41d58d3ca9b10ba0591')) paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '7e8e4bf1f0f8612961ed113e8af8f0c5')) -paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', 'a29488d94d9a4bc4434d8a3529b4c6fe')) +paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1, 'NCHW')), ('document', 'd29d829607b5ff12924197a3ba296c89')) paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', 'bd97ebfe4bdf5110a5fcb8ecb626a447')) -paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', '548c7c2ead5771d15abbaad505f901e9')) -paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', 'b7d810d1e251c5957c1efa6aa699d2d0')) +paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1, 'NCHW')), ('document', '44da7890c8a362a83a1c0902a1dc1e4d')) +paddle.fluid.layers.resize_trilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1, 'NCDHW')), ('document', '5b4d0f823f94c260fe5e6f7eec60a797')) +paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'data_format'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 'NCHW')), ('document', '0107a5cbae1aef3f381d3d769a6068eb')) paddle.fluid.layers.gather (ArgSpec(args=['input', 'index', 'overwrite'], varargs=None, keywords=None, defaults=(True,)), ('document', 'f985c9b66e3aec96fa753a8eb44c991c')) +paddle.fluid.layers.gather_nd (ArgSpec(args=['input', 'index', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3cc24f9cf135770aa6263dba25b457f9')) paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name', 'overwrite'], varargs=None, keywords=None, defaults=(None, True)), ('document', '69b22affd4a6326502af166f04c095ab')) +paddle.fluid.layers.scatter_nd_add (ArgSpec(args=['ref', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c2fa5ee7484b52b95a28abf1d8827cd0')) +paddle.fluid.layers.scatter_nd (ArgSpec(args=['index', 'updates', 'shape', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '14b5449ce42f8ff4ac4ce79b41c86cc5')) paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'abe3f714120117a5a3d3e639853932bf')) paddle.fluid.layers.random_crop (ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)), ('document', '042af0b8abea96b40c22f6e70d99e042')) -paddle.fluid.layers.mean_iou (ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None), ('document', 'e3b6630ba43cb13dfeeb1601cb64d671')) +paddle.fluid.layers.mean_iou (ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None), ('document', 'e714b4aa7993dfe9c1a38886875dbaac')) paddle.fluid.layers.relu (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0942c174f4f6fb274976d4357356f6a2')) paddle.fluid.layers.selu (ArgSpec(args=['x', 'scale', 'alpha', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'f93c61f5b0bf933cd425a64dca2c4fdd')) paddle.fluid.layers.log (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '02f668664e3bfc4df6c00d7363467140')) -paddle.fluid.layers.crop (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'ddf9837ee83e549119210a3d714d5f44')) +paddle.fluid.layers.crop (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'ba3621917d5beffd3d022b88fbf6dc46')) +paddle.fluid.layers.crop_tensor (ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'cb855453e3506bf54c5c013616ffddfb')) paddle.fluid.layers.rank_loss (ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '8eb36596bb43d7a907d3397c7aedbdb3')) paddle.fluid.layers.margin_rank_loss (ArgSpec(args=['label', 'left', 'right', 'margin', 'name'], varargs=None, keywords=None, defaults=(0.1, None)), ('document', '6fc86ed23b420c8a0f6c043563cf3937')) paddle.fluid.layers.elu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '9af1926c06711eacef9e82d7a9e4d308')) paddle.fluid.layers.relu6 (ArgSpec(args=['x', 'threshold', 'name'], varargs=None, keywords=None, defaults=(6.0, None)), ('document', '538fc860b2a1734e118b94e4a1a3ee67')) -paddle.fluid.layers.pow (ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', '35fa2b79b1ae6968d4a69788051c1d27')) +paddle.fluid.layers.pow (ArgSpec(args=['x', 'factor', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'ca34f88ff61cf2a7f4c97a493d6000d0')) paddle.fluid.layers.stanh (ArgSpec(args=['x', 'scale_a', 'scale_b', 'name'], varargs=None, keywords=None, defaults=(0.6666666666666666, 1.7159, None)), ('document', '1e1efad868714425da15c785dfb533a1')) paddle.fluid.layers.hard_sigmoid (ArgSpec(args=['x', 'slope', 'offset', 'name'], varargs=None, keywords=None, defaults=(0.2, 0.5, None)), ('document', '607d79ca873bee40eed1c79a96611591')) -paddle.fluid.layers.swish (ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'b511609e3e0e8b636bf19f8b98249897')) +paddle.fluid.layers.swish (ArgSpec(args=['x', 'beta', 'name'], varargs=None, keywords=None, defaults=(1.0, None)), ('document', 'e0dc7bc66cba939033bc028d7a62c5f4')) paddle.fluid.layers.prelu (ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '2da40e447716338affebfe058d05d9a9')) paddle.fluid.layers.brelu (ArgSpec(args=['x', 't_min', 't_max', 'name'], varargs=None, keywords=None, defaults=(0.0, 24.0, None)), ('document', '49580538249a52c857fce75c94ad8af7')) paddle.fluid.layers.leaky_relu (ArgSpec(args=['x', 'alpha', 'name'], varargs=None, keywords=None, defaults=(0.02, None)), ('document', '1eb3009c69060299ec87949ee0d4b9ae')) @@ -203,7 +232,8 @@ paddle.fluid.layers.pad2d (ArgSpec(args=['input', 'paddings', 'mode', 'pad_value paddle.fluid.layers.unstack (ArgSpec(args=['x', 'axis', 'num'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b0c4ca08d4eb295189e1b107c920d093')) paddle.fluid.layers.sequence_enumerate (ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None)), ('document', 'b870fed41abd2aecf929ece65f555fa1')) paddle.fluid.layers.unique (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=('int32',)), ('document', 'cab0b06e5683875f12f0efc62fa230a9')) -paddle.fluid.layers.expand (ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '33bc4f6010282ffe044d77be7ba7c275')) +paddle.fluid.layers.unique_with_counts (ArgSpec(args=['x', 'dtype'], varargs=None, keywords=None, defaults=('int32',)), ('document', '1cb59c65b41766116944b8ed1e6ad345')) +paddle.fluid.layers.expand (ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7b97042c3ba55fb5fec6a06308523b73')) paddle.fluid.layers.sequence_concat (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b992616c1afbd6b0c2a897ac23036381')) paddle.fluid.layers.scale (ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None)), ('document', '463e4713806e5adaa4d20a41e2218453')) paddle.fluid.layers.elementwise_add (ArgSpec(args=['x', 'y', 'axis', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, None, None)), ('document', '5c0fb7298aec32525f96d451ae4c2851')) @@ -220,14 +250,15 @@ paddle.fluid.layers.gaussian_random (ArgSpec(args=['shape', 'mean', 'std', 'seed paddle.fluid.layers.sampling_id (ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')), ('document', 'c39b647b6cf08e058d96ee503d5284fe')) paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')), ('document', 'b24d0b21361c4bb8ef2cec8c26fb12b2')) paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'f4b60847cb0f1ae00823ba6fb1b11310')) -paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '3ca6a761570d86e303e473afba99bb49')) +paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '315b4870f294e33a27ecbdf440bed3ff')) +paddle.fluid.layers.strided_slice (ArgSpec(args=['input', 'axes', 'starts', 'ends', 'strides'], varargs=None, keywords=None, defaults=None), ('document', '340d8d656272ea396b441aab848429a2')) paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'bf61c8f79d795a8371bdb3b5468aa82b')) paddle.fluid.layers.rank (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '096df0e0273145ab80ed119a4c294db3')) paddle.fluid.layers.size (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'cf2e156beae36378722666c4c33bebfe')) -paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '1d6777f61831c54bea3a0029e2118448')) -paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '4d51a5a453755e0eb8c5ff6910a00dca')) -paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '1840f54c5bd5338bdf854980d47bf771')) -paddle.fluid.layers.logical_not (ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'd8fc1c5a5535736d4cd44c893a9701c9')) +paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '12db97c6c459c0f240ec7006737174f2')) +paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '15adbc561618b7db69671e02009bea67')) +paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '77ccf37b710c507dd97e03f08ce8bb29')) +paddle.fluid.layers.logical_not (ArgSpec(args=['x', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6e2fe8a322ec69811f6507d22acf8f9f')) paddle.fluid.layers.clip (ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ce33756573c572da67302499455dbcd')) paddle.fluid.layers.clip_by_norm (ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '99a1b9012d9c4495efc89d69958c3be7')) paddle.fluid.layers.mean (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '597257fb94d0597c404a6a5c91ab5258')) @@ -251,34 +282,28 @@ paddle.fluid.layers.shuffle_channel (ArgSpec(args=['x', 'group', 'name'], vararg paddle.fluid.layers.temporal_shift (ArgSpec(args=['x', 'seg_num', 'shift_ratio', 'name'], varargs=None, keywords=None, defaults=(0.25, None)), ('document', '13b1cdcb01f5ffdc26591ff9a2ec4669')) paddle.fluid.layers.py_func (ArgSpec(args=['func', 'x', 'out', 'backward_func', 'skip_vars_in_backward_input'], varargs=None, keywords=None, defaults=(None, None)), ('document', '8404e472ac12b4a30a505d3d3a3e5fdb')) paddle.fluid.layers.psroi_pool (ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '42d5155374f69786300d90d751956998')) +paddle.fluid.layers.prroi_pool (ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(1.0, 1, 1, None)), ('document', '454c7ea8c73313dd41513929d7526303')) paddle.fluid.layers.teacher_student_sigmoid_loss (ArgSpec(args=['input', 'label', 'soft_max_up_bound', 'soft_max_lower_bound'], varargs=None, keywords=None, defaults=(15.0, -15.0)), ('document', '07cb0d95a646dba1b9cc7cdce89e59f0')) paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None), ('document', '11bb8e62cc9256958eff3991fe4834da')) paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '18bc95c62d3300456c3c7da5278b47bb')) -paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '864f3cdc5e0c6152e2a39b136171644f')) paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '6b6ee1170fe20a79cf0631a1f49b0df2')) paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '7e5cac851fd9bad344230e1044b6a565')) paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', '20992b20d19c2e5983f366150827b4a6')) paddle.fluid.layers.continuous_value_model (ArgSpec(args=['input', 'cvm', 'use_cvm'], varargs=None, keywords=None, defaults=(True,)), ('document', 'c03490ffaa1b78258747157c313db4cd')) paddle.fluid.layers.where (ArgSpec(args=['condition'], varargs=None, keywords=None, defaults=None), ('document', 'b1e1487760295e1ff55307b880a99e18')) paddle.fluid.layers.sign (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'fa2f457a81714430c5677c2d68744728')) -paddle.fluid.layers.deformable_conv (ArgSpec(args=['input', 'offset', 'mask', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'deformable_groups', 'im2col_step', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, None, None, None)), ('document', '4d83ba6b971cfd590493b0925b3e081e')) +paddle.fluid.layers.deformable_conv (ArgSpec(args=['input', 'offset', 'mask', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'deformable_groups', 'im2col_step', 'param_attr', 'bias_attr', 'modulated', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, None, None, True, None)), ('document', '335193ac57d41d7199f8d26d30c069b1')) paddle.fluid.layers.unfold (ArgSpec(args=['x', 'kernel_sizes', 'strides', 'paddings', 'dilations', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None)), ('document', '3f884662ad443d9ecc2b3734b4f61ad6')) -paddle.fluid.layers.deformable_roi_pooling (ArgSpec(args=['input', 'rois', 'trans', 'no_trans', 'spatial_scale', 'group_size', 'pooled_height', 'pooled_width', 'part_size', 'sample_per_part', 'trans_std', 'position_sensitive', 'name'], varargs=None, keywords=None, defaults=(False, 1.0, [1, 1], 1, 1, None, 1, 0.1, False, None)), ('document', '99c03e3f249e36854f87dedaa17c8f35')) -paddle.fluid.layers.shard_index (ArgSpec(args=['input', 'index_num', 'nshards', 'shard_id', 'ignore_value'], varargs=None, keywords=None, defaults=(-1,)), ('document', '5786fdbba6753ecd6cbce5e6b0889924')) +paddle.fluid.layers.deformable_roi_pooling (ArgSpec(args=['input', 'rois', 'trans', 'no_trans', 'spatial_scale', 'group_size', 'pooled_height', 'pooled_width', 'part_size', 'sample_per_part', 'trans_std', 'position_sensitive', 'name'], varargs=None, keywords=None, defaults=(False, 1.0, [1, 1], 1, 1, None, 1, 0.1, False, None)), ('document', '47c5d1c890b36fa00ff3285c9398f613')) +paddle.fluid.layers.filter_by_instag (ArgSpec(args=['ins', 'ins_tag', 'filter_tag', 'is_lod'], varargs=None, keywords=None, defaults=None), ('document', '7703a2088af8de4128b143ff1164ca4a')) +paddle.fluid.layers.shard_index (ArgSpec(args=['input', 'index_num', 'nshards', 'shard_id', 'ignore_value'], varargs=None, keywords=None, defaults=(-1,)), ('document', 'c4969dd6bf164f9e6a90414ea4f4e5ad')) +paddle.fluid.layers.hard_swish (ArgSpec(args=['x', 'threshold', 'scale', 'offset', 'name'], varargs=None, keywords=None, defaults=(6.0, 6.0, 3.0, None)), ('document', '6a5152a7015c62cb8278fc24cb456459')) +paddle.fluid.layers.mse_loss (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'd9ede6469288636e1b3233b461a165c9')) paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '9d7806e31bdf727c1a23b8782a09b545')) -paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'cccb6eb5410c822e5307c947aca2c899')) -paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '32181f6037e387fb6e68a5beaafe33b6')) -paddle.fluid.layers.shuffle (ArgSpec(args=['reader', 'buffer_size'], varargs=None, keywords=None, defaults=None), ('document', 'aa5803d1eccdaef03cdfb0b7ca088071')) -paddle.fluid.layers.batch (ArgSpec(args=['reader', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '3007211c84c5c77eda8dc83619a6eaf8')) -paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '7241dd1c142f4c65c8d7f66948140aa7')) -paddle.fluid.layers.random_data_generator (ArgSpec(args=['low', 'high', 'shapes', 'lod_levels', 'for_parallel'], varargs=None, keywords=None, defaults=(True,)), ('document', '290f5b97f24f0022e195f7228dd56fd9')) +paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '88367daf9a30c9ab83adc5d7221e23ef')) +paddle.fluid.layers.double_buffer (ArgSpec(args=['reader', 'place', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '44724c493f41a124abc7531c2740e2e3')) paddle.fluid.layers.py_reader (ArgSpec(args=['capacity', 'shapes', 'dtypes', 'lod_levels', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', 'd78a1c7344955c5caed8dc13adb7beb6')) paddle.fluid.layers.create_py_reader_by_data (ArgSpec(args=['capacity', 'feed_list', 'name', 'use_double_buffer'], varargs=None, keywords=None, defaults=(None, True)), ('document', '2edf37d57862b24a7a26aa19a3573f73')) -paddle.fluid.layers.Preprocessor ('paddle.fluid.layers.io.Preprocessor', ('document', '1c2efbbc1197b44941a95b9ec4e737ae')) -paddle.fluid.layers.Preprocessor.__init__ (ArgSpec(args=['self', 'reader', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.layers.Preprocessor.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.layers.Preprocessor.inputs (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.layers.Preprocessor.outputs (ArgSpec(args=['self'], varargs='outs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.load (ArgSpec(args=['out', 'file_path', 'load_as_fp16'], varargs=None, keywords=None, defaults=(None,)), ('document', '9d1a4bc97bbce9fa1d4f7a4200a771ff')) paddle.fluid.layers.create_tensor (ArgSpec(args=['dtype', 'name', 'persistable'], varargs=None, keywords=None, defaults=(None, False)), ('document', 'aaf0176c743c43e9bc684dd7dfac25c5')) paddle.fluid.layers.create_parameter (ArgSpec(args=['shape', 'dtype', 'name', 'attr', 'is_bias', 'default_initializer'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '021272f30e0cdf7503586815378abfb8')) @@ -298,12 +323,13 @@ paddle.fluid.layers.zeros (ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs paddle.fluid.layers.reverse (ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None), ('document', '628135603692137d52bcf5a8d8d6816d')) paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '51a0fa1cfaf2507c00a215adacdb8a63')) paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '129cf426e71452fe8276d616a6dc21ae')) -paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '23c66e5918040fcc11c8fa8c5da1b38e')) +paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'b9fff4ffc8d11934cde099f4c39bf841')) paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', 'a45b42f21bc5a4e84b60981a3d629ab3')) paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '3663d1148946eed4c1c34c81be586b9e')) paddle.fluid.layers.zeros_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd88a23bcdc443719b3953593f7cef14a')) -paddle.fluid.layers.ones_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', '642afd126553337d6796600e886a6525')) +paddle.fluid.layers.ones_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd18d42059c6b189cbd3fab2fcb206c15')) paddle.fluid.layers.diag (ArgSpec(args=['diagonal'], varargs=None, keywords=None, defaults=None), ('document', '88a15e15f0098d549f07a01eaebf9ce3')) +paddle.fluid.layers.eye (ArgSpec(args=['num_rows', 'num_columns', 'batch_shape', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 'float32')), ('document', '25389d1e239a5d1cda66298f908ec549')) paddle.fluid.layers.While ('paddle.fluid.layers.control_flow.While', ('document', '50110155608a00f43d3d3fd1be41dcb4')) paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -345,30 +371,30 @@ paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=No paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837')) paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f')) paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '5b552a1f0f7eb4dacb768a975ba15d08')) -paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', 'ee6c70867d317b0a87094ed23546215f')) +paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, 20, True, True, True, True, 'both')), ('document', '3130bed32922b9fd84ce2dea6250f635')) paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '3011dc695f490afdf504dc24f628319a')) -paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd894323f31a913c4a5bd4cc764f6a76a')) -paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd083538e3439ed6b28b00207e0f321d5')) -paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9ef0909adb4d8c9430fcd595bab72dc1')) -paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f820eeaf81dfbdd1c360122cd5795cc8')) +paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bdc9a71908d3c9748532ff44c2f31034')) +paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9a4c346630a042454f727ad5e0cffc11')) +paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '92bec0a7fdec48ad78effdf30b02c6fa')) +paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a4af62b6c6ce858c897f74a4f0f')) paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '2dde114018cbcaff9b24c566bf6704a5')) -paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '63d198e36e1d85dcfb454c1a3cb3b38e')) -paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '4f53a5e7f50c55ea516375ef8f46316b')) -paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '893ec81a025f3c82f1c8fca6aa84d39f')) -paddle.fluid.layers.rsqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c629f5163fa04f80abb3d0240c462fa6')) -paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'f6d5642f52e357f3cec89cc9c15dc66c')) -paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e5ccc5339056e947272c1921d11e6cfe')) -paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b9c1474e5d0f83e4a15a5cd827abbf9c')) -paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd7d3af92e8c1d93aeeb4d6bc2e0fc9b6')) +paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '000a76652c8e59e21e7fb6d87cc7a668')) +paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e3dce5e892ce63cc9c6ed87a7e6206d5')) +paddle.fluid.layers.rsqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0b90c858d4d71a58896537c1bd7acb09')) +paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '503f4d5723bbe1b6c9f24058078709ed')) +paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5602b78da33c4b0ccaea0374411de423')) +paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a0977ab14448ba472e5c2e152f42a818')) +paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e94c8569179ffa3a0dca028a5b518dbf')) paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5c9a00178c5c28bb824f7d6c25060d3b')) paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '20d1d49fe4d13430a63c57fc4b29a677')) -paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0da9ea1a725c3d91ca0c37cea951ba29')) -paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e91cb3422c0ffdc04375752143179b47')) -paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '30bd7174c21294230616a22cd87b0035')) -paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '200916f013bad0b052b13dc43901f0b8')) -paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'be4533a4cd97c84424512dca76142083')) -paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '410f27a44b7365cc60d5d5ff5a53407e')) +paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '4441e4e5e9934eb98760e31330e7a13c')) +paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '40132ef34808ed621c63ed4fd886fd1c')) +paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '578106495166d0fb65ade2bb51cdf926')) +paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '728233aff902803f5f62e2d340c3bcbb')) +paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '74c4e6dfbdfc3453301ea11d722ad3d6')) +paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'a70e9320b113ca33c1299bbc032f09d4')) paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', '6de6775d9e9ed885056e764982130cfd')) +paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'alpha'], varargs=None, keywords=None, defaults=(None,)), ('document', '958c7bfdfb0b5e92af6ca4a90d24e5ef')) paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '386a4103d2884b2f1312ebc1e8ee6486')) paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '5ab9d5721a6734fe127069e4314e1309')) paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '9a0464425426a9b9c1b7500ede2836c1')) @@ -377,8 +403,8 @@ paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densitie paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'fd58078fdfffd899b91f992ba224628f')) paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '080ce0d54d3f1950ad5a3a8e5ae529e9')) paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'e9685f32d21bec8c013626c0254502c5')) -paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'efae414c1137c7944d6174dd08c5347a')) -paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '8edacd4b9bd02dd68931b9fa6bfe0cbd')) +paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta', 'return_index'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0, False)), ('document', '5485bcaceb0cde2695565a2ffd5bbd40')) +paddle.fluid.layers.ssd_loss (ArgSpec(args=['location', 'confidence', 'gt_box', 'gt_label', 'prior_box', 'prior_box_var', 'background_label', 'overlap_threshold', 'neg_pos_ratio', 'neg_overlap', 'loc_loss_weight', 'conf_loss_weight', 'match_type', 'mining_type', 'normalize', 'sample_size'], varargs=None, keywords=None, defaults=(None, 0, 0.5, 3.0, 0.5, 1.0, 1.0, 'per_prediction', 'max_negative', True, None)), ('document', '14d1eeae0f41b6792be43c1c0be0589b')) paddle.fluid.layers.rpn_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'is_crowd', 'im_info', 'rpn_batch_size_per_im', 'rpn_straddle_thresh', 'rpn_fg_fraction', 'rpn_positive_overlap', 'rpn_negative_overlap', 'use_random'], varargs=None, keywords=None, defaults=(256, 0.0, 0.5, 0.7, 0.3, True)), ('document', '651d98d51879dfa1bc1cd40391786a41')) paddle.fluid.layers.retinanet_target_assign (ArgSpec(args=['bbox_pred', 'cls_logits', 'anchor_box', 'anchor_var', 'gt_boxes', 'gt_labels', 'is_crowd', 'im_info', 'num_classes', 'positive_overlap', 'negative_overlap'], varargs=None, keywords=None, defaults=(1, 0.5, 0.4)), ('document', 'fa1d1c9d5e0111684c0db705f86a2595')) paddle.fluid.layers.sigmoid_focal_loss (ArgSpec(args=['x', 'label', 'fg_num', 'gamma', 'alpha'], varargs=None, keywords=None, defaults=(2, 0.25)), ('document', 'aeac6aae100173b3fc7f102cf3023a3d')) @@ -391,22 +417,23 @@ paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=Non paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '4c6225fc1a1c0b84955a8f0013008243')) paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e308ce1661cb722b220a6f482f85b9e4')) paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '400403175718d5a632402cdae88b01b8')) -paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '11b463ae2ad4c797fb91b3ee9864c4b4')) +paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ed56ff21536ca5c8ad418d0cfaf6a7b9')) paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9ddee76cb808db83768bf68010e39b2b')) -paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', '76d74056e9eedcacf013d8e3b115cbd3')) +paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'f6e333d76922c6e564413b4d216c245c')) +paddle.fluid.layers.multiclass_nms2 (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'return_index', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, False, None)), ('document', 'be156186ee7a2ee56ab30b964acb15e5')) paddle.fluid.layers.retinanet_detection_output (ArgSpec(args=['bboxes', 'scores', 'anchors', 'im_info', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0.05, 1000, 100, 0.3, 1.0)), ('document', '078d28607ce261a0cba2b965a79f6bb8')) paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6c023b9401214ae387a8b2d92638e5e4')) paddle.fluid.layers.box_decoder_and_assign (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'box_score', 'box_clip', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3619a7847709f5868f5e929065947b38')) paddle.fluid.layers.collect_fpn_proposals (ArgSpec(args=['multi_rois', 'multi_scores', 'min_level', 'max_level', 'post_nms_top_n', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '80a75103e001ca1ba056fbbe0c6a19f3')) paddle.fluid.layers.accuracy (ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)), ('document', 'ef799022a6040597462ae2b3d2f1c407')) -paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', '300537e259bba86fdefa13a133a0587d')) +paddle.fluid.layers.auc (ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)), ('document', '34b4575807f955f7e8698b8dead23858')) paddle.fluid.layers.exponential_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'eaf430c5a0380fb11bfe9a8922cd6295')) -paddle.fluid.layers.natural_exp_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', '63a9e96d446d7de1289f30b832bce36a')) +paddle.fluid.layers.natural_exp_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'aa3146f64d5d508e4e50687603aa7b15')) paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)), ('document', 'ea37a3a8a0b3ce2254e7bc49a0951dbe')) paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', 'a343254c36c2e89512cd8cd8a1960ead')) paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'd9f654117542c6b702963dda107a247f')) paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'fd57228fb76195e66bbcc8d8e42c494d')) -paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', 'f0d65d8c89d0fe78051ca689daa15e35')) +paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '1062e487dd3b50a6e58b5703b4f594c9')) paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', 'dc7292c456847ba41cfd318e9f7f4363')) paddle.fluid.layers.Uniform ('paddle.fluid.layers.distributions.Uniform', ('document', 'af70e7003f437e7a8a9e28cded35c433')) paddle.fluid.layers.Uniform.__init__ (ArgSpec(args=['self', 'low', 'high'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -420,6 +447,18 @@ paddle.fluid.layers.Normal.entropy (ArgSpec(args=['self'], varargs=None, keyword paddle.fluid.layers.Normal.kl_divergence (ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None), ('document', '2e8845cdf1129647e6fa6e816876cd3b')) paddle.fluid.layers.Normal.log_prob (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', 'b79091014ceaffb6a7372a198a341c23')) paddle.fluid.layers.Normal.sample (ArgSpec(args=['self', 'shape', 'seed'], varargs=None, keywords=None, defaults=(0,)), ('document', 'adac334af13f6984e991b3ecf12b8cb7')) +paddle.fluid.layers.Categorical ('paddle.fluid.layers.distributions.Categorical', ('document', '865c9dac8af6190e05588486ba091ee8')) +paddle.fluid.layers.Categorical.__init__ (ArgSpec(args=['self', 'logits'], varargs=None, keywords=None, defaults=None), ('document', '933b96c9ebab8e2c1f6007a50287311e')) +paddle.fluid.layers.Categorical.entropy (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'b360a2a7a4da07c2d268b329e09c82c1')) +paddle.fluid.layers.Categorical.kl_divergence (ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None), ('document', 'c2c4c37376584178025f0a4a61c4b862')) +paddle.fluid.layers.Categorical.log_prob (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', 'c0edd2e2fc76711477b32dc4da9de768')) +paddle.fluid.layers.Categorical.sample (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '08a2bbcaa20ee176ee7ec3d05737a0f6')) +paddle.fluid.layers.MultivariateNormalDiag ('paddle.fluid.layers.distributions.MultivariateNormalDiag', ('document', 'f6ee0e8b2898796dcff2a68c9fda19f0')) +paddle.fluid.layers.MultivariateNormalDiag.__init__ (ArgSpec(args=['self', 'loc', 'scale'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.MultivariateNormalDiag.entropy (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '3c679b573ba975c5067c8ebfd4354b02')) +paddle.fluid.layers.MultivariateNormalDiag.kl_divergence (ArgSpec(args=['self', 'other'], varargs=None, keywords=None, defaults=None), ('document', 'd9190d29dbd54c81f747a6436c35f062')) +paddle.fluid.layers.MultivariateNormalDiag.log_prob (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', 'c0edd2e2fc76711477b32dc4da9de768')) +paddle.fluid.layers.MultivariateNormalDiag.sample (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '08a2bbcaa20ee176ee7ec3d05737a0f6')) paddle.fluid.contrib.InitState ('paddle.fluid.contrib.decoder.beam_search_decoder.InitState', ('document', '3afd1f84232718e628e9e566941c5f05')) paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.StateCell ('paddle.fluid.contrib.decoder.beam_search_decoder.StateCell', ('document', 'ecd0066c02867d445d7b461e28220c50')) @@ -449,11 +488,11 @@ paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, paddle.fluid.contrib.QuantizeTranspiler ('paddle.fluid.contrib.quantize.quantize_transpiler.QuantizeTranspiler', ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d')) paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd')) +paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '909675a1ab055c69b436a7893fcae4fd')) paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884')) paddle.fluid.contrib.distributed_batch_reader (ArgSpec(args=['batch_reader'], varargs=None, keywords=None, defaults=None), ('document', 'b60796eb0a481484dd34e345f0eaa4d5')) paddle.fluid.contrib.Compressor ('paddle.fluid.contrib.slim.core.compressor.Compressor', ('document', 'a5417774a94aa9ae5560a42b96527e7d')) -paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer', 'search_space'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, [], None, None, None, None)), ('document', 'c195b3bba26169cff9439e8c467557c0')) +paddle.fluid.contrib.Compressor.__init__ (ArgSpec(args=['self', 'place', 'scope', 'train_program', 'train_reader', 'train_feed_list', 'train_fetch_list', 'eval_program', 'eval_reader', 'eval_feed_list', 'eval_fetch_list', 'eval_func', 'save_eval_model', 'prune_infer_model', 'teacher_programs', 'checkpoint_path', 'train_optimizer', 'distiller_optimizer', 'search_space'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, None, True, None, [], None, None, None, None)), ('document', '05119e0fa0fc07f5cf848ebf0a2cf070')) paddle.fluid.contrib.Compressor.config (ArgSpec(args=['self', 'config_file'], varargs=None, keywords=None, defaults=None), ('document', '780d9c007276ccbb95b292400d7807b0')) paddle.fluid.contrib.Compressor.run (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'c6e43d6a078d307672283c1f36e04fe9')) paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67')) @@ -474,10 +513,14 @@ paddle.fluid.contrib.HDFSClient.upload (ArgSpec(args=['self', 'hdfs_path', 'loca paddle.fluid.contrib.multi_download (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'trainer_id', 'trainers', 'multi_processes'], varargs=None, keywords=None, defaults=(5,)), ('document', '100927be598ed8f9eaa1f3ef1b23568a')) paddle.fluid.contrib.multi_upload (ArgSpec(args=['client', 'hdfs_path', 'local_path', 'multi_processes', 'overwrite', 'sync'], varargs=None, keywords=None, defaults=(5, False, True)), ('document', '183f34c83d30dbe16e09e8716c41958a')) paddle.fluid.contrib.extend_with_decoupled_weight_decay (ArgSpec(args=['base_optimizer'], varargs=None, keywords=None, defaults=None), ('document', 'a1095dfd4ec725747f662d69cd7659d4')) -paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'amp_lists', 'init_loss_scaling', 'incr_every_n_steps', 'decr_every_n_nan_or_inf', 'incr_ratio', 'decr_ratio', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(None, 1.0, 1000, 2, 2.0, 0.8, False)), ('document', 'd05e71f5b0bd6d92bb94e70e00b3f9cf')) +paddle.fluid.contrib.mixed_precision.decorate (ArgSpec(args=['optimizer', 'amp_lists', 'init_loss_scaling', 'incr_every_n_steps', 'decr_every_n_nan_or_inf', 'incr_ratio', 'decr_ratio', 'use_dynamic_loss_scaling'], varargs=None, keywords=None, defaults=(None, 1.0, 1000, 2, 2.0, 0.8, True)), ('document', '5f118631fc8632afb981b3a26daae731')) paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists ('paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists', ('document', 'c116ec6bb5d30998792daea8db21ee40')) paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists.__init__ (ArgSpec(args=['self', 'custom_white_list', 'custom_black_list'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.fused_elemwise_activation (ArgSpec(args=['x', 'y', 'functor_list', 'axis', 'scale', 'save_intermediate_out'], varargs=None, keywords=None, defaults=(-1, 0.0, True)), ('document', '1c4b247a2858cea8d9d8750693688270')) +paddle.fluid.contrib.sequence_topk_avg_pooling (ArgSpec(args=['input', 'row', 'col', 'topks', 'channel_num'], varargs=None, keywords=None, defaults=None), ('document', '5218c85dd4122b626da9bb92f3b50042')) +paddle.fluid.contrib.var_conv_2d (ArgSpec(args=['input', 'row', 'col', 'input_channel', 'output_channel', 'filter_size', 'stride', 'param_attr', 'act', 'dtype', 'name'], varargs=None, keywords=None, defaults=(1, None, None, 'float32', None)), ('document', 'f52a6edf6d3e970568788604da3329c2')) +paddle.fluid.contrib.match_matrix_tensor (ArgSpec(args=['x', 'y', 'channel_num', 'act', 'param_attr', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, 'float32', None)), ('document', '3bdc4b2891c1460bc630fdcd22766b21')) +paddle.fluid.contrib.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '7c727562ebdda38274106d1a9b338e5b')) paddle.fluid.contrib.BasicGRUUnit ('paddle.fluid.contrib.layers.rnn_impl.BasicGRUUnit', ('document', '2aed2540ed1540f081be9f4d08f2a65e')) paddle.fluid.contrib.BasicGRUUnit.__init__ (ArgSpec(args=['self', 'name_scope', 'hidden_size', 'param_attr', 'bias_attr', 'gate_activation', 'activation', 'dtype'], varargs=None, keywords=None, defaults=(None, None, None, None, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.BasicGRUUnit.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) @@ -512,6 +555,7 @@ paddle.fluid.contrib.BasicLSTMUnit.state_dict (ArgSpec(args=['self', 'destinatio paddle.fluid.contrib.BasicLSTMUnit.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) paddle.fluid.contrib.BasicLSTMUnit.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.basic_lstm (ArgSpec(args=['input', 'init_hidden', 'init_cell', 'hidden_size', 'num_layers', 'sequence_length', 'dropout_prob', 'bidirectional', 'batch_first', 'param_attr', 'bias_attr', 'gate_activation', 'activation', 'forget_bias', 'dtype', 'name'], varargs=None, keywords=None, defaults=(1, None, 0.0, False, True, None, None, None, None, 1.0, 'float32', 'basic_lstm')), ('document', 'fe4d0c3c55a162b8cfe10b05fabb7ce4')) +paddle.fluid.contrib.ctr_metric_bundle (ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None), ('document', 'b68d12366896c41065fc3738393da2aa')) paddle.fluid.dygraph.Layer ('paddle.fluid.dygraph.layers.Layer', ('document', 'a889d5affd734ede273e94d4257163ab')) paddle.fluid.dygraph.Layer.__init__ (ArgSpec(args=['self', 'name_scope', 'dtype'], varargs=None, keywords=None, defaults=(VarType.FP32,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Layer.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) @@ -530,7 +574,7 @@ paddle.fluid.dygraph.Layer.sublayers (ArgSpec(args=['self', 'include_sublayers'] paddle.fluid.dygraph.Layer.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.__impl__ (ArgSpec(args=['func'], varargs=None, keywords=None, defaults=()), ('document', 'fa71ad4e6c2b5bf2b5258bd1959f9b2a')) paddle.fluid.dygraph.guard (ArgSpec(args=['place'], varargs=None, keywords=None, defaults=(None,)), ('document', '7071320ffe2eec9aacdae574951278c6')) -paddle.fluid.dygraph.to_variable (ArgSpec(args=['value', 'block', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '9a65d87163a2c6b00fb78f4e61fb3300')) +paddle.fluid.dygraph.to_variable (ArgSpec(args=['value', 'block', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0e69fa3666f15dd01b6e3e270b9371cd')) paddle.fluid.dygraph.Conv2D ('paddle.fluid.dygraph.nn.Conv2D', ('document', 'baafe7ae0d3a61ae79cf4c7443e2c37c')) paddle.fluid.dygraph.Conv2D.__init__ (ArgSpec(args=['self', 'name_scope', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'dtype'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Conv2D.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) @@ -643,7 +687,7 @@ paddle.fluid.dygraph.GRUUnit.parameters (ArgSpec(args=['self', 'include_sublayer paddle.fluid.dygraph.GRUUnit.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.GRUUnit.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) paddle.fluid.dygraph.GRUUnit.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.dygraph.LayerNorm ('paddle.fluid.dygraph.nn.LayerNorm', ('document', '85ea3ae0e470704546cabcafd61192e1')) +paddle.fluid.dygraph.LayerNorm ('paddle.fluid.dygraph.nn.LayerNorm', ('document', 'b44f5d3d10386c460094e21f24ff272b')) paddle.fluid.dygraph.LayerNorm.__init__ (ArgSpec(args=['self', 'name_scope', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.LayerNorm.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) paddle.fluid.dygraph.LayerNorm.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) @@ -659,8 +703,8 @@ paddle.fluid.dygraph.LayerNorm.parameters (ArgSpec(args=['self', 'include_sublay paddle.fluid.dygraph.LayerNorm.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers'], varargs=None, keywords=None, defaults=(None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.LayerNorm.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) paddle.fluid.dygraph.LayerNorm.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.dygraph.NCE ('paddle.fluid.dygraph.nn.NCE', ('document', '47eb439a5568468fad70235f1e61ead9')) -paddle.fluid.dygraph.NCE.__init__ (ArgSpec(args=['self', 'name_scope', 'num_total_classes', 'param_attr', 'bias_attr', 'num_neg_samples', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, 'uniform', None, 0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.NCE ('paddle.fluid.dygraph.nn.NCE', ('document', '2d579e8d9ce31bb29e079e5f6108fc73')) +paddle.fluid.dygraph.NCE.__init__ (ArgSpec(args=['self', 'name_scope', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, 'uniform', None, 0, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.NCE.add_parameter (ArgSpec(args=['self', 'name', 'parameter'], varargs=None, keywords=None, defaults=None), ('document', 'f35ab374c7d5165c3daf3bd64a5a2ec1')) paddle.fluid.dygraph.NCE.add_sublayer (ArgSpec(args=['self', 'name', 'sublayer'], varargs=None, keywords=None, defaults=None), ('document', '839ff3c0534677ba6ad8735c3fd4e995')) paddle.fluid.dygraph.NCE.backward (ArgSpec(args=['self'], varargs='inputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -788,41 +832,41 @@ paddle.fluid.dygraph.TreeConv.state_dict (ArgSpec(args=['self', 'destination', ' paddle.fluid.dygraph.TreeConv.sublayers (ArgSpec(args=['self', 'include_sublayers'], varargs=None, keywords=None, defaults=(True,)), ('document', '00a881005ecbc96578faf94513bf0d62')) paddle.fluid.dygraph.TreeConv.train (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer ('paddle.fluid.dygraph.tracer.Tracer', ('document', '28d72409112111274c33e1f07229d5da')) -paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self', 'block'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Tracer.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.all_parameters (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.eval_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None 2. trace(self: paddle.fluid.core_avx.Tracer, arg0: paddle.fluid.core_avx.OpBase, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None -paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'op', 'inputs', 'outputs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.dygraph.Tracer.trace 1. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CUDAPlace, arg5: bool) -> None 2. trace(self: paddle.fluid.core_avx.Tracer, arg0: unicode, arg1: Dict[unicode, handle], arg2: Dict[unicode, handle], arg3: Dict[unicode, Variant], arg4: paddle::platform::CPUPlace, arg5: bool) -> None +paddle.fluid.dygraph.Tracer.trace_op (ArgSpec(args=['self', 'type', 'inputs', 'outputs', 'attrs', 'stop_gradient'], varargs=None, keywords=None, defaults=(False,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.trace_var (ArgSpec(args=['self', 'name', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.Tracer.train_mode (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.prepare_context (ArgSpec(args=['strategy'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.dygraph.save_persistables (ArgSpec(args=['model_dict', 'dirname', 'optimizers'], varargs=None, keywords=None, defaults=('save_dir', None)), ('document', '7f526f879139a14cda8e0b5a9171f264')) -paddle.fluid.dygraph.load_persistables (ArgSpec(args=['dirname'], varargs=None, keywords=None, defaults=('save_dir',)), ('document', '2574d50a7a9f89fb0d74ddf73d8128f0')) -paddle.fluid.dygraph.NoamDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NoamDecay', ('document', 'e45b81ab71653cb8ad7384671e6238e4')) +paddle.fluid.dygraph.save_persistables (ArgSpec(args=['model_dict', 'dirname', 'optimizers'], varargs=None, keywords=None, defaults=('save_dir', None)), ('document', 'b0b2ec2a502214a737300fb648cb9dc7')) +paddle.fluid.dygraph.load_persistables (ArgSpec(args=['dirname'], varargs=None, keywords=None, defaults=('save_dir',)), ('document', 'e0709f8259620fdcfd2c0c1b23348852')) +paddle.fluid.dygraph.NoamDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NoamDecay', ('document', '9ccfea97dbf15134d406a23aae1e1fa2')) paddle.fluid.dygraph.NoamDecay.__init__ (ArgSpec(args=['self', 'd_model', 'warmup_steps', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(1, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.NoamDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) paddle.fluid.dygraph.NoamDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.dygraph.PiecewiseDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PiecewiseDecay', ('document', '139b30620ffd26ed3f4da24b954a4022')) +paddle.fluid.dygraph.PiecewiseDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PiecewiseDecay', ('document', '8f4d37eaad4e2f5b12850f3663856758')) paddle.fluid.dygraph.PiecewiseDecay.__init__ (ArgSpec(args=['self', 'boundaries', 'values', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.PiecewiseDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) paddle.fluid.dygraph.PiecewiseDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.dygraph.NaturalExpDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NaturalExpDecay', ('document', 'ed584947bab492fb5263d1474dcab709')) +paddle.fluid.dygraph.NaturalExpDecay ('paddle.fluid.dygraph.learning_rate_scheduler.NaturalExpDecay', ('document', '94bed58b392a5a71b6d1abd39eed7111')) paddle.fluid.dygraph.NaturalExpDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.NaturalExpDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) paddle.fluid.dygraph.NaturalExpDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.dygraph.ExponentialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.ExponentialDecay', ('document', '2d620b5c4ae70cf64c6d710647ef48c6')) +paddle.fluid.dygraph.ExponentialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.ExponentialDecay', ('document', 'a259689c649c5f82636536386ce2ef19')) paddle.fluid.dygraph.ExponentialDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.ExponentialDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) paddle.fluid.dygraph.ExponentialDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.dygraph.InverseTimeDecay ('paddle.fluid.dygraph.learning_rate_scheduler.InverseTimeDecay', ('document', '599c7c42b0a27b83acfd648c705ac622')) +paddle.fluid.dygraph.InverseTimeDecay ('paddle.fluid.dygraph.learning_rate_scheduler.InverseTimeDecay', ('document', '6a868b2c7cc0f09f57ef71902bbc93ca')) paddle.fluid.dygraph.InverseTimeDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'decay_rate', 'staircase', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.InverseTimeDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) paddle.fluid.dygraph.InverseTimeDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.dygraph.PolynomialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PolynomialDecay', ('document', '19080eb899a7102ce33b43c17b5e8043')) +paddle.fluid.dygraph.PolynomialDecay ('paddle.fluid.dygraph.learning_rate_scheduler.PolynomialDecay', ('document', 'bb90314cee58952f13522dcd571ca832')) paddle.fluid.dygraph.PolynomialDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False, 0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.PolynomialDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) paddle.fluid.dygraph.PolynomialDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.dygraph.CosineDecay ('paddle.fluid.dygraph.learning_rate_scheduler.CosineDecay', ('document', 'd21fe863218f9bcc4a7216c628cc041f')) +paddle.fluid.dygraph.CosineDecay ('paddle.fluid.dygraph.learning_rate_scheduler.CosineDecay', ('document', '46dadadee1a8a92d70bd277d9345bfb0')) paddle.fluid.dygraph.CosineDecay.__init__ (ArgSpec(args=['self', 'learning_rate', 'step_each_epoch', 'epochs', 'begin', 'step', 'dtype'], varargs=None, keywords=None, defaults=(0, 1, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph.CosineDecay.create_lr_var (ArgSpec(args=['self', 'lr'], varargs=None, keywords=None, defaults=None), ('document', '013bc233558149d0757b3df57845b866')) paddle.fluid.dygraph.CosineDecay.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -846,7 +890,7 @@ paddle.fluid.transpiler.RoundRobin.__init__ (ArgSpec(args=['self', 'pserver_endp paddle.fluid.transpiler.RoundRobin.dispatch (ArgSpec(args=['self', 'varlist'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.transpiler.RoundRobin.reset (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.transpiler.DistributeTranspilerConfig ('paddle.fluid.transpiler.distribute_transpiler.DistributeTranspilerConfig', ('document', '550b8c767a8ae1a2eb74b18924ddc975')) -paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ +paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.nets.simple_img_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)), ('document', '13f01ff80e8dfbd3427d90cf49bc62eb')) paddle.fluid.nets.sequence_conv_pool (ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)), ('document', 'd6a1e527b53f5cc15594fee307dfc5cf')) paddle.fluid.nets.glu (ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)), ('document', 'b87bacfc70dd3477ed25ef14aa01389a')) @@ -892,6 +936,14 @@ paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', ' paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamaxOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.DpsgdOptimizer ('paddle.fluid.optimizer.DpsgdOptimizer', ('document', '71113c30b66c0f4035b10ebd8af8c5ad')) +paddle.fluid.optimizer.DpsgdOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'clip', 'batch_size', 'sigma'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DpsgdOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.DpsgdOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.DpsgdOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DpsgdOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DpsgdOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.DpsgdOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.DecayedAdagradOptimizer ('paddle.fluid.optimizer.DecayedAdagradOptimizer', ('document', 'e76838a8586bf2e58e6b5cdd2f67f780')) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) @@ -916,7 +968,7 @@ paddle.fluid.optimizer.RMSPropOptimizer.backward (ArgSpec(args=['self', 'loss', paddle.fluid.optimizer.RMSPropOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.RMSPropOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.RMSPropOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) -paddle.fluid.optimizer.AdadeltaOptimizer ('paddle.fluid.optimizer.AdadeltaOptimizer', ('document', 'b5e33fa8aca6cfbcaebfc6cd7742908a')) +paddle.fluid.optimizer.AdadeltaOptimizer ('paddle.fluid.optimizer.AdadeltaOptimizer', ('document', '3f1c5385519a3674c18c3a1ab34ac04f')) paddle.fluid.optimizer.AdadeltaOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'epsilon', 'rho', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, 0.95, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdadeltaOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.AdadeltaOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) @@ -966,7 +1018,18 @@ paddle.fluid.optimizer.ExponentialMovingAverage.update (ArgSpec(args=['self'], v paddle.fluid.optimizer.PipelineOptimizer ('paddle.fluid.optimizer.PipelineOptimizer', ('document', '6f85382abedb922387b08d98e8d0b69c')) paddle.fluid.optimizer.PipelineOptimizer.__init__ (ArgSpec(args=['self', 'optimizer', 'cut_list', 'place_list', 'concurrency_list', 'queue_size', 'sync_steps', 'start_cpu_core_id'], varargs=None, keywords=None, defaults=(None, None, None, 30, 1, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '1b7b2bfb986e93048e75ba69f2f490ab')) +paddle.fluid.optimizer.LookaheadOptimizer ('paddle.fluid.optimizer.LookaheadOptimizer', ('document', 'c291cadfa7452c7bf58b9e2f900a3511')) +paddle.fluid.optimizer.LookaheadOptimizer.__init__ (ArgSpec(args=['self', 'inner_optimizer', 'alpha', 'k'], varargs=None, keywords=None, defaults=(0.5, 5)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.LookaheadOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.RecomputeOptimizer ('paddle.fluid.optimizer.RecomputeOptimizer', ('document', '05769ba1182270f808f85488a50c8caa')) +paddle.fluid.optimizer.RecomputeOptimizer.__init__ (ArgSpec(args=['self', 'optimizer'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.RecomputeOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '7838e157ec5ff4f835f814adf3a2b9cc')) +paddle.fluid.optimizer.RecomputeOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'ec8dfa14fcd958d7c196f3d1a0ce6fa7')) +paddle.fluid.optimizer.RecomputeOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks', 'checkpoints'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a26b3dbb0f63ee81d847d92e9fb942dc')) +paddle.fluid.optimizer.RecomputeOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.RecomputeOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '7b2b8ae72011bc4decb67e97623f2c56')) +paddle.fluid.optimizer.RecomputeOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks', 'checkpoints'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', '52488008103886c793843a3828bacd5e')) paddle.fluid.backward.gradients (ArgSpec(args=['targets', 'inputs', 'target_gradients', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'e2097e1e0ed84ae44951437bfe269a1b')) paddle.fluid.regularizer.L1DecayRegularizer ('paddle.fluid.regularizer.L1DecayRegularizer', ('document', '34603757e70974d2fcc730643b382925')) paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -989,15 +1052,16 @@ paddle.fluid.CUDAPlace ('paddle.fluid.core_avx.CUDAPlace', ('document', '6a6cd8e paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPlace, arg0: int) -> None paddle.fluid.CUDAPinnedPlace ('paddle.fluid.core_avx.CUDAPinnedPlace', ('document', 'afd58ea5d390b5ea06ca70291a266d45')) paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core_avx.CUDAPinnedPlace) -> None -paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', 'fa47fa251f727c4a4f638d61e3c7c141')) -paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.WeightNormParamAttr ('paddle.fluid.param_attr.WeightNormParamAttr', ('document', '48ab4f49c7eeeade5958b731b6a96aa0')) +paddle.fluid.ParamAttr ('paddle.fluid.param_attr.ParamAttr', ('document', 'cd667b4ee96d7d6fca40aa722d67d744')) +paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, True)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.WeightNormParamAttr ('paddle.fluid.param_attr.WeightNormParamAttr', ('document', 'b5ae1698ea72d5a9428000b916a67379')) paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.DataFeeder ('paddle.fluid.data_feeder.DataFeeder', ('document', 'a39802654f20692ad49c340cef7c6556')) +paddle.fluid.DataFeeder ('paddle.fluid.data_feeder.DataFeeder', ('document', 'd9e64be617bd5f49dbb08ac2bc8665e6')) paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '449ec75d35b3498091908714e35e6686')) +paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'a0ed5ce816b5d603cb595aacb922335a')) paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', 'ce65fe1d81dcd7067d5092a5667f35cc')) paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '334c6af750941a4397a2dd2ea8a4d76f')) +paddle.fluid.clip.set_gradient_clip (ArgSpec(args=['clip', 'param_list', 'program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '77ca02bb37b70d226510df9cf5e45965')) paddle.fluid.clip.ErrorClipByValue ('paddle.fluid.clip.ErrorClipByValue', ('document', 'e6f815a03be88dee2537707d9e6b9209')) paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.clip.GradientClipByValue ('paddle.fluid.clip.GradientClipByValue', ('document', 'b7a22f687269cae0c338ef3866322db7')) @@ -1012,7 +1076,7 @@ paddle.fluid.dygraph_grad_clip.GradClipByNorm ('paddle.fluid.dygraph_grad_clip.G paddle.fluid.dygraph_grad_clip.GradClipByNorm.__init__ (ArgSpec(args=['self', 'clip_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm ('paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm', ('document', 'd1872377e7d7a5fe0dd2e8c42e4c9656')) paddle.fluid.dygraph_grad_clip.GradClipByGlobalNorm.__init__ (ArgSpec(args=['self', 'max_global_norm'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '49f5db5da13cfd8c069754dd11be3901')) +paddle.fluid.profiler.cuda_profiler (ArgSpec(args=['output_file', 'output_mode', 'config'], varargs=None, keywords=None, defaults=(None, None)), ('document', '4053b45953807a24e28027dc86829d6c')) paddle.fluid.profiler.reset_profiler (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'fd1f25a7a06516ca9a1f4ab0783a4d70')) paddle.fluid.profiler.profiler (ArgSpec(args=['state', 'sorted_key', 'profile_path'], varargs=None, keywords=None, defaults=(None, '/tmp/profile')), ('document', 'a2be24e028dffa06ab28cc55a27c59e4')) paddle.fluid.profiler.start_profiler (ArgSpec(args=['state'], varargs=None, keywords=None, defaults=None), ('document', '4c192ea399e6e80b1ab47a8265b022a5')) @@ -1020,24 +1084,5 @@ paddle.fluid.profiler.stop_profiler (ArgSpec(args=['sorted_key', 'profile_path'] paddle.fluid.unique_name.generate (ArgSpec(args=['key'], varargs=None, keywords=None, defaults=None), ('document', '4d68cde4c4df8f1b8018620b4dc19b42')) paddle.fluid.unique_name.switch (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', '695a6e91afbcdbafac69a069038811be')) paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ead717d6d440a1eb11971695cd1727f4')) -paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310')) -paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7')) paddle.fluid.Scope Scope() -> paddle.fluid.core_avx._Scope paddle.fluid.install_check.run_check (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '66b7c84a17ed32fec2df9628367be2b9')) -paddle.reader.cache (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', '1676886070eb607cb608f7ba47be0d3c')) -paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d')) -paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb')) -paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d')) -paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4')) -paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d')) -paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad')) -paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796')) -paddle.reader.PipeReader ('paddle.reader.decorator.PipeReader', ('document', 'd3c250618f98c1a5fb646f869016a98e')) -paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45')) -paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0')) -paddle.reader.Fake ('paddle.reader.decorator.Fake', ('document', '0d8f4847b99bed6d456ade0d903202e1')) -paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada')) -paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', 'f45fcb7add066c8e042c6774fc7c3db2')) -paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', 'b4a94ee0e2cefb495619275c2f8c61d2')) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 595454e9..16457b56 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -4,7 +4,6 @@ add_subdirectory(framework) add_subdirectory(imperative) add_subdirectory(operators) add_subdirectory(string) -add_subdirectory(recordio) add_subdirectory(pybind) # NOTE: please add subdirectory inference at last. diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 5dc6e74b..51efe60c 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -63,7 +63,7 @@ if(WITH_GPU) else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() -cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) @@ -123,8 +123,8 @@ cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_co cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context) cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place) -cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type) +cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog data_feed_proto + shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) @@ -133,7 +133,9 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper) +cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) + +cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) @@ -193,18 +195,17 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() -target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper) +target_link_libraries(executor while_op_helper executor_gc_helper recurrent_op_helper conditional_block_op_helper) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -cc_library(prune SRCS prune.cc DEPS framework_proto) +cc_library(prune SRCS prune.cc DEPS framework_proto boost) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) -cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) @@ -222,6 +223,9 @@ endif (NOT WIN32) cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) +cc_library(op_compatible_info SRCS op_compatible_info DEPS string_helper) +cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatible_info string_helper glog) + # Get the current working branch execute_process( COMMAND git rev-parse --abbrev-ref HEAD diff --git a/paddle/fluid/framework/archive.h b/paddle/fluid/framework/archive.h old mode 100755 new mode 100644 index 30b5e0e8..73fcc742 --- a/paddle/fluid/framework/archive.h +++ b/paddle/fluid/framework/archive.h @@ -168,10 +168,10 @@ class ArchiveBase { #else if (newsize > Capacity()) { #endif - Reserve(std::max(Capacity() * 2, newsize)); + Reserve((std::max)(Capacity() * 2, newsize)); } finish_ = buffer_ + newsize; - cursor_ = std::min(cursor_, finish_); + cursor_ = (std::min)(cursor_, finish_); } void Reserve(size_t newcap) { @@ -207,7 +207,7 @@ class ArchiveBase { #else if (size > size_t(limit_ - finish_)) { #endif - Reserve(std::max(Capacity() * 2, Length() + size)); + Reserve((std::max)(Capacity() * 2, Length() + size)); } } @@ -311,6 +311,18 @@ class Archive : public ArchiveBase { *this >> x; return x; } + + template + void Printf(const char* fmt, ARGS&&... args) { + size_t temp = Limit() - Finish(); + int len = snprintf(Finish(), temp, fmt, args...); + CHECK(len >= 0); // NOLINT + if ((size_t)len >= temp) { + PrepareWrite(len + 1); + CHECK(snprintf(Finish(), (size_t)len + 1, fmt, args...) == len); + } + AdvanceFinish(len); + } }; template @@ -518,11 +530,11 @@ Archive& operator>>(Archive& ar, std::tuple& x) { } \ template \ Archive& operator>>(Archive& ar, MAP_TYPE& p) { \ - size_t size = ar.template Get(); \ + size_t size = ar.template get(); \ p.clear(); \ RESERVE_STATEMENT; \ for (size_t i = 0; i < size; i++) { \ - p.insert(ar.template Get>()); \ + p.insert(ar.template get>()); \ } \ return ar; \ } @@ -539,11 +551,11 @@ Archive& operator>>(Archive& ar, std::tuple& x) { } \ template \ Archive& operator>>(Archive& ar, MAP_TYPE& p) { \ - size_t size = ar.template Get(); \ + size_t size = ar.template get(); \ p.clear(); \ RESERVE_STATEMENT; \ for (size_t i = 0; i < size; i++) { \ - p.insert(ar.template Get>()); \ + p.insert(ar.template get>()); \ } \ return ar; \ } @@ -568,11 +580,11 @@ ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size)) } \ template \ Archive& operator>>(Archive& ar, SET_TYPE& p) { \ - size_t size = ar.template Get(); \ + size_t size = ar.template get(); \ p.clear(); \ RESERVE_STATEMENT; \ for (size_t i = 0; i < size; i++) { \ - p.insert(ar.template Get()); \ + p.insert(ar.template get()); \ } \ return ar; \ } @@ -588,11 +600,11 @@ ARCHIVE_REPEAT(std::unordered_multimap, p.reserve(size)) } \ template \ Archive& operator>>(Archive& ar, SET_TYPE& p) { \ - size_t size = ar.template Get(); \ + size_t size = ar.template get(); \ p.clear(); \ RESERVE_STATEMENT; \ for (size_t i = 0; i < size; i++) { \ - p.insert(ar.template Get()); \ + p.insert(ar.template get()); \ } \ return ar; \ } diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index f0658ed1..d186ef12 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -40,7 +40,7 @@ class ChannelObject { // capacity can be zero explicit ChannelObject(size_t capacity) { - capacity_ = std::min(MaxCapacity(), capacity); + capacity_ = (std::min)(MaxCapacity(), capacity); } void Clear() { @@ -192,7 +192,7 @@ class ChannelObject { std::condition_variable full_cond_; static constexpr size_t MaxCapacity() { - return std::numeric_limits::max() / 2; + return (std::numeric_limits::max)() / 2; } void Notify() { @@ -289,7 +289,7 @@ template using Channel = std::shared_ptr>; template -Channel MakeChannel(size_t capacity = std::numeric_limits::max()) { +Channel MakeChannel(size_t capacity = (std::numeric_limits::max)()) { return std::make_shared>(capacity); } @@ -332,7 +332,7 @@ class ChannelReader { } if (cursor_ >= buffer_.size()) { cursor_ = 0; - if (channel_->Read(buffer_) == 0) { + if (channel_->read(buffer_) == 0) { failed_ = true; return *this; } @@ -370,7 +370,7 @@ class ChannelWriter { void Reset(ChannelObject* channel) { CHECK(buffer_.empty()) << "Forgot to flush"; - CHECK(channel != nullptr) << "Channel can not be nullptr"; + // CHECK(channel != nullptr) << "Channel can not be nullptr"; channel_ = channel; buffer_.clear(); failed_ = !channel; diff --git a/paddle/fluid/framework/commit.h b/paddle/fluid/framework/commit.h deleted file mode 100644 index 343bf82f..00000000 --- a/paddle/fluid/framework/commit.h +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include - -namespace paddle { -namespace framework { - -static std::string paddle_commit() { - return "95c1816ec0"; -} - -static std::string paddle_compile_branch() { - return "develop"; -} - -static std::string paddle_version() { - return "0.0.0"; -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc old mode 100755 new mode 100644 index ed94e30e..bfeb2977 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -33,11 +33,53 @@ limitations under the License. */ #include "io/shell.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/platform/timer.h" namespace paddle { namespace framework { +void RecordCandidateList::ReSize(size_t length) { + _mutex.lock(); + _capacity = length; + CHECK(_capacity > 0); // NOLINT + _candidate_list.clear(); + _candidate_list.resize(_capacity); + _full = false; + _cur_size = 0; + _total_size = 0; + _mutex.unlock(); +} + +void RecordCandidateList::ReInit() { + _mutex.lock(); + _full = false; + _cur_size = 0; + _total_size = 0; + _mutex.unlock(); +} + +void RecordCandidateList::AddAndGet(const Record& record, + RecordCandidate* result) { + _mutex.lock(); + size_t index = 0; + ++_total_size; + auto fleet_ptr = FleetWrapper::GetInstance(); + if (!_full) { + _candidate_list[_cur_size++] = record; + _full = (_cur_size == _capacity); + } else { + CHECK(_cur_size == _capacity); + index = fleet_ptr->LocalRandomEngine()() % _total_size; + if (index < _capacity) { + _candidate_list[index] = record; + } + } + index = fleet_ptr->LocalRandomEngine()() % _cur_size; + *result = _candidate_list[index]; + _mutex.unlock(); +} + void DataFeed::AddFeedVar(Variable* var, const std::string& name) { CheckInit(); for (size_t i = 0; i < use_slots_.size(); ++i) { @@ -101,11 +143,24 @@ void DataFeed::AssignFeedVar(const Scope& scope) { } } +void DataFeed::CopyToFeedTensor(void* dst, const void* src, size_t size) { + if (platform::is_cpu_place(this->place_)) { + memcpy(dst, src, size); + } else { +#ifdef PADDLE_WITH_CUDA + cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice); +#else + PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option"); +#endif + } +} + template void PrivateQueueDataFeed::SetQueueSize(int queue_size) { PADDLE_ENFORCE(queue_size > 0, "Illegal queue size: %d.", queue_size); queue_size_ = queue_size; queue_ = paddle::framework::MakeChannel(); + queue_->SetCapacity(queue_size); } template @@ -169,6 +224,7 @@ InMemoryDataFeed::InMemoryDataFeed() { this->thread_id_ = 0; this->thread_num_ = 1; this->parse_ins_id_ = false; + this->parse_content_ = false; this->input_channel_ = nullptr; this->output_channel_ = nullptr; this->consume_channel_ = nullptr; @@ -252,6 +308,11 @@ void InMemoryDataFeed::SetThreadNum(int thread_num) { thread_num_ = thread_num; } +template +void InMemoryDataFeed::SetParseContent(bool parse_content) { + parse_content_ = parse_content; +} + template void InMemoryDataFeed::SetParseInsId(bool parse_ins_id) { parse_ins_id_ = parse_ins_id; @@ -301,7 +362,8 @@ void MultiSlotDataFeed::Init( paddle::framework::MultiSlotDesc multi_slot_desc = data_feed_desc.multi_slot_desc(); SetBatchSize(data_feed_desc.batch_size()); - SetQueueSize(data_feed_desc.batch_size()); + // temporarily set queue size = batch size * 100 + SetQueueSize(data_feed_desc.batch_size() * 100); size_t all_slot_num = multi_slot_desc.slots_size(); all_slots_.resize(all_slot_num); all_slots_type_.resize(all_slot_num); @@ -610,15 +672,16 @@ void MultiSlotDataFeed::PutToFeedVec( if (type[0] == 'f') { // float const auto& feasign = ins_vec[i].GetFloatData(); - float* tensor_ptr = feed_vec_[i]->mutable_data( - {total_instance, 1}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); + float* tensor_ptr = + feed_vec_[i]->mutable_data({total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float)); } else if (type[0] == 'u') { // uint64 // no uint64_t type in paddlepaddle const auto& feasign = ins_vec[i].GetUint64Data(); int64_t* tensor_ptr = feed_vec_[i]->mutable_data( - {total_instance, 1}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); + {total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, &feasign[0], + total_instance * sizeof(int64_t)); } LoD data_lod{offset}; @@ -709,6 +772,18 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) { pos += len + 1; VLOG(3) << "ins_id " << instance->ins_id_; } + if (parse_content_) { + int num = strtol(&str[pos], &endptr, 10); + CHECK(num == 1); // NOLINT + pos = endptr - str + 1; + size_t len = 0; + while (str[pos + len] != ' ') { + ++len; + } + instance->content_ = std::string(str + pos, len); + pos += len + 1; + VLOG(3) << "content " << instance->content_; + } for (size_t i = 0; i < use_slots_index_.size(); ++i) { int idx = use_slots_index_[i]; int num = strtol(&str[pos], &endptr, 10); @@ -833,8 +908,14 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( std::vector> offset(use_slots_.size(), std::vector{0}); std::vector visit(use_slots_.size(), false); + ins_content_vec_.clear(); + ins_content_vec_.reserve(ins_vec.size()); + ins_id_vec_.clear(); + ins_id_vec_.reserve(ins_vec.size()); for (size_t i = 0; i < ins_vec.size(); ++i) { auto& r = ins_vec[i]; + ins_id_vec_.push_back(r.ins_id_); + ins_content_vec_.push_back(r.content_); for (auto& item : r.float_feasigns_) { batch_float_feasigns[item.slot()].push_back(item.sign().float_feasign_); visit[item.slot()] = true; @@ -872,15 +953,15 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( const auto& type = all_slots_type_[i]; if (type[0] == 'f') { // float float* feasign = batch_float_feasigns[i].data(); - float* tensor_ptr = feed_vec_[i]->mutable_data( - {total_instance, 1}, platform::CPUPlace()); - memcpy(tensor_ptr, feasign, total_instance * sizeof(float)); + float* tensor_ptr = + feed_vec_[i]->mutable_data({total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float)); } else if (type[0] == 'u') { // uint64 // no uint64_t type in paddlepaddle uint64_t* feasign = batch_uint64_feasigns[i].data(); int64_t* tensor_ptr = feed_vec_[i]->mutable_data( - {total_instance, 1}, platform::CPUPlace()); - memcpy(tensor_ptr, feasign, total_instance * sizeof(int64_t)); + {total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t)); } auto& slot_offset = offset[i]; LoD data_lod{slot_offset}; @@ -906,15 +987,16 @@ void PrivateInstantDataFeed::PutToFeedVec() { if (type[0] == 'f') { // float const auto& feasign = ins_vec_[i].GetFloatData(); - float* tensor_ptr = feed_vec_[i]->mutable_data( - {total_instance, 1}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); + float* tensor_ptr = + feed_vec_[i]->mutable_data({total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, &feasign[0], total_instance * sizeof(float)); } else if (type[0] == 'u') { // uint64 // no uint64_t type in paddlepaddle const auto& feasign = ins_vec_[i].GetUint64Data(); int64_t* tensor_ptr = feed_vec_[i]->mutable_data( - {total_instance, 1}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); + {total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, &feasign[0], + total_instance * sizeof(int64_t)); } LoD data_lod{offset}; diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 7164834c..9ea9be41 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -26,6 +26,7 @@ limitations under the License. */ #include #include #include // NOLINT +#include #include #include @@ -104,13 +105,25 @@ class DataFeed { virtual void SetThreadNum(int thread_num) {} // This function will do nothing at default virtual void SetParseInsId(bool parse_ins_id) {} + virtual void SetParseContent(bool parse_content) {} virtual void SetFileListMutex(std::mutex* mutex) { mutex_for_pick_file_ = mutex; } virtual void SetFileListIndex(size_t* file_index) { file_idx_ = file_index; } + virtual const std::vector& GetInsIdVec() const { + return ins_id_vec_; + } + virtual const std::vector& GetInsContentVec() const { + return ins_content_vec_; + } + virtual int GetCurBatchSize() { return batch_size_; } virtual void LoadIntoMemory() { PADDLE_THROW("This function(LoadIntoMemory) is not implemented."); } + virtual void SetPlace(const paddle::platform::Place& place) { + place_ = place; + } + virtual const paddle::platform::Place& GetPlace() const { return place_; } protected: // The following three functions are used to check if it is executed in this @@ -124,6 +137,7 @@ class DataFeed { // This function is used to pick one file from the global filelist(thread // safe). virtual bool PickOneFile(std::string* filename); + virtual void CopyToFeedTensor(void* dst, const void* src, size_t size); std::vector filelist_; size_t* file_idx_; @@ -158,6 +172,9 @@ class DataFeed { bool finish_set_filelist_; bool finish_start_; std::string pipe_command_; + std::vector ins_id_vec_; + std::vector ins_content_vec_; + platform::Place place_; }; // PrivateQueueDataFeed is the base virtual class for ohther DataFeeds. @@ -215,6 +232,7 @@ class InMemoryDataFeed : public DataFeed { virtual void SetThreadId(int thread_id); virtual void SetThreadNum(int thread_num); virtual void SetParseInsId(bool parse_ins_id); + virtual void SetParseContent(bool parse_content); virtual void LoadIntoMemory(); protected: @@ -225,6 +243,7 @@ class InMemoryDataFeed : public DataFeed { int thread_id_; int thread_num_; bool parse_ins_id_; + bool parse_content_; std::ifstream file_; std::shared_ptr fp_; paddle::framework::ChannelObject* input_channel_; @@ -419,6 +438,42 @@ struct Record { std::vector uint64_feasigns_; std::vector float_feasigns_; std::string ins_id_; + std::string content_; +}; + +struct RecordCandidate { + std::string ins_id_; + std::unordered_multimap feas; + + RecordCandidate& operator=(const Record& rec) { + feas.clear(); + ins_id_ = rec.ins_id_; + for (auto& fea : rec.uint64_feasigns_) { + feas.insert({fea.slot(), fea.sign()}); + } + return *this; + } +}; + +class RecordCandidateList { + public: + RecordCandidateList() = default; + RecordCandidateList(const RecordCandidateList&) = delete; + RecordCandidateList& operator=(const RecordCandidateList&) = delete; + + void ReSize(size_t length); + + void ReInit(); + + void AddAndGet(const Record& record, RecordCandidate* result); + + private: + size_t _capacity = 0; + std::mutex _mutex; + bool _full = false; + size_t _cur_size = 0; + size_t _total_size = 0; + std::vector _candidate_list; }; template diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index bbcd3426..fe53c6f9 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -18,7 +18,6 @@ #include "paddle/fluid/operators/math/math_function.h" #ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" #endif @@ -121,28 +120,35 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, const Tensor& in, Tensor* out) { auto in_layout = kernel_type_for_var.data_layout_; auto out_layout = expected_kernel_type.data_layout_; + auto place = expected_kernel_type.place_; PADDLE_ENFORCE( in_layout == DataLayout::kMKLDNN && out_layout != DataLayout::kMKLDNN, "TransDataLayoutFromMKLDNN only supports transform from MKLDNN to " "non-MKLDNN"); + innerTransDataLayoutFromMKLDNN(in_layout, out_layout, in, out, place); +} + +void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, + const Tensor& in, Tensor* out, + platform::Place place) { #ifdef PADDLE_WITH_MKLDNN - PADDLE_ENFORCE(in.format() != memory::format::format_undef && - in.format() != memory::format::any, - "Input tensor should have specified memory format"); + PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::format_undef, + "Input tensor should have specified memory format"); + PADDLE_ENFORCE_NE(in.format(), MKLDNNMemoryFormat::any, + "Input tensor should have specified memory format"); // Set default as NCHW in case not specified out_layout = out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; auto& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = dynamic_cast( - pool.Get(expected_kernel_type.place_)); + auto* dev_ctx = dynamic_cast(pool.Get(place)); auto& cpu_engine = dev_ctx->GetEngine(); - std::vector in_tz = paddle::framework::vectorize2int(in.dims()); - std::vector out_tz = in_tz; + auto in_tz = paddle::framework::vectorize(in.dims()); + auto out_tz = in_tz; memory::data_type in_type = ToMKLDNNDataType(in.type()); PADDLE_ENFORCE(in_type != memory::data_type::data_undef, @@ -157,15 +163,15 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, if (in_format != out_format) { void* in_data = GetDataFromTensor(in, in_type); - const std::string key = platform::ReorderMKLDNNHandler::GetHash( - in_tz, in_format, out_format, std::to_string(in_type)); + const std::string key = platform::CreateKey(in_tz, in_format, out_format, + std::to_string(in_type)); platform::ReorderMKLDNNHandler handler(in_tz, in.type(), in_type, *dev_ctx, cpu_engine, key); auto reorder_src_memory_p = handler.AcquireSrcMemory(in_format, in_data); auto reorder_dst_memory_p = - handler.AcquireDstMemory(out, out_format, expected_kernel_type.place_); + handler.AcquireDstMemory(out, out_format, place); auto reorder_p = handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p); @@ -177,7 +183,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, } out->set_layout(out_layout); // reset format since the out tensor will be feed to non-MKLDNN OPkernel - out->set_format(memory::format::format_undef); + out->set_format(MKLDNNMemoryFormat::format_undef); #endif } diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 2c0a34b8..d67ea1e5 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -21,30 +21,33 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + namespace paddle { namespace framework { #ifdef PADDLE_WITH_MKLDNN -using MKLDNNFormat = mkldnn::memory::format; using MKLDNNDataType = mkldnn::memory::data_type; -inline MKLDNNFormat ToMKLDNNFormat(const DataLayout& layout) { +inline MKLDNNMemoryFormat ToMKLDNNFormat(const DataLayout& layout) { switch (layout) { case DataLayout::kNHWC: - return MKLDNNFormat::nhwc; + return MKLDNNMemoryFormat::nhwc; case DataLayout::kNCHW: - return MKLDNNFormat::nchw; + return MKLDNNMemoryFormat::nchw; default: PADDLE_THROW("Fail to convert layout %s to MKLDNN format", DataLayoutToString(layout)); } } -inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) { +inline DataLayout ToPaddleLayout(const MKLDNNMemoryFormat& format) { switch (format) { - case MKLDNNFormat::nhwc: + case MKLDNNMemoryFormat::nhwc: return DataLayout::kNHWC; - case MKLDNNFormat::nchw: + case MKLDNNMemoryFormat::nchw: return DataLayout::kNCHW; default: PADDLE_THROW("Fail to convert MKLDNN format to paddle layout"); @@ -69,6 +72,10 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, const OpKernelType& expected_kernel_type, const Tensor& in, Tensor* out); +void innerTransDataLayoutFromMKLDNN(DataLayout in_layout, DataLayout out_layout, + const Tensor& in, Tensor* out, + platform::Place place); + std::vector GetAxis(const DataLayout& from, const DataLayout& to); void TransDataLayout(const OpKernelType& kernel_type_for_var, diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index f0c8ccc2..541b3fed 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -42,12 +42,16 @@ DatasetImpl::DatasetImpl() { channel_num_ = 1; file_idx_ = 0; cur_channel_ = 0; - fleet_send_batch_size_ = 80000; - fleet_send_sleep_seconds_ = 2; + fleet_send_batch_size_ = 1024; + fleet_send_sleep_seconds_ = 0; merge_by_insid_ = false; erase_duplicate_feas_ = true; keep_unmerged_ins_ = true; min_merge_size_ = 2; + parse_ins_id_ = false; + parse_content_ = false; + preload_thread_num_ = 0; + global_index_ = 0; } // set filelist, file_idx_ will reset to zero. @@ -103,17 +107,36 @@ void DatasetImpl::SetChannelNum(int channel_num) { channel_num_ = channel_num; } +template +void DatasetImpl::SetParseInsId(bool parse_ins_id) { + parse_ins_id_ = parse_ins_id; +} + +template +void DatasetImpl::SetParseContent(bool parse_content) { + parse_content_ = parse_content; +} + template void DatasetImpl::SetMergeByInsId( const std::vector& merge_slot_list, bool erase_duplicate_feas, int min_merge_size, bool keep_unmerged_ins) { merge_by_insid_ = true; + parse_ins_id_ = true; merge_slots_list_ = merge_slot_list; erase_duplicate_feas_ = erase_duplicate_feas; min_merge_size_ = min_merge_size; keep_unmerged_ins_ = keep_unmerged_ins; } +template +void DatasetImpl::SetFeaEval(bool fea_eval, int record_candidate_size) { + slots_shuffle_fea_eval_ = fea_eval; + slots_shuffle_rclist_.ReSize(record_candidate_size); + VLOG(3) << "SetFeaEval fea eval mode: " << fea_eval + << " with record candidate size: " << record_candidate_size; +} + template std::vector DatasetImpl::GetReaders() { std::vector ret; @@ -182,10 +205,21 @@ void DatasetImpl::LoadIntoMemory() { template void DatasetImpl::PreLoadIntoMemory() { VLOG(3) << "DatasetImpl::PreLoadIntoMemory() begin"; - preload_threads_.clear(); - for (int64_t i = 0; i < thread_num_; ++i) { - preload_threads_.push_back(std::thread( - &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get())); + if (preload_thread_num_ != 0) { + CHECK(preload_thread_num_ == preload_readers_.size()); + preload_threads_.clear(); + for (int64_t i = 0; i < preload_thread_num_; ++i) { + preload_threads_.push_back( + std::thread(&paddle::framework::DataFeed::LoadIntoMemory, + preload_readers_[i].get())); + } + } else { + CHECK(thread_num_ == readers_.size()); + preload_threads_.clear(); + for (int64_t i = 0; i < thread_num_; ++i) { + preload_threads_.push_back(std::thread( + &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get())); + } } VLOG(3) << "DatasetImpl::PreLoadIntoMemory() end"; } @@ -258,7 +292,7 @@ void DatasetImpl::LocalShuffle() { } template -void DatasetImpl::GlobalShuffle() { +void DatasetImpl::GlobalShuffle(int thread_num) { VLOG(3) << "DatasetImpl::GlobalShuffle() begin"; platform::Timer timeline; timeline.Start(); @@ -325,13 +359,21 @@ void DatasetImpl::GlobalShuffle() { ars.shrink_to_fit(); data.clear(); data.shrink_to_fit(); - sleep(this->fleet_send_sleep_seconds_); + // currently we find bottleneck is server not able to handle large data + // in time, so we can remove this sleep and set fleet_send_batch_size to + // 1024, and set server thread to 24. + if (fleet_send_sleep_seconds_ != 0) { + sleep(this->fleet_send_sleep_seconds_); + } } }; - VLOG(3) << "start global shuffle threads"; std::vector global_shuffle_threads; - for (int i = 0; i < thread_num_; ++i) { + if (thread_num == -1) { + thread_num = thread_num_; + } + VLOG(3) << "start global shuffle threads, num = " << thread_num; + for (int i = 0; i < thread_num; ++i) { global_shuffle_threads.push_back(std::thread(global_shuffle_func)); } for (std::thread& t : global_shuffle_threads) { @@ -345,6 +387,101 @@ void DatasetImpl::GlobalShuffle() { << timeline.ElapsedSec() << " seconds"; } +template +void DatasetImpl::DynamicAdjustChannelNum(int channel_num) { + if (channel_num_ == channel_num) { + VLOG(3) << "DatasetImpl::DynamicAdjustChannelNum channel_num_=" + << channel_num_ << ", channel_num_=channel_num, no need to adjust"; + return; + } + VLOG(3) << "adjust channel num from " << channel_num_ << " to " + << channel_num; + channel_num_ = channel_num; + std::vector>* origin_channels = nullptr; + std::vector>* other_channels = nullptr; + // find out which channel (output or consume) has data + int cur_channel = 0; + uint64_t output_channels_data_size = 0; + uint64_t consume_channels_data_size = 0; + CHECK(multi_output_channel_.size() == multi_consume_channel_.size()); + for (int i = 0; i < multi_output_channel_.size(); ++i) { + output_channels_data_size += multi_output_channel_[i]->Size(); + consume_channels_data_size += multi_consume_channel_[i]->Size(); + } + if (output_channels_data_size != 0) { + CHECK(consume_channels_data_size == 0); // NOLINT + cur_channel = 0; + } else { + CHECK(output_channels_data_size == 0); // NOLINT + cur_channel = 1; + } + if (cur_channel == 0) { + origin_channels = &multi_output_channel_; + other_channels = &multi_consume_channel_; + } else { + origin_channels = &multi_consume_channel_; + other_channels = &multi_output_channel_; + } + CHECK(origin_channels != nullptr); // NOLINT + CHECK(other_channels != nullptr); // NOLINT + + paddle::framework::Channel total_data_channel = + paddle::framework::MakeChannel(); + std::vector> new_channels; + std::vector> new_other_channels; + std::vector local_vec; + for (int i = 0; i < origin_channels->size(); ++i) { + local_vec.clear(); + (*origin_channels)[i]->Close(); + (*origin_channels)[i]->ReadAll(local_vec); + total_data_channel->Write(std::move(local_vec)); + } + total_data_channel->Close(); + total_data_channel->SetBlockSize(total_data_channel->Size() / channel_num + + 1); + + for (int i = 0; i < channel_num; ++i) { + local_vec.clear(); + total_data_channel->Read(local_vec); + new_other_channels.push_back(paddle::framework::MakeChannel()); + new_channels.push_back(paddle::framework::MakeChannel()); + new_channels[i]->Write(std::move(local_vec)); + } + + total_data_channel->Clear(); + origin_channels->clear(); + other_channels->clear(); + *origin_channels = new_channels; + *other_channels = new_other_channels; + + new_channels.clear(); + new_other_channels.clear(); + std::vector>().swap(new_channels); + std::vector>().swap(new_other_channels); + local_vec.clear(); + std::vector().swap(local_vec); + VLOG(3) << "adjust channel num done"; +} + +template +void DatasetImpl::DynamicAdjustReadersNum(int thread_num) { + if (thread_num_ == thread_num) { + VLOG(3) << "DatasetImpl::DynamicAdjustReadersNum thread_num_=" + << thread_num_ << ", thread_num_=thread_num, no need to adjust"; + return; + } + VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num; + thread_num_ = thread_num; + std::vector>().swap(readers_); + CreateReaders(); + VLOG(3) << "adjust readers num done"; +} + +template +void DatasetImpl::SetFleetSendSleepSeconds(int seconds) { + fleet_send_sleep_seconds_ = seconds; +} + template void DatasetImpl::CreateReaders() { VLOG(3) << "Calling CreateReaders()"; @@ -352,8 +489,6 @@ void DatasetImpl::CreateReaders() { VLOG(3) << "Filelist size in Dataset: " << filelist_.size(); VLOG(3) << "channel num in Dataset: " << channel_num_; CHECK(thread_num_ > 0) << "thread num should > 0"; - CHECK(thread_num_ <= filelist_.size()) - << "thread num should <= filelist size"; CHECK(channel_num_ > 0) << "channel num should > 0"; CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num"; VLOG(3) << "readers size: " << readers_.size(); @@ -372,7 +507,8 @@ void DatasetImpl::CreateReaders() { readers_[i]->SetFileListMutex(&mutex_for_pick_file_); readers_[i]->SetFileListIndex(&file_idx_); readers_[i]->SetFileList(filelist_); - readers_[i]->SetParseInsId(merge_by_insid_); + readers_[i]->SetParseInsId(parse_ins_id_); + readers_[i]->SetParseContent(parse_content_); if (input_channel_ != nullptr) { readers_[i]->SetInputChannel(input_channel_.get()); } @@ -401,6 +537,47 @@ void DatasetImpl::DestroyReaders() { cur_channel_ = 1 - cur_channel_; } +template +void DatasetImpl::SetPreLoadThreadNum(int thread_num) { + preload_thread_num_ = thread_num; +} + +template +void DatasetImpl::CreatePreLoadReaders() { + VLOG(3) << "Begin CreatePreLoadReaders"; + if (preload_thread_num_ == 0) { + preload_thread_num_ = thread_num_; + } + CHECK(preload_thread_num_ > 0) << "thread num should > 0"; + CHECK(input_channel_ != nullptr); + preload_readers_.clear(); + for (int i = 0; i < preload_thread_num_; ++i) { + preload_readers_.push_back( + DataFeedFactory::CreateDataFeed(data_feed_desc_.name())); + preload_readers_[i]->Init(data_feed_desc_); + preload_readers_[i]->SetThreadId(i); + preload_readers_[i]->SetThreadNum(preload_thread_num_); + preload_readers_[i]->SetFileListMutex(&mutex_for_pick_file_); + preload_readers_[i]->SetFileListIndex(&file_idx_); + preload_readers_[i]->SetFileList(filelist_); + preload_readers_[i]->SetParseInsId(parse_ins_id_); + preload_readers_[i]->SetInputChannel(input_channel_.get()); + preload_readers_[i]->SetOutputChannel(nullptr); + preload_readers_[i]->SetConsumeChannel(nullptr); + } + VLOG(3) << "End CreatePreLoadReaders"; +} + +template +void DatasetImpl::DestroyPreLoadReaders() { + VLOG(3) << "Begin DestroyPreLoadReaders"; + preload_readers_.clear(); + std::vector>().swap( + preload_readers_); + file_idx_ = 0; + VLOG(3) << "End DestroyPreLoadReaders"; +} + template int64_t DatasetImpl::GetMemoryDataSize() { return input_channel_->Size(); @@ -436,7 +613,16 @@ int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, CHECK(ar.Cursor() == ar.Finish()); auto fleet_ptr = FleetWrapper::GetInstance(); - int64_t index = fleet_ptr->LocalRandomEngine()() % channel_num_; + // not use random because it doesn't perform well here. + // to make sure each channel get data equally, we just put data to + // channel one by one. + // int64_t index = fleet_ptr->LocalRandomEngine()() % channel_num_; + int64_t index = 0; + { + std::unique_lock lk(global_index_mutex_); + index = global_index_++; + } + index = index % channel_num_; VLOG(3) << "ramdom index=" << index; multi_output_channel_[index]->Write(std::move(data)); @@ -648,5 +834,167 @@ void MultiSlotDataset::MergeByInsId() { VLOG(3) << "MultiSlotDataset::MergeByInsId end"; } +void MultiSlotDataset::GetRandomData(const std::set& slots_to_replace, + std::vector* result) { + int debug_erase_cnt = 0; + int debug_push_cnt = 0; + auto multi_slot_desc = data_feed_desc_.multi_slot_desc(); + slots_shuffle_rclist_.ReInit(); + for (const auto& rec : slots_shuffle_original_data_) { + RecordCandidate rand_rec; + Record new_rec = rec; + slots_shuffle_rclist_.AddAndGet(rec, &rand_rec); + for (auto it = new_rec.uint64_feasigns_.begin(); + it != new_rec.uint64_feasigns_.end();) { + if (slots_to_replace.find(it->slot()) != slots_to_replace.end()) { + it = new_rec.uint64_feasigns_.erase(it); + debug_erase_cnt += 1; + } else { + ++it; + } + } + for (auto slot : slots_to_replace) { + auto range = rand_rec.feas.equal_range(slot); + for (auto it = range.first; it != range.second; ++it) { + new_rec.uint64_feasigns_.push_back({it->second, it->first}); + debug_push_cnt += 1; + } + } + result->push_back(std::move(new_rec)); + } + VLOG(2) << "erase feasign num: " << debug_erase_cnt + << " repush feasign num: " << debug_push_cnt; +} + +// slots shuffle to input_channel_ with needed-shuffle slots +void MultiSlotDataset::SlotsShuffle( + const std::set& slots_to_replace) { + int out_channel_size = 0; + if (cur_channel_ == 0) { + for (size_t i = 0; i < multi_output_channel_.size(); ++i) { + out_channel_size += multi_output_channel_[i]->Size(); + } + } else { + for (size_t i = 0; i < multi_consume_channel_.size(); ++i) { + out_channel_size += multi_consume_channel_[i]->Size(); + } + } + VLOG(2) << "DatasetImpl::SlotsShuffle() begin with input channel size: " + << input_channel_->Size() + << " output channel size: " << out_channel_size; + if (!slots_shuffle_fea_eval_) { + VLOG(3) << "DatasetImpl::SlotsShuffle() end," + "fea eval mode off, need to set on for slots shuffle"; + return; + } + if ((!input_channel_ || input_channel_->Size() == 0) && + slots_shuffle_original_data_.size() == 0 && out_channel_size == 0) { + VLOG(3) << "DatasetImpl::SlotsShuffle() end, no data to slots shuffle"; + return; + } + platform::Timer timeline; + timeline.Start(); + auto multi_slot_desc = data_feed_desc_.multi_slot_desc(); + std::set index_slots; + for (size_t i = 0; i < multi_slot_desc.slots_size(); ++i) { + std::string cur_slot = multi_slot_desc.slots(i).name(); + if (slots_to_replace.find(cur_slot) != slots_to_replace.end()) { + index_slots.insert(i); + } + } + if (slots_shuffle_original_data_.size() == 0) { + // before first slots shuffle, instances could be in + // input_channel, oupput_channel or consume_channel + if (input_channel_ && input_channel_->Size() != 0) { + slots_shuffle_original_data_.reserve(input_channel_->Size()); + input_channel_->Close(); + input_channel_->ReadAll(slots_shuffle_original_data_); + } else { + CHECK(out_channel_size > 0); // NOLINT + if (cur_channel_ == 0) { + for (size_t i = 0; i < multi_output_channel_.size(); ++i) { + std::vector vec_data; + multi_output_channel_[i]->Close(); + multi_output_channel_[i]->ReadAll(vec_data); + slots_shuffle_original_data_.reserve( + slots_shuffle_original_data_.size() + vec_data.size()); + slots_shuffle_original_data_.insert( + slots_shuffle_original_data_.end(), + std::make_move_iterator(vec_data.begin()), + std::make_move_iterator(vec_data.end())); + vec_data.clear(); + vec_data.shrink_to_fit(); + multi_output_channel_[i]->Clear(); + } + } else { + for (size_t i = 0; i < multi_consume_channel_.size(); ++i) { + std::vector vec_data; + multi_consume_channel_[i]->Close(); + multi_consume_channel_[i]->ReadAll(vec_data); + slots_shuffle_original_data_.reserve( + slots_shuffle_original_data_.size() + vec_data.size()); + slots_shuffle_original_data_.insert( + slots_shuffle_original_data_.end(), + std::make_move_iterator(vec_data.begin()), + std::make_move_iterator(vec_data.end())); + vec_data.clear(); + vec_data.shrink_to_fit(); + multi_consume_channel_[i]->Clear(); + } + } + } + } else { + // if already have original data for slots shuffle, clear channel + input_channel_->Clear(); + if (cur_channel_ == 0) { + for (size_t i = 0; i < multi_output_channel_.size(); ++i) { + if (!multi_output_channel_[i]) { + continue; + } + multi_output_channel_[i]->Clear(); + } + } else { + for (size_t i = 0; i < multi_consume_channel_.size(); ++i) { + if (!multi_consume_channel_[i]) { + continue; + } + multi_consume_channel_[i]->Clear(); + } + } + } + int end_size = 0; + if (cur_channel_ == 0) { + for (size_t i = 0; i < multi_output_channel_.size(); ++i) { + if (!multi_output_channel_[i]) { + continue; + } + end_size += multi_output_channel_[i]->Size(); + } + } else { + for (size_t i = 0; i < multi_consume_channel_.size(); ++i) { + if (!multi_consume_channel_[i]) { + continue; + } + end_size += multi_consume_channel_[i]->Size(); + } + } + CHECK(input_channel_->Size() == 0) + << "input channel should be empty before slots shuffle"; + std::vector random_data; + random_data.clear(); + // get slots shuffled random_data + GetRandomData(index_slots, &random_data); + input_channel_->Open(); + input_channel_->Write(std::move(random_data)); + random_data.clear(); + random_data.shrink_to_fit(); + input_channel_->Close(); + + timeline.Pause(); + VLOG(2) << "DatasetImpl::SlotsShuffle() end" + << ", memory data size for slots shuffle=" << input_channel_->Size() + << ", cost time=" << timeline.ElapsedSec() << " seconds"; +} + } // end namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index 3c40a7c0..bcf344d2 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -17,6 +17,7 @@ #include #include #include // NOLINT +#include #include #include // NOLINT #include @@ -57,10 +58,15 @@ class Dataset { virtual void SetDataFeedDesc(const std::string& data_feed_desc_str) = 0; // set channel num virtual void SetChannelNum(int channel_num) = 0; + // set parse ins id + virtual void SetParseInsId(bool parse_ins_id) = 0; + virtual void SetParseContent(bool parse_content) = 0; // set merge by ins id virtual void SetMergeByInsId(const std::vector& merge_slot_list, bool erase_duplicate_feas, int min_merge_size, bool keep_unmerged_ins) = 0; + // set fea eval mode + virtual void SetFeaEval(bool fea_eval, int record_candidate_size) = 0; // get file list virtual const std::vector& GetFileList() = 0; // get thread num @@ -93,7 +99,11 @@ class Dataset { // local shuffle data virtual void LocalShuffle() = 0; // global shuffle data - virtual void GlobalShuffle() = 0; + virtual void GlobalShuffle(int thread_num = -1) = 0; + // for slots shuffle + virtual void SlotsShuffle(const std::set& slots_to_replace) = 0; + virtual void GetRandomData(const std::set& slots_to_replace, + std::vector* result) = 0; // create readers virtual void CreateReaders() = 0; // destroy readers @@ -104,6 +114,17 @@ class Dataset { virtual int64_t GetShuffleDataSize() = 0; // merge by ins id virtual void MergeByInsId() = 0; + // create preload readers + virtual void CreatePreLoadReaders() = 0; + // destroy preload readers after prelaod done + virtual void DestroyPreLoadReaders() = 0; + // set preload thread num + virtual void SetPreLoadThreadNum(int thread_num) = 0; + // seperate train thread and dataset thread + virtual void DynamicAdjustChannelNum(int channel_num) = 0; + virtual void DynamicAdjustReadersNum(int thread_num) = 0; + // set fleet send sleep seconds + virtual void SetFleetSendSleepSeconds(int seconds) = 0; protected: virtual int ReceiveFromClient(int msg_type, int client_id, @@ -126,13 +147,17 @@ class DatasetImpl : public Dataset { const std::string& fs_ugi); virtual void SetDataFeedDesc(const std::string& data_feed_desc_str); virtual void SetChannelNum(int channel_num); + virtual void SetParseInsId(bool parse_ins_id); + virtual void SetParseContent(bool parse_content); virtual void SetMergeByInsId(const std::vector& merge_slot_list, bool erase_duplicate_feas, int min_merge_size, bool keep_unmerged_ins); + virtual void SetFeaEval(bool fea_eval, int record_candidate_size); virtual const std::vector& GetFileList() { return filelist_; } virtual int GetThreadNum() { return thread_num_; } virtual int GetTrainerNum() { return trainer_num_; } + virtual Channel GetInputChannel() { return input_channel_; } virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; } virtual std::pair GetHdfsConfig() { return std::make_pair(fs_name_, fs_ugi_); @@ -149,17 +174,27 @@ class DatasetImpl : public Dataset { virtual void WaitPreLoadDone(); virtual void ReleaseMemory(); virtual void LocalShuffle(); - virtual void GlobalShuffle(); + virtual void GlobalShuffle(int thread_num = -1); + virtual void SlotsShuffle(const std::set& slots_to_replace) {} + virtual void GetRandomData(const std::set& slots_to_replace, + std::vector* result) {} virtual void CreateReaders(); virtual void DestroyReaders(); virtual int64_t GetMemoryDataSize(); virtual int64_t GetShuffleDataSize(); virtual void MergeByInsId() {} + virtual void CreatePreLoadReaders(); + virtual void DestroyPreLoadReaders(); + virtual void SetPreLoadThreadNum(int thread_num); + virtual void DynamicAdjustChannelNum(int channel_num); + virtual void DynamicAdjustReadersNum(int thread_num); + virtual void SetFleetSendSleepSeconds(int seconds); protected: virtual int ReceiveFromClient(int msg_type, int client_id, const std::string& msg); std::vector> readers_; + std::vector> preload_readers_; paddle::framework::Channel input_channel_; int channel_num_; std::vector> multi_output_channel_; @@ -168,6 +203,8 @@ class DatasetImpl : public Dataset { // and when finish reading, we set cur_channel = 1 - cur_channel, // so if cur_channel=0, all data are in output_channel, else consume_channel int cur_channel_; + std::vector slots_shuffle_original_data_; + RecordCandidateList slots_shuffle_rclist_; int thread_num_; paddle::framework::DataFeedDesc data_feed_desc_; int trainer_num_; @@ -180,10 +217,16 @@ class DatasetImpl : public Dataset { int64_t fleet_send_sleep_seconds_; std::vector preload_threads_; bool merge_by_insid_; + bool parse_ins_id_; + bool parse_content_; bool erase_duplicate_feas_; bool keep_unmerged_ins_; int min_merge_size_; std::vector merge_slots_list_; + bool slots_shuffle_fea_eval_ = false; + int preload_thread_num_; + std::mutex global_index_mutex_; + int64_t global_index_ = 0; }; // use std::vector or Record as data type @@ -191,6 +234,9 @@ class MultiSlotDataset : public DatasetImpl { public: MultiSlotDataset() {} virtual void MergeByInsId(); + virtual void SlotsShuffle(const std::set& slots_to_replace); + virtual void GetRandomData(const std::set& slots_to_replace, + std::vector* result); virtual ~MultiSlotDataset() {} }; diff --git a/paddle/fluid/framework/ddim.cc b/paddle/fluid/framework/ddim.cc index d4e46924..b3aaa01d 100644 --- a/paddle/fluid/framework/ddim.cc +++ b/paddle/fluid/framework/ddim.cc @@ -48,22 +48,6 @@ bool DDim::operator==(const DDim& d) const { bool DDim::operator!=(const DDim& d) const { return !(*this == d); } -std::vector vectorize(const DDim& ddim) { - std::vector result(DDim::kMaxRank); - dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); - result.resize(ddim.size()); - return result; -} - -// NOTE: framework::vectorize converts to type int64_t -// which does not fit cudnn inputs. -std::vector vectorize2int(const DDim& ddim) { - std::vector result(DDim::kMaxRank); - dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); - result.resize(ddim.size()); - return result; -} - struct ProductVisitor { template inline int64_t operator()(const Dim& dim) { diff --git a/paddle/fluid/framework/ddim.h b/paddle/fluid/framework/ddim.h index bfe3e55a..14824afb 100644 --- a/paddle/fluid/framework/ddim.h +++ b/paddle/fluid/framework/ddim.h @@ -170,8 +170,13 @@ DDim make_ddim(const std::vector& dims); */ DDim make_ddim(std::initializer_list dims); -std::vector vectorize(const DDim& ddim); -std::vector vectorize2int(const DDim& ddim); +template +std::vector vectorize(const DDim& ddim) { + std::vector result(DDim::kMaxRank); + dynamic_dim_assign(ddim.Get(), result.data(), ddim.size()); + result.resize(ddim.size()); + return result; +} int64_t product(const DDim& ddim); diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index c566f0a5..1e87eabc 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -3,7 +3,10 @@ cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) + +cc_library(share_tensor_buffer_functor SRCS share_tensor_buffer_functor.cc DEPS framework_proto scope place operator op_registry) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) +cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope computation_op_handle share_tensor_buffer_functor) cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) @@ -59,12 +62,7 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) -cc_library(share_tensor_buffer_op_handle SRCS share_tensor_buffer_op_handle.cc DEPS op_handle_base scope) - -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass buffer_shared_inplace_op_pass) -if (WITH_GPU) - list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) -endif() +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass buffer_shared_inplace_op_pass buffer_shared_cross_op_memory_reuse_pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope @@ -82,18 +80,27 @@ cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_ha device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context gather_op_handle) -cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor) + +cc_library(scope_buffered_monitor SRCS scope_buffered_monitor.cc DEPS scope profiler selected_rows) +cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor scope_buffered_monitor) #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory # device_context reduce_op_handle ) cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle) +if(WITH_NGRAPH) + set(NGRAPH_BS_DEPS ngraph) +else() + set(NGRAPH_BS_DEPS) +endif() + cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass fuse_elewise_add_act_pass multi_batch_merge_pass fuse_relu_depthwise_conv_pass - memory_optimize_pass lock_free_optimize_pass + lock_free_optimize_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass - fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass record_skip_memory_opt_vars_pass) + fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass + ${NGRAPH_BS_DEPS}) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index f806a4fa..a367772a 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -20,12 +20,9 @@ #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" -// asynchronous nccl allreduce or synchronous issue: -// https://github.com/PaddlePaddle/Paddle/issues/15049 -DEFINE_bool( - sync_nccl_allreduce, true, - "If set true, will call `cudaStreamSynchronize(nccl_stream)`" - "after allreduce, this mode can get better performance in some scenarios."); +#ifdef PADDLE_WITH_CUDA +DECLARE_bool(sync_nccl_allreduce); +#endif namespace paddle { namespace framework { @@ -43,11 +40,124 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places) - : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} + : OpHandleBase(node), local_scopes_(local_scopes), places_(places) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); +} #endif +void AllReduceOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); + + WaitInputVarGenerated(); + std::vector inputs = this->Inputs(); + std::vector outputs = this->Outputs(); + auto in_var_handles = DynamicCast(inputs); + auto out_var_handles = DynamicCast(outputs); + AllReduceImpl(in_var_handles, out_var_handles); +} + +void AllReduceOpHandle::AllReduceImpl( + const std::vector &in_var_handles, + const std::vector &out_var_handles) { + size_t num_places = places_.size(); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), num_places, + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + PADDLE_ENFORCE_EQ(local_exec_scopes_.size(), num_places); + + std::vector lod_tensor_data; + std::vector places; + lod_tensor_data.reserve(num_places); + places.reserve(num_places); + int64_t numel = -1; + bool is_gpu_place = false; + auto dtype = static_cast(0); + for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { + auto &local_scope = local_exec_scopes_[i]; + auto var = local_scope->FindVar(in_var_handles[i]->name()); + PADDLE_ENFORCE_NOT_NULL(var, "%s is not found int scope.", + in_var_handles[i]->name()); + auto &lod_tensor = var->Get(); + + if (i == 0) { + numel = static_cast(lod_tensor.numel()); + dtype = lod_tensor.type(); + is_gpu_place = platform::is_gpu_place(lod_tensor.place()); + } + PADDLE_ENFORCE_EQ(numel, static_cast(lod_tensor.numel())); + PADDLE_ENFORCE_EQ(dtype, lod_tensor.type()); + PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place())); + + lod_tensor_data.emplace_back(lod_tensor.data()); + places.emplace_back(lod_tensor.place()); + + VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name() + << ", out_name:" << out_var_handles[i]->name(); + + PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(), + "The name of input and output should be equal."); + } + + std::vector grad_var_names; + grad_var_names.reserve(num_places); + for (auto &out_var : out_var_handles) { + grad_var_names.emplace_back(out_var->Name()); + } + + AllReduceFunc(lod_tensor_data, dtype, numel, places, grad_var_names); +} + +void AllReduceOpHandle::AllReduceFunc( + std::vector lod_tensor_data, + const framework::proto::VarType::Type &dtype, int64_t numel, + const std::vector &places, + const std::vector &out_var_names) { + if (is_gpu_place(places[0])) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + PADDLE_ENFORCE_NOT_NULL(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + ncclDataType_t nccl_dtype = platform::ToNCCLDataType(dtype); + std::vector> all_reduce_calls; + for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { + auto &p = places[i]; + void *buffer = const_cast(lod_tensor_data.at(i)); + all_reduce_calls.emplace_back([=] { + NCCLAllReduce(p, buffer, buffer, numel, nccl_dtype, ncclSum); + }); + } + NCCLAllReduceFunc(all_reduce_calls); +#else + PADDLE_THROW("Not compiled with CUDA."); +#endif + } else { // Special handle CPU only Operator's gradient. Like CRF + auto &trg = *local_exec_scopes_[0] + ->FindVar(out_var_names[0]) + ->GetMutable(); + + // Reduce All Tensor to trg in CPU + ReduceBufferData func(lod_tensor_data, trg.data(), numel); + VisitDataType(trg.type(), func); + + for (size_t i = 1; i < local_exec_scopes_.size(); ++i) { + auto &scope = local_exec_scopes_[i]; + auto &p = places[i]; + auto *var = scope->FindVar(out_var_names[i]); + + size_t size = numel * SizeOfType(trg.type()); + RunAndRecordEvent(p, [&trg, var, p, size] { + auto dst_ptr = var->GetMutable()->data(); + platform::CPUPlace cpu_place; + memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data(), size); + }); + } + } + VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype); +} + #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) -void AllReduceOpHandle::RunAllReduceFuncs( +void AllReduceOpHandle::NCCLAllReduceFunc( const std::vector> &all_reduce_calls) { this->RunAndRecordEvent([&] { if (all_reduce_calls.size() == 1UL) { @@ -83,85 +193,6 @@ void AllReduceOpHandle::RunAllReduceFuncs( } #endif -void AllReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name()); - - WaitInputVarGenerated(); - - auto in_var_handles = DynamicCast(this->Inputs()); - auto out_var_handles = DynamicCast(this->Outputs()); - PADDLE_ENFORCE_EQ( - in_var_handles.size(), places_.size(), - "The NoDummyInputSize should be equal to the number of places."); - PADDLE_ENFORCE_EQ( - in_var_handles.size(), out_var_handles.size(), - "The NoDummyInputSize and NoDummyOutputSize should be equal."); - - std::vector lod_tensors; - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &local_scope = local_exec_scopes_[i]; - auto &lod_tensor = - local_scope->FindVar(in_var_handles[i]->name())->Get(); - lod_tensors.emplace_back(&lod_tensor); - VLOG(10) << "place:" << i << ", input_name:" << in_var_handles[i]->name() - << ", out_name:" << out_var_handles[i]->name(); - PADDLE_ENFORCE_EQ(in_var_handles[i]->name(), out_var_handles[i]->name(), - "The name of input and output should be equal."); - } - - if (platform::is_gpu_place(lod_tensors[0]->place())) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); - int dtype = -1; - size_t numel = 0; - std::vector> all_reduce_calls; - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &p = places_[i]; - auto &lod_tensor = *lod_tensors[i]; - void *buffer = const_cast(lod_tensor.data()); - - if (dtype == -1) { - dtype = platform::ToNCCLDataType(lod_tensor.type()); - } - - if (numel == 0) { - numel = static_cast(lod_tensor.numel()); - } - - all_reduce_calls.emplace_back([=] { - NCCLAllReduce(p, buffer, buffer, numel, - static_cast(dtype), ncclSum); - }); - } - VLOG(10) << "allreduce size:" << numel * SizeOfType(lod_tensors[0]->type()); - RunAllReduceFuncs(all_reduce_calls); -#else - PADDLE_THROW("Not compiled with CUDA"); -#endif - } else { // Special handle CPU only Operator's gradient. Like CRF - auto &trg = *this->local_exec_scopes_[0] - ->FindVar(out_var_handles[0]->name()) - ->GetMutable(); - - // Reduce All Tensor to trg in CPU - ReduceLoDTensor func(lod_tensors, &trg); - VisitDataType(lod_tensors[0]->type(), func); - - for (size_t i = 1; i < local_scopes_.size(); ++i) { - auto &scope = local_exec_scopes_[i]; - auto &p = places_[i]; - auto *var = scope->FindVar(out_var_handles[i]->name()); - auto *dev_ctx = dev_ctxes_.at(p); - - RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { - auto &tensor_gpu = *var->GetMutable(); - auto &tensor_cpu = trg; - TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu); - }); - } - } -} - std::string AllReduceOpHandle::Name() const { return "all_reduce"; } } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index ed5e475a..c18b0ed9 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -61,9 +61,17 @@ class AllReduceOpHandle : public OpHandleBase { #endif #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - void RunAllReduceFuncs( + void NCCLAllReduceFunc( const std::vector> &all_reduce_calls); #endif + + void AllReduceImpl(const std::vector &in_var_handles, + const std::vector &out_var_handles); + + void AllReduceFunc(std::vector lod_tensor_data, + const framework::proto::VarType::Type &dtype, + int64_t numel, const std::vector &places, + const std::vector &out_var_handles); }; } // namespace details diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 75143b9a..3637625f 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -38,8 +38,6 @@ void BroadcastOpHandle::RunImpl() { VarHandle *in_var_handle = in_var_handles[0]; - WaitInputVarGenerated(); - BroadcastOneVar(*in_var_handle, out_var_handles, local_exec_scopes_); } @@ -59,6 +57,7 @@ void BroadcastOpHandle::BroadcastOneVar( InitOutputValue(in_var_handle, out_var_handles); if (platform::is_cpu_place(in_tensor.place())) { + WaitInputVarGenerated(); for (auto *out_var_handle : out_var_handles) { if (out_var_handle->IsTheSameVar(in_var_handle)) { continue; @@ -109,6 +108,7 @@ void BroadcastOpHandle::BroadcastOneVar( }); } + WaitInputVarGenerated(); this->RunAndRecordEvent([&] { { platform::NCCLGroupGuard guard; @@ -126,6 +126,9 @@ void BroadcastOpHandle::BroadcastOneVar( &VariableVisitor::GetMutableTensor(out_var)); } }); + for (auto &p : places_) { + nccl_ctxs_->DevCtx(p)->Wait(); + } #else PADDLE_THROW("CUDA is not enabled."); #endif diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 326a4631..1f5fd015 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -21,13 +21,13 @@ limitations under the License. */ #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_printer.h" #include "paddle/fluid/framework/ir/graph_to_program_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h" -#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h" DECLARE_bool(use_mkldnn); +DECLARE_bool(use_ngraph); namespace paddle { namespace framework { @@ -43,216 +43,239 @@ static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { !strategy.enable_parallel_graph_; } +static inline void ConvertDefaultValue(boost::optional *default_value) { + if (*default_value == boost::none) { + *default_value = true; + } +} + class ParallelExecutorPassBuilder : public ir::PassBuilder { public: explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) : ir::PassBuilder(), strategy_(strategy) { - // Add a graph viz pass to record a graph. - if (!strategy_.debug_graphviz_path_.empty()) { - VLOG(1) << "Add graph_viz_pass"; - auto viz_pass = AppendPass("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph"); - viz_pass->Set("graph_viz_path", new std::string(graph_path)); - } + ResolveOptionConfliction(); - // Note(zcd): record_skip_memory_opt_vars_pass should be the first pass. - VLOG(1) << "Add record_skip_memory_opt_vars_pass"; - AppendPass("record_skip_memory_opt_vars_pass"); - -#ifdef PADDLE_WITH_MKLDNN - if (FLAGS_use_mkldnn) { - VLOG(1) << "Add mkldnn_placement_pass"; - AppendPass("mkldnn_placement_pass"); - } else if (!strategy_.mkldnn_enabled_op_types_.empty()) { - LOG(WARNING) - << "mkldnn_enabled_op_types specify the operator type list to " - "use MKLDNN acceleration. It is null in default, means " - "that all the operators supported by MKLDNN will be " - "accelerated. And it should not be set when " - "FLAGS_use_mkldnn=false."; - } -#else - PADDLE_ENFORCE(!FLAGS_use_mkldnn, - "Please compile with MKLDNN first to use MKLDNN"); -#endif + AppendPrintGraphPass("graph_viz_pass", "_original_graph"); + AppendPassWithCheck(strategy_.enable_sequential_execution_, + "sequential_execution_pass"); + AppendPassWithCheck(strategy_.sync_batch_norm_, "sync_batch_norm_pass"); - if (strategy_.enable_sequential_execution_) { - VLOG(1) << "Add sequential_execution_pass"; - AppendPass("sequential_execution_pass"); - } + AppendPassToUseNgraph("ngraph_subgraph_pass"); - // Add op fusion. - if (strategy.sync_batch_norm_) { - AppendPass("sync_batch_norm_pass"); - } + AppendOpFusePasses(); + AppendPrintGraphPass("graph_viz_pass", "_fused_graph"); - // Add op fusion. - if (strategy.fuse_relu_depthwise_conv_) { - VLOG(1) << "Add fuse_relu_depthwise_conv_pass"; - AppendPass("fuse_relu_depthwise_conv_pass"); - } + AppendMultiDevPass(); + AppendMultiGraphOptPasses(); - // TODO(zjl): refactor MemoryOptimizePass to fit - // new strategy, which does not need to set - // var.persistable = True - if (strategy_.use_legacy_memory_optimize_strategy_) { - if (strategy_.enable_inplace_) { - VLOG(5) << "Add inplace_pass"; - AppendPass("inplace_pass"); - } - } + AppendPassToSetMkldnnAttr("mkldnn_placement_pass"); + // runtime_context_cache pass should be the last pass to enable the attr of + // all original and fused operators. But no operators can be enabled this + // attr if putting it after MultiDevPass. + AppendPassWithCheck(strategy_.cache_runtime_context_, + "runtime_context_cache_pass"); + AppendPassWithCheck(strategy_.remove_unnecessary_lock_, + "modify_op_lock_and_record_event_pass"); + // Note: This pass is used to check whether the multi_device_graph is right. + AppendPass("multi_devices_check_pass"); - if (strategy_.fuse_elewise_add_act_ops_) { - VLOG(1) << "Add fuse_elewise_add_act_pass"; - AppendPass("fuse_elewise_add_act_pass"); - } + SetCollectiveContext(); + } - // for single card training, fuse_all_reduce_ops is unnecessary. - // coalesce_grad_tensor_pass should be before of MultiDevPass. - if (strategy_.fuse_all_reduce_ops_) { - VLOG(1) << "Add coalesce_grad_tensor_pass"; - AppendPass("coalesce_grad_tensor_pass"); + void ResolveOptionConfliction() { + // Specifies the restrictions between different pass. + if (strategy_.enable_parallel_graph_) { + LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true) + << "Currently, fuse_all_optimizer_ops doesn't work under " + "parallel_graph."; + strategy_.fuse_all_optimizer_ops_ = false; + LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true) + << "fuse_all_reduce_ops doesn't work under " + "parallel_graph."; + strategy_.fuse_all_reduce_ops_ = false; } - - // Fuse all the optimization operators. if (strategy_.is_distribution_) { - VLOG(3) << "Currently, fuse_all_optimizer_ops only works under " - "Non-distributed mode."; + LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true) + << "Currently, fuse_all_optimizer_ops only works under " + "Non-distributed mode."; strategy_.fuse_all_optimizer_ops_ = false; + LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true) + << "Currently, fuse_all_reduce_ops_ only works under " + "Non-distributed mode."; + strategy_.fuse_all_reduce_ops_ = false; } - if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce || - strategy_.is_distribution_) { - VLOG(3) << "Currently, fuse_all_optimizer_ops only works under AllReduce " - "mode."; + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + LOG_IF(WARNING, strategy_.fuse_all_optimizer_ops_ == true) + << "Currently, fuse_all_optimizer_ops only works under AllReduce " + "mode."; strategy_.fuse_all_optimizer_ops_ = false; + LOG_IF(WARNING, strategy_.fuse_all_reduce_ops_ == true) + << "fuse_all_optimizer_ops only works under AllReduce " + "mode."; + strategy_.fuse_all_reduce_ops_ = false; } - if (strategy_.fuse_all_optimizer_ops_) { - // NOTE: fuse_all_xx_ops will count the number of xx operator first, - // if the number is zero, fuse_all_reduce_ops will do nothing. - // Currently, only one type of optimization algorithm can be fused. - VLOG(1) << "Add fuse_adam_op_pass"; - AppendPass("fuse_adam_op_pass"); - VLOG(1) << "Add fuse_sgd_op_pass"; - AppendPass("fuse_sgd_op_pass"); - VLOG(1) << "Add fuse_momentum_op_pass"; - AppendPass("fuse_momentum_op_pass"); + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + LOG_IF(WARNING, strategy_.fuse_broadcast_ops_ == true) + << "Currently, fuse_broadcast_ops only works under Reduce " + "mode."; + strategy_.fuse_broadcast_ops_ = false; } - // Add a graph viz pass to record a graph. - if (!strategy.debug_graphviz_path_.empty()) { - auto viz_pass = AppendPass("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", strategy_.debug_graphviz_path_.c_str(), "_fused_graph"); - viz_pass->Set("graph_viz_path", new std::string(graph_path)); - } + ConvertDefaultValue(&strategy_.fuse_all_optimizer_ops_); + ConvertDefaultValue(&strategy_.fuse_all_reduce_ops_); + ConvertDefaultValue(&strategy_.fuse_broadcast_ops_); - CollectiveContext *context = CollectiveContext::GetInstance(); - context->endpoints_ = strategy_.trainers_endpoints_; - context->trainer_id_ = strategy_.trainer_id_; - PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0"); - if (strategy_.trainer_id_ > 0 && strategy_.trainers_endpoints_.size() > 0) { - PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) < - strategy_.trainers_endpoints_.size(), - "trainer_id_ < endpoints_ size"); + if (strategy_.fuse_all_optimizer_ops_ == true) { + LOG_IF(WARNING, strategy_.async_mode_) + << "Currently, fuse_all_optimizer_ops doesn't work under " + "async mode."; + strategy_.fuse_all_optimizer_ops_ = !strategy_.async_mode_; } - VLOG(1) << "CollectiveContext:" << context->String(); - - // NOTE(dzh): memory optimize should be a runtime pass. - // However, after multi_devices_pass, VarHandle, OpHandle is - // the de-fact IR, any reuse on Graph is meaningless. - // A side-effect of that, memory optimize cannot forsee the fetched vars - // , so fetchlist should be set persistable before call the Run interface. - if (strategy_.use_legacy_memory_optimize_strategy_) { - if (strategy_.memory_optimize_) { - VLOG(5) << "Add memory_optimize_pass"; - AppendPass("memory_optimize_pass"); - } - } - - // runtime_context_cache pass should be the last pass to enable the attr of - // all original and fused operators. But no operators can be enabled this - // attr if putting it after MultiDevPass. - if (strategy_.cache_runtime_context_) { - VLOG(1) << "Add runtime_context_cache_pass"; - AppendPass("runtime_context_cache_pass"); - } - - AppendMultiDevPass(strategy_); - - if (strategy_.fuse_all_reduce_ops_) { - // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator - // first, if the number is zero, fuse_all_reduce_ops will do nothing. - VLOG(1) << "Add fuse_all_reduce_op_pass"; - AppendPass("fuse_all_reduce_op_pass"); + if (strategy_.fuse_all_reduce_ops_ == true) { + LOG_IF(WARNING, strategy_.async_mode_) + << "Currently, fuse_all_reduce_ops doesn't work under " + "async mode."; + strategy_.fuse_all_reduce_ops_ = !strategy_.async_mode_; } + } - // Add a graph print pass to record a graph with device info. - if (!strategy_.debug_graphviz_path_.empty()) { - VLOG(1) << "Add multi_devices_print_pass"; - auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); - const std::string graph_path = - string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(), - "_multi_devices_graph"); - multi_devices_print_pass->Set(ir::kGraphvizPath, - new std::string(graph_path)); - multi_devices_print_pass->Set( - "graph_printer", new ir::GraphvizSSAGraphPrinter); - } + void AppendMultiGraphOptPasses() { + // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator + // first, if the number is zero, fuse_all_reduce_ops will do nothing. + AppendPassWithCheck(strategy_.fuse_all_reduce_ops_, + "fuse_all_reduce_op_pass"); + AppendPrintGraphPass("multi_devices_print_pass", "_multi_devices_graph"); // experimental shows that the program will be faster if append // all_reduce_deps_pass here. - if (!strategy_.enable_parallel_graph_ && + bool append_all_reduce_deps_pass = + !strategy_.enable_parallel_graph_ && (SeqOnlyAllReduceOps(strategy_) || - strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce)) { - VLOG(1) << "Add all_reduce_deps_pass"; - AppendPass("all_reduce_deps_pass"); - } + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce); + AppendPassWithCheck(append_all_reduce_deps_pass, "all_reduce_deps_pass"); - if (strategy_.num_trainers_ > 1 && !strategy_.async_mode_ && + bool append_backward_optimizer_op_deps_pass = + strategy_.num_trainers_ > 1 && !strategy_.async_mode_ && !strategy_.is_distribution_ && - strategy_.enable_backward_optimizer_op_deps_) { - VLOG(1) << "Add backward_op_deps_pass"; - AppendPass("backward_optimizer_op_deps_pass"); - } + strategy_.enable_backward_optimizer_op_deps_; + AppendPassWithCheck(append_backward_optimizer_op_deps_pass, + "backward_optimizer_op_deps_pass"); + } - if (strategy_.remove_unnecessary_lock_) { - VLOG(1) << "Add modify_op_lock_and_record_event_pass"; - AppendPass("modify_op_lock_and_record_event_pass"); + void AppendOpFusePasses() { + AppendPassWithCheck(strategy_.fuse_relu_depthwise_conv_, + "fuse_relu_depthwise_conv_pass"); + AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, + "fuse_elewise_add_act_pass"); + // for single card training, fuse_all_reduce_ops is unnecessary. + // coalesce_grad_tensor_pass should be before of MultiDevPass. + AppendPassWithCheck(strategy_.fuse_all_reduce_ops_, + "coalesce_grad_tensor_pass"); + // Fuse all the optimization operators. + // NOTE: fuse_all_xx_ops will count the number of xx operator first, + // if the number is zero, fuse_all_reduce_ops will do nothing. + // Currently, only one type of optimization algorithm can be fused. + if (strategy_.fuse_all_optimizer_ops_ == true) { + AppendPass("fuse_adam_op_pass"); + AppendPass("fuse_sgd_op_pass"); + AppendPass("fuse_momentum_op_pass"); } + } - // Verify that the graph is correct for multi-device executor. - VLOG(1) << "Add multi_devices_check_pass"; - AppendPass("multi_devices_check_pass"); + void SetCollectiveContext() const { + CollectiveContext *context = CollectiveContext::GetInstance(); + context->endpoints_ = strategy_.trainers_endpoints_; + context->trainer_id_ = strategy_.trainer_id_; + PADDLE_ENFORCE_GE(strategy_.trainer_id_, 0, "trainer_id_ >= 0"); + if (strategy_.trainer_id_ > 0 && strategy_.trainers_endpoints_.size() > 0) { + PADDLE_ENFORCE_LT(static_cast(strategy_.trainer_id_), + strategy_.trainers_endpoints_.size(), + "trainer_id_ < endpoints_ size"); + } + VLOG(1) << "CollectiveContext:" << context->String(); } // Convert graph to run on multi-devices. - void AppendMultiDevPass(const BuildStrategy &strategy) { + void AppendMultiDevPass() { ir::Pass *multi_devices_pass = nullptr; - if (strategy_.async_mode_) { - VLOG(1) << "Add async_multi_devices_pass"; multi_devices_pass = AppendPass("async_multi_devices_pass").get(); } else if (strategy_.is_distribution_) { - VLOG(1) - << "Add dist_multi_devices_pass, multi device parameter server mode"; multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); } else { - if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { - VLOG(1) << "Add all_reduce_mode_multi_devices_pass"; - multi_devices_pass = - AppendPass("all_reduce_mode_multi_devices_pass").get(); - } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { - VLOG(1) << "Add reduce_mode_multi_devices_pass"; - multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); - } else { - PADDLE_THROW("Unknown reduce strategy."); + switch (strategy_.reduce_) { + case BuildStrategy::ReduceStrategy::kAllReduce: + multi_devices_pass = + AppendPass("all_reduce_mode_multi_devices_pass").get(); + break; + case BuildStrategy::ReduceStrategy::kReduce: + multi_devices_pass = + AppendPass("reduce_mode_multi_devices_pass").get(); + break; + default: + PADDLE_THROW("Unknown reduce strategy."); } } multi_devices_pass->SetNotOwned("strategy", &strategy_); } + void AppendPrintGraphPass(const std::string &pass_name, + const std::string &debug_file_suffix) { + if (!strategy_.debug_graphviz_path_.empty()) { + auto viz_pass = AppendPass(pass_name); + const std::string graph_path = string::Sprintf( + "%s%s", strategy_.debug_graphviz_path_.c_str(), debug_file_suffix); + viz_pass->Set(ir::kGraphvizPath, + new std::string(graph_path)); + } + } + + void AppendPassWithCheck(const boost::optional &append_pass, + const std::string &pass_name) { + AppendPassWithCheck(append_pass == true, pass_name); + } + + void AppendPassWithCheck(bool append_pass, const std::string &pass_name) { + if (append_pass) { + AppendPass(pass_name); + } + } + + void AppendPassToSetMkldnnAttr(const std::string &pass_name) { +#ifdef PADDLE_WITH_MKLDNN + if (FLAGS_use_mkldnn) { + AppendPass(pass_name); + } else if (!strategy_.mkldnn_enabled_op_types_.empty()) { + LOG(WARNING) + << "mkldnn_enabled_op_types specify the operator type list to " + "use MKLDNN acceleration. It is null in default, means " + "that all the operators supported by MKLDNN will be " + "accelerated. And it should not be set when " + "FLAGS_use_mkldnn=false."; + } +#else + PADDLE_ENFORCE(!FLAGS_use_mkldnn, + "Please compile with MKLDNN first to use MKLDNN"); +#endif + } + + void AppendPassToUseNgraph(const std::string &pass_name) { +#ifdef PADDLE_WITH_NGRAPH + if (FLAGS_use_ngraph) { + if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kAllReduce) { + LOG(WARNING) << "Currently ngraph_subgraph_pass works under AllReduce," + "please set FLAGS_use_ngraph=false."; + } else { + AppendPass(pass_name); + } + } +#else + PADDLE_ENFORCE_NE(FLAGS_use_ngraph, true, + "Please compile with NGRAPH first to use NGRAPH"); +#endif + } + private: BuildStrategy strategy_; }; @@ -284,12 +307,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, #else const bool use_cuda) const { #endif - VLOG(3) << "apply all passes"; + VLOG(1) << "apply all passes"; // Create a default one if not finalized by user. CreatePassesFromStrategy(false); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { - VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type(); + VLOG(1) << "BuildStrategy::Apply pass:" << pass->Type(); if (IsMultiDevPass(pass->Type())) { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); @@ -298,40 +321,33 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Erase(kLocalScopes); pass->SetNotOwned>(kLocalScopes, &local_scopes); - pass->Erase(ir::kNRanks); - pass->Set(ir::kNRanks, new size_t(nranks)); + pass->Erase(kNRanks); + pass->Set(kNRanks, new size_t(nranks)); #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); #endif - } else if (pass->Type() == "coalesce_grad_tensor_pass" || - pass->Type() == "fuse_adam_op_pass" || - pass->Type() == "fuse_sgd_op_pass" || - pass->Type() == "fuse_momentum_op_pass" || - pass->Type() == "fuse_all_reduce_op_pass") { + } else if (pass->Type() == "fuse_all_reduce_op_pass") { + pass->Erase(kNRanks); + pass->Set(kNRanks, new size_t(nranks)); pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); pass->Erase(kLocalScopes); pass->SetNotOwned>(kLocalScopes, &local_scopes); - if (pass->Type() == "fuse_all_reduce_op_pass") { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; - pass->Erase(kNCCLCtxs); - pass->SetNotOwned(kNCCLCtxs, nctx); - pass->Erase(kUseHierarchicalAllReduce); - pass->Set(kUseHierarchicalAllReduce, - new bool(use_hierarchical_allreduce_)); + platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase(kNCCLCtxs); + pass->SetNotOwned(kNCCLCtxs, nctx); + pass->Erase(kUseHierarchicalAllReduce); + pass->Set(kUseHierarchicalAllReduce, + new bool(use_hierarchical_allreduce_)); #endif - } } else if (pass->Type() == "coalesce_grad_tensor_pass") { - pass->Erase(kPlaces); - pass->SetNotOwned>(kPlaces, &places); - pass->Erase(kLocalScopes); - pass->SetNotOwned>(kLocalScopes, - &local_scopes); + pass->Erase(kNRanks); + pass->Set(kNRanks, new size_t(nranks)); } else if (pass->Type() == "sequential_execution_pass") { LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; @@ -352,9 +368,6 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, "GPU, skipped."; continue; } - } else if (pass->Type() == "inplace_pass") { - pass->Erase(ir::kUseCuda); - pass->Set(ir::kUseCuda, new bool(use_cuda)); } else if (pass->Type() == "mkldnn_placement_pass") { pass->Set("mkldnn_enabled_op_types", new std::unordered_set(mkldnn_enabled_op_types_)); @@ -365,11 +378,11 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, continue; } } - VLOG(3) << "Start Apply Pass " << pass->Type(); + VLOG(1) << "Start Apply Pass " << pass->Type(); graph = pass->Apply(graph); - VLOG(3) << "Finish Apply Pass " << pass->Type(); + VLOG(1) << "Finish Apply Pass " << pass->Type(); } - VLOG(3) << "All Passes Applied"; + VLOG(1) << "All Passes Applied"; return graph; } @@ -387,12 +400,10 @@ USE_PASS(all_reduce_mode_multi_devices_pass); USE_PASS(dist_multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); -USE_PASS(memory_optimize_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(backward_optimizer_op_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); -USE_PASS(inplace_pass); USE_PASS(lock_free_optimize_pass); USE_PASS(coalesce_grad_tensor_pass); USE_PASS(graph_to_program_pass); @@ -401,7 +412,9 @@ USE_PASS(fuse_sgd_op_pass); USE_PASS(fuse_momentum_op_pass); USE_PASS(fuse_all_reduce_op_pass); USE_PASS(runtime_context_cache_pass); -USE_PASS(record_skip_memory_opt_vars_pass); #ifdef PADDLE_WITH_MKLDNN USE_PASS(mkldnn_placement_pass); #endif +#ifdef PADDLE_WITH_NGRAPH +USE_PASS(ngraph_subgraph_pass); +#endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 14fb1783..5f0cc4b2 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -19,6 +19,7 @@ #include #include #include +#include "boost/optional.hpp" #include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -88,8 +89,8 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients // should not be sparse types - bool fuse_all_optimizer_ops_{false}; - bool fuse_all_reduce_ops_{false}; + boost::optional fuse_all_optimizer_ops_{boost::none}; + boost::optional fuse_all_reduce_ops_{boost::none}; // fuse_relu_depthwise_conv can fuse the `relu -> // depthwise_conv` bool fuse_relu_depthwise_conv_{false}; @@ -97,7 +98,7 @@ struct BuildStrategy { // faster. Because fusing broadcast OP equals delaying the execution of all // broadcast Ops, in this case, all nccl streams are used only for reduce // operations for a period of time. - bool fuse_broadcast_ops_{false}; + boost::optional fuse_broadcast_ops_{boost::none}; // replace batch_norm with sync_batch_norm. bool sync_batch_norm_{false}; @@ -108,19 +109,14 @@ struct BuildStrategy { // FLAGS_use_mkldnn=false std::unordered_set mkldnn_enabled_op_types_; - // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4 - // to open them by default, we need to solve the fetch variable issue - // TODO(liuwei1031): memory_optimize depends on kStaleProgramOpDescs, - // it is not appropriate, because kStaleProgramOpDescs will be removed in the - // near future. - bool memory_optimize_{false}; + // By default, memory_optimize would be opened if gc is disabled, and + // be closed if gc is enabled. + // Users can forcely enable/disable memory_optimize by setting True/False. + boost::optional memory_optimize_{boost::none}; // Turn on inplace by default. bool enable_inplace_{true}; - // TODO(zjl): Remove this flag when MemoryOptimizePass is refactored - bool use_legacy_memory_optimize_strategy_{false}; - // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // num_trainers is 1, so the current fields of build_strategy doesn't tell if // it's distributed model. diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc index 817fe03c..2e64f9d4 100644 --- a/paddle/fluid/framework/details/eager_deletion_op_handle.cc +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -96,7 +96,8 @@ void EagerDeletionOpHandle::RunImpl() { std::deque> garbages; for (size_t i = 0; i < var_infos_.size(); ++i) { auto *var_info = var_infos_[i]; - if (var_info->IsSkipped() || !var_info->DecreaseRefCnt()) { + if (var_info->IsSkippedAllMemoryOptimization() || + !var_info->DecreaseRefCnt()) { continue; } diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 68de1580..b44e6b6a 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -31,7 +31,7 @@ struct ExecutionStrategy { // iterations the framework cleans up a local execution scope. // In some models, the value of this parameter has a great // influence on the performance(about 15%) of the program. - size_t num_iteration_per_drop_scope_{1}; + size_t num_iteration_per_drop_scope_{100}; // At present, the kExperimental executor is the fastest in most models. ExecutorType type_{kExperimental}; // This debug option. diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 7daab6da..97557d2b 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" +#include #include -#include #include #include #include @@ -191,13 +191,13 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( const std::shared_ptr> &complete_q) { ++remaining_; this->pool_.enqueue([=] { - std::queue op_queue; - op_queue.push(op); + std::deque op_queue; + op_queue.push_front(op); size_t complete = 0; while (!op_queue.empty()) { - OpHandleBase *op_to_run = op_queue.front(); - op_queue.pop(); + OpHandleBase *op_to_run = op_queue.back(); + op_queue.pop_back(); if (!RunOp(op_to_run, complete_q, &complete)) { return; @@ -213,7 +213,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( // NOTE(zjl): op with highest priority should run // first without switching to another thread. if (pending_op->GetPriority() == OpHandleBase::Priority::kHighest) { - op_queue.push(pending_op); + op_queue.push_back(pending_op); } else { if (op_to_run == nullptr) { op_to_run = pending_op; @@ -224,7 +224,9 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( } } - if (op_to_run != nullptr) op_queue.push(op_to_run); + if (op_to_run != nullptr) { + op_queue.push_front(op_to_run); + } } --remaining_; complete_q->Push(complete); diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 1ac32ca9..221dec72 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -61,12 +61,17 @@ void FetchOpHandle::RunImpl() { var_handle->name()); auto &t = var->Get(); - if (platform::is_gpu_place(t.place())) { + if (t.IsInitialized() && t.numel() > 0) { + if (platform::is_gpu_place(t.place())) { #ifdef PADDLE_WITH_CUDA - TensorCopy(t, cpu, &tensors_[i]); + TensorCopy(t, cpu, &tensors_[i]); #endif + } else { + tensors_[i].ShareDataWith(t); + } } else { - tensors_[i].ShareDataWith(t); + tensors_[i].clear(); + tensors_[i].Resize({0}); } tensors_[i].set_lod(t.lod()); } diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 23f0b439..dce4e36e 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -33,28 +33,18 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, const std::vector &places, const size_t num_of_all_reduce, const platform::NCCLCommunicator *ctxs) - : NCCLOpHandleBase(node, places, ctxs), - local_scopes_(local_scopes), - num_of_all_reduce_(num_of_all_reduce) { - PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); -} + : AllReduceOpHandle(node, local_scopes, places, ctxs), + num_of_all_reduce_(num_of_all_reduce) {} #else - FusedAllReduceOpHandle::FusedAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, const std::vector &places, const size_t num_of_all_reduce) - : OpHandleBase(node), - local_scopes_(local_scopes), - places_(places), - num_of_all_reduce_(num_of_all_reduce) { - PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); -} - + : AllReduceOpHandle(node, local_scopes, places), + num_of_all_reduce_(num_of_all_reduce) {} #endif void FusedAllReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name()); - VLOG(4) << this->DebugString(); WaitInputVarGenerated(); @@ -71,6 +61,30 @@ void FusedAllReduceOpHandle::RunImpl() { in_var_handles.size(), out_var_handles.size(), "The NoDummyInputSize and NoDummyOutputSize should be equal."); + // Note: some gradient op doesn't have CUDAKernel, so the gradients of + // those op are in CPUPlace, in this case, the all reduce should not be fused. + if (InputIsInDifferentPlace(in_var_handles)) { + for (size_t j = 0; j < num_of_all_reduce_; ++j) { + std::vector dev_inputs; + std::vector dev_outputs; + dev_inputs.reserve(place_num); + dev_outputs.reserve(place_num); + for (size_t idx = 0; idx < place_num; ++idx) { + dev_inputs.emplace_back(in_var_handles.at(j * place_num + idx)); + dev_outputs.emplace_back(out_var_handles.at(j * place_num + idx)); + } + AllReduceImpl(dev_inputs, dev_outputs); + } + } else { + FusedAllReduceFunc(in_var_handles, out_var_handles); + } +} + +void FusedAllReduceOpHandle::FusedAllReduceFunc( + const std::vector &in_var_handles, + const std::vector &out_var_handles) { + size_t place_num = places_.size(); + GradientAndLoDTensor grads_tensor; grads_tensor.resize(place_num); @@ -87,14 +101,11 @@ void FusedAllReduceOpHandle::RunImpl() { static_cast(0); GetDTypeAndNumel(g_tensor, &ele_dtype, &element_num); - if (numel == -1) { + if (scope_idx == 0) { numel = element_num; - } - if (dtype == static_cast(0)) { dtype = ele_dtype; - PADDLE_ENFORCE_NE(ele_dtype, - static_cast(0)); } + PADDLE_ENFORCE_EQ(ele_dtype, dtype); // Check whether the address space is contiguous. @@ -134,66 +145,36 @@ void FusedAllReduceOpHandle::RunImpl() { } std::vector lod_tensor_data; + lod_tensor_data.reserve(place_num); for (size_t scope_idx = 0; scope_idx < place_num; ++scope_idx) { auto data = grads_tensor.at(scope_idx).at(0).second->data(); lod_tensor_data.emplace_back(data); } + std::vector grad_var_names; + grad_var_names.reserve(place_num); + for (auto &grad_t : grads_tensor) { + grad_var_names.emplace_back(grad_t.at(0).first); + } - if (platform::is_gpu_place(places_[0])) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); - int nccl_dtype = platform::ToNCCLDataType(dtype); - std::vector> all_reduce_calls; - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &p = places_[i]; - void *buffer = const_cast(lod_tensor_data.at(i)); - - all_reduce_calls.emplace_back([=] { - NCCLAllReduce(p, buffer, buffer, numel, - static_cast(nccl_dtype), ncclSum); - }); - } + AllReduceFunc(lod_tensor_data, dtype, numel, this->places_, grad_var_names); +} - VLOG(10) << "fusedallreduce size:" << numel * SizeOfType(dtype); - - this->RunAndRecordEvent([&] { - if (all_reduce_calls.size() == 1UL) { - // Do not use NCCLGroup when manage NCCL by per thread per device - all_reduce_calls[0](); - } else { - platform::NCCLGroupGuard guard; - for (auto &call : all_reduce_calls) { - call(); - } +bool FusedAllReduceOpHandle::InputIsInDifferentPlace( + const std::vector &in_var_handles) const { + for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { + auto *local_scope = local_exec_scopes_[scope_idx]; + size_t place_num = places_.size(); + for (size_t j = 0; j < in_var_handles.size(); j += place_num) { + auto var_name = in_var_handles[j]->name(); + auto var = local_scope->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); + auto &lod_tensor = var->Get(); + if (!is_same_place(lod_tensor.place(), places_.at(scope_idx))) { + return true; } - }); -#else - PADDLE_THROW("Not compiled with CUDA"); -#endif - } else { - // Special handle CPU only Operator's gradient. Like CRF - auto grad_name = grads_tensor.at(0).at(0).first; - auto &trg = *this->local_exec_scopes_[0] - ->FindVar(grad_name) - ->GetMutable(); - - // Reduce All data to trg in CPU - ReduceBufferData func(lod_tensor_data, trg.data(), numel); - VisitDataType(trg.type(), func); - - for (size_t i = 1; i < local_exec_scopes_.size(); ++i) { - auto &scope = *local_exec_scopes_[i]; - auto &p = places_[i]; - auto *var = scope.FindVar(grad_name); - auto *dev_ctx = dev_ctxes_.at(p); - size_t size = numel * SizeOfType(trg.type()); - RunAndRecordEvent(p, [&trg, var, dev_ctx, p, size] { - auto dst_ptr = var->GetMutable()->data(); - platform::CPUPlace cpu_place; - memory::Copy(cpu_place, dst_ptr, cpu_place, trg.data(), size); - }); } } + return false; } void FusedAllReduceOpHandle::GetGradLoDTensor( @@ -202,12 +183,14 @@ void FusedAllReduceOpHandle::GetGradLoDTensor( std::vector> *grad_tensor) const { auto *local_scope = local_exec_scopes_[scope_idx]; size_t place_num = places_.size(); - for (size_t j = 0; j < in_var_handles.size(); j += place_num) { auto var_name = in_var_handles[j]->name(); PADDLE_ENFORCE_EQ(var_name, out_var_handles[j]->name()); - auto &lod_tensor = local_scope->FindVar(var_name)->Get(); - PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx)); + auto var = local_scope->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var, "%s is not found in local scope.", var_name); + auto &lod_tensor = var->Get(); + PADDLE_ENFORCE_EQ(lod_tensor.place(), places_.at(scope_idx), + "%s(%d) is not in the right place.", var_name, scope_idx); grad_tensor->emplace_back(std::make_pair(var_name, &lod_tensor)); } } diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h index fccbd772..f6a11c4e 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -17,6 +17,7 @@ #include #include #include +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" @@ -30,14 +31,14 @@ namespace framework { namespace details { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) -struct FusedAllReduceOpHandle : public NCCLOpHandleBase { +struct FusedAllReduceOpHandle : public AllReduceOpHandle { FusedAllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const size_t num_of_all_reduce, const platform::NCCLCommunicator *ctxs); #else -struct FusedAllReduceOpHandle : public OpHandleBase { +struct FusedAllReduceOpHandle : public AllReduceOpHandle { FusedAllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -45,22 +46,10 @@ struct FusedAllReduceOpHandle : public OpHandleBase { #endif std::string Name() const override; - // Delay and buffer nccl_all_reduce together can significantly increase - // performance. Disable this feature by returning false. - bool IsMultiDeviceTransfer() override { return true; }; - protected: void RunImpl() override; - std::vector GetLocalScopes() override { return local_scopes_; } - private: - std::vector local_scopes_; -#if !(defined(PADDLE_WITH_CUDA) && !defined(_WIN32)) - // NCCLOpHandleBase already have these attributes. - // Will polish it by class inheritance framework. - std::vector places_; -#endif size_t num_of_all_reduce_; // Check the dtype of the input @@ -74,6 +63,12 @@ struct FusedAllReduceOpHandle : public OpHandleBase { const std::vector &out_var_handles, std::vector> *grad_tensor) const; + + bool InputIsInDifferentPlace( + const std::vector &in_var_handles) const; + + void FusedAllReduceFunc(const std::vector &in_var_handles, + const std::vector &out_var_handles); }; } // namespace details diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index 8cd41954..49bc85db 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -42,6 +42,8 @@ typedef std::vector>> GraphVars; constexpr char kGraphVars[] = "vars"; +constexpr char kNRanks[] = "nranks"; + constexpr char kPlaces[] = "places"; constexpr char kLocalScopes[] = "local_scopes"; constexpr char kNCCLCtxs[] = "nccl_ctxs"; @@ -68,6 +70,9 @@ constexpr char kParamsAndSparseGrads[] = "params_and_sparse_grads"; typedef std::vector ProgramDescs; constexpr char kProgramDescs[] = "program_descs"; +typedef std::unordered_set PinnedVars; +constexpr char kPinnedVars[] = "pinned_vars"; + typedef std::vector>> GroupParamsAndGrads; constexpr char kGroupParamsAndDenseGrads[] = "group_params_dense_grads"; diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 4c708691..16016dd3 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -108,6 +108,8 @@ class OpHandleBase { ir::Node *Node() { return node_; } + const ir::Node *Node() const { return node_; } + void SetLocalExecScopes( const std::unordered_map &scope_map); diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index 0de8e436..11c4621f 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -78,44 +78,59 @@ struct ReduceBufferData { } }; -inline void GatherLocalSelectedRows( - const std::vector &src_selecte_rows_, - const std::vector &in_places, - const std::map &dev_ctxes, - const platform::Place &out_place, SelectedRows *dst_selecte_rows) { - PADDLE_ENFORCE(!src_selecte_rows_.empty()); - - std::vector in_tensors; - std::vector out_rows; - - for (auto in_sr_ptr : src_selecte_rows_) { - auto &in_sr = *in_sr_ptr; - in_tensors.emplace_back(in_sr.value()); - out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end()); +struct GatherLocalSelectedRowsFunctor { + GatherLocalSelectedRowsFunctor( + const std::vector &src_selected_rows, + const std::vector &in_places, + const std::map &dev_ctxes, + const platform::Place &out_place, SelectedRows *dst_selected_rows) + : dev_ctxes_(dev_ctxes), + in_places_(in_places), + out_place_(out_place), + dst_selected_rows_(dst_selected_rows) { + PADDLE_ENFORCE_EQ(src_selected_rows.empty(), false); + + std::vector out_rows; + + for (auto in_sr_ptr : src_selected_rows) { + auto &in_sr = *in_sr_ptr; + in_tensors_.emplace_back(in_sr.value()); + out_rows.insert(out_rows.end(), in_sr.rows().begin(), in_sr.rows().end()); + } + + auto &pre_in = src_selected_rows[0]; + + auto &dst_tensor = *dst_selected_rows_; + dst_tensor.set_height(pre_in->height()); + dst_tensor.set_rows(out_rows); + size_t rows = out_rows.size(); + DDim out_dim = pre_in->GetCompleteDims(); + out_dim[0] = static_cast(rows); + dst_tensor.mutable_value()->Resize(out_dim); + dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type()); } - auto &pre_in = src_selecte_rows_[0]; - - auto &dst_tensor = *dst_selecte_rows; - dst_tensor.set_height(pre_in->height()); - dst_tensor.set_rows(out_rows); - size_t rows = out_rows.size(); - DDim out_dim = pre_in->GetCompleteDims(); - out_dim[0] = static_cast(rows); - dst_tensor.mutable_value()->Resize(out_dim); - dst_tensor.mutable_value()->mutable_data(out_place, pre_in->value().type()); - Tensor *out_tensor = dst_tensor.mutable_value(); - - // copy - int s = 0, e = 0; - for (size_t j = 0; j < in_tensors.size(); ++j) { - e += in_tensors[j].dims()[0]; - auto sub_out = out_tensor->Slice(s, e); - paddle::framework::TensorCopy(in_tensors[j], out_place, - *(dev_ctxes.at(in_places[j])), &sub_out); - s = e; + void operator()() { + auto *out_tensor = dst_selected_rows_->mutable_value(); + // copy + int s = 0, e = 0; + for (size_t j = 0; j < in_tensors_.size(); ++j) { + e += in_tensors_[j].dims()[0]; + auto sub_out = out_tensor->Slice(s, e); + paddle::framework::TensorCopy(in_tensors_[j], out_place_, + *(dev_ctxes_.at(in_places_[j])), &sub_out); + s = e; + } } -} + + private: + const std::map &dev_ctxes_; + std::vector in_places_; + std::vector in_tensors_; + + platform::Place out_place_; + SelectedRows *dst_selected_rows_; +}; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 26153b7d..f5245713 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -66,8 +66,11 @@ void ReduceOpHandle::GatherSelectedRows( auto gathered_var_mid = scope->Var(gathered_var_name); auto gathered_select_rows = gathered_var_mid->GetMutable(); - GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place, - gathered_select_rows); + GatherLocalSelectedRowsFunctor functor( + src_selected_rows, in_places, dev_ctxes, out_place, gathered_select_rows); + WaitInputVarGenerated(); + functor(); + // FIXME(gongwb): remove this Wait. Wait(dev_ctxes); @@ -167,9 +170,6 @@ void ReduceOpHandle::RunImpl() { var_scopes.at(in_0_handle->scope_idx())->FindVar(in_0_handle->name()); PADDLE_ENFORCE_NOT_NULL(pre_in_var); - // Wait input done, this Wait is asynchronous operation - WaitInputVarGenerated(); - // NOTE: The Places of all input tensor must be all on CPU or all on GPU. std::vector in_places; // used to get dev_ctx for (auto *in_handle : in_var_handles) { @@ -209,9 +209,11 @@ void ReduceOpHandle::RunImpl() { // TODO(gongwb): add cpu support if (collective_context.endpoints_.size() <= 1 || is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) { - GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_, - t_out_p, - out_var->GetMutable()); + GatherLocalSelectedRowsFunctor functor( + in_selected_rows, in_places, dev_ctxes_, t_out_p, + out_var->GetMutable()); + WaitInputVarGenerated(); + functor(); return; } @@ -236,6 +238,7 @@ void ReduceOpHandle::RunImpl() { GetInputValues(in_var_handles, var_scopes); if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) { + WaitInputVarGenerated(); this->RunAndRecordEvent([&] { // FIXME(zcd): The order of summing is important, // especially when the type of data is float or double. @@ -295,6 +298,7 @@ void ReduceOpHandle::RunImpl() { }); } + WaitInputVarGenerated(); this->RunAndRecordEvent([&] { platform::NCCLGroupGuard guard; for (auto &call : all_reduce_calls) { diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 7ab21609..517dd5ee 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -38,13 +38,11 @@ struct ScaleLossGradFunctor { float coeff_; Tensor *out_; platform::Place place_; - OpHandleBase *op_handle_; proto::VarType::Type out_dtype_; platform::DeviceContext *ctx_; ScaleLossGradFunctor(float coeff, Tensor *out, platform::Place place, - OpHandleBase *op_handle, proto::VarType::Type dtype, - platform::DeviceContext *ctx) + proto::VarType::Type dtype, platform::DeviceContext *ctx) : coeff_(coeff), out_(out), place_(place), out_dtype_(dtype), ctx_(ctx) {} template @@ -76,11 +74,11 @@ void ScaleLossGradOpHandle::RunImpl() { tensor->Resize(make_ddim({1})); #ifdef PADDLE_WITH_CUDA - ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, + ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_)); this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); }); #else - ScaleLossGradFunctor func(coeff_, tensor, place_, this, out_dtype_, nullptr); + ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr); framework::VisitDataType(out_dtype_, func); #endif } diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc new file mode 100644 index 00000000..ecbfa17a --- /dev/null +++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc @@ -0,0 +1,202 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/scope_buffered_monitor.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/profiler.h" + +DECLARE_double(local_exe_sub_scope_limit); + +namespace paddle { +namespace framework { +namespace details { + +static constexpr double kMB = 1 / (1024 * 1024); + +static void GetTensors(Variable *var, + std::unordered_set *tensor_set) { + if (var->IsType() && var->Get().IsInitialized()) { + tensor_set->insert(var->GetMutable()); + } else if (var->IsType() && + var->Get().value().IsInitialized()) { + tensor_set->insert(var->GetMutable()->mutable_value()); + } else if (var->IsType()) { + auto *tensor_arr = var->GetMutable(); + for (auto &t : *tensor_arr) { + if (t.IsInitialized()) { + tensor_set->insert(&t); + } + } + } +} + +static void GetTensors(Scope *scope, std::unordered_set *tensor_set) { + for (auto &var_name : scope->LocalVarNames()) { + GetTensors(scope->FindVar(var_name), tensor_set); + } + + for (auto *kid : scope->kids()) { + GetTensors(kid, tensor_set); + } +} + +static size_t GetTensorMemorySize(Scope *scope, bool clear_cpu_tensor) { + std::unordered_set tensor_set; + GetTensors(scope, &tensor_set); + size_t memory_size = 0; + std::unordered_set allocation_set; + for (auto *tensor : tensor_set) { + if (clear_cpu_tensor && platform::is_cpu_place(tensor->place())) { + tensor->clear(); + } else { + auto allocation = tensor->Holder().get(); + if (!allocation_set.count(allocation)) { + memory_size += allocation->size(); + allocation_set.insert(allocation); + } + } + } + return memory_size; +} + +size_t GetScopeVarMemorySize(Scope *scope) { + return GetTensorMemorySize(scope, false /*clear_cpu_tensor*/); +} + +ScopeBufferedMonitor::ScopeBufferedMonitor( + const std::vector &places, + const std::vector &local_exec_scopes) + : places_(places), local_exec_scopes_(local_exec_scopes) { + pre_local_exec_scopes_.resize(local_exec_scopes_.size()); + post_local_exec_scopes_.resize(local_exec_scopes_.size()); +} + +void ScopeBufferedMonitor::Apply(const std::function &callback, + bool has_fetch) { + std::unique_ptr pre_local_exec_scopes_event( + new platform::RecordEvent( + "ScopeBufferedMonitor::pre_local_exec_scopes_process")); + for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) { + pre_local_exec_scopes_.at(scope_id).clear(); + auto scopes = local_exec_scopes_.at(scope_id)->kids(); + VLOG(10) << "pre_local_exec_scopes[" << scope_id + << "] sub-scope: " << scopes.size(); + pre_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end()); + } + pre_local_exec_scopes_event.reset(); + + callback(); + + std::unique_ptr post_local_exec_scopes_event( + new platform::RecordEvent( + "ScopeBufferedMonitor::post_local_exec_scopes_process")); + for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) { + post_local_exec_scopes_.at(scope_id).clear(); + auto scopes = local_exec_scopes_.at(scope_id)->kids(); + VLOG(10) << "post_local_exec_scopes[" << scope_id + << "] sub-scope: " << scopes.size(); + post_local_exec_scopes_.at(scope_id).insert(scopes.begin(), scopes.end()); + } + + history_local_exec_scopes_.emplace_back(); + auto &incr_local_exec_scopes = history_local_exec_scopes_.back(); + incr_local_exec_scopes.resize(local_exec_scopes_.size()); + for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) { + for (auto &scope : post_local_exec_scopes_.at(scope_id)) { + if (!pre_local_exec_scopes_.at(scope_id).count(scope)) { + incr_local_exec_scopes.at(scope_id).insert(scope); + } + } + + if (VLOG_IS_ON(10)) { + if (incr_local_exec_scopes.at(scope_id).size() && + FLAGS_local_exe_sub_scope_limit > 0) { + VLOG(10) + << "FLAGS_local_exe_sub_scope_limit is " + << FLAGS_local_exe_sub_scope_limit + << " MBytes now. If you don't need to limit the memory of local " + "execution scope, you should set " + "FLAGS_local_exe_sub_scope_limit=-1."; + } + std::stringstream out; + out << scope_id << " kids: "; + for (auto &scope : incr_local_exec_scopes.at(scope_id)) { + out << scope << ", "; + } + VLOG(10) << out.str(); + } + } + + size_t history_step = history_local_exec_scopes_.size(); + if (has_fetch && history_step >= 2) { + ClearHistoryLocalExecScopes(history_step - 1); + } + + // Delete CPU Memory + std::vector gpu_memory_size_per_gpu(places_.size()); + for (auto &scope_vec : history_local_exec_scopes_) { + for (size_t idx = 0; idx < scope_vec.size(); ++idx) { + for (auto &scope : scope_vec.at(idx)) { + gpu_memory_size_per_gpu.at(idx) += + GetTensorMemorySize(scope, true /*clear_cpu_tensor*/); + } + } + } + if (VLOG_IS_ON(8)) { + for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) { + VLOG(8) << "history local exec scopes contains " + << string::HumanReadableSize(gpu_memory_size_per_gpu.at(idx)) + << " in " << places_.at(idx); + } + } + + if (FLAGS_local_exe_sub_scope_limit > 0) { + for (size_t idx = 0; idx < gpu_memory_size_per_gpu.size(); ++idx) { + if (gpu_memory_size_per_gpu.at(idx) / kMB >= + FLAGS_local_exe_sub_scope_limit) { + platform::DeviceContextPool::Instance().Get(places_.at(idx))->Wait(); + local_exec_scopes_.at(idx)->DropKids(); + } + for (auto &scope_vec : history_local_exec_scopes_) { + scope_vec.at(idx).clear(); + } + } + } +} + +void ScopeBufferedMonitor::ClearHistoryLocalExecScopes(size_t history_step) { + VLOG(10) << "delete pre_incr_local_exec_scopes."; + for (size_t i = 0; i < history_step; ++i) { + auto &pre_incr_local_exec_scopes = history_local_exec_scopes_.front(); + for (size_t scope_idx = 0; scope_idx < pre_incr_local_exec_scopes.size(); + ++scope_idx) { + for (auto scope : pre_incr_local_exec_scopes[scope_idx]) { + local_exec_scopes_.at(scope_idx)->DeleteScope(scope); + } + } + history_local_exec_scopes_.pop_front(); + } +} + +void ScopeBufferedMonitor::ClearHistoryLocalExecScopes() { + history_local_exec_scopes_.clear(); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.h b/paddle/fluid/framework/details/scope_buffered_monitor.h new file mode 100644 index 00000000..1246c35a --- /dev/null +++ b/paddle/fluid/framework/details/scope_buffered_monitor.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/scope.h" +namespace paddle { +namespace framework { +namespace details { + +class ScopeBufferedMonitor { + public: + ScopeBufferedMonitor(const std::vector &places, + const std::vector &local_exec_scopes); + + void Apply(const std::function &callback, bool has_fetch); + + void ClearHistoryLocalExecScopes(); + + void ClearHistoryLocalExecScopes(size_t history_step); + + private: + std::vector places_; + std::vector local_exec_scopes_; + std::vector> pre_local_exec_scopes_; + std::vector> post_local_exec_scopes_; + std::deque>> + history_local_exec_scopes_; +}; + +size_t GetScopeVarMemorySize(Scope *scope); + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 8459f3a4..3640e9f7 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -21,10 +21,10 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/profiler.h" - namespace paddle { namespace framework { namespace details { + ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( ExecutionStrategy strategy, std::vector local_scopes, std::vector local_exec_scopes, std::vector var_infos, @@ -35,7 +35,8 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( local_scopes_(std::move(local_scopes)), local_exec_scopes_(std::move(local_exec_scopes)), var_infos_(std::move(var_infos)), - places_(std::move(places)) { + places_(std::move(places)), + scope_monitor_(places_, local_exec_scopes_) { PADDLE_ENFORCE_EQ(local_scopes_.size(), local_exec_scopes_.size()); PrepareLocalExeScopes(); } @@ -49,16 +50,43 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( std::vector fetch_data; std::exception_ptr eptr = nullptr; - try { - fetch_data = underlying_executor_->Run(fetch_tensors); - } catch (...) { - eptr = std::current_exception(); + + auto exe_run_func = [&]() { + try { + fetch_data = underlying_executor_->Run(fetch_tensors); + } catch (...) { + eptr = std::current_exception(); + } + }; + + if (strategy_.num_iteration_per_drop_scope_ == 1) { + exe_run_func(); + } else { + scope_monitor_.Apply(exe_run_func, fetch_tensors.size() > 0); + } + + if (VLOG_IS_ON(5)) { + for (auto *scope : local_exec_scopes_) { + VLOG(5) << "Left " + << string::HumanReadableSize(GetScopeVarMemorySize(scope)) + << " on scope " << scope << " before deleting"; + } } ++drop_scope_counter_; - if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { + if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ || + DropScopeOrNot()) { DropLocalExeScopes(); } + + if (VLOG_IS_ON(5)) { + for (auto *scope : local_exec_scopes_) { + VLOG(5) << "Left " + << string::HumanReadableSize(GetScopeVarMemorySize(scope)) + << " on scope " << scope << " after deleting"; + } + } + if (eptr) { std::rethrow_exception(eptr); } else { @@ -66,6 +94,19 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( } } +bool ScopeBufferedSSAGraphExecutor::DropScopeOrNot() const { + for (auto &var : tensor_array_vars_) { + auto tensor_array = var->GetMutable(); + for (LoDTensor &tensor : *tensor_array) { + if (tensor.IsInitialized()) { + return true; + } + } + tensor_array->clear(); + } + return false; +} + void ScopeBufferedSSAGraphExecutor::InitVariables() { for (auto &info : tmp_var_infos_) { for (auto &pair : info) { @@ -103,7 +144,7 @@ void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() { for (auto &p : places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } - + scope_monitor_.ClearHistoryLocalExecScopes(); for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { local_exec_scopes_[i]->EraseVarsExcept(preserve_vars_[i]); local_exec_scopes_[i]->DropKids(); @@ -138,6 +179,9 @@ void ScopeBufferedSSAGraphExecutor::PrepareLocalExeScopes() { Variable *tmp_var = local_scope->Var(info.name_); preserve_vars_[idx].emplace(tmp_var); tmp_var_infos_[idx].emplace_back(tmp_var, info.type_); + if (info.type_ == proto::VarType::LOD_TENSOR_ARRAY) { + tensor_array_vars_.emplace_back(tmp_var); + } } } } diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 988882e6..17493a89 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -14,17 +14,18 @@ #pragma once #include +#include #include #include #include #include #include #include -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/details/var_handle.h" - #include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/scope_buffered_monitor.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" +#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -60,6 +61,8 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { private: void InitVariables(); + bool DropScopeOrNot() const; + size_t drop_scope_counter_{0}; ExecutionStrategy strategy_; std::unique_ptr underlying_executor_; @@ -70,8 +73,12 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { std::vector>> tmp_var_infos_; + std::vector tensor_array_vars_; + std::vector var_infos_; std::vector places_; + + ScopeBufferedMonitor scope_monitor_; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.cc b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc new file mode 100644 index 00000000..fb43bfbf --- /dev/null +++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.cc @@ -0,0 +1,126 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace details { + +// TODO(zjl): support SelectedRows +static inline const Tensor &GetTensorFromVar(const Variable *var) { + if (var->IsType()) { + return var->Get(); + } else { + PADDLE_THROW("Variable must be type of LoDTensor"); + } +} + +static inline Tensor *GetMutableTensorFromVar(Variable *var) { + if (var->IsType()) { + return var->GetMutable(); + } else { + PADDLE_THROW("Variable must be type of LoDTensor"); + } +} + +ShareTensorBufferFunctor::ShareTensorBufferFunctor( + Scope *scope, size_t scope_idx, const std::string &op_type, + const std::vector &in_var_infos, + const std::vector &out_var_names) + : scope_(scope), + scope_idx_(scope_idx), + op_type_(op_type), + in_var_infos_(in_var_infos), + out_var_names_(out_var_names) { + PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size()); + for (size_t i = 0; i < in_var_infos_.size(); ++i) { + AddReuseVarPair(in_var_infos_[i], out_var_names_[i]); + } +} + +std::unordered_map +ShareTensorBufferFunctor::ReusedVars() const { + std::unordered_map result; + for (size_t i = 0; i < in_var_infos_.size(); ++i) { + result.insert({in_var_infos_[i]->Name(), out_var_names_[i]}); + } + return result; +} + +void ShareTensorBufferFunctor::AddReuseVarPair( + const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) { + PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr"); + PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name, + "in/out cannot have same name: %s", out_var_name); + in_var_infos_.emplace_back(in_var_info); + out_var_names_.emplace_back(out_var_name); +} + +void ShareTensorBufferFunctor::CallOnce() { + PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here"); + for (size_t i = 0; i < in_var_infos_.size(); ++i) { + auto *in_var = exec_scope_->FindVar(in_var_infos_[i]->Name()); + auto *out_var = exec_scope_->FindVar(out_var_names_[i]); + PADDLE_ENFORCE_NOT_NULL(in_var); + PADDLE_ENFORCE_NOT_NULL(out_var); + PADDLE_ENFORCE_NE(in_var, out_var); + in_out_vars_.emplace_back(in_var, out_var); + } +} + +void ShareTensorBufferFunctor::operator()(Scope *exec_scope) { + if (!exec_scope_) { + PADDLE_ENFORCE_NOT_NULL(exec_scope); + exec_scope_ = exec_scope; + CallOnce(); + } else { + PADDLE_ENFORCE(exec_scope_ == exec_scope, "Scope must be the same"); + } + + for (size_t i = 0; i < in_var_infos_.size(); ++i) { + const auto &in_tensor = GetTensorFromVar(in_out_vars_[i].first); + auto *out_tensor = GetMutableTensorFromVar(in_out_vars_[i].second); + auto *in_var_info = in_var_infos_[i]; + + if (UNLIKELY(in_var_info->IsSkippedMemoryReuse())) { + // If in_var is inplaced in the previous batch and we want to fetch + // in_var in the current batch, we have to reset memory of out_var + // to avoid wrong calculation result. + if (in_tensor.Holder() == out_tensor->Holder()) { + VLOG(1) << "Clear " << out_var_names_[i] + << " because you may want to fetch an inplaced variable " + << in_var_info->Name() + << " in previous batch: " << in_var_info->Name() << " -> " + << out_var_names_[i]; + out_tensor->clear(); + } + } else { + out_tensor->ShareBufferWith(in_tensor); + + VLOG(2) << "Share tensor buffer when running " << op_type_ << " : " + << in_var_info->Name() << " -> " << out_var_names_[i]; + } + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/share_tensor_buffer_functor.h b/paddle/fluid/framework/details/share_tensor_buffer_functor.h new file mode 100644 index 00000000..774dcd05 --- /dev/null +++ b/paddle/fluid/framework/details/share_tensor_buffer_functor.h @@ -0,0 +1,73 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace framework { +namespace details { + +// NOTE(paddle-dev): ShareTensorBufferFunctor is responsible for +// performing memory reuse in run-time. ShareTensorBufferOpHandle +// is only a wrapper of ShareTensorBufferFunctor. +// Once we find the run-time memory reuse strategy is time-consuming in +// scheduling, we should need a pass to move ShareTensorBufferFunctor into +// each ComputationOpHandle. ShareTensorBufferFunctor is preserved for +// this probable movement. +class ShareTensorBufferFunctor { + public: + ShareTensorBufferFunctor( + Scope *scope, size_t scope_idx, const std::string &op_type, + const std::vector &in_var_infos, + const std::vector &out_var_names); + + void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, + const std::string &out_var_name); + + void operator()(Scope *exec_scope); + + std::unordered_map ReusedVars() const; + + size_t GetScopeIdx() const { return scope_idx_; } + + Scope *GetScope() { return scope_; } + + private: + void CallOnce(); + + private: + Scope *scope_; + Scope *exec_scope_{nullptr}; + + size_t scope_idx_; + std::string op_type_; + std::vector in_var_infos_; + std::vector out_var_names_; + + std::vector> in_out_vars_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc index 8539eb9d..01c4dc97 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.cc @@ -25,55 +25,42 @@ namespace paddle { namespace framework { namespace details { -// TODO(zjl): support SelectedRows -static inline const Tensor &GetTensorFromVar(const Variable *var) { - if (var->IsType()) { - return var->Get(); - } else { - PADDLE_THROW("Variable must be type of LoDTensor"); - } -} +ComputationOpHandle *GetUniquePendingComputationOpHandle( + ShareTensorBufferOpHandle *share_tensor_op) { + ComputationOpHandle *result_op = nullptr; + for (ir::Node *out_var : share_tensor_op->Node()->outputs) { + for (ir::Node *pending_op : out_var->outputs) { + auto &op = pending_op->Wrapper(); + auto *compute_op = dynamic_cast(&op); + PADDLE_ENFORCE_NOT_NULL(compute_op); -static inline Tensor *GetMutableTensorFromVar(Variable *var) { - if (var->IsType()) { - return var->GetMutable(); - } else { - PADDLE_THROW("Variable must be type of LoDTensor"); + if (result_op == nullptr) { + result_op = compute_op; + } else { + PADDLE_ENFORCE_EQ(result_op, compute_op); + } + } } + + PADDLE_ENFORCE_NOT_NULL(result_op); + return result_op; } ShareTensorBufferOpHandle::ShareTensorBufferOpHandle( ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type, - const std::vector &in_var_infos, + const std::vector &in_var_infos, const std::vector &out_var_names) : OpHandleBase(node), - scope_(scope), - scope_idx_(scope_idx), - op_type_(op_type), - in_var_infos_(in_var_infos), - out_var_names_(out_var_names) { - PADDLE_ENFORCE_EQ(in_var_infos_.size(), out_var_names_.size()); - for (size_t i = 0; i < in_var_infos_.size(); ++i) { - Add(in_var_infos_[i], out_var_names_[i]); - } -} + functor_(scope, scope_idx, op_type, in_var_infos, out_var_names) {} -std::unordered_set ShareTensorBufferOpHandle::ReusedVarSet() - const { - std::unordered_set result; - for (auto &in_var_info : in_var_infos_) { - result.insert(in_var_info->Name()); - } - return result; +std::unordered_map +ShareTensorBufferOpHandle::ReusedVars() const { + return functor_.ReusedVars(); } -void ShareTensorBufferOpHandle::Add(ir::MemOptVarInfo *in_var_info, - const std::string &out_var_name) { - PADDLE_ENFORCE_NOT_NULL(in_var_info, "in_var_info cannot be nullptr"); - PADDLE_ENFORCE_NE(in_var_info->Name(), out_var_name, - "in/out cannot have same name: %s", out_var_name); - in_var_infos_.emplace_back(in_var_info); - out_var_names_.emplace_back(out_var_name); +void ShareTensorBufferOpHandle::AddReuseVarPair( + const ir::MemOptVarInfo *in_var_info, const std::string &out_var_name) { + functor_.AddReuseVarPair(in_var_info, out_var_name); } void ShareTensorBufferOpHandle::InitCUDA() { @@ -84,49 +71,7 @@ void ShareTensorBufferOpHandle::InitCUDA() { #endif } -void ShareTensorBufferOpHandle::CallOnce() { - PADDLE_ENFORCE(in_out_vars_.empty(), "in_out_vars_ must be initialized here"); - Scope *exec_scope = local_exec_scopes_[0]; - for (size_t i = 0; i < in_var_infos_.size(); ++i) { - auto *in_var = exec_scope->FindVar(in_var_infos_[i]->Name()); - auto *out_var = exec_scope->FindVar(out_var_names_[i]); - PADDLE_ENFORCE_NOT_NULL(in_var); - PADDLE_ENFORCE_NOT_NULL(out_var); - PADDLE_ENFORCE_NE(in_var, out_var); - in_out_vars_.emplace_back(in_var, out_var); - } -} - -void ShareTensorBufferOpHandle::RunImpl() { - if (in_var_infos_.size() != in_out_vars_.size()) { - CallOnce(); - } - - for (size_t i = 0; i < in_var_infos_.size(); ++i) { - const auto &in_tensor = GetTensorFromVar(in_out_vars_[i].first); - auto *out_tensor = GetMutableTensorFromVar(in_out_vars_[i].second); - auto *in_var_info = in_var_infos_[i]; - - if (UNLIKELY(in_var_info->IsSkipped())) { - // If in_var is inplaced in the previous batch and we want to fetch - // in_var in the current batch, we have to reset memory of out_var - // to avoid wrong calculation result. - if (in_tensor.Holder() == out_tensor->Holder()) { - VLOG(1) << "Clear " << out_var_names_[i] - << " because you may want to fetch an inplaced variable " - << in_var_info->Name() - << " in previous batch: " << in_var_info->Name() << " -> " - << out_var_names_[i]; - out_tensor->clear(); - } - } else { - out_tensor->ShareBufferWith(in_tensor); - - VLOG(2) << "Share tensor buffer when running " << op_type_ << " : " - << in_var_info->Name() << " -> " << out_var_names_[i]; - } - } -} +void ShareTensorBufferOpHandle::RunImpl() { functor_(local_exec_scopes_[0]); } } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h index 87e971ba..b22f5621 100644 --- a/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h +++ b/paddle/fluid/framework/details/share_tensor_buffer_op_handle.h @@ -14,22 +14,15 @@ #pragma once #include -#include +#include #include #include +#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" namespace paddle { namespace framework { - -class Variable; -class Scope; -class Tensor; - -namespace ir { -class MemOptVarInfo; -} // namespace ir - namespace details { class ShareTensorBufferOpHandle : public OpHandleBase { @@ -37,16 +30,19 @@ class ShareTensorBufferOpHandle : public OpHandleBase { ShareTensorBufferOpHandle( ir::Node *node, Scope *scope, size_t scope_idx, const std::string &op_type, - const std::vector &in_vars_infos, + const std::vector &in_vars_infos, const std::vector &out_var_names); - std::unordered_set ReusedVarSet() const; + std::unordered_map ReusedVars() const; Priority GetPriority() const override { return Priority::kHighest; } - size_t GetScopeIdx() const { return scope_idx_; } + size_t GetScopeIdx() const { return functor_.GetScopeIdx(); } + + void AddReuseVarPair(const ir::MemOptVarInfo *in_var_info, + const std::string &out_var_name); - void Add(ir::MemOptVarInfo *in_var_info, const std::string &ou_var_name); + const ShareTensorBufferFunctor &Functor() const { return functor_; } protected: std::string Name() const override { return "buffer_share"; } @@ -55,20 +51,17 @@ class ShareTensorBufferOpHandle : public OpHandleBase { void InitCUDA() override; - std::vector GetLocalScopes() override { return {scope_}; } + std::vector GetLocalScopes() override { + return {functor_.GetScope()}; + } private: - void CallOnce(); - - Scope *scope_; - size_t scope_idx_; - std::string op_type_; - std::vector in_var_infos_; - std::vector out_var_names_; - - std::vector> in_out_vars_; + ShareTensorBufferFunctor functor_; }; +ComputationOpHandle *GetUniquePendingComputationOpHandle( + ShareTensorBufferOpHandle *share_tensor_op); + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc index a2461a36..070a17a9 100644 --- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" @@ -103,16 +104,15 @@ void SparseAllReduceOpHandle::RunImplEncoded() { int dev_id = boost::get(place).device; auto *nccl_ctxs = nccl_ctxs_->GetRunEnvNCCLCtx(run_order_, false); auto &nccl_ctx = nccl_ctxs->at(dev_id); + auto *dev_ctx = nccl_ctxs->DevCtx(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; - auto &allocator = - platform::DeviceTemporaryAllocator::Instance().Get(place, stream); int encode_size = 2 * k * sizeof(int); // dgc use ncclAllGather to get all the encoded data // so the buffer need nranks. int buf_size = nranks_ * encode_size; - auto tmp_ious_data = allocator.Allocate(buf_size); + auto tmp_ious_data = memory::Alloc(*dev_ctx, buf_size); void *gather_buff = reinterpret_cast(tmp_ious_data->ptr()); VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel @@ -126,7 +126,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() { }); } - RunAllReduceFuncs(all_reduce_calls); + NCCLAllReduceFunc(all_reduce_calls); } int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) { diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index 93060ef2..86428f8b 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -21,6 +21,7 @@ #include #include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -74,12 +75,16 @@ struct VarHandleBase { OpHandleBase* GeneratedOp() { return generated_op_; } + const OpHandleBase* GeneratedOp() const { return generated_op_; } + const std::unordered_set& PendingOps() const { return pending_ops_; } ir::Node* Node() { return node_; } + const ir::Node* Node() const { return node_; } + protected: // The operator who generate this variable. nullptr if the variable // is a root node. @@ -96,6 +101,9 @@ struct VarHandleBase { // // NOTE: runtime variables have place. struct VarHandle : public VarHandleBase { + DISABLE_COPY_AND_ASSIGN(VarHandle); + + public: virtual ~VarHandle(); std::string DebugString() const override; diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 8c91a5b9..61649530 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -114,12 +114,19 @@ class DeviceWorker { virtual void BindingDataFeedMemory() = 0; virtual void SetRootScope(Scope* root_scope); virtual void SetDataFeed(DataFeed* data_feed); + virtual void SetNeedDump(bool need_dump_field) {} + virtual void SetChannelWriter(ChannelObject* queue) {} virtual void SetPlace(const paddle::platform::Place& place) { place_ = place; } + virtual void SetReaderPlace(const paddle::platform::Place& place) { + device_reader_->SetPlace(place); + } + virtual Scope* GetThreadScope() { return thread_scope_; } protected: Scope* root_scope_ = nullptr; + Scope* thread_scope_; paddle::platform::Place place_; DataFeed* device_reader_ = nullptr; int64_t batch_num_; @@ -144,22 +151,30 @@ class CPUWorkerBase : public DeviceWorker { class HogwildWorker : public CPUWorkerBase { public: HogwildWorker() {} - virtual ~HogwildWorker() {} + virtual ~HogwildWorker() { + for (OperatorBase* op : ops_) { + delete op; + } + std::vector().swap(ops_); + } virtual void Initialize(const TrainerDesc& desc); virtual void TrainFiles(); virtual void TrainFilesWithProfiler(); virtual void PrintFetchVars(); virtual void CreateDeviceResource(const ProgramDesc& main_prog); virtual void BindingDataFeedMemory(); + template + void SetZero(LoDTensor* tensor, LoDTensor* root_tensor, int tensor_dim); protected: void CreateThreadOperators(const ProgramDesc& program); void CreateThreadScope(const ProgramDesc& program); std::vector op_names_; std::vector ops_; - Scope* thread_scope_; + // Scope* thread_scope_; HogwildWorkerParameter param_; std::vector skip_ops_; + std::map stat_var_name_map_; }; class DownpourWorker : public HogwildWorker { @@ -169,6 +184,8 @@ class DownpourWorker : public HogwildWorker { virtual void Initialize(const TrainerDesc& desc); virtual void TrainFiles(); virtual void TrainFilesWithProfiler(); + virtual void SetNeedDump(bool need_dump_field); + virtual void SetChannelWriter(ChannelObject* queue); protected: std::shared_ptr fleet_ptr_; @@ -176,11 +193,17 @@ class DownpourWorker : public HogwildWorker { void FillSparseValue(size_t table_id); void PushGradients(); void CollectLabelInfo(size_t table_id); + void AdjustInsWeight(); private: bool need_to_push_dense_; + bool need_dump_field_; + bool dump_slot_; bool need_to_push_sparse_; + std::vector dump_fields_; + ChannelWriter writer_; DownpourWorkerParameter param_; + float scale_datanorm_; // just save the value in param_ for easy access std::map label_var_name_; std::map> sparse_key_names_; @@ -203,6 +226,10 @@ class DownpourWorker : public HogwildWorker { std::shared_ptr _pull_dense_worker; std::vector<::std::future> push_sparse_status_; std::vector<::std::future> push_dense_status_; + + // adjust ins weight + AdjustInsWeightConfig adjust_ins_weight_config_; + std::vector nid_show_; }; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) @@ -285,7 +312,6 @@ class SectionWorker : public DeviceWorker { int section_num_; int pipeline_num_; int thread_id_; - // This worker will consume scope from in_scope_queue_ // and produce scope to out_scope_queue_ ScopeQueue* in_scope_queue_ = nullptr; diff --git a/paddle/fluid/framework/dim.h b/paddle/fluid/framework/dim.h index 531ee844..66214b26 100644 --- a/paddle/fluid/framework/dim.h +++ b/paddle/fluid/framework/dim.h @@ -20,7 +20,6 @@ #include #include "paddle/fluid/framework/array.h" -#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc index 8cd0789c..56a9ebc3 100644 --- a/paddle/fluid/framework/dist_multi_trainer.cc +++ b/paddle/fluid/framework/dist_multi_trainer.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include #include +#include "io/fs.h" #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker_factory.h" @@ -22,16 +23,34 @@ limitations under the License. */ namespace paddle { namespace framework { -void DistMultiTrainer::Initialize(const TrainerDesc& trainer_desc, - Dataset* dataset) { +void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc, + Dataset *dataset) { thread_num_ = trainer_desc.thread_num(); SetDataset(dataset); - const std::vector readers = + dump_fields_path_ = trainer_desc.dump_fields_path(); + dump_converter_ = trainer_desc.dump_converter(); + need_dump_field_ = false; + if (trainer_desc.dump_fields_size() != 0 && dump_fields_path_ != "") { + need_dump_field_ = true; + } + if (need_dump_field_) { + auto &file_list = dataset->GetFileList(); + if (file_list.size() == 0) { + need_dump_field_ = false; + } + } + mpi_rank_ = trainer_desc.mpi_rank() / 2; + const std::vector readers = dataset->GetReaders(); thread_num_ = readers.size(); workers_.resize(thread_num_); + for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size(); + i++) { + need_merge_var_names_.push_back( + trainer_desc.downpour_param().stat_var_names(i)); + } for (int i = 0; i < thread_num_; ++i) { workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( @@ -39,6 +58,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc& trainer_desc, workers_[i]->SetDeviceIndex(i); workers_[i]->SetDataFeed(readers[i]); workers_[i]->Initialize(trainer_desc); + workers_[i]->SetNeedDump(need_dump_field_); } VLOG(3) << "going to initialize pull dense worker"; @@ -48,7 +68,51 @@ void DistMultiTrainer::Initialize(const TrainerDesc& trainer_desc, SetDebug(trainer_desc.debug()); } -void DistMultiTrainer::InitOtherEnv(const ProgramDesc& main_program) { +void DistMultiTrainer::DumpWork() { +#ifdef _LINUX + while (1) { + std::string out_str; + if (!queue_->Get(out_str)) { + break; + } + size_t write_count = + fwrite_unlocked(out_str.data(), 1, out_str.length(), fp_.get()); + if (write_count != out_str.length()) { + VLOG(3) << "dump text failed"; + continue; + } + write_count = fwrite_unlocked("\n", 1, 1, fp_.get()); + if (write_count != 1) { + VLOG(3) << "dump text failed"; + continue; + } + } +#endif +} + +void DistMultiTrainer::InitDumpEnv() { + queue_ = paddle::framework::MakeChannel(); + int err_no = 0; + std::string path = string::format_string( + "%s/part-%03d", dump_fields_path_.c_str(), mpi_rank_); + + fp_ = fs_open_write(path, &err_no, dump_converter_); + for (int i = 0; i < thread_num_; ++i) { + workers_[i]->SetChannelWriter(queue_.get()); + } + dump_thread_ = std::thread(&DistMultiTrainer::DumpWork, this); +} + +void DistMultiTrainer::FinalizeDumpEnv() { + queue_->Close(); + dump_thread_.join(); + queue_.reset(); +} + +void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) { + if (need_dump_field_) { + InitDumpEnv(); + } pull_dense_worker_->SetRootScope(root_scope_); pull_dense_worker_->Start(); VLOG(3) << "init other env done."; @@ -67,12 +131,59 @@ void DistMultiTrainer::Run() { } void DistMultiTrainer::Finalize() { - for (auto& th : threads_) { + for (auto &th : threads_) { th.join(); } + for (int i = 0; i < need_merge_var_names_.size(); i++) { + Variable *root_var = root_scope_->FindVar(need_merge_var_names_[i]); + if (root_var == nullptr) { + continue; + } + LoDTensor *root_tensor = root_var->GetMutable(); + for (int j = 1; j < thread_num_; j++) { + Scope *cur_thread_scope = workers_[j]->GetThreadScope(); + Variable *thread_var = + cur_thread_scope->FindVar(need_merge_var_names_[i]); + LoDTensor *thread_tensor = thread_var->GetMutable(); + if (root_tensor->numel() != thread_tensor->numel()) { + continue; + } +#define MergeCallback(cpp_type, proto_type) \ + do { \ + if (root_tensor->type() == proto_type) { \ + if (thread_tensor->type() != proto_type) { \ + VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \ + << "] " << need_merge_var_names_[i] \ + << ", root tensor type=" << root_tensor->type() \ + << ", thread tensor type=" << thread_tensor->type(); \ + exit(-1); \ + } \ + MergeToRootScope(root_tensor, thread_tensor); \ + } \ + } while (0) + _ForEachDataType_(MergeCallback); + } + } + + if (need_dump_field_) { + FinalizeDumpEnv(); + } pull_dense_worker_->Stop(); root_scope_->DropKids(); + + // flush local client push queue + auto fleet_ptr_ = FleetWrapper::GetInstance(); + fleet_ptr_->ClientFlush(); } +template +void DistMultiTrainer::MergeToRootScope(LoDTensor *root_tensor, + LoDTensor *tensor) { + T *root_data = root_tensor->data(); + T *data = tensor->data(); + for (int i = 0; i < tensor->numel(); i++) { + root_data[i] += data[i]; + } +} } // end namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index 510a9943..e7dbf3b1 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -15,6 +15,12 @@ limitations under the License. */ #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/string/string_helper.h" + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif namespace paddle { namespace framework { @@ -58,12 +64,99 @@ void DownpourWorker::Initialize(const TrainerDesc& desc) { skip_ops_[i] = param_.skip_ops(i); } + for (int i = 0; i < param_.stat_var_names_size(); ++i) { + stat_var_name_map_[param_.stat_var_names(i)] = 1; + } + need_to_push_sparse_ = param_.push_sparse(); need_to_push_dense_ = param_.push_dense(); fleet_ptr_ = FleetWrapper::GetInstance(); fetch_config_ = desc.fetch_config(); use_cvm_ = desc.use_cvm(); + scale_datanorm_ = desc.scale_datanorm(); + dump_slot_ = desc.dump_slot(); + dump_fields_.resize(desc.dump_fields_size()); + for (int i = 0; i < desc.dump_fields_size(); ++i) { + dump_fields_[i] = desc.dump_fields(i); + } + adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); +} + +void DownpourWorker::SetChannelWriter(ChannelObject* queue) { + writer_.Reset(queue); +} + +void DownpourWorker::SetNeedDump(bool need_dump_field) { + need_dump_field_ = need_dump_field; +} + +template +std::string PrintLodTensorType(LoDTensor* tensor, int64_t start, int64_t end) { + auto count = tensor->numel(); + if (start < 0 || end > count) { + VLOG(3) << "access violation"; + return "access violation"; + } + std::ostringstream os; + for (int64_t i = start; i < end; i++) { + os << ":" << tensor->data()[i]; + } + return os.str(); +} + +std::string PrintLodTensorIntType(LoDTensor* tensor, int64_t start, + int64_t end) { + auto count = tensor->numel(); + if (start < 0 || end > count) { + VLOG(3) << "access violation"; + return "access violation"; + } + std::ostringstream os; + for (int64_t i = start; i < end; i++) { + os << ":" << static_cast(tensor->data()[i]); + } + return os.str(); +} + +std::string PrintLodTensor(LoDTensor* tensor, int64_t start, int64_t end) { + std::string out_val; + if (tensor->type() == proto::VarType::FP32) { + out_val = PrintLodTensorType(tensor, start, end); + } else if (tensor->type() == proto::VarType::INT64) { + out_val = PrintLodTensorIntType(tensor, start, end); + } else if (tensor->type() == proto::VarType::FP64) { + out_val = PrintLodTensorType(tensor, start, end); + } else { + out_val = "unsupported type"; + } + return out_val; +} + +std::pair GetTensorBound(LoDTensor* tensor, int index) { + auto& dims = tensor->dims(); + if (tensor->lod().size() != 0) { + auto& lod = tensor->lod()[0]; + return {lod[index] * dims[1], lod[index + 1] * dims[1]}; + } else { + return {index * dims[1], (index + 1) * dims[1]}; + } +} + +bool CheckValidOutput(LoDTensor* tensor, int batch_size) { + auto& dims = tensor->dims(); + if (dims.size() != 2) return false; + if (tensor->lod().size() != 0) { + auto& lod = tensor->lod()[0]; + if (lod.size() != batch_size + 1) { + return false; + } + } else { + if (dims[0] != batch_size) { + return false; + } + } + return true; } void DownpourWorker::CollectLabelInfo(size_t table_idx) { @@ -148,30 +241,130 @@ void DownpourWorker::FillSparseValue(size_t table_idx) { auto& tensor_lod = tensor->lod()[0]; LoD data_lod{tensor_lod}; tensor_emb->set_lod(data_lod); + + bool is_nid = (adjust_ins_weight_config_.need_adjust() && + adjust_ins_weight_config_.nid_slot() == emb_slot_name); + if (is_nid) { + nid_show_.clear(); + } + int nid_ins_index = 0; + for (int index = 0; index < len; ++index) { if (use_cvm_) { if (ids[index] == 0u) { memcpy(ptr + table.emb_dim() * index, init_value.data(), sizeof(float) * table.emb_dim()); + if (is_nid) { + nid_show_.push_back(-1); + ++nid_ins_index; + } continue; } memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data(), sizeof(float) * table.emb_dim()); + if (is_nid && index == tensor->lod()[0][nid_ins_index]) { + nid_show_.push_back(fea_value[fea_idx][0]); + ++nid_ins_index; + } fea_idx++; } else { if (ids[index] == 0u) { memcpy(ptr + table.emb_dim() * index, init_value.data() + 2, sizeof(float) * table.emb_dim()); + if (is_nid) { + nid_show_.push_back(-1); + ++nid_ins_index; + } continue; } memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data() + 2, sizeof(float) * table.emb_dim()); + if (is_nid && index == tensor->lod()[0][nid_ins_index]) { + nid_show_.push_back(fea_value[fea_idx][0]); + ++nid_ins_index; + } fea_idx++; } } } } +void DownpourWorker::AdjustInsWeight() { +#ifdef _LINUX + // check var and tensor not null + if (!adjust_ins_weight_config_.need_adjust()) { + VLOG(0) << "need_adjust=false, skip adjust ins weight"; + return; + } + Variable* nid_var = + thread_scope_->FindVar(adjust_ins_weight_config_.nid_slot()); + if (nid_var == nullptr) { + VLOG(0) << "nid slot var " << adjust_ins_weight_config_.nid_slot() + << " is nullptr, skip adjust ins weight"; + return; + } + LoDTensor* nid_tensor = nid_var->GetMutable(); + if (nid_tensor == nullptr) { + VLOG(0) << "tensor of nid slot var " << adjust_ins_weight_config_.nid_slot() + << " is nullptr, skip adjust ins weight"; + return; + } + Variable* ins_weight_var = + thread_scope_->FindVar(adjust_ins_weight_config_.ins_weight_slot()); + if (ins_weight_var == nullptr) { + VLOG(0) << "ins weight var " << adjust_ins_weight_config_.ins_weight_slot() + << " is nullptr, skip adjust ins weight"; + return; + } + LoDTensor* ins_weight_tensor = ins_weight_var->GetMutable(); + if (ins_weight_tensor == nullptr) { + VLOG(0) << "tensor of ins weight tensor " + << adjust_ins_weight_config_.ins_weight_slot() + << " is nullptr, skip adjust ins weight"; + return; + } + + float* ins_weights = ins_weight_tensor->data(); + size_t len = ins_weight_tensor->numel(); // len = batch size + // here we assume nid_show slot only has one feasign in each instance + CHECK(len == nid_show_.size()) << "ins_weight size should be equal to " + << "nid_show size, " << len << " vs " + << nid_show_.size(); + float nid_adjw_threshold = adjust_ins_weight_config_.nid_adjw_threshold(); + float nid_adjw_ratio = adjust_ins_weight_config_.nid_adjw_ratio(); + int64_t nid_adjw_num = 0; + double nid_adjw_weight = 0.0; + size_t ins_index = 0; + for (int i = 0; i < len; ++i) { + float nid_show = nid_show_[i]; + VLOG(3) << "nid_show " << nid_show; + if (nid_show < 0) { + VLOG(3) << "nid_show < 0, continue"; + continue; + } + float ins_weight = 1.0; + if (nid_show >= 0 && nid_show < nid_adjw_threshold) { + ins_weight = log(M_E + + (nid_adjw_threshold - nid_show) / nid_adjw_threshold * + nid_adjw_ratio); + // count nid adjw insnum and weight + ++nid_adjw_num; + nid_adjw_weight += ins_weight; + // choose large ins weight + VLOG(3) << "ins weight new " << ins_weight << ", ins weight origin " + << ins_weights[ins_index]; + if (ins_weight > ins_weights[ins_index]) { + VLOG(3) << "ins " << ins_index << " weight changes to " << ins_weight; + ins_weights[ins_index] = ins_weight; + } + ++ins_index; + } + } + VLOG(3) << "nid adjw info: total_adjw_num: " << nid_adjw_num + << ", avg_adjw_weight: " << nid_adjw_weight; +#endif +} + void DownpourWorker::TrainFilesWithProfiler() { VLOG(3) << "Begin to train files with profiler"; platform::SetNumThreads(1); @@ -200,6 +393,7 @@ void DownpourWorker::TrainFilesWithProfiler() { double total_time = 0.0; double read_time = 0.0; double pull_sparse_time = 0.0; + double adjust_ins_weight_time = 0.0; double collect_label_time = 0.0; double fill_sparse_time = 0.0; double push_sparse_time = 0.0; @@ -207,8 +401,6 @@ void DownpourWorker::TrainFilesWithProfiler() { int cur_batch; int batch_cnt = 0; uint64_t total_inst = 0; - double op_sum_time = 0; - std::unordered_map op_to_time; timeline.Start(); while ((cur_batch = device_reader_->Next()) > 0) { timeline.Pause(); @@ -243,6 +435,16 @@ void DownpourWorker::TrainFilesWithProfiler() { timeline.Pause(); fill_sparse_time += timeline.ElapsedSec(); total_time += timeline.ElapsedSec(); + timeline.Start(); + auto nid_iter = std::find(sparse_value_names_[tid].begin(), + sparse_value_names_[tid].end(), + adjust_ins_weight_config_.nid_slot()); + if (nid_iter != sparse_value_names_[tid].end()) { + AdjustInsWeight(); + } + timeline.Pause(); + adjust_ins_weight_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); } VLOG(3) << "Fill sparse value for all sparse table done."; @@ -282,7 +484,8 @@ void DownpourWorker::TrainFilesWithProfiler() { fleet_ptr_->PushSparseVarsWithLabelAsync( *thread_scope_, tid, features_[tid], feature_labels_[tid], sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), - &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_); + &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_, + dump_slot_); timeline.Pause(); push_sparse_time += timeline.ElapsedSec(); total_time += timeline.ElapsedSec(); @@ -296,7 +499,8 @@ void DownpourWorker::TrainFilesWithProfiler() { uint64_t tid = static_cast( param_.program_config(0).push_dense_table_id(i)); fleet_ptr_->PushDenseVarsAsync( - *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_); + *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_, + scale_datanorm_, cur_batch); } timeline.Pause(); push_dense_time += timeline.ElapsedSec(); @@ -354,6 +558,8 @@ void DownpourWorker::TrainFilesWithProfiler() { if (thread_id_ == 0) { // should be configured here if (batch_cnt > 0 && batch_cnt % 100 == 0) { + double op_sum_time = 0; + std::unordered_map op_to_time; for (size_t i = 0; i < op_total_time.size(); ++i) { fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i, op_name[i].c_str(), op_total_time[i] / batch_cnt); @@ -378,10 +584,15 @@ void DownpourWorker::TrainFilesWithProfiler() { fprintf(stderr, "push dense time: %fs\n", push_dense_time / batch_cnt); fprintf(stderr, "collect label time: %fs\n", collect_label_time / batch_cnt); + fprintf(stderr, "adjust ins weight time: %fs\n", + adjust_ins_weight_time / batch_cnt); fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt); fprintf(stderr, "IO percent: %f\n", read_time / total_time * 100); + fprintf(stderr, "op run percent: %f\n", op_sum_time / total_time * 100); fprintf(stderr, "pull sparse time percent: %f\n", pull_sparse_time / total_time * 100); + fprintf(stderr, "adjust ins weight time percent: %f\n", + adjust_ins_weight_time / total_time * 100); fprintf(stderr, "collect label time percent: %f\n", collect_label_time / total_time * 100); fprintf(stderr, "fill sparse time percent: %f\n", @@ -421,6 +632,12 @@ void DownpourWorker::TrainFiles() { &feature_values_[tid], table.fea_dim()); CollectLabelInfo(i); FillSparseValue(i); + auto nid_iter = std::find(sparse_value_names_[tid].begin(), + sparse_value_names_[tid].end(), + adjust_ins_weight_config_.nid_slot()); + if (nid_iter != sparse_value_names_[tid].end()) { + AdjustInsWeight(); + } } VLOG(3) << "fill sparse value for all sparse table done."; @@ -454,7 +671,8 @@ void DownpourWorker::TrainFiles() { fleet_ptr_->PushSparseVarsWithLabelAsync( *thread_scope_, tid, features_[tid], feature_labels_[tid], sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), - &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_); + &feature_grads_[tid], &push_sparse_status_, cur_batch, use_cvm_, + dump_slot_); } } @@ -464,9 +682,9 @@ void DownpourWorker::TrainFiles() { uint64_t tid = static_cast( param_.program_config(0).push_dense_table_id(i)); fleet_ptr_->PushDenseVarsAsync( - *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_); + *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_, + scale_datanorm_, cur_batch); } - VLOG(3) << "push dense gradient done."; // the following code should be more precise and clean @@ -512,11 +730,52 @@ void DownpourWorker::TrainFiles() { pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid); } } + if (need_dump_field_) { + int batch_size = device_reader_->GetCurBatchSize(); + std::vector ars(batch_size); + for (auto& ar : ars) { + ar.clear(); + } + auto& ins_id_vec = device_reader_->GetInsIdVec(); + auto& ins_content_vec = device_reader_->GetInsContentVec(); + for (size_t i = 0; i < ins_id_vec.size(); i++) { + ars[i] += ins_id_vec[i]; + ars[i] = ars[i] + "\t" + ins_content_vec[i]; + } + for (auto& field : dump_fields_) { + Variable* var = thread_scope_->FindVar(field); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + if (!CheckValidOutput(tensor, batch_size)) { + continue; + } + for (int i = 0; i < batch_size; ++i) { + auto output_dim = tensor->dims()[1]; + std::string output_dimstr = + boost::lexical_cast(output_dim); + ars[i] = ars[i] + "\t" + field + ":" + output_dimstr; + auto bound = GetTensorBound(tensor, i); + ars[i] += PrintLodTensor(tensor, bound.first, bound.second); + } + } + // #pragma omp parallel for + for (size_t i = 0; i < ars.size(); i++) { + if (ars[i].length() == 0) { + continue; + } + writer_ << ars[i]; + } + } PrintFetchVars(); thread_scope_->DropKids(); ++batch_cnt; } + if (need_dump_field_) { + writer_.Flush(); + } } } // end namespace framework diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index cfab2f5f..df9b53d6 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -30,6 +30,7 @@ limitations under the License. */ #include "paddle/fluid/framework/trainer_factory.h" #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h" #include "paddle/fluid/operators/controlflow/while_op_helper.h" #include "paddle/fluid/operators/distributed/distributed.h" @@ -38,11 +39,11 @@ limitations under the License. */ #ifdef PADDLE_WITH_NGRAPH #include "paddle/fluid/operators/ngraph/ngraph_engine.h" -DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); #endif DECLARE_bool(benchmark); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); +DEFINE_bool(use_ngraph, false, "Use NGRAPH to run"); namespace paddle { namespace framework { @@ -58,10 +59,31 @@ ExecutorPrepareContext::ExecutorPrepareContext( void ExecutorPrepareContext::PrepareUnusedVars( const std::vector& keep_vars, bool force_disable_gc) { +#ifdef PADDLE_WITH_NGRAPH + if (FLAGS_use_ngraph) { + // FIXME(zjl): There is difference when ngraph and gc are both enabled + // in unittests. I do not know why it happens. Maybe ngraph engine + // would cache some variables? + LOG_FIRST_N(WARNING, 1) + << "FLAGS_use_ngraph=True, garbage collection strategy is " + "disabled in Executor"; + force_disable_gc = true; + } +#endif force_disable_gc_ = force_disable_gc; if (GetEagerDeletionThreshold() < 0 || force_disable_gc_) { return; } + + // If gc is enabled and block size > 1 + if (prog_.Size() > 1) { + operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp( + prog_, block_id_, ops_); + operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(prog_, block_id_, + ops_); + operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( + prog_, block_id_, ops_); + } unused_vars_ = GetUnusedVars(prog_.Block(block_id_), ops_, keep_vars); } @@ -388,8 +410,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, int64_t max_memory_size = GetEagerDeletionThreshold(); std::unique_ptr gc; - // FIXME(zjl): recurrent_op is rather complex, we would - // disable gc forcely in recurrent_op if (!ctx->force_disable_gc_ && max_memory_size >= 0) { #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { @@ -407,13 +427,6 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, #ifdef PADDLE_WITH_CUDA } #endif - // If gc is enabled and block size > 1 - if (gc && ctx->prog_.Size() > 1) { - operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp(ctx->block_id_, - ctx->ops_); - operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( - ctx->block_id_, ctx->ops_); - } } for (auto& op : ctx->ops_) { diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index d0d12b30..a6db5c8d 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -44,7 +44,8 @@ struct ExecutorPrepareContext { std::vector> ops_; - std::unordered_map> unused_vars_; + std::unordered_map> + unused_vars_; bool force_disable_gc_{false}; }; diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index 77b0977b..1712d66c 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -89,10 +89,10 @@ static bool VarCanBeDeleted(const std::string &name, const BlockDesc &block, type == proto::VarType::LOD_TENSOR_ARRAY; } -std::unordered_map> GetUnusedVars( - const BlockDesc &block, - const std::vector> &ops, - const std::vector &skip_var_list) { +std::unordered_map> +GetUnusedVars(const BlockDesc &block, + const std::vector> &ops, + const std::vector &skip_var_list) { std::unordered_set skip_vars(skip_var_list.begin(), skip_var_list.end()); @@ -134,7 +134,7 @@ std::unordered_map> GetUnusedVars( } } - std::unordered_map> result; + std::unordered_map> result; for (auto &name_op_idx_pair : var_op_idx_map) { auto &name = name_op_idx_pair.first; size_t op_idx = name_op_idx_pair.second; @@ -144,8 +144,8 @@ std::unordered_map> GetUnusedVars( } void DeleteUnusedTensors( - const Scope &scope, OperatorBase *op, - const std::unordered_map> + const Scope &scope, const OperatorBase *op, + const std::unordered_map> &delete_vars_map, GarbageCollector *gc) { auto iter = delete_vars_map.find(op); diff --git a/paddle/fluid/framework/executor_gc_helper.h b/paddle/fluid/framework/executor_gc_helper.h index 8553273f..a4c71c53 100644 --- a/paddle/fluid/framework/executor_gc_helper.h +++ b/paddle/fluid/framework/executor_gc_helper.h @@ -26,15 +26,15 @@ namespace paddle { namespace framework { // Result map: op -> variable names that can be deleted after op runs -std::unordered_map> GetUnusedVars( - const BlockDesc &block, - const std::vector> &ops, - const std::vector &skip_vars); +std::unordered_map> +GetUnusedVars(const BlockDesc &block, + const std::vector> &ops, + const std::vector &skip_vars); // Collect unused tensors after op runs void DeleteUnusedTensors( - const Scope &scope, OperatorBase *op, - const std::unordered_map> + const Scope &scope, const OperatorBase *op, + const std::unordered_map> &delete_vars_map, GarbageCollector *gc); diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 12fc454f..42406397 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -5,3 +5,8 @@ else() endif(WITH_PSLIB) cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope) +if(WITH_BOX_PS) + cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor box_ps) +else() + cc_library(box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor) +endif(WITH_BOX_PS) diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc new file mode 100644 index 00000000..935bcc72 --- /dev/null +++ b/paddle/fluid/framework/fleet/box_wrapper.cc @@ -0,0 +1,247 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/fleet/box_wrapper.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace framework { + +std::shared_ptr BoxWrapper::s_instance_ = nullptr; +#ifdef PADDLE_WITH_BOX_PS +std::shared_ptr BoxWrapper::boxps_ptr_ = nullptr; +#endif + +int BoxWrapper::GetDate() const { + time_t now = time(0); + tm t; +#ifdef _WIN32 + localtime_s(&t, &now); +#else + localtime_r(&now, &t); +#endif + char buf[10]; + snprintf(buf, sizeof(buf), "%04d%02d%02d", (1900 + t.tm_year), (1 + t.tm_mon), + t.tm_mday); + return atoi(buf); +} + +void BoxWrapper::FeedPass(const std::vector& feasgin_to_box) const { +#ifdef PADDLE_WITH_BOX_PS + int ret = boxps_ptr_->FeedPass(GetDate(), feasgin_to_box); + PADDLE_ENFORCE_EQ(ret, 0, "FeedPass failed in BoxPS."); +#endif +} + +void BoxWrapper::BeginPass() const { +#ifdef PADDLE_WITH_BOX_PS + int ret = boxps_ptr_->BeginPass(); + PADDLE_ENFORCE_EQ(ret, 0, "BeginPass failed in BoxPS."); +#endif +} + +void BoxWrapper::EndPass() const { +#ifdef PADDLE_WITH_BOX_PS + int ret = boxps_ptr_->EndPass(); + PADDLE_ENFORCE_EQ(ret, 0, "EndPass failed in BoxPS."); +#endif +} + +void BoxWrapper::PullSparse(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& values, + const std::vector& slot_lengths, + const int hidden_size) { +#ifdef PADDLE_WITH_BOX_PS + if (platform::is_cpu_place(place) || platform::is_gpu_place(place)) { + int64_t total_length = + std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); + LoDTensor total_keys_tensor; + int64_t* total_keys = + total_keys_tensor.mutable_data({total_length, 1}, place); + int64_t offset = 0; + for (size_t i = 0; i < keys.size(); ++i) { + if (platform::is_cpu_place(place)) { + memory::Copy(boost::get(place), total_keys + offset, + boost::get(place), keys[i], + slot_lengths[i] * sizeof(uint64_t)); + } else { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + memory::Copy(boost::get(place), + total_keys + offset, + boost::get(place), keys[i], + slot_lengths[i] * sizeof(uint64_t), nullptr); +#else + PADDLE_THROW( + "Please compile WITH_GPU option, and NCCL doesn't support " + "windows."); +#endif + } + offset += slot_lengths[i]; + } + PADDLE_ENFORCE_EQ(offset, total_length, + "BoxWrapper::PullSparse: total feasign keys length " + "should be equal to the sum of length of all input " + "tensors."); + + // Space allocation for FeatureValue is left for boxps + paddle::boxps::FeatureValue* total_values; + if (platform::is_cpu_place(place)) { + int ret = boxps_ptr_->PullSparseCPU( + reinterpret_cast(total_keys), &total_values, + static_cast(total_length)); + PADDLE_ENFORCE_EQ(ret, 0, "PullSparseCPU failed in BoxPS."); + } else { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + int ret = boxps_ptr_->PullSparseGPU( + reinterpret_cast(total_keys), &total_values, + static_cast(total_length), + boost::get(place).GetDeviceId()); + PADDLE_ENFORCE_EQ(ret, 0, "PullSparseGPU failed in BoxPS."); +#endif + } + + offset = 0; + for (size_t i = 0; i < values.size(); ++i) { + int64_t fea_num = slot_lengths[i]; + for (auto j = 0; j < fea_num; ++j) { + // Copy the emb from BoxPS to paddle tensor. Since 'show','click','emb' + // are continuous in memory, so we copy here using the 'show' address + if (platform::is_cpu_place(place)) { + memory::Copy( + boost::get(place), + values[i] + j * hidden_size, + boost::get(place), + reinterpret_cast(&((total_values + offset)->show)), + sizeof(float) * hidden_size); + } else { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + memory::Copy( + boost::get(place), + values[i] + j * hidden_size, + boost::get(place), + reinterpret_cast(&((total_values + offset)->show)), + sizeof(float) * hidden_size, nullptr); +#endif + } + ++offset; + } + } + PADDLE_ENFORCE_EQ(offset, total_length, + "BoxWrapper::PullSparse: total emb values length should " + "be equal to the sum of length of all input tensors."); + + } else { + PADDLE_THROW( + "PaddleBox: PullSparse Only Support CPUPlace and CUDAPlace Now."); + } +#endif +} + +void BoxWrapper::PushSparseGrad(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& grad_values, + const std::vector& slot_lengths, + const int hidden_size) { +#ifdef PADDLE_WITH_BOX_PS + if (platform::is_cpu_place(place) || platform::is_gpu_place(place)) { + int64_t total_length = + std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); + LoDTensor total_keys_tensor; + int64_t* total_keys = + total_keys_tensor.mutable_data({total_length, 1}, place); + int64_t offset = 0; + for (size_t i = 0; i < keys.size(); ++i) { + if (platform::is_cpu_place(place)) { + memory::Copy(boost::get(place), total_keys + offset, + boost::get(place), keys[i], + slot_lengths[i] * sizeof(uint64_t)); + } else { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + memory::Copy(boost::get(place), + total_keys + offset, + boost::get(place), keys[i], + slot_lengths[i] * sizeof(uint64_t), nullptr); +#else + PADDLE_THROW( + "Please compile WITH_GPU option, and for now NCCL doesn't support " + "windows."); +#endif + } + offset += slot_lengths[i]; + } + PADDLE_ENFORCE_EQ(offset, total_length, + "BoxWrapper::PushSparseGrad: total feasign keys length " + "should be equal to the sum of length of all input " + "tensors."); + auto buf = memory::AllocShared( + place, total_length * sizeof(paddle::boxps::FeaturePushValue)); + paddle::boxps::FeaturePushValue* total_grad_values = + reinterpret_cast(buf->ptr()); + offset = 0; + for (size_t i = 0; i < grad_values.size(); ++i) { + int64_t fea_num = slot_lengths[i]; + for (auto j = 0; j < fea_num; ++j) { + // Copy the emb grad from paddle tensor to BoxPS. Since + // 'show','click','emb' are continuous in memory, so we copy here using + // the 'show' address + if (platform::is_cpu_place(place)) { + memory::Copy( + boost::get(place), + reinterpret_cast(&((total_grad_values + offset)->show)), + boost::get(place), + grad_values[i] + j * hidden_size, sizeof(float) * hidden_size); + } else { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + memory::Copy( + boost::get(place), + reinterpret_cast(&((total_grad_values + offset)->show)), + boost::get(place), + grad_values[i] + j * hidden_size, sizeof(float) * hidden_size, + nullptr); +#endif + } + ++offset; + } + } + PADDLE_ENFORCE_EQ(offset, total_length, + "BoxWrapper::PushSparseGrad: total emb grad values " + "length should be equal to the sum of length of all " + "input tensors."); + if (platform::is_cpu_place(place)) { + int ret = boxps_ptr_->PushSparseCPU( + reinterpret_cast(total_keys), total_grad_values, + static_cast(total_length)); + PADDLE_ENFORCE_EQ(ret, 0, "PushSparseCPU failed in BoxPS."); + } else { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + int ret = boxps_ptr_->PushSparseGPU( + reinterpret_cast(total_keys), total_grad_values, + static_cast(total_length), + boost::get(place).GetDeviceId()); + PADDLE_ENFORCE_EQ(ret, 0, "PushSparseGPU failed in BoxPS."); +#endif + } + } else { + PADDLE_THROW( + "PaddleBox: PushSparse Only Support CPUPlace and CUDAPlace Now."); + } +#endif +} +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h new file mode 100644 index 00000000..c650d9cb --- /dev/null +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -0,0 +1,126 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include "paddle/fluid/framework/data_set.h" +#ifdef PADDLE_WITH_BOX_PS +#include +#endif +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +class BoxWrapper { + public: + virtual ~BoxWrapper() {} + BoxWrapper() {} + + void FeedPass(const std::vector& feasgin_to_box) const; + void BeginPass() const; + void EndPass() const; + void PullSparse(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& values, + const std::vector& slot_lengths, + const int hidden_size); + void PushSparseGrad(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& grad_values, + const std::vector& slot_lengths, + const int hidden_size); + + static std::shared_ptr GetInstance() { + if (nullptr == s_instance_) { + // If main thread is guaranteed to init this, this lock can be removed + static std::mutex mutex; + std::lock_guard lock(mutex); + if (nullptr == s_instance_) { + s_instance_.reset(new paddle::framework::BoxWrapper()); +#ifdef PADDLE_WITH_BOX_PS + s_instance_->boxps_ptr_.reset(new paddle::boxps::FakeBoxPS()); +#endif + } + } + return s_instance_; + } + + private: +#ifdef PADDLE_WITH_BOX_PS + static std::shared_ptr boxps_ptr_; +#endif + static std::shared_ptr s_instance_; + int GetDate() const; +}; + +class BoxHelper { + public: + explicit BoxHelper(paddle::framework::Dataset* dataset) : dataset_(dataset) {} + virtual ~BoxHelper() {} + + void BeginPass() { + auto box_ptr = BoxWrapper::GetInstance(); + box_ptr->BeginPass(); + } + + void EndPass() { + auto box_ptr = BoxWrapper::GetInstance(); + box_ptr->EndPass(); + } + void LoadIntoMemory() { + dataset_->LoadIntoMemory(); + FeedPass(); + } + void PreLoadIntoMemory() { + dataset_->PreLoadIntoMemory(); + feed_data_thread_.reset(new std::thread([&]() { + dataset_->WaitPreLoadDone(); + FeedPass(); + })); + } + void WaitFeedPassDone() { feed_data_thread_->join(); } + + private: + Dataset* dataset_; + std::shared_ptr feed_data_thread_; + // notify boxps to feed this pass feasigns from SSD to memory + void FeedPass() { + auto box_ptr = BoxWrapper::GetInstance(); + auto input_channel_ = + dynamic_cast(dataset_)->GetInputChannel(); + std::vector pass_data; + std::vector feasign_to_box; + input_channel_->ReadAll(pass_data); + for (const auto& ins : pass_data) { + const auto& feasign_v = ins.uint64_feasigns_; + for (const auto feasign : feasign_v) { + feasign_to_box.push_back(feasign.sign().uint64_feasign_); + } + } + input_channel_->Open(); + input_channel_->Write(pass_data); + input_channel_->Close(); + box_ptr->FeedPass(feasign_to_box); + } +}; + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 62221f4a..22a9b79d 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -66,6 +66,14 @@ paddle::ps::Archive& operator>>(paddle::ps::Archive& ar, std::shared_ptr FleetWrapper::pslib_ptr_ = NULL; #endif +void FleetWrapper::SetClient2ClientConfig(int request_timeout_ms, + int connect_timeout_ms, + int max_retry) { + client2client_request_timeout_ms_ = request_timeout_ms; + client2client_connect_timeout_ms_ = connect_timeout_ms; + client2client_max_retry_ = max_retry; +} + void FleetWrapper::InitServer(const std::string& dist_desc, int index) { #ifdef PADDLE_WITH_PSLIB if (!is_initialized_) { @@ -142,7 +150,9 @@ std::vector FleetWrapper::GetClientsInfo() { void FleetWrapper::CreateClient2ClientConnection() { #ifdef PADDLE_WITH_PSLIB VLOG(3) << "Going to create client2client connection"; - pslib_ptr_->create_client2client_connection(); + pslib_ptr_->create_client2client_connection(client2client_request_timeout_ms_, + client2client_connect_timeout_ms_, + client2client_max_retry_); #endif } @@ -188,6 +198,7 @@ void FleetWrapper::PullSparseVarsSync( auto status = t.get(); if (status != 0) { LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); exit(-1); } } @@ -264,7 +275,8 @@ void FleetWrapper::PushDenseVarsSync( void FleetWrapper::PushDenseVarsAsync( const Scope& scope, const uint64_t table_id, const std::vector& var_names, - std::vector<::std::future>* push_sparse_status) { + std::vector<::std::future>* push_sparse_status, + float scale_datanorm, int batch_size) { #ifdef PADDLE_WITH_PSLIB std::vector regions; for (auto& t : var_names) { @@ -272,6 +284,20 @@ void FleetWrapper::PushDenseVarsAsync( LoDTensor* tensor = var->GetMutable(); int count = tensor->numel(); float* g = tensor->data(); + if (scale_datanorm >= 0) { + if (t.find(".batch_size@GRAD") != std::string::npos || + t.find(".batch_sum@GRAD") != std::string::npos) { + Eigen::Map mat(g, 1, count); + float scale = 1.0 / batch_size; + mat *= scale; + } else if (t.find(".batch_square_sum@GRAD") != std::string::npos) { + VLOG(3) << "epsilon: " << scale_datanorm; + for (int i = 0; i < count; ++i) { + g[i] = (g[i] - batch_size * scale_datanorm) / batch_size + + batch_size * scale_datanorm; + } + } + } paddle::ps::Region reg(g, count); regions.emplace_back(std::move(reg)); } @@ -288,19 +314,27 @@ void FleetWrapper::PushSparseVarsWithLabelAsync( const std::vector& sparse_grad_names, const int emb_dim, std::vector>* push_values, std::vector<::std::future>* push_sparse_status, - const int batch_size, const bool use_cvm) { + const int batch_size, const bool use_cvm, const bool dump_slot) { #ifdef PADDLE_WITH_PSLIB int offset = 2; + int slot_offset = 0; int grad_dim = emb_dim; + int show_index = 0; + int click_index = 1; if (use_cvm) { offset = 0; grad_dim = emb_dim - 2; } + if (dump_slot) { + slot_offset = 1; + show_index = 1; + click_index = 2; + } CHECK_GE(grad_dim, 0); push_values->resize(fea_keys.size() + 1); for (auto& t : *push_values) { - t.resize(emb_dim + offset); + t.resize(emb_dim + offset + slot_offset); } uint64_t fea_idx = 0u; for (size_t i = 0; i < sparse_key_names.size(); ++i) { @@ -315,9 +349,14 @@ void FleetWrapper::PushSparseVarsWithLabelAsync( } int len = tensor->numel(); int64_t* ids = tensor->data(); - + int slot = 0; + if (dump_slot) { + slot = boost::lexical_cast(sparse_key_names[i]); + } Variable* g_var = scope.FindVar(sparse_grad_names[i]); - CHECK(g_var != nullptr) << "var[" << sparse_grad_names[i] << "] not found"; + if (g_var == nullptr) { + continue; + } LoDTensor* g_tensor = g_var->GetMutable(); if (g_tensor == nullptr) { LOG(ERROR) << "tensor of var[" << sparse_key_names[i] << "] is null"; @@ -339,14 +378,19 @@ void FleetWrapper::PushSparseVarsWithLabelAsync( } CHECK(fea_idx < (*push_values).size()); CHECK(fea_idx < fea_labels.size()); + if (use_cvm) { - memcpy((*push_values)[fea_idx].data() + offset, g, + memcpy((*push_values)[fea_idx].data() + offset + slot_offset, g, sizeof(float) * emb_dim); } else { - memcpy((*push_values)[fea_idx].data() + offset, g, + memcpy((*push_values)[fea_idx].data() + offset + slot_offset, g, sizeof(float) * emb_dim); - (*push_values)[fea_idx][0] = 1.0f; - (*push_values)[fea_idx][1] = static_cast(fea_labels[fea_idx]); + (*push_values)[fea_idx][show_index] = 1.0f; + (*push_values)[fea_idx][click_index] = + static_cast(fea_labels[fea_idx]); + } + if (dump_slot) { + (*push_values)[fea_idx][0] = static_cast(slot); } g += emb_dim; fea_idx++; @@ -370,7 +414,9 @@ void FleetWrapper::LoadFromPaddleModel(Scope& scope, const uint64_t table_id, std::vector var_list, std::string model_path, std::string model_proto_file, + std::vector table_var_list, bool load_combine) { +#ifdef PADDLE_WITH_PSLIB // load ProgramDesc from model file auto read_proto_func = [](const std::string& filename) -> ProgramDesc { std::string contents; @@ -436,7 +482,8 @@ void FleetWrapper::LoadFromPaddleModel(Scope& scope, const uint64_t table_id, } } delete old_scope; - PushDenseParamSync(scope, table_id, old_param_list); + PushDenseParamSync(scope, table_id, table_var_list); +#endif } void FleetWrapper::LoadModel(const std::string& path, const int mode) { @@ -445,6 +492,7 @@ void FleetWrapper::LoadModel(const std::string& path, const int mode) { ret.wait(); if (ret.get() != 0) { LOG(ERROR) << "load model from path:" << path << " failed"; + sleep(sleep_seconds_before_fail_exit_); exit(-1); } #else @@ -474,6 +522,7 @@ void FleetWrapper::SaveModel(const std::string& path, const int mode) { int32_t feasign_cnt = ret.get(); if (feasign_cnt == -1) { LOG(ERROR) << "save model failed"; + sleep(sleep_seconds_before_fail_exit_); exit(-1); } #else @@ -481,6 +530,60 @@ void FleetWrapper::SaveModel(const std::string& path, const int mode) { #endif } +double FleetWrapper::GetCacheThreshold() { +#ifdef PADDLE_WITH_PSLIB + double cache_threshold = 0.0; + auto ret = pslib_ptr_->_worker_ptr->flush(); + ret.wait(); + ret = pslib_ptr_->_worker_ptr->get_cache_threshold(0, cache_threshold); + ret.wait(); + if (cache_threshold < 0) { + LOG(ERROR) << "get cache threshold failed"; + sleep(sleep_seconds_before_fail_exit_); + exit(-1); + } + return cache_threshold; +#else + VLOG(0) << "FleetWrapper::GetCacheThreshold does nothing when no pslib"; + return 0.0; +#endif +} + +void FleetWrapper::CacheShuffle(int table_id, const std::string& path, + const int mode, const double cache_threshold) { +#ifdef PADDLE_WITH_PSLIB + auto ret = pslib_ptr_->_worker_ptr->cache_shuffle( + 0, path, std::to_string(mode), std::to_string(cache_threshold)); + ret.wait(); + int32_t feasign_cnt = ret.get(); + if (feasign_cnt == -1) { + LOG(ERROR) << "cache shuffle failed"; + sleep(sleep_seconds_before_fail_exit_); + exit(-1); + } +#else + VLOG(0) << "FleetWrapper::CacheShuffle does nothing when no pslib"; +#endif +} + +int32_t FleetWrapper::SaveCache(int table_id, const std::string& path, + const int mode) { +#ifdef PADDLE_WITH_PSLIB + auto ret = pslib_ptr_->_worker_ptr->save_cache(0, path, std::to_string(mode)); + ret.wait(); + int32_t feasign_cnt = ret.get(); + if (feasign_cnt == -1) { + LOG(ERROR) << "table save cache failed"; + sleep(sleep_seconds_before_fail_exit_); + exit(-1); + } + return feasign_cnt; +#else + VLOG(0) << "FleetWrapper::SaveCache does nothing when no pslib"; + return -1; +#endif +} + void FleetWrapper::ShrinkSparseTable(int table_id) { #ifdef PADDLE_WITH_PSLIB auto ret = pslib_ptr_->_worker_ptr->shrink(table_id); @@ -490,20 +593,40 @@ void FleetWrapper::ShrinkSparseTable(int table_id) { #endif } +void FleetWrapper::ClearModel() { +#ifdef PADDLE_WITH_PSLIB + auto ret = pslib_ptr_->_worker_ptr->clear(); + ret.wait(); +#else + VLOG(0) << "FleetWrapper::ClearModel does nothing when no pslib"; +#endif +} + void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope, std::vector var_list, - float decay) { + float decay, int emb_dim) { #ifdef PADDLE_WITH_PSLIB std::vector regions; for (std::string& name : var_list) { if (name.find("batch_sum") != std::string::npos) { Variable* var = scope->FindVar(name); CHECK(var != nullptr) << "var[" << name << "] not found"; - VLOG(3) << "prepare shrink dense batch_sum"; + VLOG(0) << "prepare shrink dense batch_sum"; LoDTensor* tensor = var->GetMutable(); float* g = tensor->data(); - Eigen::Map mat(g, 1, tensor->numel()); - mat *= decay; + + // show_batch_sum += N * log(decay) + std::string size_name = name; + size_name.replace(size_name.find("batch_sum"), size_name.length(), + "batch_size"); + Variable* var_size = scope->FindVar(size_name); + CHECK(var_size != nullptr) << "var[" << size_name << "] not found"; + VLOG(3) << "shrink dense batch_sum: " << name << ", " << size_name; + float* g_size = var_size->GetMutable()->data(); + + for (int k = 0; k < tensor->numel(); k += emb_dim) { + g[k] = g[k] + g_size[k] * log(decay); + } paddle::ps::Region reg(g, tensor->numel()); regions.emplace_back(std::move(reg)); } else { @@ -521,6 +644,7 @@ void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope, auto status = push_status.get(); if (status != 0) { LOG(FATAL) << "push shrink dense param failed, status[" << status << "]"; + sleep(sleep_seconds_before_fail_exit_); exit(-1); } #else diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index 8b375727..4aa62634 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -55,7 +55,21 @@ namespace framework { class FleetWrapper { public: virtual ~FleetWrapper() {} - FleetWrapper() { scale_sparse_gradient_with_batch_size_ = true; } + FleetWrapper() { + scale_sparse_gradient_with_batch_size_ = true; + // trainer sleep some time for pslib core dump + sleep_seconds_before_fail_exit_ = 300; + // pslib request server timeout ms + client2client_request_timeout_ms_ = 500000; + // pslib connect server timeout_ms + client2client_connect_timeout_ms_ = 10000; + // pslib request max retry + client2client_max_retry_ = 3; + } + + void SetClient2ClientConfig(int request_timeout_ms, int connect_timeout_ms, + int max_retry); + // Pull sparse variables from server in Sync mode // Param: scope, table_id, var_names, fea_keys // Param: fea_values @@ -82,7 +96,8 @@ class FleetWrapper { void PushDenseVarsAsync( const Scope& scope, const uint64_t table_id, const std::vector& var_names, - std::vector<::std::future>* push_sparse_status); + std::vector<::std::future>* push_sparse_status, + float scale_datanorm, int batch_size); void PushDenseVarsSync(Scope* scope, const uint64_t table_id, const std::vector& var_names); @@ -100,7 +115,7 @@ class FleetWrapper { const std::vector& sparse_grad_names, const int emb_dim, std::vector>* push_values, std::vector<::std::future>* push_sparse_status, - const int batch_size, const bool use_cvm); + const int batch_size, const bool use_cvm, const bool dump_slot); // Push sparse variables to server in Async mode // Param: scope, table_id, fea_keys, sparse_grad_names @@ -135,6 +150,7 @@ class FleetWrapper { void LoadFromPaddleModel(Scope& scope, const uint64_t table_id, // NOLINT std::vector var_list, std::string model_path, std::string model_proto_file, + std::vector table_var_list, bool load_combine); // mode = 0, load all feature // mode = 1, laod delta feature, which means load diff @@ -147,9 +163,17 @@ class FleetWrapper { // mode = 1, save delta feature, which means save diff void SaveModel(const std::string& path, const int mode); + double GetCacheThreshold(); + void CacheShuffle(int table_id, const std::string& path, const int mode, + const double cache_threshold); + int32_t SaveCache(int table_id, const std::string& path, const int mode); + + void ClearModel(); + void ShrinkSparseTable(int table_id); void ShrinkDenseTable(int table_id, Scope* scope, - std::vector var_list, float decay); + std::vector var_list, float decay, + int emb_dim); // register client to client communication typedef std::function MsgHandlerFunc; @@ -185,6 +209,10 @@ class FleetWrapper { protected: static bool is_initialized_; bool scale_sparse_gradient_with_batch_size_; + int32_t sleep_seconds_before_fail_exit_; + int client2client_request_timeout_ms_; + int client2client_connect_timeout_ms_; + int client2client_max_retry_; DISABLE_COPY_AND_ASSIGN(FleetWrapper); }; diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 789b2ef8..f100dc63 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -25,29 +25,21 @@ #include "glog/logging.h" #include "paddle/fluid/framework/garbage_collector.h" +DECLARE_double(eager_delete_tensor_gb); +DECLARE_double(memory_fraction_of_eager_deletion); +DECLARE_bool(fast_eager_deletion_mode); + namespace paddle { namespace framework { -DEFINE_double( - eager_delete_tensor_gb, -1.0, - "Memory size threshold (GB) when the garbage collector clear tensors." - "Disabled when this value is less than 0"); - -DEFINE_bool(fast_eager_deletion_mode, true, - "Fast eager deletion mode. If enabled, memory would release " - "immediately without waiting GPU kernel ends."); - -DEFINE_double(memory_fraction_of_eager_deletion, 1.0, - "Fraction of eager deletion. If less than 1.0, all variables in " - "the program would be sorted according to its memory size, and " - "only the FLAGS_memory_fraction_of_eager_deletion of the largest " - "variables would be deleted."); - GarbageCollector::GarbageCollector(const platform::Place &place, size_t max_memory_size) : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { garbages_.reset(new GarbageQueue()); dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); + if (max_memory_size_ > 1) { + mutex_.reset(new std::mutex()); + } } CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place, diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 6ce797bd..61033952 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -46,7 +46,7 @@ class GarbageCollector { platform::DeviceContext *dev_ctx_; std::unique_ptr garbages_; - mutable std::mutex mutex_; + mutable std::unique_ptr mutex_; const size_t max_memory_size_; size_t cur_memory_size_{0}; }; @@ -118,7 +118,7 @@ void GarbageCollector::Add(Container &&objs, Callback &&callback) { GarbageQueue *garbage_queue = nullptr; { - std::lock_guard guard(mutex_); + std::lock_guard guard(*mutex_); for (auto &obj : objs) { if (!obj) continue; cur_memory_size_ += obj->size(); diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index a006a0fa..4aaf2569 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/platform/cpu_helper.h" @@ -20,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace framework { -void HogwildWorker::Initialize(const TrainerDesc& desc) { +void HogwildWorker::Initialize(const TrainerDesc &desc) { fetch_config_ = desc.fetch_config(); param_ = desc.hogwild_param(); skip_ops_.resize(param_.skip_ops_size()); @@ -30,45 +31,70 @@ void HogwildWorker::Initialize(const TrainerDesc& desc) { use_cvm_ = desc.use_cvm(); } -void HogwildWorker::CreateThreadOperators(const ProgramDesc& program) { - auto& block = program.Block(0); +void HogwildWorker::CreateThreadOperators(const ProgramDesc &program) { + auto &block = program.Block(0); op_names_.clear(); - for (auto& op_desc : block.AllOps()) { + for (auto &op_desc : block.AllOps()) { std::unique_ptr local_op = OpRegistry::CreateOp(*op_desc); op_names_.push_back(op_desc->Type()); - OperatorBase* local_op_ptr = local_op.release(); + OperatorBase *local_op_ptr = local_op.release(); ops_.push_back(local_op_ptr); continue; } } -void HogwildWorker::CreateThreadScope(const ProgramDesc& program) { - auto& block = program.Block(0); +void HogwildWorker::CreateThreadScope(const ProgramDesc &program) { + auto &block = program.Block(0); PADDLE_ENFORCE_NOT_NULL( root_scope_, "root_scope should be set before creating thread scope"); thread_scope_ = &root_scope_->NewScope(); - for (auto& var : block.AllVars()) { + + for (auto &var : block.AllVars()) { if (var->Persistable()) { - auto* ptr = root_scope_->Var(var->Name()); + auto *ptr = root_scope_->Var(var->Name()); InitializeVariable(ptr, var->GetType()); + if (stat_var_name_map_.find(var->Name()) != stat_var_name_map_.end() && + thread_id_ != 0) { + int tensor_dim = + root_scope_->FindVar(var->Name())->GetMutable()->numel(); + auto *ptr1 = thread_scope_->Var(var->Name()); + InitializeVariable(ptr1, var->GetType()); + LoDTensor *thread_tensor = ptr1->GetMutable(); + LoDTensor *root_tensor = + root_scope_->FindVar(var->Name())->GetMutable(); +#define MemsetCallback(cpp_type, proto_type) \ + do { \ + if (root_tensor->type() == proto_type) { \ + SetZero(thread_tensor, root_tensor, tensor_dim); \ + } \ + } while (0) + _ForEachDataType_(MemsetCallback); + } } else { - auto* ptr = thread_scope_->Var(var->Name()); + auto *ptr = thread_scope_->Var(var->Name()); InitializeVariable(ptr, var->GetType()); } } } +template +void HogwildWorker::SetZero(LoDTensor *tensor, LoDTensor *root_tensor, + int tensor_dim) { + T *ptr = tensor->mutable_data(root_tensor->dims(), platform::CPUPlace()); + memset(ptr, 0, sizeof(T) * tensor_dim); +} + void HogwildWorker::BindingDataFeedMemory() { - const std::vector& input_feed = + const std::vector &input_feed = device_reader_->GetUseSlotAlias(); for (auto name : input_feed) { device_reader_->AddFeedVar(thread_scope_->FindVar(name), name); } } -void HogwildWorker::CreateDeviceResource(const ProgramDesc& main_prog) { +void HogwildWorker::CreateDeviceResource(const ProgramDesc &main_prog) { CreateThreadScope(main_prog); CreateThreadOperators(main_prog); } @@ -78,7 +104,7 @@ void HogwildWorker::TrainFilesWithProfiler() { device_reader_->Start(); std::vector op_total_time; std::vector op_name; - for (auto& op : ops_) { + for (auto &op : ops_) { op_name.push_back(op->Type()); } op_total_time.resize(ops_.size()); @@ -141,7 +167,7 @@ void HogwildWorker::TrainFiles() { device_reader_->Start(); int cur_batch; while ((cur_batch = device_reader_->Next()) > 0) { - for (auto& op : ops_) { + for (auto &op : ops_) { bool need_skip = false; for (auto t = 0u; t < skip_ops_.size(); ++t) { if (op->Type().find(skip_ops_[t]) != std::string::npos) { diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h index fdc0c202..40026eac 100644 --- a/paddle/fluid/framework/inplace_op_inference.h +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -13,13 +13,8 @@ // limitations under the License. #pragma once -#include -#include #include #include -#include -#include "glog/logging.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/type_defs.h" @@ -58,5 +53,15 @@ class SingleOpInplaceInToOut : public InplaceOpInference { } }; +#define DECLARE_INPLACE_OP_INFERER(class_name, ...) \ + class class_name final : public ::paddle::framework::InplaceOpInference { \ + public: \ + std::unordered_map operator()( \ + const ::paddle::framework::OpDesc& op_desc, \ + bool use_cuda) const final { \ + return {__VA_ARGS__}; \ + } \ + } + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc deleted file mode 100644 index 727e579d..00000000 --- a/paddle/fluid/framework/inplace_op_inference_test.cc +++ /dev/null @@ -1,328 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include -#include -#include -#include -#include "gtest/gtest.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" -#include "paddle/fluid/framework/ir/pass_builder.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/var_type_inference.h" - -USE_PASS(inplace_pass); - -namespace paddle { -namespace framework { - -std::unique_ptr CreateInplacePass() { - auto pass = ir::PassRegistry::Instance().Get("inplace_pass"); - pass->Set(ir::kUseCuda, new bool(true)); - return pass; -} - -class NOP : public OperatorBase { - public: - NOP(const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const Scope& scope, - const platform::Place& place) const override {} -}; - -class SingleOpMaker : public OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "").AsDuplicable(); - AddOutput("Out", ""); - AddComment(""); - } -}; - -class SingleGradOpMaker : public framework::SingleGradOpDescMaker { - public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - protected: - std::unique_ptr Apply() const override { - auto* op = new framework::OpDesc(); - op->SetType("single_op_grad"); - op->SetInput("Out", OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), InputGrad("X")); - return std::unique_ptr(op); - } -}; - -class SingleOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override { - ctx->HasInput("X"); - ctx->HasOutput("Out"); - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } -}; - -class SingleGradOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override { - ctx->HasInput(framework::GradVarName("Out")); - ctx->HasOutput(framework::GradVarName("X")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); - } -}; - -class MultiOutOpMaker : public OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "").AsDuplicable(); - AddInput("Y", "").AsDuplicable(); - AddInput("Z", "").AsDuplicable(); - AddOutput("Out", ""); - AddOutput("YOut", ""); - AddOutput("ZOut", ""); - AddOutput("NotReuseOut", ""); - AddComment(""); - } -}; - -class MultiOutShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override { - ctx->ShareDim("X", "Out"); - ctx->ShareDim("Y", "YOut"); - ctx->ShareDim("Z", "ZOut"); - } -}; - -class MultiGradOpMaker : public framework::SingleGradOpDescMaker { - public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - protected: - std::unique_ptr Apply() const override { - auto* op = new framework::OpDesc(); - op->SetType("multi_out_grad"); - op->SetInput("X", Input("X")); - op->SetOutput(framework::GradVarName("Y"), OutputGrad("YOut")); - op->SetOutput(framework::GradVarName("X"), OutputGrad("Out")); - op->SetOutput(framework::GradVarName("Z"), OutputGrad("ZOut")); - return std::unique_ptr(op); - } -}; - -class MultiOutGradShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("Y"), - ctx->GetInputDim(framework::GradVarName("YOut"))); - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - ctx->SetOutputDim(framework::GradVarName("Z"), - ctx->GetInputDim(framework::GradVarName("ZOut"))); - } -}; - -class MultiOutInplaceInToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const OpDesc& op_desc, bool use_cuda) const override { - return std::unordered_map{ - {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"}, - }; - } -}; - -class MultiOutGradInplaceInToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const OpDesc& op_desc, bool use_cuda) const override { - return std::unordered_map{ - {framework::GradVarName("YOut"), framework::GradVarName("Y")}, - {framework::GradVarName("Out"), framework::GradVarName("X")}, - {framework::GradVarName("ZOut"), framework::GradVarName("Z")}, - }; - } -}; - -} // namespace framework -} // namespace paddle - -namespace f = paddle::framework; -REGISTER_OPERATOR(single_op, f::NOP, f::SingleOpMaker, f::SingleGradOpMaker, - f::SingleOpInplaceInToOut, f::SingleOpShapeInference); -REGISTER_OPERATOR(single_op_grad, f::NOP, f::SingleOpInplaceInToOut, - f::SingleGradOpShapeInference); -REGISTER_OPERATOR(multi_out_op, f::NOP, f::MultiOutOpMaker, f::MultiGradOpMaker, - f::MultiOutInplaceInToOut, f::MultiOutShapeInference); -REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut, - f::MultiOutGradShapeInference); - -namespace paddle { -namespace framework { - -void FakeSuccData(ProgramDesc* prog) { // NOLINT - prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); - prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_out"); - prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 32, 128, 128}); -} - -void FakeNoInplaceData(ProgramDesc* prog) { // NOLINT - prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); - prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_out"); - prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 31, 128, 128}); -} - -ir::Node* GetNodeFromGraph(ir::Graph* g, std::string name) { - ir::Node* op_node = nullptr; - for (auto& item : g->Nodes()) { - if (item->Name() == name) { - op_node = item; - break; - } - } - return op_node; -} - -std::unique_ptr test_SingleOpInplaceInToOut( - std::unique_ptr g) { - auto pass = CreateInplacePass(); - ir::Node* op_node = GetNodeFromGraph(g.get(), "single_op"); - EXPECT_NE(op_node, nullptr); - pass->Apply(g.get()); - return g; -} - -TEST(InferInplace, SingleOpInplaceInToOut) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("single_op"); - op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); - op->SetOutput("Out", {"test2_out"}); - - FakeSuccData(&prog); - std::unique_ptr g(new ir::Graph(prog)); - g->Set(ir::kMemOptSkipVars, new std::unordered_set()); - g = test_SingleOpInplaceInToOut(std::move(g)); - auto op_node = GetNodeFromGraph(g.get(), "single_op"); - - EXPECT_EQ(op_node->outputs[0]->Name(), "test2_a"); -} - -TEST(InferInplace, SingleOpInplaceInToOutNoInplace) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("single_op"); - op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); - op->SetOutput("Out", {"test2_out"}); - - FakeNoInplaceData(&prog); - std::unique_ptr g(new ir::Graph(prog)); - g->Set(ir::kMemOptSkipVars, new std::unordered_set()); - g = test_SingleOpInplaceInToOut(std::move(g)); - auto op_node = GetNodeFromGraph(g.get(), "single_op"); - - EXPECT_EQ(op_node->outputs[0]->Name(), "test2_out"); -} - -TEST(InferInplace, MultiOutInplaceInToOut) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("multi_out_op"); - op->SetInput("X", {"a0", "a1"}); - op->SetInput("Y", {"b0"}); - op->SetInput("Z", {"c0", "c1"}); - op->SetOutput("Out", {"o0"}); - op->SetOutput("YOut", {"y0"}); - op->SetOutput("ZOut", {"z0"}); - - prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("o0"); - prog.MutableBlock(0)->Var("y0"); - prog.MutableBlock(0)->Var("z0"); - prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); - - std::unique_ptr g(new ir::Graph(prog)); - g->Set(ir::kMemOptSkipVars, new std::unordered_set()); - auto pass = CreateInplacePass(); - pass->Apply(g.get()); - auto op_node = GetNodeFromGraph(g.get(), "multi_out_op"); - ASSERT_TRUE(op_node != nullptr); - EXPECT_EQ(op_node->outputs[0]->Name(), "a0"); - EXPECT_EQ(op_node->outputs[1]->Name(), "b0"); - EXPECT_EQ(op_node->outputs[2]->Name(), "c0"); -} - -TEST(InferInplace, MultiGradInplaceInToOut) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("multi_out_grad"); - op->SetInput(GradVarName("Out"), {"o0"}); - op->SetInput(GradVarName("YOut"), {"y0"}); - op->SetInput(GradVarName("ZOut"), {"z0"}); - op->SetOutput(GradVarName("X"), {"a0", "a1"}); - op->SetOutput(GradVarName("Y"), {"b0"}); - op->SetOutput(GradVarName("Z"), {"c0", "c1"}); - - prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("o0"); - prog.MutableBlock(0)->Var("y0"); - prog.MutableBlock(0)->Var("z0"); - prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("z0")->SetShape({32, 15, 1024, 1024}); - - std::unique_ptr g(new ir::Graph(prog)); - g->Set(ir::kMemOptSkipVars, new std::unordered_set()); - auto pass = CreateInplacePass(); - pass->Apply(g.get()); - auto op_node = GetNodeFromGraph(g.get(), "multi_out_grad"); - ASSERT_TRUE(op_node != nullptr); - EXPECT_EQ(op_node->outputs[0]->Name(), "o0"); - EXPECT_EQ(op_node->outputs[2]->Name(), "y0"); - EXPECT_EQ(op_node->outputs[3]->Name(), "c0"); - - std::unordered_map expects = { - {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, - }; -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/io/fs.cc b/paddle/fluid/framework/io/fs.cc index fc6e9f40..d5bc5df2 100644 --- a/paddle/fluid/framework/io/fs.cc +++ b/paddle/fluid/framework/io/fs.cc @@ -149,7 +149,7 @@ std::vector localfs_list(const std::string& path) { std::shared_ptr pipe; int err_no = 0; pipe = shell_popen( - string::format_string("find %s -maxdepth 1 -type f", path.c_str()), "r", + string::format_string("find %s -type f -maxdepth 1", path.c_str()), "r", &err_no); string::LineFileReader reader; std::vector list; @@ -452,24 +452,5 @@ void fs_mkdir(const std::string& path) { LOG(FATAL) << "Not supported"; } } - -std::string fs_path_join(const std::string& dir, const std::string &path) { - if (dir.empty()) { - return path; - } - if (dir.back() == '/') { - return dir + path; - } - return dir + '/' + path; -} - -std::pair fs_path_split(const std::string &path) { - size_t pos = path.find_last_of('/'); - if (pos == std::string::npos) { - return {".", path}; - } - return {path.substr(0, pos), path.substr(pos + 1)}; -} - } // end namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/io/fs.h b/paddle/fluid/framework/io/fs.h index 3f9e7873..3f017470 100644 --- a/paddle/fluid/framework/io/fs.h +++ b/paddle/fluid/framework/io/fs.h @@ -97,9 +97,5 @@ extern std::string fs_tail(const std::string& path); extern bool fs_exists(const std::string& path); extern void fs_mkdir(const std::string& path); - -extern std::string fs_path_join(const std::string& dir, const std::string &path); - -extern std::pair fs_path_split(const std::string &path); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc index ab671cb5..ce0c3a76 100644 --- a/paddle/fluid/framework/io/shell.cc +++ b/paddle/fluid/framework/io/shell.cc @@ -119,16 +119,12 @@ static int shell_popen_fork_internal(const char* real_cmd, bool do_read, close(parent_end); if (child_end != child_std_end) { - if (dup2(child_end, child_std_end) != child_std_end) { - return -1; - } + PCHECK(dup2(child_end, child_std_end) == child_std_end); close(child_end); } close_open_fds_internal(); - if (execl("/bin/bash", "bash", "-c", real_cmd, NULL) < 0) { - return -1; - } + PCHECK(execl("/bin/bash", "bash", "-c", real_cmd, NULL) >= 0); exit(127); #endif } @@ -194,7 +190,8 @@ std::shared_ptr shell_popen(const std::string& cmd, << ", err_no[" << *err_no << "]"; } if (wstatus == -1 && errno == ECHILD) { - LOG(WARNING) << "errno is ECHILD"; + // temporarily remove this warning + // LOG(WARNING) << "errno is ECHILD"; } }}; #endif @@ -285,7 +282,8 @@ std::pair, std::shared_ptr> shell_p2open( << "status[" << wstatus << "], cmd[" << cmd << "]"; if (wstatus == -1 && errno == ECHILD) { - LOG(WARNING) << "errno is ECHILD"; + // temporarily remove this warning + // LOG(WARNING) << "errno is ECHILD"; } }}; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0e12e356..9476256b 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -12,21 +12,14 @@ unset(INFER_IR_PASSES CACHE) # clear the global variable function(pass_library TARGET DEST) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) + set(multiValueArgs SRCS DEPS DIR) set(targetPrefix "") - # Get optional argument - set(extraMacroArgs ${ARGN}) - list(LENGTH extraMacroArgs numExtraMacroArgs) - if(numExtraMacroArgs GREATER 0) - list(GET extraMacroArgs 0 targetPrefix) - endif() - - cmake_parse_arguments(op_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if(targetPrefix) - cc_library(${TARGET} SRCS ${targetPrefix}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS}) + cmake_parse_arguments(pass_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(pass_library_DIR) + cc_library(${TARGET} SRCS ${pass_library_DIR}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${pass_library_DEPS}) else() - cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${op_library_DEPS}) + cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${pass_library_DEPS}) endif() # add more DEST here, such as train, dist and collect USE_PASS into a file automatically. @@ -37,6 +30,8 @@ function(pass_library TARGET DEST) endif() endfunction() +cc_library(codegen SRCS codegen.cc DEPS codegen_helper) +cc_library(codegen_helper SRCS codegen_helper.cc DEPS graph node graph_helper) cc_library(node SRCS node.cc DEPS proto_desc) cc_library(graph SRCS graph.cc DEPS node pretty_log) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) @@ -44,6 +39,7 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph graph_helper graph_traits) cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass) +cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass) cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper) @@ -52,7 +48,6 @@ pass_library(graph_viz_pass base) pass_library(lock_free_optimize_pass base) pass_library(fc_fuse_pass inference) pass_library(attention_lstm_fuse_pass inference) -pass_library(infer_clean_graph_pass inference) pass_library(fc_lstm_fuse_pass inference) pass_library(embedding_fc_lstm_fuse_pass inference) pass_library(fc_gru_fuse_pass inference) @@ -61,6 +56,7 @@ pass_library(multi_batch_merge_pass base) pass_library(conv_bn_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(seqpool_concat_fuse_pass inference) +pass_library(seqpool_cvm_concat_fuse_pass inference) pass_library(repeated_fc_relu_fuse_pass inference) pass_library(squared_mat_sub_fuse_pass inference) pass_library(is_test_pass base) @@ -76,23 +72,27 @@ pass_library(quant_conv2d_dequant_fuse_pass inference) pass_library(fillconstant_elementwisemul_fuse inference) pass_library(shuffle_channel_detect_pass inference) pass_library(delete_quant_dequant_op_pass inference) +pass_library(simplify_with_basic_ops_pass base) +pass_library(fc_elementwise_layernorm_fuse_pass base) +if(WITH_GPU) + pass_library(cudnn_placement_pass base DEPS placement_pass_base) +endif() if(ANAKIN_SUBGRAPH) pass_library(simplify_anakin_priorbox_detection_out_pass inference) endif() if(WITH_MKLDNN) - pass_library(mkldnn_placement_pass base mkldnn) - pass_library(depthwise_conv_mkldnn_pass base mkldnn) - pass_library(conv_bias_mkldnn_fuse_pass inference mkldnn) - pass_library(conv_relu_mkldnn_fuse_pass inference mkldnn) - pass_library(conv_brelu_mkldnn_fuse_pass inference mkldnn) - pass_library(conv_concat_relu_mkldnn_fuse_pass inference mkldnn) - pass_library(conv_elementwise_add_mkldnn_fuse_pass inference mkldnn) - pass_library(fc_mkldnn_pass inference mkldnn) - pass_library(cpu_quantize_placement_pass base mkldnn) - pass_library(cpu_quantize_pass inference mkldnn) - pass_library(cpu_quantize_squash_pass inference mkldnn) + pass_library(mkldnn_placement_pass base DEPS placement_pass_base DIR mkldnn) + pass_library(depthwise_conv_mkldnn_pass base DIR mkldnn) + pass_library(conv_bias_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn) + pass_library(fc_mkldnn_pass inference DIR mkldnn) + pass_library(cpu_quantize_placement_pass base DIR mkldnn) + pass_library(cpu_quantize_pass inference DIR mkldnn) + pass_library(cpu_quantize_squash_pass inference DIR mkldnn) endif() if(WITH_NGRAPH) @@ -110,6 +110,7 @@ set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") cc_library(pass_builder SRCS pass_builder.cc DEPS pass) +cc_test(codegen_test SRCS codegen_test.cc DEPS codegen_helper codegen) cc_test(node_test SRCS node_test.cc DEPS node) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) @@ -118,15 +119,21 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto) +cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_tester.cc DEPS seqpool_cvm_concat_fuse_pass framework_proto) +cc_test(test_repeated_fc_relu_fuse_pass SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) +cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass) +cc_test(test_fc_elementwise_layernorm_fuse_pass SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass) +if(WITH_GPU) + cc_test(test_cudnn_placement_pass SRCS cudnn_placement_pass_tester.cc DEPS cudnn_placement_pass) +endif() if(NOT WIN32) cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass) endif() if (WITH_MKLDNN) cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor) - cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) - cc_test(test_conv_brelu_mkldnn_fuse_pass SRCS mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc DEPS conv_brelu_mkldnn_fuse_pass) + cc_test(test_conv_activation_mkldnn_fuse_pass SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass) cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) cc_test(test_mkldnn_placement_pass SRCS mkldnn/mkldnn_placement_pass_tester.cc DEPS mkldnn_placement_pass) diff --git a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc index 3eb4ef9f..5b9742f4 100644 --- a/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc +++ b/paddle/fluid/framework/ir/coalesce_grad_tensor_pass.cc @@ -65,28 +65,33 @@ double GetFuseParameterMemorySize() { return FLAGS_fuse_parameter_memory_size; } class CoalesceGradTensorPass : public ir::Pass { protected: void ApplyImpl(ir::Graph *graph) const { + if (Get(details::kNRanks) <= 1) { + VLOG(6) << "The number of place is" << Get(details::kNRanks) + << ", there doesn't need apply FuseAllReduceOpPass."; + return; + } ir::Graph &result = *graph; - details::ParamsAndGrads params_grads; RecordParamsAndGrads(result, ¶ms_grads); - VLOG(10) << "The number of params and grads is:" << params_grads.size(); - if (params_grads.size() == 0) { - return; - } - - auto vars_info = GetVarInfo(result); ResetAttribute(details::kParamsAndDenseGrads, &result); ResetAttribute(details::kParamsAndSparseGrads, &result); ResetAttribute( details::kGroupParamsAndDenseGrads, &result); + + VLOG(10) << "The number of params and grads is:" << params_grads.size(); + if (params_grads.size() == 0) { + return; + } + auto &p_g_dense_grad = result.Get(details::kParamsAndDenseGrads); auto &p_g_sparse_grad = result.Get(details::kParamsAndSparseGrads); + auto vars_info = GetVarInfo(result); for (auto ¶m_grad : params_grads) { if (IsLoDTensorType(GetTypeOfVar(vars_info, param_grad.second))) { p_g_dense_grad.emplace_back(param_grad); @@ -118,33 +123,37 @@ class CoalesceGradTensorPass : public ir::Pass { p_g_dense_grad.size(), num_of_p_g_dense_grad, "The number of p_g_dense_grad is not consistent with before."); + auto &pinned_var_set = + graph->GetOrInit(details::kPinnedVars); if (IsUnifiedDtype(p_g_dense_grad, vars_info)) { - SetGradientPersistable(p_g_dense_grad, vars_info); + RecordGradients(p_g_dense_grad, vars_info, &pinned_var_set); CoalesceTensors(vars_info, p_g_dense_grad, &result); } else { for (auto &sub_param_grad : group_params_grads) { - SetGradientPersistable(p_g_dense_grad, vars_info); - PADDLE_ENFORCE(IsUnifiedDtype(sub_param_grad, vars_info), - "The data type of the same group is not consistent."); + RecordGradients(p_g_dense_grad, vars_info, &pinned_var_set); + PADDLE_ENFORCE_EQ(IsUnifiedDtype(sub_param_grad, vars_info), true, + "The data type of the same group is not consistent."); CoalesceTensors(vars_info, sub_param_grad, &result); } } } - void SetGradientPersistable( + void RecordGradients( const std::vector> &sub_param_grad, - const std::unordered_map> &vars_info) - const { + const std::unordered_map> &vars_info, + std::unordered_set *pinned_var_set) const { + // The Gradients should not be reused during memory optimization. for (auto &p_g : sub_param_grad) { auto iter = vars_info.find(p_g.second); - PADDLE_ENFORCE(iter != vars_info.end(), "%s is not found.", p_g.second); - PADDLE_ENFORCE(!iter->second.empty()); - // Set persistable + PADDLE_ENFORCE_EQ(iter != vars_info.end(), true, "%s is not found.", + p_g.second); + PADDLE_ENFORCE_EQ(!iter->second.empty(), true); for (auto it : iter->second) { PADDLE_ENFORCE_NOT_NULL(it->Var()); - it->Var()->SetPersistable(true); + pinned_var_set->insert(it->Var()->Name()); } - PADDLE_ENFORCE(IsLoDTensorType(GetTypeOfVar(vars_info, p_g.second))); + PADDLE_ENFORCE_EQ(IsLoDTensorType(GetTypeOfVar(vars_info, p_g.second)), + true); } } @@ -411,8 +420,10 @@ class CoalesceGradTensorPass : public ir::Pass { const std::unordered_map> &vars_info, const std::string &var_name) const { auto grad_iter = vars_info.find(var_name); - PADDLE_ENFORCE(grad_iter != vars_info.end(), "%s is not found.", var_name); - PADDLE_ENFORCE(!grad_iter->second.empty()); + PADDLE_ENFORCE_EQ(grad_iter != vars_info.end(), true, "%s is not found.", + var_name); + PADDLE_ENFORCE_EQ(!grad_iter->second.empty(), true, "%s is not found.", + var_name); PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var()); return grad_iter->second.front()->Var(); } @@ -484,5 +495,4 @@ class CoalesceGradTensorPass : public ir::Pass { REGISTER_PASS(coalesce_grad_tensor_pass, paddle::framework::ir::CoalesceGradTensorPass) - .RequirePassAttr(paddle::framework::details::kPlaces) - .RequirePassAttr(paddle::framework::details::kLocalScopes); + .RequirePassAttr(paddle::framework::details::kNRanks); diff --git a/paddle/fluid/framework/ir/codegen.cc b/paddle/fluid/framework/ir/codegen.cc new file mode 100644 index 00000000..c3e5efcc --- /dev/null +++ b/paddle/fluid/framework/ir/codegen.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/ir/codegen.h" +#include +#include +#include "paddle/fluid/framework/ir/codegen_helper.h" +namespace paddle { +namespace framework { +namespace ir { + +// we get the parameter list code for the expression information +std::string CodeGen::GetDeclarationCode( + std::vector expression) { + std::stringstream ret; + ret << "fuse_kernel"; + ret << R"((int N )"; + std::set input_ids; + std::set output_ids; + std::vector last_output_idis; + + for (size_t i = 0; i < expression.size(); i++) { + std::vector tmp_input = expression[i].GetInputIds(); + for (size_t j = 0; j < tmp_input.size(); j++) { + int id = tmp_input[j]; + input_ids.insert(id); + } + int tmp_output = expression[i].GetOutputId(); + output_ids.insert(tmp_output); + } + + std::set::iterator it = input_ids.begin(); + while (it != input_ids.end()) { + int var_index = *it; + if (output_ids.find(var_index) != output_ids.end()) { + input_ids.erase(it++); + } else { + it++; + } + } + + for (it = input_ids.begin(); it != input_ids.end(); it++) { + int var_index = *it; + ret << R"(, const T* var)" << var_index; + } + + for (it = output_ids.begin(); it != output_ids.end(); it++) { + int var_index = *it; + ret << R"(, T* var)" << var_index; + } + + ret << R"())"; + + return ret.str(); +} + +std::string CodeGen::GetOffsetCode() { + std::stringstream ret; + ret << indentation << "int offset = idx;" << std::endl; + return ret.str(); +} + +std::string CodeGen::GetComputeCode( + std::vector expression) { + // get the right experssion code using suffix expression + std::stringstream ret; + for (size_t i = 0; i < expression.size(); i++) { + ret << expression[i].GetExpression(); + } + return ret.str(); +} +// in order to get the right result of expression, we need to calculate, we +// store the expression as +// suffix Expressions using vector +std::string CodeGen::GetKernelCode( + std::vector expression) { + auto declaration_code = GetDeclarationCode(expression); + auto offset_code = GetOffsetCode(); + auto compute_code = GetComputeCode(expression); + auto cuda_kernel = const_kernel_start + declaration_code + const_kernel_mid + + offset_code + compute_code + const_kernel_end; + return cuda_kernel; +} +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/codegen.h b/paddle/fluid/framework/ir/codegen.h new file mode 100644 index 00000000..975d4888 --- /dev/null +++ b/paddle/fluid/framework/ir/codegen.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include +#include +#include "paddle/fluid/framework/ir/codegen_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +class CodeGen { + public: + std::string GetKernelCode(std::vector expression); + + private: + std::string GetDeclarationCode( + std::vector expression); + std::string GetOffsetCode(); + std::string GetComputeCode( + std::vector expression); +}; +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/codegen_helper.cc b/paddle/fluid/framework/ir/codegen_helper.cc new file mode 100644 index 00000000..8f14549e --- /dev/null +++ b/paddle/fluid/framework/ir/codegen_helper.cc @@ -0,0 +1,61 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ +#include "paddle/fluid/framework/ir/codegen_helper.h" +#include +#include +#include +#include +namespace paddle { +namespace framework { +namespace ir { + +OperationExpression::OperationExpression(std::vector input_ids, + int output_id, + std::string search_operation) { + input_ids_ = input_ids; + output_id_ = output_id; + search_operation_ = search_operation; +} + +// we Traverse the graph and get the group , all input id and output id is +// unique for the node which belong the group +std::string OperationExpression::GetExpression() { + std::stringstream ret; + if (operator_cuda_table.find(search_operation_) == + operator_cuda_table.end()) { + std::cerr << "Not supportted operation, " << search_operation_ << std::endl; + } else { + auto rhs = operator_cuda_table[search_operation_]; + std::string replaced_str = "$"; + int count = 0; + auto pos = rhs.find(replaced_str); + while (pos != -1) { + auto index = input_ids_[count]; + rhs.replace(pos, replaced_str.length(), + std::to_string(index) + R"([offset])"); + pos = rhs.find(replaced_str); + count++; + } + auto lhs = std::string(indentation) + "var" + std::to_string(output_id_) + + R"([offset])"; + auto equal_split = R"( = )"; + auto semicolon = R"(;)"; + ret << lhs << equal_split << rhs << semicolon << std::endl; + } + + return ret.str(); +} +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/codegen_helper.h b/paddle/fluid/framework/ir/codegen_helper.h new file mode 100644 index 00000000..be8d3c8a --- /dev/null +++ b/paddle/fluid/framework/ir/codegen_helper.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include +#include + +namespace paddle { +namespace framework { +namespace ir { +static std::unordered_map operator_cuda_table = { + {"elementwise_add", "var$ + var$"}, + {"elementwise_sub", "var$ - var$"}, + {"elementwise_mul", "var$ * var$"}, + {"elementwise_div", "var$ / var$"}, + {"elementwise_min", "real_min(var$, var$)"}, + {"elementwise_max", "real_max(var$, var$)"}, + {"relu", "real_max(var$, 0)"}, + {"sigmoid", "1.0 / (1.0 + real_exp(-var$))"}}; + +// op computation is composed by single or many operation +class OperationExpression { + public: + OperationExpression(std::vector input_ids, int output_id, + std::string search_oprtation); + std::string GetExpression(); + std::vector GetInputIds() { return input_ids_; } + int GetOutputId() { return output_id_; } + + private: + std::vector input_ids_; + int output_id_; + std::string search_operation_; +}; + +static const char indentation[] = R"( )"; + +static const char const_kernel_start[] = R"( +template +extern "C" __global__ void +)"; + +static const char const_kernel_mid[] = R"( +{ + for(int idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < N; + idx += gridDim.x * blockDim.x) { + +)"; + +static const char const_kernel_end[] = R"( +} +} +)"; +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/codegen_test.cc b/paddle/fluid/framework/ir/codegen_test.cc new file mode 100644 index 00000000..8fd5fde3 --- /dev/null +++ b/paddle/fluid/framework/ir/codegen_test.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/codegen.h" +#include +#include +#include +#include "paddle/fluid/framework/ir/codegen_helper.h" +#ifdef PADDLE_WITH_CUDA +TEST(codegen, cuda) { + std::vector mul_input{1, 2}; + std::vector add_input{3, 4}; + std::vector sigmod_input{5}; + int mul_out = 3; + int add_out = 5; + int sigmod_out = 6; + + std::string op1 = "elementwise_mul"; + std::string op2 = "elementwise_add"; + std::string op3 = "sigmoid"; + paddle::framework::ir::OperationExpression opexp1(mul_input, mul_out, op1); + paddle::framework::ir::OperationExpression opexp2(add_input, add_out, op2); + paddle::framework::ir::OperationExpression opexp3(sigmod_input, sigmod_out, + op3); + + std::vector fused_op = { + opexp1, opexp2, opexp3}; + paddle::framework::ir::CodeGen codegen; + std::string result = codegen.GetKernelCode(fused_op); + std::cout << result << std::endl; +} +#endif diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc deleted file mode 100644 index 99bc5fe8..00000000 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" - -namespace paddle { -namespace framework { -namespace ir { - -#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); -#define GET_NODES \ - GET_IR_NODE(conv_op); \ - GET_IR_NODE(conv_out); \ - GET_IR_NODE(conv_filter); \ - GET_IR_NODE(elementwise_add_op); \ - GET_IR_NODE(elementwise_add_in_y); \ - GET_IR_NODE(elementwise_add_out); \ - GET_IR_NODE(elementwise_add_op_1); \ - GET_IR_NODE(elementwise_add_in_y_1); \ - GET_IR_NODE(elementwise_add_out_1); \ - GET_IR_NODE(act_op); \ - GET_IR_NODE(act_out); - -// Inherient the basic infomation from `base_desc`, and modify some fields. -framework::proto::OpDesc PrepareOpDesc( - const framework::proto::OpDesc& base_desc, const std::string& bias, - const std::string& bias1, const std::string& activation, - const std::string& output) { - auto proto = base_desc; - framework::OpDesc desc(proto, nullptr); - desc.SetInput("Bias", {bias}); - desc.SetInput("ResidualData", {bias1}); - desc.SetAttr("activation", activation); - desc.SetOutput("Output", {output}); - desc.SetAttr("is_test", true); - desc.SetAttr("use_cudnn", false); - - return *desc.Proto(); -} - -void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { - const std::string pattern_name = "conv_elementwise_add_act_fuse"; - FusePassBase::Init(pattern_name, graph); - - GraphPatternDetector gpd; - auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( - "conv2d", "Input"); - - patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name); - pattern(x); - - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - GET_NODES; - - auto base_op_desc = *conv_op->Op()->Proto(); - std::string bias_name = elementwise_add_in_y->Name(); - std::string bias1_name = elementwise_add_in_y_1->Name(); - std::string act_op_type = act_op->Op()->Type(); - std::string act_op_out = act_out->Name(); - - auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name, - act_op_type, act_op_out); - framework::OpDesc new_op_desc(new_op_proto, nullptr); - - // Create a new node for the fused op. - auto new_conv_op = graph->CreateOpNode(&new_op_desc); - - // Link inputs and outputs. - PADDLE_ENFORCE(subgraph.count(x)); - auto* conv_in_node = subgraph.at(x); - - IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input - IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter - IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias - IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // ResidualData - IR_NODE_LINK_TO(new_conv_op, act_out); // Output - - // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), - {conv_op, elementwise_add_op, elementwise_add_op_1, - elementwise_add_out}); - }; - gpd(graph.get(), handler); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, - paddle::framework::ir::ConvElementwiseAdd2ActFusePass); diff --git a/paddle/fluid/platform/dynload/warpctc_lib_path.h b/paddle/fluid/framework/ir/cudnn_placement_pass.cc similarity index 76% rename from paddle/fluid/platform/dynload/warpctc_lib_path.h rename to paddle/fluid/framework/ir/cudnn_placement_pass.cc index 6ff38b40..420e8ee8 100644 --- a/paddle/fluid/platform/dynload/warpctc_lib_path.h +++ b/paddle/fluid/framework/ir/cudnn_placement_pass.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once +#include "paddle/fluid/framework/ir/cudnn_placement_pass.h" -#define WARPCTC_LIB_PATH "/home/wangguibao/paddle-github/work_dir/Paddle/build/third_party/install/warpctc/lib/" +REGISTER_PASS(cudnn_placement_pass, paddle::framework::ir::CUDNNPlacementPass) + .RequirePassAttr("cudnn_enabled_op_types"); diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass.h b/paddle/fluid/framework/ir/cudnn_placement_pass.h new file mode 100644 index 00000000..d3f58583 --- /dev/null +++ b/paddle/fluid/framework/ir/cudnn_placement_pass.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/placement_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Specifies which operators should use cuDNN. + */ +class CUDNNPlacementPass : public PlacementPassBase { + private: + const std::string GetPlacementName() const { return "cuDNN"; } + + const std::string GetAttrName() const { return "use_cudnn"; } + + const std::unordered_set GetOpTypesList() const { + return Get>("cudnn_enabled_op_types"); + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc new file mode 100644 index 00000000..b4a56361 --- /dev/null +++ b/paddle/fluid/framework/ir/cudnn_placement_pass_tester.cc @@ -0,0 +1,119 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/cudnn_placement_pass.h" + +#include +#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +namespace ir { + +void RegisterOpKernel() { + static bool is_registered = false; + if (!is_registered) { + auto& all_kernels = OperatorWithKernel::AllOpKernels(); + + platform::CUDAPlace place = platform::CUDAPlace(0); + OpKernelType plain_kernel_type = + OpKernelType(proto::VarType::FP32, place, DataLayout::kAnyLayout, + LibraryType::kPlain); + OpKernelType cudnn_kernel_type = + OpKernelType(proto::VarType::FP32, place, DataLayout::kAnyLayout, + LibraryType::kCUDNN); + + auto fake_kernel_func = [](const ExecutionContext&) -> void { + static int num_calls = 0; + num_calls++; + }; + + all_kernels["conv2d"][cudnn_kernel_type] = fake_kernel_func; + all_kernels["pool2d"][cudnn_kernel_type] = fake_kernel_func; + all_kernels["depthwise_conv2d"][plain_kernel_type] = fake_kernel_func; + all_kernels["relu"][plain_kernel_type] = fake_kernel_func; + + is_registered = true; + } +} + +void MainTest(std::initializer_list cudnn_enabled_op_types, + unsigned expected_use_cudnn_true_count) { + // operator use_cudnn + // -------------------------------------------------- + // (a,b)->concat->c - + // (c,weights,bias)->conv2d->f false + // f->relu->g - + // g->pool2d->h false + // (h,weights2,bias2)->depthwise_conv2d->k false + // k->relu->l - + Layers layers; + VarDesc* a = layers.data("a"); + VarDesc* b = layers.data("b"); + VarDesc* c = layers.concat(std::vector({a, b})); + VarDesc* weights_0 = layers.data("weights_0"); + VarDesc* bias_0 = layers.data("bias_0"); + VarDesc* f = layers.conv2d(c, weights_0, bias_0, false); + VarDesc* g = layers.relu(f); + VarDesc* h = layers.pool2d(g, false); + VarDesc* weights_1 = layers.data("weights_1"); + VarDesc* bias_1 = layers.data("bias_1"); + VarDesc* k = layers.depthwise_conv2d(h, weights_1, bias_1, false); + layers.relu(k); + + RegisterOpKernel(); + + std::unique_ptr graph(new ir::Graph(layers.main_program())); + auto pass = PassRegistry::Instance().Get("cudnn_placement_pass"); + pass->Set("cudnn_enabled_op_types", + new std::unordered_set(cudnn_enabled_op_types)); + + graph.reset(pass->Apply(graph.release())); + + unsigned use_cudnn_true_count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()) { + auto* op = node->Op(); + if (op->HasAttr("use_cudnn") && + boost::get(op->GetAttr("use_cudnn"))) { + ++use_cudnn_true_count; + } + } + } + + EXPECT_EQ(use_cudnn_true_count, expected_use_cudnn_true_count); +} + +TEST(CUDNNPlacementPass, enable_conv2d) { + // 1 conv2d + MainTest({"conv2d"}, 1); +} + +TEST(CUDNNPlacementPass, enable_relu_pool) { + // 1 conv2d + 1 pool2d + MainTest({"conv2d", "pool2d"}, 2); +} + +TEST(CUDNNPlacementPass, enable_all) { + // 1 conv2d + 1 pool2d + // depthwise_conv2d doesnot have CUDNN kernel. + MainTest({}, 2); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(cudnn_placement_pass); diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc index 6462e7bf..21ceec79 100644 --- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc @@ -21,7 +21,6 @@ #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" -#include "paddle/fluid/operators/math/fc_compute.h" #include "paddle/fluid/platform/cpu_info.h" namespace paddle { @@ -45,7 +44,8 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, patterns::FC fc_pattern(pattern, name_scope); // fc_out is a tmp var, will be removed after fuse, so marked as intermediate. - auto* fc_out = fc_pattern(embedding_out, with_fc_bias)->AsIntermediate(); + auto* fc_out = fc_pattern(embedding_out, with_fc_bias, /* with_relu */ false) + ->AsIntermediate(); patterns::LSTM lstm_pattern(pattern, name_scope); lstm_pattern(fc_out); @@ -195,7 +195,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, } if (with_fc_bias) { - GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight, diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc new file mode 100644 index 00000000..e2c7606c --- /dev/null +++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc @@ -0,0 +1,259 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +struct FCElementwiseLayerNorm : public PatternBase { + FCElementwiseLayerNorm(PDPattern *pattern, const std::string &name_scope) + : PatternBase(pattern, name_scope, "fc_elementwise_layernorm") {} + + PDNode *operator()(PDNode *x); + + // declare operator node's name + PATTERN_DECL_NODE(fused_fc_elementwise_layernorm); + PATTERN_DECL_NODE(fc); + PATTERN_DECL_NODE(elementwise); + PATTERN_DECL_NODE(layer_norm); + // declare variable node's name + PATTERN_DECL_NODE(fc_w); + PATTERN_DECL_NODE(fc_bias); + PATTERN_DECL_NODE(fc_out); // (x,fc_w,fc_bias) -> fc_out + PATTERN_DECL_NODE(elementwise_input); + PATTERN_DECL_NODE( + elementwise_out); // (fc_out,elementwise_input) -> elementwise_out + PATTERN_DECL_NODE(layer_norm_bias); + PATTERN_DECL_NODE(layer_norm_scale); + PATTERN_DECL_NODE(layer_norm_out); + PATTERN_DECL_NODE(layer_norm_mean); + PATTERN_DECL_NODE(layer_norm_variance); +}; + +PDNode *FCElementwiseLayerNorm::operator()(PDNode *x) { + // Create nodes for fc op. + x->assert_is_op_input("fc", "Input"); + auto *fc = pattern->NewNode(fc_repr())->assert_is_op("fc"); + auto *fc_w_var = pattern->NewNode(fc_w_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("fc", "W"); + auto *fc_bias_var = pattern->NewNode(fc_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("fc", "Bias"); + auto *fc_out_var = pattern->NewNode(fc_out_repr())->assert_is_op_output("fc"); + + // Add links for fc op. + fc->LinksFrom({x, fc_w_var, fc_bias_var}).LinksTo({fc_out_var}); + + // Create nodes for elementwise_add op. + fc_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + auto *elementwise = + pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add"); + auto *elementwise_input_var = pattern->NewNode(elementwise_input_repr()) + ->assert_is_op_input("elementwise_add"); + + auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr()) + ->AsOutput() + ->assert_is_op_output("elementwise_add"); + + // Add links for elementwise_add op. + elementwise->LinksFrom({fc_out_var, elementwise_input_var}) + .LinksTo({elementwise_out_var}); + + // Create nodes for layer_norm op. + elementwise_out_var->AsIntermediate()->assert_is_op_input("layer_norm"); + auto *layer_norm = + pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm"); + auto *layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("layer_norm", "Bias"); + auto *layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr()) + ->AsInput() + ->assert_is_persistable_var() + ->assert_is_op_input("layer_norm", "Scale"); + + auto *layer_norm_out_var = pattern->NewNode(layer_norm_out_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Y"); + auto *layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Mean"); + auto *layer_norm_variance_var = + pattern->NewNode(layer_norm_variance_repr()) + ->AsOutput() + ->assert_is_op_output("layer_norm", "Variance"); + + // Add links for layer_norm op. + layer_norm + ->LinksFrom( + {elementwise_out_var, layer_norm_bias_var, layer_norm_scale_var}) + .LinksTo( + {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var}); + return layer_norm_out_var; +} + +} // namespace patterns + +template +static bool IsEqual(const std::vector &x, const std::vector &y) { + if (!(x.size() > 0U && y.size() > 0U) || x.size() != y.size()) { + return false; + } + for (size_t i = 0; i < x.size(); ++i) { + if (x[i] != y[i]) { + return false; + } + } + return true; +} + +void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { + PADDLE_ENFORCE_NOT_NULL(graph); + FusePassBase::Init("fc_elementwise_layernorm_fuse", graph); + int found_subgraph_count = 0; + + GraphPatternDetector gpd; + auto *x = gpd.mutable_pattern() + ->NewNode("fc_elementwise_layernorm_fuse/x") + ->AsInput() + ->assert_is_op_input("fc", "Input"); + patterns::FCElementwiseLayerNorm fused_pattern( + gpd.mutable_pattern(), "fc_elementwise_layernorm_fuse"); + fused_pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *graph) { + if (subgraph.count(x) <= 0) { + LOG(WARNING) << "The subgraph is empty."; + return; + } + + VLOG(4) << "handle FCElementwiseLayerNorm fuse"; + GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_w, fc_w, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_bias, fc_bias, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_out, fc_out, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise, elementwise, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_input, elementwise_input, + fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_out, elementwise_out, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm, layer_norm, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias, layer_norm_bias, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale, layer_norm_scale, + fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out, layer_norm_out, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean, layer_norm_mean, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance, + fused_pattern); + + if (!IsEqual(fc_out->Var()->GetShape(), + elementwise_input->Var()->GetShape())) { + return; + } + + int begin_norm_axis = + boost::get(layer_norm->Op()->GetAttr("begin_norm_axis")); + auto layer_norm_x_dims = fc_out->Var()->GetShape(); + auto layer_norm_x_mat_dims = framework::flatten_to_2d( + framework::make_ddim(layer_norm_x_dims), begin_norm_axis); + if (fc_w->Var()->GetShape()[1] != layer_norm_x_mat_dims[1]) { + return; + } + + if (fc_out->outputs.size() > 1U || elementwise_out->outputs.size() > 1U) { + // When fc_out or elementwise_out are used as input of other operators, we + // cannon fuse. + return; + } + + std::unordered_set del_node_set; + + // Create an FusedFCElementwiseLayerNorm op node + OpDesc new_desc; + new_desc.SetType("fused_fc_elementwise_layernorm"); + + // inputs + new_desc.SetInput("X", {subgraph.at(x)->Name()}); + new_desc.SetInput("W", {fc_w->Name()}); + new_desc.SetInput("Bias0", {fc_bias->Name()}); + new_desc.SetInput("Y", {elementwise_input->Name()}); + new_desc.SetInput("Scale", {layer_norm_scale->Name()}); + new_desc.SetInput("Bias1", {layer_norm_bias->Name()}); + + // outputs + new_desc.SetOutput("Out", {layer_norm_out->Name()}); + if (layer_norm_mean->outputs.size() > 0U) { + new_desc.SetOutput("Mean", {layer_norm_mean->Name()}); + } else { + del_node_set.insert(layer_norm_mean); + } + if (layer_norm_variance->outputs.size() > 0U) { + new_desc.SetOutput("Variance", {layer_norm_variance->Name()}); + } else { + del_node_set.insert(layer_norm_variance); + } + + // attrs + new_desc.SetAttr("x_num_col_dims", fc->Op()->GetAttr("in_num_col_dims")); + new_desc.SetAttr("epsilon", layer_norm->Op()->GetAttr("epsilon")); + new_desc.SetAttr("begin_norm_axis", + layer_norm->Op()->GetAttr("begin_norm_axis")); + new_desc.SetAttr("activation_type", fc->Op()->GetAttr("activation_type")); + + auto fused_node = graph->CreateOpNode(&new_desc); // OpDesc will be copied. + + del_node_set.insert(fc); + del_node_set.insert(elementwise); + del_node_set.insert(layer_norm); + del_node_set.insert(fc_out); + del_node_set.insert(elementwise_out); + GraphSafeRemoveNodes(graph, del_node_set); + + IR_NODE_LINK_TO(subgraph.at(x), fused_node); + IR_NODE_LINK_TO(fc_w, fused_node); + IR_NODE_LINK_TO(fc_bias, fused_node); + IR_NODE_LINK_TO(elementwise_input, fused_node); + IR_NODE_LINK_TO(layer_norm_scale, fused_node); + IR_NODE_LINK_TO(layer_norm_bias, fused_node); + IR_NODE_LINK_TO(fused_node, layer_norm_out); + if (layer_norm_mean->outputs.size() > 0U) { + IR_NODE_LINK_TO(fused_node, layer_norm_mean); + } + if (layer_norm_variance->outputs.size() > 0U) { + IR_NODE_LINK_TO(fused_node, layer_norm_variance); + } + + found_subgraph_count++; + }; + + gpd(graph, handler); + AddStatis(found_subgraph_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fc_elementwise_layernorm_fuse_pass, + paddle::framework::ir::FCElementwiseLayerNormFusePass); diff --git a/paddle/fluid/framework/revision.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h similarity index 59% rename from paddle/fluid/framework/revision.cc rename to paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h index 28693a5e..ac4d0b39 100644 --- a/paddle/fluid/framework/revision.cc +++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h @@ -1,10 +1,10 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -12,18 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/revision.h" +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" namespace paddle { namespace framework { -#ifdef PADDLE_REVISION -const std::string kPaddleRevision(PADDLE_REVISION); -#else -const std::string kPaddleRevision("baidu/paddlepaddle/paddle@null@null"); -#endif - -const std::string GetPaddleRevision() { - return kPaddleRevision; -} +namespace ir { + +class FCElementwiseLayerNormFusePass : public FusePassBase { + public: + virtual ~FCElementwiseLayerNormFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc new file mode 100644 index 00000000..c1f822d7 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass_tester.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.h" + +#include +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +TEST(FCElementwiseLayerNormFusePass, basic) { + // inputs operator output + // -------------------------------------------------------------------- + // (x, weights_0, bias_0) fc -> fc_out_0 + // (fc_out_0, weights_1, bias_1) fc -> fc_out_1 + // (fc_out_1, y) elementwise_add -> elementwise_out + // (elementwise_out, scale, bias_2) layer_norm -> + Layers layers; + auto* x = layers.data("x", {128, 768}); + auto* weights_0 = layers.data("weights_0", {768, 3072}, true); + auto* bias_0 = layers.data("bias_0", {3072}, true); + auto* fc_out_0 = layers.fc(x, weights_0, bias_0); // {128, 3072} + auto* weights_1 = layers.data("weights_1", {3072, 768}, true); + auto* bias_1 = layers.data("bias_1", {768}, true); + auto* fc_out_1 = + layers.fc(fc_out_0, weights_1, bias_1, 1, "relu"); // {128, 768} + fc_out_1->SetShape({128, 768}); + auto* y = layers.data("y", {128, 768}); + auto* elementwise_out = layers.elementwise_add(fc_out_1, y); + auto* scale = layers.data("scale", {768}, true); + auto* bias_2 = layers.data("bias_2", {768}, true); + layers.layer_norm(elementwise_out, scale, bias_2); + + std::unique_ptr graph(new ir::Graph(layers.main_program())); + auto pass = + PassRegistry::Instance().Get("fc_elementwise_layernorm_fuse_pass"); + int num_nodes_before = graph->Nodes().size(); + VLOG(3) << DebugString(graph); + + graph.reset(pass->Apply(graph.release())); + int num_nodes_after = graph->Nodes().size(); + int num_fused_nodes_after = + GetNumOpNodes(graph, "fused_fc_elementwise_layernorm"); + VLOG(3) << DebugString(graph); + + PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 6); + PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(fc_elementwise_layernorm_fuse_pass); diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 102fd388..b53e6a25 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -25,83 +25,110 @@ namespace framework { namespace ir { void FCFusePass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE(graph); + PADDLE_ENFORCE_NOT_NULL(graph); FusePassBase::Init("fc_fuse", graph); - std::unordered_set nodes2delete; + int found_fc_count = 0; + for (bool with_relu : {true, false}) { + found_fc_count += ApplyFCPattern(graph, with_relu); + } + AddStatis(found_fc_count); +} + +int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const { GraphPatternDetector gpd; auto* x = gpd.mutable_pattern() ->NewNode("fc_fuse/x") ->AsInput() ->assert_is_op_input("mul", "X"); patterns::FC fc_pattern(gpd.mutable_pattern(), "fc_fuse"); - fc_pattern(x, true /*with bias*/); + fc_pattern(x, true /*with bias*/, with_relu); int found_fc_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { + if (subgraph.count(x) <= 0) { + LOG(WARNING) << "The subgraph is empty."; + return; + } + VLOG(4) << "handle FC fuse"; GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); - GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); - GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(bias, bias, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(elementwise_add_out, elementwise_add_out, + fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul_out, mul_out, fc_pattern); + Node* relu = nullptr; + Node* relu_out = nullptr; + if (with_relu) { + GET_IR_NODE_FROM_SUBGRAPH(tmp_relu, relu, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(tmp_relu_out, relu_out, fc_pattern); + relu = tmp_relu; + relu_out = tmp_relu_out; + } - auto base_op_desc = mul->Op(); // Create an FC Node. - // OpDesc desc(base_op_desc, nullptr); OpDesc desc; - std::string fc_x_in = subgraph.at(x)->Name(); - std::string fc_Y_in = w->Name(); - std::string fc_bias_in = fc_bias->Name(); - std::string fc_out_out = fc_out->Name(); - - desc.SetInput("Input", std::vector({fc_x_in})); - desc.SetInput("W", std::vector({fc_Y_in})); - desc.SetInput("Bias", std::vector({fc_bias_in})); - desc.SetOutput("Out", std::vector({fc_out_out})); + desc.SetType("fc"); + + // Set inputs of fc + desc.SetInput("Input", {subgraph.at(x)->Name()}); + desc.SetInput("W", {w->Name()}); + desc.SetInput("Bias", {bias->Name()}); + + // Set output of fc + std::string fc_out_name = + with_relu ? relu_out->Name() : elementwise_add_out->Name(); + desc.SetOutput("Out", std::vector({fc_out_name})); + + // Set attrs of fc desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims")); + std::string activation_type = with_relu ? "relu" : ""; + desc.SetAttr("activation_type", activation_type); // For anakin subgraph int8 // When in anakin subgraph int8 mode, the pattern like "fake_quant + mul + - // fake_dequant" - // can be detected by the quant_dequant_fuse_pass. This pass will add - // "input_scale", - // "weight_scale" which are extracted from fake_quant op and fake_dequant op - // to mul op, - // and then delete the fake_quant op and fake_dequant op in the graph. If - // the mul op - // has the scale info, we should add those to the fused fc. - if (base_op_desc->HasAttr("enable_int8")) { - desc.SetAttr("enable_int8", base_op_desc->GetAttr("enable_int8")); - desc.SetAttr("input_scale", base_op_desc->GetAttr("input_scale")); - desc.SetAttr("weight_scale", base_op_desc->GetAttr("weight_scale")); - if (base_op_desc->HasAttr("out_scale")) - desc.SetAttr("out_scale", base_op_desc->GetAttr("out_scale")); + // fake_dequant" can be detected by the quant_dequant_fuse_pass. This pass + // will add "input_scale", "weight_scale" which are extracted from + // fake_quant op and fake_dequant op to mul op, and then delete the + // fake_quant op and fake_dequant op in the graph. If the mul op has the + // scale info, we should add those to the fused fc. + auto* mul_op_desc = mul->Op(); + if (mul_op_desc->HasAttr("enable_int8")) { + desc.SetAttr("enable_int8", mul_op_desc->GetAttr("enable_int8")); + desc.SetAttr("input_scale", mul_op_desc->GetAttr("input_scale")); + desc.SetAttr("weight_scale", mul_op_desc->GetAttr("weight_scale")); + if (mul_op_desc->HasAttr("out_scale")) + desc.SetAttr("out_scale", mul_op_desc->GetAttr("out_scale")); auto elementwise_desc = elementwise_add->Op(); if (elementwise_desc->HasAttr("out_scale")) desc.SetAttr("out_scale", elementwise_desc->GetAttr("out_scale")); } - desc.SetType("fc"); - auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. - GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out}); + if (with_relu) { + GraphSafeRemoveNodes( + graph, {mul, elementwise_add, mul_out, elementwise_add_out, relu}); + } else { + GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out}); + } - PADDLE_ENFORCE(subgraph.count(x)); IR_NODE_LINK_TO(subgraph.at(x), fc_node); IR_NODE_LINK_TO(w, fc_node); - IR_NODE_LINK_TO(fc_bias, fc_node); - IR_NODE_LINK_TO(fc_node, fc_out); + IR_NODE_LINK_TO(bias, fc_node); + if (with_relu) { + IR_NODE_LINK_TO(fc_node, relu_out); + } else { + IR_NODE_LINK_TO(fc_node, elementwise_add_out); + } found_fc_count++; }; - gpd(graph, handler); - - AddStatis(found_fc_count); + return found_fc_count; } } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h index 0a0fcd2d..ef6636d1 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_fuse_pass.h @@ -31,7 +31,9 @@ class FCFusePass : public FusePassBase { virtual ~FCFusePass() {} protected: - void ApplyImpl(ir::Graph* graph) const override; + void ApplyImpl(Graph* graph) const override; + + int ApplyFCPattern(Graph* graph, bool with_relu) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc index affe5069..320d28f1 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -15,81 +15,53 @@ #include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include -#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" namespace paddle { namespace framework { namespace ir { -void SetOp(ProgramDesc* prog, const std::string& type, - const std::vector& inputs, - const std::vector& outputs) { - auto* op = prog->MutableBlock(0)->AppendOp(); - op->SetType(type); - if (type == "mul") { - op->SetInput("X", {inputs[0]}); - op->SetInput("Y", {inputs[1]}); - op->SetAttr("x_num_col_dims", {1}); - } else if (type == "elementwise_add") { - op->SetInput("X", inputs); - } - op->SetOutput("Out", outputs); - op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), - static_cast(OpRole::kForward)); -} - -// a->OP0->b -// a->OP1->c -// (b, c)->mul->d -// (d, e)->elementwise_add->f -ProgramDesc BuildProgramDesc() { - ProgramDesc prog; - for (auto& v : std::vector({"a", "b", "c", "d", "e", "f"})) { - auto* var = prog.MutableBlock(0)->Var(v); - var->SetType(proto::VarType::SELECTED_ROWS); - if (v == "c") { - var->SetPersistable(true); - } - } - - SetOp(&prog, "OP0", std::vector({"a"}), - std::vector({"b"})); - SetOp(&prog, "OP1", std::vector({"a"}), - std::vector({"c"})); - SetOp(&prog, "mul", std::vector({"b", "c"}), - std::vector({"d"})); - SetOp(&prog, "elementwise_add", std::vector({"d", "e"}), - std::vector({"f"})); - - return prog; -} - TEST(FCFusePass, basic) { - auto prog = BuildProgramDesc(); - - std::unique_ptr graph(new ir::Graph(prog)); - + // inputs operator output + // -------------------------------------------------------- + // (a, filters_0 bias_0) conv2d -> conv2d_out + // conv2d_out relu -> relu_out_0 + // (relu_out_0, weights_0) mul -> mul_out_0 + // (mul_out_0, bias_1) elementwise_add -> add_out_0 + // add_out_0 relu -> relu_out_1 + // (relu_out_1, weights_1) mul -> mul_out_1 + // (mul_out_1, bias_2) elementwise_add -> add_out_1 + Layers layers; + auto* a = layers.data("a"); + auto* filters_0 = layers.data("conv2d_filters_0", {}, true); + auto* bias_0 = layers.data("conv2d_bias_0", {}, true); + auto* conv2d_out = layers.conv2d(a, filters_0, bias_0, false); + auto* relu_out_0 = layers.relu(conv2d_out); + auto* weights_0 = layers.data("weights_0", {}, true); + auto* mul_out_0 = layers.mul(relu_out_0, weights_0); + auto* bias_1 = layers.data("bias_1", {}, true); + auto* add_out_0 = layers.elementwise_add(mul_out_0, bias_1); + auto* relu_out_1 = layers.relu(add_out_0); + auto* weights_1 = layers.data("weights_1", {}, true); + auto* mul_out_1 = layers.mul(relu_out_1, weights_1); + auto* bias_2 = layers.data("bias_2", {}, true); + auto* add_out_1 = layers.elementwise_add(mul_out_1, bias_2); + VLOG(4) << add_out_1; + + std::unique_ptr graph(new ir::Graph(layers.main_program())); auto pass = PassRegistry::Instance().Get("fc_fuse_pass"); - - int pre_nodes = graph->Nodes().size(); + int num_nodes_before = graph->Nodes().size(); + int num_mul_nodes_before = GetNumOpNodes(graph, "mul"); + VLOG(3) << DebugString(graph); graph.reset(pass->Apply(graph.release())); + int num_nodes_after = graph->Nodes().size(); + int num_fc_nodes_after = GetNumOpNodes(graph, "fc"); + VLOG(3) << DebugString(graph); - int after_nodes = graph->Nodes().size(); - - // Remove 3 Nodes: MUL,ELEMENTWISE_ADD, mul_out - // Add 1 Node: FC - EXPECT_EQ(pre_nodes - 2, after_nodes); - - // Assert fc op in newly generated graph - int fc_count = 0; - - for (auto* node : graph->Nodes()) { - if (node->IsOp() && node->Op()->Type() == "fc") { - ++fc_count; - } - } - EXPECT_EQ(fc_count, 1); + PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 6); + PADDLE_ENFORCE_EQ(num_fc_nodes_after, 2); + PADDLE_ENFORCE_EQ(num_mul_nodes_before, num_fc_nodes_after); } } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc index 10cbe319..287c6dc4 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc @@ -33,7 +33,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, PDNode* x = pattern->NewNode(patterns::UniqueKey("x"))->assert_var_not_persistable(); - auto* fc_out = fc_pattern(x, with_fc_bias); + auto* fc_out = fc_pattern(x, with_fc_bias, /* with_relu */ false); fc_out->AsIntermediate(); // fc_out is a tmp var, will be removed after fuse. gru_pattern(fc_out); @@ -116,7 +116,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, auto* x_n = subgraph.at(x); GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); - GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, gru_pattern); GET_IR_NODE_FROM_SUBGRAPH(gru, gru, gru_pattern); GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, gru_pattern); diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index 6858a98b..a5a72e87 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -33,7 +33,8 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, patterns::FC fc_pattern(pattern, name_scope); // fc_out is a tmp var, will be removed after fuse, so marked as intermediate. - auto* fc_out = fc_pattern(x, with_fc_bias)->AsIntermediate(); + auto* fc_out = + fc_pattern(x, with_fc_bias, /* with_relu */ false)->AsIntermediate(); patterns::LSTM lstm_pattern(pattern, name_scope); lstm_pattern(fc_out); @@ -132,7 +133,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern); if (with_fc_bias) { - GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(fc_out, elementwise_add_out, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern); lstm_creator(lstm, subgraph.at(x), w, Weight, Bias, Hidden, Cell, fc_out, diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc index 504ff04c..8aec0987 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_adam_op_pass.cc @@ -32,19 +32,63 @@ class FuseAdamOpPass : public FuseOptimizerOpPass { return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; } - void FuseOptimizerOps( + ir::Node *FuseOptimizerOps( const std::unordered_map> &aux_var_set, const std::unordered_map &fused_vars_name, const std::vector &adam_ops, ir::Graph *graph) const { - FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph); - FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"), - adam_ops, graph); - FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"), - adam_ops, graph); + auto fused_adam_node = + FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph); + auto fused_scale1 = + FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"), + adam_ops, graph); + auto fused_scale2 = + FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"), + adam_ops, graph); + RemoveCycleDepsBetweenOpNodes(graph, fused_scale1, fused_scale2); + return fused_adam_node; } - void FuseAdamOps( + void RemoveCycleDepsBetweenOpNodes(Graph *graph, const Node *fused_scale1, + const Node *fused_scale2) const { + std::unordered_set not_need_ctrl_var_nodes; + std::unordered_set fused_scale2_in_nodes; + fused_scale2_in_nodes.insert(fused_scale2->inputs.begin(), + fused_scale2->inputs.end()); + for (auto &out_node : fused_scale1->outputs) { + if (fused_scale2_in_nodes.count(out_node)) { + PADDLE_ENFORCE(out_node->IsCtrlVar(), + "The dependency var only should be ctrl var."); + not_need_ctrl_var_nodes.insert(out_node); + } + } + + for (auto &node : not_need_ctrl_var_nodes) { + // remove this node from the input op node. + PADDLE_ENFORCE(!node->inputs.empty(), + "The input should not be empty here."); + auto op_node = node->inputs.front(); + PADDLE_ENFORCE(op_node->IsOp()); + op_node->outputs.erase( + remove_if( + op_node->outputs.begin(), op_node->outputs.end(), + [&node](const Node *op_out_node) { return op_out_node == node; }), + op_node->outputs.end()); + + // remove this node from the output op nodes. + for (auto &out_op_node : node->outputs) { + out_op_node->inputs.erase( + remove_if( + out_op_node->inputs.begin(), out_op_node->inputs.end(), + [&node](const Node *op_in_node) { return op_in_node == node; }), + out_op_node->inputs.end()); + } + + graph->RemoveNode(node); + } + } + + ir::Node *FuseAdamOps( const std::unordered_map> &vars_set, const std::unordered_map &fused_vars_name, const std::vector &adam_ops, ir::Graph *graph) const { @@ -80,7 +124,7 @@ class FuseAdamOpPass : public FuseOptimizerOpPass { // NOTE: fused_var is only exist in scope, so the graph doesn't have // fused_var node. - VLOG(7) << "Insert adam to graph "; + VLOG(6) << "Insert adam to graph "; OpDesc adam_desc(adam_ops[0]->Op()->Block()); adam_desc.SetType("adam"); adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); @@ -102,16 +146,13 @@ class FuseAdamOpPass : public FuseOptimizerOpPass { adam_desc.SetAttr("min_row_size_to_use_multithread", min_row_size_to_use_multithread); adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); - - auto adam_node = graph->CreateOpNode(&adam_desc); - - InserInputAndOutputForOptOps(adam_ops, adam_node); + return graph->CreateOpNode(&adam_desc); } - void FuseScaleOps(const std::vector &beta_name, - const std::string &fused_var_name, - const std::vector &adam_ops, - ir::Graph *graph) const { + ir::Node *FuseScaleOps(const std::vector &beta_name, + const std::string &fused_var_name, + const std::vector &adam_ops, + ir::Graph *graph) const { PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size()); const std::string scale_op_name = "scale"; @@ -139,7 +180,7 @@ class FuseAdamOpPass : public FuseOptimizerOpPass { scale_ops.emplace_back(*scale_op_iter); } PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size()); - + VLOG(6) << "The number of scale op is " << scale_ops.size() << "."; // Check attributions // NOTE: If new attribution is added, the following code maybe need change. int op_role = boost::get( @@ -164,7 +205,7 @@ class FuseAdamOpPass : public FuseOptimizerOpPass { // NOTE: fused_var is only exist in scope, so the graph doesn't have // fused_var node. - VLOG(7) << "Insert fused scale to graph."; + VLOG(6) << "Insert fused scale to graph."; OpDesc scale_desc(scale_ops[0]->Op()->Block()); scale_desc.SetType("scale"); scale_desc.SetInput("X", {fused_var_name}); @@ -175,35 +216,16 @@ class FuseAdamOpPass : public FuseOptimizerOpPass { scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); auto scale_node = graph->CreateOpNode(&scale_desc); - for (auto scale_op : scale_ops) { - // set inputs - scale_node->inputs.insert(scale_node->inputs.begin(), - scale_op->inputs.begin(), - scale_op->inputs.end()); - for (auto &input : scale_op->inputs) { - std::replace(input->outputs.begin(), input->outputs.end(), scale_op, - scale_node); - } - // set outputs - scale_node->outputs.insert(scale_node->outputs.begin(), - scale_op->outputs.begin(), - scale_op->outputs.end()); - for (auto &output : scale_op->outputs) { - std::replace(output->inputs.begin(), output->inputs.end(), scale_op, - scale_node); - } - } - + InsertInputAndOutputForFusedOpNode(scale_ops, graph, scale_node); // Delete scale_ops for (auto &scale_op : scale_ops) { graph->RemoveNode(scale_op); } + return scale_node; } }; } // namespace ir } // namespace framework } // namespace paddle -REGISTER_PASS(fuse_adam_op_pass, paddle::framework::ir::FuseAdamOpPass) - .RequirePassAttr(paddle::framework::details::kPlaces) - .RequirePassAttr(paddle::framework::details::kLocalScopes); +REGISTER_PASS(fuse_adam_op_pass, paddle::framework::ir::FuseAdamOpPass); diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc index 3ac92d17..8f3a623a 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_momentum_op_pass.cc @@ -33,7 +33,7 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass { } // Fuse Momentum Ops - virtual void FuseOptimizerOps( + virtual ir::Node *FuseOptimizerOps( const std::unordered_map> &vars_set, const std::unordered_map &fused_vars_name, const std::vector &momentum_ops, ir::Graph *graph) const { @@ -61,7 +61,7 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass { // NOTE: fused_var is only exist in scope, so the graph doesn't have // fused_var node. - VLOG(7) << "Insert momentum to graph "; + VLOG(6) << "Insert momentum to graph "; OpDesc momentum_desc(momentum_ops[0]->Op()->Block()); momentum_desc.SetType("momentum"); momentum_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); @@ -77,9 +77,7 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass { momentum_desc.SetAttr("use_nesterov", use_nesterov); momentum_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); - auto momentum_node = graph->CreateOpNode(&momentum_desc); - - InserInputAndOutputForOptOps(momentum_ops, momentum_node); + return graph->CreateOpNode(&momentum_desc); } }; @@ -87,6 +85,4 @@ class FuseMomentumOpPass : public FuseOptimizerOpPass { } // namespace framework } // namespace paddle -REGISTER_PASS(fuse_momentum_op_pass, paddle::framework::ir::FuseMomentumOpPass) - .RequirePassAttr(paddle::framework::details::kPlaces) - .RequirePassAttr(paddle::framework::details::kLocalScopes); +REGISTER_PASS(fuse_momentum_op_pass, paddle::framework::ir::FuseMomentumOpPass); diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc index ee601145..fcb5604a 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h" #include +#include #include #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_registry.h" @@ -48,7 +49,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { } VLOG(6) << "Find " << fuse_op_type << " operators : " << opt_ops_num - << ", and " << opt_nodes.size() << " for dense gradients "; + << ", and " << opt_nodes.size() << " for dense gradients."; if (opt_nodes.size() == 0 || result.Has(details::kFusedOptType)) { if (result.Has(details::kFusedOptType)) { auto &opt_type = @@ -59,6 +60,20 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { } return; } + + // There should not have no-ctr-var between the op_nodes that link the op_node + // of op_nodes. + if (HasVarDepsBetweenOps(topo_nodes, opt_nodes)) { + VLOG(6) << "There are interdependent variables among these optimization " + "operators, which can not be handled well at present."; + return; + } + + LOG(WARNING) << "Find " << fuse_op_type << " operators : " << opt_ops_num + << ", and " << opt_nodes.size() << " for dense gradients. " + << "To make the speed faster, those optimization are fused " + "during training."; + result.Set(details::kFusedOptType, new details::FusedOptType); result.Get(details::kFusedOptType) = fuse_op_type; if (!result.Has(details::kProgramDescs)) { @@ -96,6 +111,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { PADDLE_ENFORCE_LE( params_and_dense_grads.size(), aux_var_set.at(kGrad).size(), "The number of dense gradients should be little than optimizer ops."); + std::unordered_set opt_grad_set(aux_var_set.at(kGrad).size()); for (auto &p_g : params_and_dense_grads) { opt_grad_set.insert(p_g.second); @@ -128,7 +144,8 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { auto &fused_vars = result.Get(details::kFusedVars); auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad.front()); - PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); + PADDLE_ENFORCE_EQ(iter != fused_vars.end(), true, + "Not find the fused_grad."); fused_vars_name[kGrad] = fused_grad.front(); // Sort the parameters and auxiliary variables according @@ -137,7 +154,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { &opt_nodes); grad_fused = true; } else { - VLOG(10) << "The number of new gradients is " << new_grad_idx.size(); + VLOG(6) << "The number of new gradients is " << new_grad_idx.size(); if (new_grad_idx.size() == 1) return; // NOTE(zcd): If the gradients of backward stage and optimization stage // have diff, Only take care of the the gradient of optimization stage. @@ -158,14 +175,54 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { &result); // Step 5: Fuse optimizer Ops and Scale Ops - FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result); + auto *fused_opt_node = + FuseOptimizerOps(aux_var_set, fused_vars_name, opt_nodes, &result); + InsertInputAndOutputForFusedOpNode(opt_nodes, graph, fused_opt_node); // Step 6: Remove optimizer Ops for (auto &opt_op : opt_nodes) { graph->RemoveNode(opt_op); } } +bool FuseOptimizerOpPass::HasVarDepsBetweenOps( + const std::vector &topo_nodes, + const std::vector &opt_nodes) const { + std::unordered_map> preceding_ops; + std::unordered_map> pending_ops; + for (auto &op : topo_nodes) { + preceding_ops[op]; + pending_ops[op]; + for (auto &var : op->outputs) { + if (var->IsCtrlVar()) continue; + for (auto &pending_op : var->outputs) { + preceding_ops[pending_op].insert(op); + pending_ops[op].insert(pending_op); + } + } + } + + std::unordered_set opt_node_set(opt_nodes.begin(), opt_nodes.end()); + auto has_var_deps = [](const std::unordered_set &op_set1, + const std::unordered_set &op_set2) -> bool { + std::set intersect_ops; + set_intersection(op_set1.begin(), op_set1.end(), op_set2.begin(), + op_set2.end(), + inserter(intersect_ops, intersect_ops.begin())); + return !intersect_ops.empty(); + }; + + for (auto opt_node : opt_node_set) { + if (has_var_deps(preceding_ops.at(opt_node), opt_node_set)) { + return true; + } + if (has_var_deps(pending_ops.at(opt_node), opt_node_set)) { + return true; + } + } + return false; +} + void FuseOptimizerOpPass::GradientsFilter( const std::vector &new_grad_idx, std::vector *opt_nodes, std::unordered_map> *aux_var_set) @@ -196,18 +253,24 @@ void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( const std::vector ¶ms, const std::vector &grads, const std::string &fused_grad_name, ir::Graph *result) const { + auto &pinned_var_set = + result->GetOrInit(details::kPinnedVars); + auto vars_info = GetVarInfo(*result); - // Set Gradients as Persistable to prevent this var becoming reusable. + // The Gradients should not be reused during memory optimization. for (auto &grad_var_name : grads) { auto iter = vars_info.find(grad_var_name); - PADDLE_ENFORCE(iter != vars_info.end()); - PADDLE_ENFORCE(!iter->second.empty()); + PADDLE_ENFORCE_EQ(iter != vars_info.end(), true, "%s is not found.", + grad_var_name); + PADDLE_ENFORCE_EQ(!iter->second.empty(), true, "%s is not found.", + grad_var_name); PADDLE_ENFORCE_NOT_NULL(iter->second.front()->Var()); - PADDLE_ENFORCE(IsLoDTensorType(iter->second.front()->Var()->GetType()), - "Currently the gradient type only should be LoDTensor when " - "fusing optimizer ops."); + PADDLE_ENFORCE_EQ( + IsLoDTensorType(iter->second.front()->Var()->GetType()), true, + "Currently the gradient type only should be LoDTensor when " + "fusing optimizer ops."); for (auto var : iter->second) { - var->Var()->SetPersistable(true); + pinned_var_set.insert(var->Var()->Name()); } } @@ -243,8 +306,9 @@ proto::VarType::Type FuseOptimizerOpPass::GetTypeOfVar( const std::unordered_map> &var_nodes, const std::string &name) const { auto grad_iter = var_nodes.find(name); - PADDLE_ENFORCE(grad_iter != var_nodes.end()); - PADDLE_ENFORCE(grad_iter->second.size() > 0); + PADDLE_ENFORCE_EQ(grad_iter != var_nodes.end(), true, "%s is not found.", + name); + PADDLE_ENFORCE_GT(grad_iter->second.size(), 0); PADDLE_ENFORCE_NOT_NULL(grad_iter->second.front()->Var()); return grad_iter->second.front()->Var()->GetType(); } @@ -271,24 +335,25 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars( const std::vector> ¶ms_grads, std::unordered_map> *aux_vars_set, std::vector *ops) const { - PADDLE_ENFORCE_NE(aux_vars_set->count(kParam), static_cast(0)); - auto ¶m_vec = aux_vars_set->at(kParam); + PADDLE_ENFORCE_NE(aux_vars_set->count(kGrad), static_cast(0)); + auto &grad_vec = aux_vars_set->at(kGrad); - std::vector param_sort_idx; - param_sort_idx.reserve(param_vec.size()); + std::vector grad_sort_idx; + grad_sort_idx.reserve(grad_vec.size()); for (auto &p_g : params_grads) { - auto iter = std::find(param_vec.begin(), param_vec.end(), p_g.first); - PADDLE_ENFORCE(iter != param_vec.end()); - auto idx = std::distance(param_vec.begin(), iter); - param_sort_idx.emplace_back(idx); + auto iter = std::find(grad_vec.begin(), grad_vec.end(), p_g.second); + PADDLE_ENFORCE_EQ(iter != grad_vec.end(), true, + "%s is not found in grad_vec", p_g.second); + auto idx = std::distance(grad_vec.begin(), iter); + grad_sort_idx.emplace_back(idx); } for (auto &aux_vars : *aux_vars_set) { std::vector sorted_vars; sorted_vars.reserve(aux_vars.second.size()); for (size_t i = 0; i < aux_vars.second.size(); ++i) { - sorted_vars.emplace_back(aux_vars.second.at(param_sort_idx[i])); + sorted_vars.emplace_back(aux_vars.second.at(grad_sort_idx[i])); } std::swap(aux_vars.second, sorted_vars); @@ -304,7 +369,7 @@ void FuseOptimizerOpPass::SortParametersAndAuxVars( std::vector sorted_ops; sorted_ops.reserve(ops->size()); for (size_t i = 0; i < ops->size(); ++i) { - sorted_ops.emplace_back(ops->at(param_sort_idx[i])); + sorted_ops.emplace_back(ops->at(grad_sort_idx[i])); } std::swap(*ops, sorted_ops); } @@ -338,26 +403,84 @@ void FuseOptimizerOpPass::AppendAllocContinuousSpace( op_desc->SetAttr("check_name", check_name); } -void FuseOptimizerOpPass::InserInputAndOutputForOptOps( - const std::vector &opt_nodes, ir::Node *opt_node) const { +void FuseOptimizerOpPass::InsertInputAndOutputForFusedOpNode( + const std::vector &op_nodes, ir::Graph *graph, + ir::Node *fused_opt_node) const { std::unordered_set inputs; std::unordered_set outputs; - for (auto opt_op : opt_nodes) { - // set inputs + for (auto opt_op : op_nodes) { inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end()); for (auto &input : opt_op->inputs) { - replace(input->outputs.begin(), input->outputs.end(), opt_op, opt_node); + replace(input->outputs.begin(), input->outputs.end(), opt_op, + fused_opt_node); } - // set outputs outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end()); for (auto &output : opt_op->outputs) { - replace(output->inputs.begin(), output->inputs.end(), opt_op, opt_node); + replace(output->inputs.begin(), output->inputs.end(), opt_op, + fused_opt_node); } } - opt_node->inputs.insert(opt_node->inputs.begin(), inputs.begin(), - inputs.end()); - opt_node->outputs.insert(opt_node->outputs.begin(), outputs.begin(), - outputs.end()); + + // Remove the dependence vars between op_nodes. + std::unordered_set out_dep_vars; + std::unordered_set not_useful_vars; + + auto deal_with_ctrl_vars = [&out_dep_vars, ¬_useful_vars, + &fused_opt_node](ir::Node *ctr_var_node) { + PADDLE_ENFORCE_EQ(ctr_var_node->inputs.size(), 1); + if (ctr_var_node->inputs.front() == fused_opt_node) { + PADDLE_ENFORCE_GT(ctr_var_node->outputs.size(), 0); + auto output_ops = ctr_var_node->outputs; + output_ops.erase(std::remove_if(output_ops.begin(), output_ops.end(), + [&fused_opt_node](const ir::Node *node) { + return node == fused_opt_node; + }), + output_ops.end()); + if (!output_ops.empty()) { + out_dep_vars.insert(ctr_var_node); + } + not_useful_vars.insert(ctr_var_node); + } + }; + + for (auto *in_node : inputs) { + if (in_node->IsCtrlVar()) { + deal_with_ctrl_vars(in_node); + } + } + + for (auto *out_node : outputs) { + if (out_node->IsCtrlVar()) { + deal_with_ctrl_vars(out_node); + } + } + + for (auto &node : not_useful_vars) { + if (inputs.count(node)) { + inputs.erase(node); + } + if (outputs.count(node)) { + outputs.erase(node); + } + } + + for (auto &dep_var : out_dep_vars) { + if (not_useful_vars.count(dep_var)) { + not_useful_vars.erase(dep_var); + } + dep_var->inputs.clear(); + dep_var->inputs.emplace_back(fused_opt_node); + } + + outputs.insert(out_dep_vars.begin(), out_dep_vars.end()); + fused_opt_node->inputs.insert(fused_opt_node->inputs.begin(), inputs.begin(), + inputs.end()); + fused_opt_node->outputs.insert(fused_opt_node->outputs.begin(), + outputs.begin(), outputs.end()); + + for (auto &ctrl_var_node : not_useful_vars) { + graph->RemoveNode(ctrl_var_node); + } } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h index 0432d8c4..149bd20d 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.h @@ -41,15 +41,16 @@ class FuseOptimizerOpPass : public ir::Pass { std::unordered_map> *aux_var_set, std::vector *ops) const; - void InserInputAndOutputForOptOps(const std::vector &opt_ops, - ir::Node *opt_node) const; + void InsertInputAndOutputForFusedOpNode( + const std::vector &opt_ops, ir::Graph *graph, + ir::Node *opt_node) const; private: virtual const std::string GetOpType() const = 0; virtual const std::vector GetAuxiliaryVarNames() const = 0; - virtual void FuseOptimizerOps( + virtual ir::Node *FuseOptimizerOps( const std::unordered_map> &vars_set, const std::unordered_map &fused_vars_name, const std::vector &adam_ops, ir::Graph *graph) const = 0; @@ -91,6 +92,9 @@ class FuseOptimizerOpPass : public ir::Pass { *aux_var_set) const; bool IsLoDTensorType(const proto::VarType::Type &type) const; + + bool HasVarDepsBetweenOps(const std::vector &topo_nodes, + const std::vector &opt_nodes) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc index 077e393c..3dd54cbc 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_sgd_op_pass.cc @@ -31,7 +31,7 @@ class FuseSgdOpPass : public FuseOptimizerOpPass { } // Fuse Sgd Ops - virtual void FuseOptimizerOps( + virtual ir::Node *FuseOptimizerOps( const std::unordered_map> &vars_set, const std::unordered_map &fused_vars_name, const std::vector &sgd_ops, ir::Graph *graph) const { @@ -42,7 +42,7 @@ class FuseSgdOpPass : public FuseOptimizerOpPass { int op_role = boost::get( sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); - VLOG(7) << "Insert sgd to graph "; + VLOG(6) << "Insert sgd to graph."; // Add fused scale OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); Sgd_desc.SetType("sgd"); @@ -56,15 +56,11 @@ class FuseSgdOpPass : public FuseOptimizerOpPass { // NOTE: multi_devices_pass requires that every op should have a role. Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); - auto sgd_node = graph->CreateOpNode(&Sgd_desc); - - InserInputAndOutputForOptOps(sgd_ops, sgd_node); + return graph->CreateOpNode(&Sgd_desc); } }; } // namespace ir } // namespace framework } // namespace paddle -REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::ir::FuseSgdOpPass) - .RequirePassAttr(paddle::framework::details::kPlaces) - .RequirePassAttr(paddle::framework::details::kLocalScopes); +REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::ir::FuseSgdOpPass); diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index fff015d4..23030905 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -85,10 +85,18 @@ class Graph { return attrs_.count(attr_name) > 0; } + template + AttrType &GetOrInit(const std::string &attr_name) { + if (!Has(attr_name)) { + Set(attr_name, new AttrType); + } + return Get(attr_name); + } + template AttrType &Get(const std::string &attr_name) const { - PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.", - attr_name); + PADDLE_ENFORCE_EQ(Has(attr_name), true, "%s attr not registered for graph.", + attr_name); try { return *boost::any_cast(attrs_.at(attr_name)); } catch (boost::bad_any_cast &) { @@ -101,8 +109,8 @@ class Graph { template void Set(const std::string &attr_name, AttrType *attr) { - PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph", - attr_name); + PADDLE_ENFORCE_EQ(attrs_.count(attr_name), 0, "%s already set in the graph", + attr_name); attrs_[attr_name] = attr; attr_dels_[attr_name] = [attr, attr_name]() { VLOG(3) << "deleting " << attr_name; @@ -112,15 +120,15 @@ class Graph { template void SetNotOwned(const std::string &attr_name, AttrType *attr) { - PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph", - attr_name); + PADDLE_ENFORCE_EQ(attrs_.count(attr_name), 0, "%s already set in the graph", + attr_name); attrs_[attr_name] = attr; attr_dels_[attr_name] = []() {}; } void Erase(const std::string &attr_name) { - PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph", - attr_name); + PADDLE_ENFORCE_NE(attrs_.count(attr_name), 0, "%s not set in the graph", + attr_name); attr_dels_[attr_name](); attrs_.erase(attr_name); attr_dels_.erase(attr_name); @@ -130,7 +138,7 @@ class Graph { // Create a normal variable with non-null VarDesc. ir::Node *CreateVarNode(VarDesc *var_desc) { - PADDLE_ENFORCE(var_desc); + PADDLE_ENFORCE_NOT_NULL(var_desc); auto *x = AddNode(new ir::Node(var_desc)); x->SetId(num_node_created_++); return x; @@ -138,7 +146,7 @@ class Graph { // Create a normal runnable operator with OpDesc. ir::Node *CreateOpNode(OpDesc *op_desc) { - PADDLE_ENFORCE(op_desc); + PADDLE_ENFORCE_NOT_NULL(op_desc); auto *x = AddNode(new ir::Node(op_desc)); x->SetId(num_node_created_++); return x; @@ -178,7 +186,7 @@ class Graph { } std::unique_ptr RemoveNode(ir::Node *node) { - PADDLE_ENFORCE(node_set_.find(node) != node_set_.end()); + PADDLE_ENFORCE_EQ(node_set_.find(node) != node_set_.end(), true); std::unique_ptr ret; ret.reset(nodes_.at(node).release()); nodes_.erase(node); @@ -200,16 +208,11 @@ class Graph { // WARN: After a series of passes, the current graph can be quite // different from OriginProgram. Caller shouldn't assume much from // the returned OriginProgram. - const ProgramDesc &OriginProgram() const { - LOG(WARNING) << "WARN: After a series of passes, the current graph can be " - "quite different from OriginProgram. So, please avoid " - "using the `OriginProgram()` method!"; - return program_; - } + const ProgramDesc &OriginProgram() const { return program_; } // This method takes ownership of `node`. ir::Node *AddNode(ir::Node *node) { - PADDLE_ENFORCE(node_set_.find(node) == node_set_.end()); + PADDLE_ENFORCE_EQ(node_set_.find(node) == node_set_.end(), true); nodes_[node].reset(node); node_set_.insert(node); return node; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 969166a3..bbb2ee2f 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -771,58 +771,33 @@ PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input, return bn_out_var; } -PDNode *patterns::ConvReLU::operator()( - paddle::framework::ir::PDNode *conv_input) { +PDNode *patterns::ConvActivation::operator()( + paddle::framework::ir::PDNode *conv_input, std::string conv_type, + std::string activation_type) { // Create Operators - conv_input->assert_is_op_input("conv2d", "Input"); - auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); - auto *relu_op = pattern->NewNode(relu_repr())->assert_is_op("relu"); - // Create variables - // Filter - auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) - ->AsInput() - ->assert_is_persistable_var() - ->assert_is_op_input("conv2d", "Filter"); - // intermediate variable, will be removed in the IR after fuse. - auto *conv_out_var = pattern->NewNode(conv_out_repr()) - ->AsIntermediate() - ->assert_is_only_output_of_op("conv2d") - ->assert_is_op_input("relu"); - // output - auto *relu_out_var = pattern->NewNode(relu_out_repr()) - ->AsOutput() - ->assert_is_op_output("relu"); - - conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); - relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var}); - return relu_out_var; -} - -PDNode *patterns::ConvBReLU::operator()( - paddle::framework::ir::PDNode *conv_input) { - // Create Operators - conv_input->assert_is_op_input("conv2d", "Input"); - auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); - auto *brelu_op = pattern->NewNode(brelu_repr())->assert_is_op("relu6"); + conv_input->assert_is_op_input(conv_type, "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(conv_type); + auto *activation_op = + pattern->NewNode(activation_repr())->assert_is_op(activation_type); // Create variables // Filter auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) ->AsInput() ->assert_is_persistable_var() - ->assert_is_op_input("conv2d", "Filter"); + ->assert_is_op_input(conv_type, "Filter"); // intermediate variable, will be removed in the IR after fuse. auto *conv_out_var = pattern->NewNode(conv_out_repr()) ->AsIntermediate() - ->assert_is_only_output_of_op("conv2d") - ->assert_is_op_input("relu6"); + ->assert_is_only_output_of_op(conv_type) + ->assert_is_op_input(activation_type); // output - auto *brelu_out_var = pattern->NewNode(brelu_out_repr()) - ->AsOutput() - ->assert_is_op_output("relu6"); + auto *activation_out_var = pattern->NewNode(activation_out_repr()) + ->AsOutput() + ->assert_is_op_output(activation_type); conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var}); - brelu_op->LinksFrom({conv_out_var}).LinksTo({brelu_out_var}); - return brelu_out_var; + activation_op->LinksFrom({conv_out_var}).LinksTo({activation_out_var}); + return activation_out_var; } PDNode *patterns::SeqConvEltAddRelu::operator()( @@ -871,7 +846,7 @@ PDNode *patterns::SeqConvEltAddRelu::operator()( } PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x, - bool with_bias) { + bool with_bias, bool with_relu) { // Create shared nodes. x->assert_is_op_input("mul", "X"); auto *mul = pattern->NewNode(mul_repr())->assert_is_op("mul"); @@ -884,11 +859,10 @@ PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x, auto *mul_out_var = pattern->NewNode(mul_out_repr())->assert_is_op_output("mul"); + // Add links. + mul->LinksFrom({x, mul_w_var}).LinksTo({mul_out_var}); if (!with_bias) { // not with bias - // Add links. - mul->LinksFrom({x, mul_w_var}).LinksTo({mul_out_var}); return mul_out_var; - } else { // with bias mul_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); // Create operators. @@ -897,15 +871,29 @@ PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x, // Create variables. auto *bias = pattern->NewNode(bias_repr()) ->assert_is_op_input("elementwise_add") + ->assert_is_persistable_var() ->AsInput(); - auto *fc_out = pattern->NewNode(Out_repr()) - ->AsOutput() - ->assert_is_op_output("elementwise_add"); + auto *elementwise_add_out_var = + pattern->NewNode(elementwise_add_out_repr()) + ->AsOutput() + ->assert_is_op_output("elementwise_add"); - mul->LinksFrom({mul_w_var, x}).LinksTo({mul_out_var}); - elementwise_add->LinksFrom({mul_out_var, bias}).LinksTo({fc_out}); - return fc_out; + elementwise_add->LinksFrom({mul_out_var, bias}) + .LinksTo({elementwise_add_out_var}); + if (!with_relu) { + return elementwise_add_out_var; + } else { + elementwise_add_out_var->AsIntermediate()->assert_is_op_input("relu"); + // Create operators. + auto *relu = pattern->NewNode(relu_repr())->assert_is_op("relu"); + auto *relu_out_var = pattern->NewNode(relu_out_repr()) + ->AsOutput() + ->assert_is_op_output("relu"); + + relu->LinksFrom({elementwise_add_out_var}).LinksTo({relu_out_var}); + return relu_out_var; + } } } @@ -1275,6 +1263,41 @@ PDNode *patterns::ConvConcatReLU::operator()() { return relu_out; } +PDNode *patterns::ConvRequant::operator()() { + // Create Operators + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto requant_op = + pattern->NewNode(requant_op_repr())->assert_is_op("requantize"); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d", "Output"); + auto requant_out = pattern->NewNode(requant_out_repr()) + ->AsOutput() + ->assert_is_op_output("requantize", "Output"); + + conv_op->LinksTo({conv_out}); + requant_op->LinksFrom({conv_out}).LinksTo({requant_out}); + + return requant_out; +} + +PDNode *patterns::ConvDequant::operator()() { + // Create Operators + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto dequant_op = + pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize"); + + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d", "Output"); + auto dequant_out = pattern->NewNode(dequant_out_repr()) + ->AsOutput() + ->assert_is_op_output("dequantize", "Output"); + + conv_op->LinksTo({conv_out}); + dequant_op->LinksFrom({conv_out}).LinksTo({dequant_out}); + + return dequant_out; +} + PDNode *patterns::PriorBox::operator()() { auto prior_box_op = pattern->NewNode(prior_box_op_repr())->assert_is_op("prior_box"); @@ -1880,6 +1903,9 @@ void patterns::QuantDequantOpFuse::operator()(PDNode *quant_op_input, void patterns::ShuffleChannelPattern::operator()(PDNode *reshape1_in) { auto reshape1_op = pattern->NewNode(reshape1_op_repr())->assert_is_op("reshape2"); + reshape1_op->assert_more([&](Node *x) { + return boost::get>(x->Op()->GetAttr("shape")).size() == 5; + }); auto reshape1_out = pattern->NewNode(reshape1_out_repr()) ->assert_is_op_output("reshape2", "Out") diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index c53e4e5e..0d7d56ca 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -431,46 +431,26 @@ struct ConvBN : public PatternBase { PATTERN_DECL_NODE(bn_saved_variance); }; -// CONV with ReLU -// op: conv + relu +// Conv with Activation +// op: conv + activation // named nodes: // conv_input, conv_weight, // conv_out, conv, -// relu_out, relu -struct ConvReLU : public PatternBase { - ConvReLU(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "conv_relu") {} +// activation_out, activation +struct ConvActivation : public PatternBase { + ConvActivation(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_activation") {} - PDNode* operator()(PDNode* conv_input); + PDNode* operator()(PDNode* conv_input, std::string conv_type = "conv2d", + std::string activation_type = "relu"); // declare operator node's name PATTERN_DECL_NODE(conv); - PATTERN_DECL_NODE(relu); + PATTERN_DECL_NODE(activation); // declare variable node's name PATTERN_DECL_NODE(conv_weight); PATTERN_DECL_NODE(conv_out); - PATTERN_DECL_NODE(relu_out); -}; - -// CONV with ReLU6 -// op: conv + relu6 -// named nodes: -// conv_input, conv_weight, -// conv_out, conv, -// relu6_out, relu6 -struct ConvBReLU : public PatternBase { - ConvBReLU(PDPattern* pattern, const std::string& name_scope) - : PatternBase(pattern, name_scope, "conv_bounded_relu") {} - - PDNode* operator()(PDNode* conv_input); - - // declare operator node's name - PATTERN_DECL_NODE(conv); - PATTERN_DECL_NODE(brelu); - // declare variable node's name - PATTERN_DECL_NODE(conv_weight); - PATTERN_DECL_NODE(conv_out); - PATTERN_DECL_NODE(brelu_out); + PATTERN_DECL_NODE(activation_out); }; // SEQCONV with Elementwise_Add ReLU @@ -507,17 +487,19 @@ struct FC : public PatternBase { FC(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "fc") {} - PDNode* operator()(PDNode* x, bool with_bias); + PDNode* operator()(PDNode* x, bool with_bias, bool with_relu); // declare operator node's name PATTERN_DECL_NODE(fc); PATTERN_DECL_NODE(mul); PATTERN_DECL_NODE(elementwise_add); + PATTERN_DECL_NODE(relu); // declare variable node's name PATTERN_DECL_NODE(w); PATTERN_DECL_NODE(mul_out); // (x,w) -> mul_out PATTERN_DECL_NODE(bias); - PATTERN_DECL_NODE(Out); + PATTERN_DECL_NODE(elementwise_add_out); + PATTERN_DECL_NODE(relu_out); }; // MKL-DNN's FC with bias @@ -796,6 +778,40 @@ struct ConvConcatReLU : public PatternBase { PATTERN_DECL_NODE(relu_out); }; +// Conv + Requant +// named nodes: +// conv_op, conv_out +// requant_op, requant_out +struct ConvRequant : public PatternBase { + ConvRequant(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_requant") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + + PATTERN_DECL_NODE(requant_op); + PATTERN_DECL_NODE(requant_out); +}; + +// Conv + Dequant +// named nodes: +// conv_op, conv_out +// dequant_op, dequant_out +struct ConvDequant : public PatternBase { + ConvDequant(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_dequant") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + + PATTERN_DECL_NODE(dequant_op); + PATTERN_DECL_NODE(dequant_out); +}; + // PriorBox operator // operator: prior_box_op // inputs: prior_box_input, prior_box_image diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h b/paddle/fluid/framework/ir/graph_printer.h similarity index 95% rename from paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h rename to paddle/fluid/framework/ir/graph_printer.h index 8562856e..76b07f0d 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h +++ b/paddle/fluid/framework/ir/graph_printer.h @@ -26,7 +26,7 @@ namespace paddle { namespace framework { namespace ir { -constexpr char kGraphvizPath[] = "debug_graphviz_path"; +constexpr char kGraphvizPath[] = "graph_viz_path"; class SSAGraphPrinter { public: diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index a95588a5..23a61b28 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -206,5 +206,51 @@ TEST(GraphTest, WriteAfterWrite) { ASSERT_NE(control_dep2, nullptr); ASSERT_EQ(control_dep1, control_dep2); } + +TEST(GraphTest, TestException) { + ProgramDesc prog; + std::unique_ptr g(new ir::Graph(prog)); + + bool not_met_exception = false; + try { + g->Erase("no_attr"); + } catch (const platform::EnforceNotMet &e) { + not_met_exception = true; + } + ASSERT_TRUE(not_met_exception); + + not_met_exception = false; + try { + g->CreateVarNode(nullptr); + } catch (const platform::EnforceNotMet &e) { + not_met_exception = true; + } + ASSERT_TRUE(not_met_exception); + + not_met_exception = false; + try { + g->CreateOpNode(nullptr); + } catch (const platform::EnforceNotMet &e) { + not_met_exception = true; + } + ASSERT_TRUE(not_met_exception); + + not_met_exception = false; + try { + g->RemoveNode(nullptr); + } catch (const platform::EnforceNotMet &e) { + not_met_exception = true; + } + ASSERT_TRUE(not_met_exception); + + not_met_exception = false; + try { + g->AddNode(nullptr); + g->AddNode(nullptr); + } catch (const platform::EnforceNotMet &e) { + not_met_exception = true; + } + ASSERT_TRUE(not_met_exception); +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index f4df4cfe..fa7263b7 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/ir/graph_printer.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/string/printf.h" @@ -25,8 +26,6 @@ namespace framework { namespace ir { using inference::analysis::Dot; namespace { -const char kGraphVizPath[] = "graph_viz_path"; - std::string FormatName(const Node* node) { if (!node->IsOp() || !node->Op() || !node->Op()->HasAttr(OpProtoAndCheckerMaker::OpNamescopeAttrName())) { @@ -39,7 +38,7 @@ std::string FormatName(const Node* node) { } // namespace void GraphVizPass::ApplyImpl(ir::Graph* graph) const { - const std::string graph_viz_path = Get(kGraphVizPath); + const std::string& graph_viz_path = Get(kGraphvizPath); VLOG(3) << "draw IR graph viz to " << graph_viz_path; std::unique_ptr fout(new std::ofstream(graph_viz_path)); PADDLE_ENFORCE(fout->good()); @@ -90,6 +89,17 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const { marked_nodes.count(n) ? marked_op_attrs : op_attrs; dot.AddNode(node_id, attr, node_id); } else if (n->IsVar()) { + if (n->Var() && n->Var()->GetType() == proto::VarType::LOD_TENSOR) { + bool is_first = true; + for (int64_t length : n->Var()->GetShape()) { + if (is_first) { + node_id += "\n" + std::to_string(length); + is_first = false; + } else { + node_id += "," + std::to_string(length); + } + } + } decltype(op_attrs)* attr; if (marked_nodes.count(n)) { attr = &marked_var_attrs; @@ -132,4 +142,4 @@ GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes( } // namespace paddle REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass) - .RequirePassAttr(paddle::framework::ir::kGraphVizPath); + .RequirePassAttr(paddle::framework::ir::kGraphvizPath); diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc deleted file mode 100644 index d7692411..00000000 --- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" - -namespace paddle { -namespace framework { -namespace ir { - -class InferCleanGraphPass : public FusePassBase { - public: - virtual ~InferCleanGraphPass() {} - - protected: - void ApplyImpl(ir::Graph* graph) const { - FusePassBase::Init("original_graph", graph); - PADDLE_ENFORCE(graph); - - auto is_valid_node = [](Node* x) { - return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); - }; - - std::unordered_set invalid_nodes; - int valid_op = 0; - for (auto* node : graph->Nodes()) { - PADDLE_ENFORCE_NOT_NULL(node); - if (is_valid_node(node)) { - invalid_nodes.insert(node); - } else if (node->IsOp()) { - // Collect all the operators to help tracking number of operators. - ++valid_op; - } - } - - GraphSafeRemoveNodes(graph, invalid_nodes); - - AddStatis(valid_op); - } - - void CleanEdges(std::vector* nodes, - const std::unordered_set& to_remove) const { - auto it = std::remove_if(nodes->begin(), nodes->end(), - [&](Node* x) { return to_remove.count(x); }); - nodes->erase(it, nodes->end()); - } -}; - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(infer_clean_graph_pass, - paddle::framework::ir::InferCleanGraphPass); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt index 070ea9aa..37993d3f 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt @@ -1,24 +1,14 @@ cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base) +cc_library(conditional_block_op_eager_deletion_pass SRCS conditional_block_op_eager_deletion_pass.cc DEPS conditional_block_op_helper graph_helper pass computation_op_handle) cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle) cc_library(recurrent_op_eager_deletion_pass SRCS recurrent_op_eager_deletion_pass.cc DEPS recurrent_op_helper graph_helper pass computation_op_handle) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle var_handle) cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) -if(WITH_GPU) - cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info) -else() - cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info) -endif() - -cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) -cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) - -cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry) - cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle - eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper) -cc_library(record_skip_memory_opt_vars_pass SRCS record_skip_memory_opt_vars_pass.cc DEPS graph graph_helper) + eager_deletion_op_handle graph graph_helper pass conditional_block_op_eager_deletion_pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper) cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handle reference_count_pass_helper share_tensor_buffer_op_handle multi_devices_helper graph pass) cc_library(buffer_shared_inplace_op_pass SRCS buffer_shared_inplace_op_pass.cc DEPS memory_reuse_pass) +cc_library(buffer_shared_cross_op_memory_reuse_pass SRCS buffer_shared_cross_op_memory_reuse_pass.cc DEPS memory_reuse_pass) diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc new file mode 100644 index 00000000..6ce14203 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_cross_op_memory_reuse_pass.cc @@ -0,0 +1,422 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/share_tensor_buffer_op_handle.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h" +#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +using OpHandleBase = details::OpHandleBase; +using ComputationOpHandle = details::ComputationOpHandle; +using VarHandle = details::VarHandle; +using VarHandleBase = details::VarHandleBase; +using DummyVarHandle = details::DummyVarHandle; + +enum NodeDependency { kSame = 0, kNoDep = 1, kBefore = 2, kAfter = 3 }; + +static NodeDependency ReverseNodeDependency(NodeDependency dep) { + return dep == NodeDependency::kBefore + ? NodeDependency::kAfter + : (dep == NodeDependency::kAfter ? NodeDependency::kBefore : dep); +} + +class BufferSharedCrossOpMemoryReusePass : public MemoryReusePass { + protected: + std::string ReuseType() const override { return "cross_op_memory_reuse"; } + + void Run(Graph *graph) const override; + + private: + void RunOnScopeIdx(size_t idx) const; + + // Toposort ops. Different strategies can be used in the future. + std::vector SortOp(const OpGraphView &graph_view) const; + + // Build the initial dependency matrix, and initializing all fields, + // including `ops_`, `op_to_idx_`, `deps_` + void BuildOpDependencyMap() const; + + // Get op index inside `ops_`, used to find dependency inside `deps_` + size_t OpIndex(const ComputationOpHandle *op) const; + + size_t ResolveDependencyBetween( + ComputationOpHandle *op, + const std::unordered_set &prev_ops) const; + + // Get dependency relationship between op1 and op2 + // Notice: GetOpDep(op1, op2) == ReverseNodeDependency(GetOpDep(op2, op1)) + NodeDependency GetOpDep(const ComputationOpHandle *op1, + const ComputationOpHandle *op2) const; + + void SetOpDep(const ComputationOpHandle *op1, const ComputationOpHandle *op2, + NodeDependency dep) const; + + private: + mutable Graph *graph_; + + // All ops in the graph, grouped by scope index + mutable std::vector> ops_; + + // Index of each op in `ops_`, grouped by scope index. + // Index of each op is the index inside `deps_`. + mutable std::vector> + op_to_idx_; + + // Dependency matrix of between any 2 ops + // If deps_[scope_idx][i][j] is equal to: + // 1. kSame, Op(i) and Op(j) are the same ops, only when i == j. + // 2. kNoDep, Op(i) and Op(j) have no dependency between each other. + // 3. kBefore, Op(i) is the preceding op of Op(j). + // 4. kAfter, Op(i) is the pending op of Op(j). + mutable std::vector>> deps_; +}; + +void BufferSharedCrossOpMemoryReusePass::Run(Graph *graph) const { + graph_ = graph; + BuildOpDependencyMap(); + for (size_t i = 0; i < ScopeNum(); ++i) { + RunOnScopeIdx(i); + } +} + +// Note(zjl): The reason why I separate SortOp from BuildOpDependencyMap() +// is that we can use different sorting strategies in the future to +// evaluate the effects of different sorting strategies. +// Currently, I use BFS, but we can use other kinds of sorting strategy +// in the future, as long as the new strategy reaches higher memory reuse +// ratio. +std::vector BufferSharedCrossOpMemoryReusePass::SortOp( + const OpGraphView &graph_view) const { + std::vector sorted_ops; + sorted_ops.reserve(graph_view.OpNumber()); + graph_view.BreadthFirstVisit( + [&](OpHandleBase *cur_op) { sorted_ops.emplace_back(cur_op); }); + PADDLE_ENFORCE_EQ(sorted_ops.size(), graph_view.OpNumber(), + "There are unvisited ops"); + return sorted_ops; +} + +/** + * Try to reuse unlived vars. + * + * What we do is: transverse all outputs of each op, and find a suitable + * unused var, and then reuse its memory as output. + * + * How to determine unused vars? + * + * Case 1: unlived vars after all preceding ops run. In this case, no extra + * edge would be added to the graph. + * + * Case 2: unlived vars after all preceding ops and all no-dep ops run. In + * this case, the reused var is from no-dep ops, so that we have to add + * extra edge to resolve data hazard. + * + * + * If Case 2 occurs, what we should do to resolve data hazard? + * + * - Step 1: add a dep var between reused_op and share_tensor_buffer_op, + * that is: reused_op -> dep_var -> share_tensor_buffer_op. + * + * - Step 2: Update deps_, all preceding ops of reused_op should be + * preceding ops of op. + */ +void BufferSharedCrossOpMemoryReusePass::RunOnScopeIdx(size_t idx) const { + auto &ops = ops_[idx]; + + auto &last_live_ops_of_vars = + Get>(kLastLiveOpsOfVars)[idx]; + + // Build a reverse map of `last_live_ops_of_vars`, + // i.e., VarHandle -> last lived ops of VarHandle + std::unordered_map> + var_to_ops; + for (auto &pair : last_live_ops_of_vars) { + for (auto *op : pair.second.ops()) { + var_to_ops[pair.second.var()].insert(op); + } + } + + // Deep copy of `var_to_ops`, used to get last lived ops of each unlived var + auto original_var_to_ops = var_to_ops; + + // Memory size of VarHandle -> list + std::map> unlived_var_pool; + size_t reuse_num = 0; + + for (auto *op : ops) { + // Transverse all output args of op, find whether there is unlived var + // can be reused. + auto out_args = op->Node()->Op()->OutputArgumentNames(); + for (auto &out_arg : out_args) { + auto out_nodes = this->FindNodesByName(out_arg, op->Node()->outputs); + // If out_arg is kEmptyVarName, it may not be found in output nodes. + if (out_nodes.size() != 1) { + continue; + } + + auto *out_node = *(out_nodes.begin()); + auto *out_var = + dynamic_cast(&(out_node->Wrapper())); + PADDLE_ENFORCE_NOT_NULL(out_var); + + // If out_arg is not reusable, skip it + if (!IsOutVarReusable(*out_var)) { + continue; + } + + auto mem_size = GetMemorySize(*out_var); + // Special case: if memory size of out_var is 0, skip it + if (mem_size == 0) { + continue; + } + + // Find a suitable unlived var from `unlived_var_pool` + // Here, we use `find`, but we can perform `lower_bound` if + // it is better in the future. + auto iter = unlived_var_pool.find(std::abs(mem_size)); + if (iter == unlived_var_pool.end()) { + continue; + } + + // Obtain candidate_vars that can be reused. + auto &candidate_vars = iter->second; + for (auto var_iter = candidate_vars.begin(); + var_iter != candidate_vars.end(); ++var_iter) { + bool success = this->TryReuseVar(*var_iter, out_var); + if (!success) continue; + + // If memory reuse is successful, we should do some post-processing. + ++reuse_num; + auto &prev_ops = original_var_to_ops.at(*var_iter); + + // Add extra dependencies between `op` and last lived ops of reused var + // (i.e. prev_ops) if needed. + // All `prev_ops` must be preceding ops of op to avoid data hazard. + size_t new_added_dep_num = ResolveDependencyBetween(op, prev_ops); + VLOG(3) << "Variable can be reused between: " << (*var_iter)->Name() + << " -> " << out_var->Name() << " when running op " + << op->Name() << ", add extra dependency " << new_added_dep_num + << "/" << prev_ops.size(); + + // erase reused var from ``original_var_to_ops` + original_var_to_ops.erase(*var_iter); + + // erase reused var from `candidate_vars` + candidate_vars.erase(var_iter); + if (candidate_vars.empty()) { + // erase reused var from `unlived_var_pool` if there is no other vars + // which has same size with reused var. + unlived_var_pool.erase(iter); + } + break; + } + } + + // After all output args have been transversed, we should check whether + // there is new unlived var after `op` runs. + for (auto op_iter = var_to_ops.begin(); op_iter != var_to_ops.end();) { + // erase op from `var_to_ops` first + op_iter->second.erase(op); + if (op_iter->second.empty()) { + // there is a unlived var, since all lived ops have run + VarHandle *unlived_var = op_iter->first; + var_to_ops.erase(op_iter++); + if (IsInVarReusable(*unlived_var)) { + auto mem_size = GetMemorySize(*unlived_var); + if (mem_size != 0) { + unlived_var_pool[std::abs(mem_size)].push_front(unlived_var); + } + } + } else { + ++op_iter; + } + } + } + VLOG(4) << "Reuse " << reuse_num << " variable(s) in Scope " << idx; +} + +size_t BufferSharedCrossOpMemoryReusePass::ResolveDependencyBetween( + ComputationOpHandle *op, + const std::unordered_set &prev_ops) const { + size_t new_added_dep_num = 0; + size_t op_idx = OpIndex(op); + auto &deps = deps_[op->GetScopeIdx()]; + for (auto *prev_op : prev_ops) { + auto op_dep = GetOpDep(prev_op, op); + if (op_dep == NodeDependency::kBefore) continue; + PADDLE_ENFORCE_EQ(op_dep, NodeDependency::kNoDep, + "The graph has circle, this may be a bug"); + + auto iter = + std::find_if(prev_op->Outputs().begin(), prev_op->Outputs().end(), + [](VarHandleBase *var) { + return dynamic_cast(var) != nullptr; + }); + + if (iter != prev_op->Outputs().end()) { + op->AddInput(*iter); + } else { + auto *dep_var = new DummyVarHandle(graph_->CreateControlDepVar()); + graph_->Get(details::kGraphDepVars) + .emplace(dep_var); + prev_op->AddOutput(dep_var); + op->AddInput(dep_var); + } + + // All preceding ops of `prev_op` should be preceding ops of `op` + size_t prev_op_idx = OpIndex(prev_op); + for (size_t i = 0; i < deps[prev_op_idx].size(); ++i) { + if (deps[prev_op_idx][i] != NodeDependency::kAfter) { + continue; + } + + deps[i][op_idx] = NodeDependency::kBefore; + deps[op_idx][i] = NodeDependency::kAfter; + } + + // All pending ops of `op` should be pending ops of `prev_op`. + for (size_t i = 0; i < deps[op_idx].size(); ++i) { + if (deps[op_idx][i] != NodeDependency::kBefore) { + continue; + } + + deps[i][prev_op_idx] = NodeDependency::kAfter; + deps[prev_op_idx][i] = NodeDependency::kBefore; + } + + // `prev_op` is one of preceding op of `op` + SetOpDep(prev_op, op, NodeDependency::kBefore); + ++new_added_dep_num; + } + return new_added_dep_num; +} + +void BufferSharedCrossOpMemoryReusePass::BuildOpDependencyMap() const { + PADDLE_ENFORCE(ops_.empty(), "ops_ must be initialized here"); + PADDLE_ENFORCE(op_to_idx_.empty(), "op_to_idx_ must be initialized here"); + PADDLE_ENFORCE(deps_.empty(), "deps_ must be initialized here"); + + // Toposort ops + OpGraphView graph_view(ir::FilterByNodeWrapper(*graph_)); + auto ops = SortOp(graph_view); + + size_t scope_num = this->ScopeNum(); + size_t op_num = ops.size(); + + // A map to record all preceding ops of each op + std::unordered_map> + preceding_ops; + + // BFS to fill `preceding_ops` + graph_view.BreadthFirstVisit([&](OpHandleBase *cur_op) { + // All preceding ops of cur_op should be: + // - preceding ops of cur_op, that is connected to cur_op directely + // - all preceding ops of `direct preceding ops of cur_op` + auto &all_preceding_ops_of_cur_op = preceding_ops[cur_op]; + for (auto &preceding_op : graph_view.PrecedingOps(cur_op)) { + all_preceding_ops_of_cur_op.insert(preceding_op); + auto &prev_preceding_ops = preceding_ops[preceding_op]; + all_preceding_ops_of_cur_op.insert(prev_preceding_ops.begin(), + prev_preceding_ops.end()); + } + }); + PADDLE_ENFORCE_EQ(preceding_ops.size(), op_num); + + // Find out ComputationOpHandles only + ops_.resize(scope_num); + op_to_idx_.resize(scope_num); + for (auto *op : ops) { + auto *compute_op = dynamic_cast(op); + if (compute_op == nullptr) continue; + size_t scope_idx = compute_op->GetScopeIdx(); + ops_[scope_idx].emplace_back(compute_op); + op_to_idx_[scope_idx].emplace(compute_op, op_to_idx_[scope_idx].size()); + } + + // Fill deps_ according to `preceding_ops` + deps_.resize(scope_num); + for (size_t i = 0; i < deps_.size(); ++i) { + deps_[i].resize(ops_[i].size()); + for (auto &item : deps_[i]) { + item.assign(ops_[i].size(), NodeDependency::kNoDep); + } + } + + for (auto &ops_on_each_device : ops_) { + for (auto *op : ops_on_each_device) { + SetOpDep(op, op, NodeDependency::kSame); + for (auto *preceding_op : preceding_ops[op]) { + auto *compute_preceding_op = + dynamic_cast(preceding_op); + if (compute_preceding_op != nullptr && + compute_preceding_op->GetScopeIdx() == op->GetScopeIdx()) { + SetOpDep(compute_preceding_op, op, NodeDependency::kBefore); + } + } + } + } +} + +size_t BufferSharedCrossOpMemoryReusePass::OpIndex( + const ComputationOpHandle *op) const { + auto iter = op_to_idx_[op->GetScopeIdx()].find(op); + PADDLE_ENFORCE(iter != op_to_idx_[op->GetScopeIdx()].end()); + return iter->second; +} + +NodeDependency BufferSharedCrossOpMemoryReusePass::GetOpDep( + const ComputationOpHandle *op1, const ComputationOpHandle *op2) const { + PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx()); + return deps_[op1->GetScopeIdx()][OpIndex(op1)][OpIndex(op2)]; +} + +void BufferSharedCrossOpMemoryReusePass::SetOpDep( + const ComputationOpHandle *op1, const ComputationOpHandle *op2, + NodeDependency dep) const { + PADDLE_ENFORCE_EQ(op1->GetScopeIdx(), op2->GetScopeIdx()); + if (op1 == op2) { + PADDLE_ENFORCE(dep == NodeDependency::kSame); + auto idx = OpIndex(op1); + deps_[op1->GetScopeIdx()][idx][idx] = NodeDependency::kSame; + } else { + auto idx1 = OpIndex(op1); + auto idx2 = OpIndex(op2); + PADDLE_ENFORCE(dep != NodeDependency::kSame && idx1 != idx2); + deps_[op1->GetScopeIdx()][idx1][idx2] = dep; + deps_[op1->GetScopeIdx()][idx2][idx1] = ReverseNodeDependency(dep); + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(buffer_shared_cross_op_memory_reuse_pass, + paddle::framework::ir::BufferSharedCrossOpMemoryReusePass) + .RequirePassAttr(paddle::framework::ir::kMemOptVarInfoMapList) + .RequirePassAttr(paddle::framework::ir::kLastLiveOpsOfVars) + .RequirePassAttr(paddle::framework::ir::kUseCuda); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc index b5d17ef2..006e79c9 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/buffer_shared_inplace_op_pass.cc @@ -50,11 +50,11 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const { for (auto &pair : each_scope_ops) { // If variable has more than 1 last lived ops, this variable cannot // be inplaced. - if (pair.second.size() != 1) { + if (pair.second.ops().size() != 1) { continue; } - auto *op = *(pair.second.begin()); + auto *op = *(pair.second.ops().begin()); const std::string &op_type = op->GetOp()->Type(); const framework::OpDesc *op_desc = op->Node()->Op(); PADDLE_ENFORCE_NOT_NULL(op_desc); @@ -141,7 +141,7 @@ void BufferSharedInplaceOpPass::Run(Graph *graph) const { << out_var_handle_ptr->Name() << ". Debug String is: " << op->GetOp()->DebugString(); } else { - VLOG(4) << "Inplace failed in op " << op_type << ": " + VLOG(3) << "Inplace failed in op " << op_type << ": " << in_var_handle_ptr->Name() << " -> " << out_var_handle_ptr->Name(); } diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc new file mode 100644 index 00000000..56a658d4 --- /dev/null +++ b/paddle/fluid/framework/ir/memory_optimize_pass/conditional_block_op_eager_deletion_pass.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" +#include "paddle/fluid/operators/controlflow/op_variant.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConditionalOpEagerDeletionPass : public Pass { + protected: + void ApplyImpl(Graph *graph) const override { + auto all_ops = ir::FilterByNodeWrapper(*graph); + + // Find all conditional_op and conditional_grad_op + std::unordered_map, + std::vector>> + target_ops; + for (auto *op : all_ops) { + auto compute_op = dynamic_cast(op); + if (compute_op == nullptr) continue; + + if (compute_op->Name() == "conditional_block") { + target_ops[compute_op->GetScopeIdx()].first.emplace_back( + compute_op->GetOp()); + } else if (compute_op->Name() == "conditional_block_grad") { + target_ops[compute_op->GetScopeIdx()].second.emplace_back( + compute_op->GetOp()); + } + } + + for (auto &ops_pair : target_ops) { + auto &ifelse_ops = ops_pair.second.first; + auto &ifelse_grad_ops = ops_pair.second.second; + operators::PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp( + graph->OriginProgram(), ifelse_ops, ifelse_grad_ops); + } + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conditional_block_op_eager_deletion_pass, + paddle::framework::ir::ConditionalOpEagerDeletionPass); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc index 452255a6..962401a6 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/eager_deletion_pass.cc @@ -205,7 +205,7 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const { for (auto &var_ops_map : last_live_ops) { for (auto &var_ops_pair : var_ops_map) { const std::string &var_name = var_ops_pair.first; - for (auto *op : var_ops_pair.second) { + for (auto *op : var_ops_pair.second.ops()) { op_vars_map[op].insert(var_name); } } @@ -269,6 +269,11 @@ void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const { } } + auto conditional_block_op_eager_deletion_pass = + ir::PassRegistry::Instance().Get( + "conditional_block_op_eager_deletion_pass"); + conditional_block_op_eager_deletion_pass->Apply(graph); + auto while_op_eager_deletion_pass = ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass"); while_op_eager_deletion_pass->Apply(graph); @@ -288,5 +293,6 @@ REGISTER_PASS(eager_deletion_pass, paddle::framework::ir::EagerDeletionPass) .RequirePassAttr(paddle::framework::ir::kAllPlaces) .RequirePassAttr(paddle::framework::ir::kGarbageCollector); +USE_PASS(conditional_block_op_eager_deletion_pass); USE_PASS(while_op_eager_deletion_pass); USE_PASS(recurrent_op_eager_deletion_pass); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc deleted file mode 100644 index 1935f5e3..00000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc +++ /dev/null @@ -1,487 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h" -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/framework/op_info.h" - -// NOTE(dzhwinter): inplace means one op output variable reuse the input space. -// By our design, one operator only can read its input(const Variable), -// write its output(non-const Variable). If one operator is inplaced, means -// user have chance to write the space before reading happens. -// Especially when some optimize code writing style is applied. -// -// -// /* wrong case in operator */ -// /*In this case, a larger allocation is allocated, input content is lost*/ -// const Tensor* in = ctx.Input("In") -// Tensor* out = ctx.Output("Out"); -// auto* out_ptr = out->mutable_data(ctx.GetPlace()); -// out_ptr[0] = 0; // input contect is overwrited. - -// NOTE(dzhwinter): -// Only for backward compacity and stable. if enable_inplace_whitelist is turn -// on. -// only the ops in whitelist will be use inplace strategy. -// if not, all the op will be inplaced if it registered with InplaceClass -DEFINE_bool( - enable_inplace_whitelist, false, - "If this option turns on, only these op in whitelist can be inplaced." - "If it turns off, all of the running op can be candidate of inplaced op." - "Such as scale, elementwise_add" - "By default, it's turned off"); - -namespace paddle { -namespace framework { -namespace ir { - -// clang-format off -const std::string kInplacedOpWhiteList[] = { // NOLINT - "sigmoid", - "exp", - "relu", - "tanh", - "sqrt", - "ceil", - "floor", - "reciprocal", - "relu6", - "soft_relu", - "hard_sigmoid", - "batch_norm", - "batch_norm_grad", - "sum", - "sum_grad", - "scale", - "reshape", - "elementwise_add", - "elementwise_add_grad", -}; - -// FIXME(zjl): Shapes of in-out of some ops are exactly the same, -// but the static size during compiling time would be wrong. -// Use a flag to indicate such ops. Please fix me when found a better way. -static const std::unordered_set kSameShapeOpWhiteSet{ // NOLINT - "reshape2", "reshape2_grad" -}; -// clang-format on - -class InplacePass : public ir::Pass { - public: - InplacePass(); - - protected: - void ApplyImpl(ir::Graph *graph) const override; - - private: - // Collect vars that cannot be reused - // e.g.: subblock ops in/out, distributed ops in/out, op_role_var - void CollectSkipVars(ir::Graph *graph, - const std::vector &ops) const; - - // Check whether var_name should be skipped - bool IsSkipVar(const std::string &var_name) const; - - // Rename out with name of in, and guarantee that the graph is - // still a SSA graph - void RenameInOut(ir::Node *op, ir::Node *in, ir::Node *out) const; - - // Check whether var is the last version one in SSA graph - bool IsLastVersionVar(ir::Node *var) const; - - // Check whether var is the first version one in SSA graph - bool IsFirstVersionVar(ir::Node *var) const; - - // Check whether all `ops` is the preceding ops of `op` - bool CheckOpDeps(ir::Node *op, const std::vector &ops) const; - - // Find nodes whose names are equal to the given name - static std::unordered_set FindNodesByName( - const std::string &name, const std::vector &nodes); - - // Collect inputs and outputs of op_desc - static void CollectInputArgsOfOpDesc( - const OpDesc *op_desc, std::unordered_multiset *in_args); - - // Get all versions vars named var_name - std::vector *AllVersionVars(const std::string &var_name) const; - - private: - // SSA graph. var_name -> each version of vars - mutable std::map> ssa_map_; - - // Skip vars, including subblock ops in/out, distributed ops in/out, - // op_role_var - mutable std::unordered_set skip_vars_; - - // Op whitelist which should not peform inplace - // Only enabled when FLAGS_enable_inplace_whitelist is true. - mutable std::unordered_set whitelist_ops_; -}; - -InplacePass::InplacePass() { - if (FLAGS_enable_inplace_whitelist) { - for (auto &s : kInplacedOpWhiteList) { - whitelist_ops_.emplace(s); - } - } -} - -std::vector *InplacePass::AllVersionVars( - const std::string &var_name) const { - auto iter = ssa_map_.find(var_name); - PADDLE_ENFORCE(iter != ssa_map_.end(), "cannot find var %s in ssa graph", - var_name); - PADDLE_ENFORCE(!iter->second.empty(), "var %s is empty in ssa graph", - var_name); - return &(iter->second); -} - -bool InplacePass::IsSkipVar(const std::string &var_name) const { - return skip_vars_.count(var_name) > 0; -} - -bool InplacePass::IsFirstVersionVar(ir::Node *var) const { - return AllVersionVars(var->Name())->front() == var; -} - -bool InplacePass::IsLastVersionVar(ir::Node *var) const { - return AllVersionVars(var->Name())->back() == var; -} - -bool InplacePass::CheckOpDeps(ir::Node *op, - const std::vector &ops) const { - std::unordered_set other_ops(ops.begin(), ops.end()); - other_ops.erase(op); - if (other_ops.empty()) return true; - - // Traverse all preceding ops of op - std::queue queue; - std::unordered_set visited_ops; - queue.push(op); - visited_ops.insert(op); - - // Visit all preceding ops of `op`, and erase it from other_ops if it is - // inside other_ops. Return true only if other_ops is empty(), which means - // that all `ops` are preceding ops of `op`. - while (!queue.empty()) { - auto *cur_op = queue.front(); - queue.pop(); - - for (auto *in_var : cur_op->inputs) { - for (auto *in_op : in_var->inputs) { - if (visited_ops.count(in_op) != 0) { - continue; - } - - visited_ops.insert(in_op); - queue.push(in_op); - other_ops.erase(in_op); - if (other_ops.empty()) return true; - } - } - } - return false; -} - -void InplacePass::CollectSkipVars(ir::Graph *graph, - const std::vector &ops) const { - // 1. Collect op role vars - PADDLE_ENFORCE(graph->Has(kMemOptSkipVars), "Graph should have attr %s", - kMemOptSkipVars); - auto &mem_opt_whitelist = graph->Get(kMemOptSkipVars); - for (const auto &var : mem_opt_whitelist) { - skip_vars_.emplace(var); - } -} - -void InplacePass::RenameInOut(ir::Node *op, ir::Node *in_var, - ir::Node *out_var) const { - auto out_var_name = out_var->Name(); - auto in_var_name = in_var->Name(); - - auto &all_out_nodes = *AllVersionVars(out_var_name); - auto &all_in_nodes = *AllVersionVars(in_var_name); - - auto iter = std::find(all_out_nodes.begin(), all_out_nodes.end(), out_var); - PADDLE_ENFORCE(iter != all_out_nodes.end(), "Cannot find out var %s", - out_var_name); - - // The following codes are designed to guarantee that ssa_map_ is still - // an ssa graph after inplace is performed. - // Step 1: Rename the following versions of out_var as the name of in_var - // Step 2: Remove the following versions of out_var and append them to in_var - // Be careful that the inputs of input op of out_var should not be renamed, - // but outputs should be renamed. - auto original_iter = iter; - while (iter != all_out_nodes.end()) { - auto *node = *iter; - /* Step 1 */ - node->RenameVar(in_var_name); - if (iter != original_iter) { - for (auto *in : node->inputs) { - if (in->IsOp() && in->Op()) { - in->Op()->RenameOutput(out_var_name, in_var_name); - in->Op()->RenameInput(out_var_name, in_var_name); - in->Op()->Flush(); - } - } - } - - for (auto *out : node->outputs) { - if (out->IsOp() && out->Op()) { - out->Op()->RenameOutput(out_var_name, in_var_name); - out->Op()->RenameInput(out_var_name, in_var_name); - out->Op()->Flush(); - } - } - - /* Step 2 */ - all_in_nodes.emplace_back(node); - ++iter; - } - - /* Step 2 */ - all_out_nodes.erase(original_iter, all_out_nodes.end()); - - if (all_out_nodes.empty()) { - ssa_map_.erase(out_var_name); - } - op->Op()->RenameOutput(out_var_name, in_var_name); - op->Op()->Flush(); -} - -std::unordered_set InplacePass::FindNodesByName( - const std::string &name, const std::vector &nodes) { - std::unordered_set ret; - for (auto *node : nodes) { - if (node->Name() == name) { - ret.insert(node); - } - } - return ret; -} - -void InplacePass::CollectInputArgsOfOpDesc( - const OpDesc *op_desc, std::unordered_multiset *in_args) { - in_args->clear(); - for (auto &in_name : op_desc->InputArgumentNames()) { - in_args->insert(in_name); - } -} - -void InplacePass::ApplyImpl(ir::Graph *graph) const { - // Step 1: topo sort ops, collect skip vars - auto ops = ir::TopologySortOperations(*graph); - CollectSkipVars(graph, ops); - - // Step 2: build ssa var map - for (auto *op_node : ops) { - for (auto *in : op_node->inputs) { - PADDLE_ENFORCE(in->IsVar()); - // Only create a new var node when var first occurs in input of op. - if (ssa_map_.count(in->Name()) == 0) { - ssa_map_[in->Name()].emplace_back(in); - } - } - - // Always create a new var node for each output of op. - for (auto *out : op_node->outputs) { - PADDLE_ENFORCE(out->IsVar()); - ssa_map_[out->Name()].emplace_back(out); - } - } - - // Step 3: traverse ops and try inplace if possible - bool use_cuda = Get(kUseCuda); - VLOG(4) << "Inplace pass is applied when use_cuda = " - << (use_cuda ? "true" : "false"); - - for (auto *op_node : ops) { - PADDLE_ENFORCE_NOT_NULL(op_node->Op(), "op_desc is nullptr"); - - auto *op_desc = op_node->Op(); - auto op_type = op_desc->Type(); - - // Skip op inside whitelist - if (whitelist_ops_.count(op_type) > 0) { - continue; - } - - auto &infer_inplace = OpInfoMap::Instance().Get(op_type).infer_inplace_; - - if (!infer_inplace) { - continue; - } - - auto in_to_outs = infer_inplace(*op_desc, use_cuda); - if (in_to_outs.empty()) continue; - - std::unordered_multiset all_in_args; - CollectInputArgsOfOpDesc(op_desc, &all_in_args); - - for (auto &pair : in_to_outs) { - auto &in_param = pair.first; - auto &out_param = pair.second; - - auto &in_args = op_desc->Input(in_param); - auto &out_args = op_desc->Output(out_param); - - if (in_args.empty()) { - VLOG(4) << "Cannot inplace because Input(" << in_param - << ") is empty in " << op_type; - continue; - } - - if (out_args.empty()) { - VLOG(4) << "Cannot inplace because Output(" << out_param - << ") is empty in " << op_type; - continue; - } - - auto &in_arg = in_args[0]; - auto &out_arg = out_args[0]; - - if (IsSkipVar(in_arg)) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is skipped in " << op_type; - continue; - } - - if (IsSkipVar(out_arg)) { - VLOG(4) << "Cannot inplace because Output(" << out_param - << ")=" << out_arg << " is skipped in " << op_type; - continue; - } - - if (in_arg == out_arg) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is the same with Output(" << out_param << ")=" << out_arg - << " in " << op_type; - continue; - } - - size_t in_arg_occur_times = all_in_args.count(in_arg); - if (in_arg_occur_times > 1) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " occurs " << in_arg_occur_times << " times in input of op " - << op_type; - continue; - } - - auto in_nodes = FindNodesByName(in_arg, op_node->inputs); - PADDLE_ENFORCE(!in_nodes.empty(), "Input(%s)=%s cannot be found in op %s", - in_param, in_arg, op_type); - - if (in_nodes.size() > 1) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " occurs in other inputs of " << op_type; - continue; - } - - auto *in_node = *in_nodes.begin(); - - if (!NodeCanReused(in_node)) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is not reusable in " << op_type; - continue; - } - - if (!IsLastVersionVar(in_node)) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is not the last version in " << op_type; - continue; - } - - // If in_node is used as inputs of many ops, check whether all of that ops - // depends on op_node. If not, in_node cannot be inplaced. - if (in_node->outputs.size() > 1 && - !CheckOpDeps(op_node, in_node->outputs)) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is not lastly used in " << op_type; - continue; - } - - auto out_nodes = FindNodesByName(out_arg, op_node->outputs); - PADDLE_ENFORCE(!out_nodes.empty(), - "Output(%s)=%s cannot be found in op %s", out_param, - out_arg, op_type); - - PADDLE_ENFORCE_EQ( - out_nodes.size(), 1, - "Wrong graph: Output(%s)=%s occurs in other outputs of op %s", - out_param, out_arg, op_type); - - if (!FindNodesByName(in_arg, op_node->outputs).empty()) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " occurs in output of op " << op_type; - continue; - } - - if (!FindNodesByName(out_arg, op_node->inputs).empty()) { - VLOG(4) << "Cannot inplace because Output(" << out_param - << ")=" << out_arg << " occurs in input of op " << op_type; - continue; - } - - auto *out_node = *out_nodes.begin(); - - if (!IsFirstVersionVar(out_node)) { - VLOG(4) << "Cannot inplace because Output(" << out_param - << ")=" << out_arg << " does not occur first in op " << op_type; - continue; - } - - if (!NodeCanReused(out_node)) { - VLOG(4) << "Cannot inplace because Output(" << out_param - << ")=" << out_arg << " is not reusable in " << op_type; - continue; - } - - if (in_node->Var()->GetType() != out_node->Var()->GetType()) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is not the same type with " - << "Output(" << out_param << ")=" << out_arg << " in " - << op_type; - continue; - } - - if (NodeSize(*in_node->Var()) != NodeSize(*out_node->Var()) && - kSameShapeOpWhiteSet.count(op_desc->Type()) == 0) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is not the same size with " - << "Output(" << out_param << ")=" << out_arg << " in " - << op_type; - continue; - } - - VLOG(4) << "Rename " << out_node->Name() << " with " << in_node->Name() - << " in " << op_type; - RenameInOut(op_node, in_node, out_node); - } - } -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(inplace_pass, paddle::framework::ir::InplacePass) - .RequirePassAttr(paddle::framework::ir::kUseCuda); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h index 0ceac791..4f6bacec 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h @@ -35,7 +35,11 @@ class MemOptVarInfo { return ref_cnt_ == 1 || (runtime_ref_cnt_.fetch_sub(1) == 1); } - void ResetRuntimeRefCnt() { runtime_ref_cnt_ = ref_cnt_; } + void ResetRuntimeRefCnt() { + if (ref_cnt_ != 1) { + runtime_ref_cnt_ = ref_cnt_; + } + } void SetRefCnt(size_t ref_cnt) { PADDLE_ENFORCE_GE(ref_cnt, 1, @@ -44,21 +48,48 @@ class MemOptVarInfo { runtime_ref_cnt_ = ref_cnt; } - bool IsSkipped() const { return skipped_; } + // Skip all memory optimization, including memory reuse and garbage collection + void SetSkipAllMemoryOptimization(bool is_skipped) { + skip_all_memory_optimization_ = is_skipped; + } + + bool IsSkippedAllMemoryOptimization() const { + return skip_all_memory_optimization_; + } + + // Skip all memory reuse, including inplace and cross op memory reuse + void SetSkipMemoryReuse(bool is_skipped) { skip_memory_reuse_ = is_skipped; } - void SetSkip(bool skipped) { skipped_ = skipped; } + bool IsSkippedMemoryReuse() const { + return skip_memory_reuse_ || skip_all_memory_optimization_; + } const std::string &Name() const { return name_; } private: std::string name_; + + /** + * ref_cnt_ is the total number of last-lived ops of variable. It would not + * be changed during iterations. + * + * runtime_ref_cnt_ is the runtime reference count of variable, which would + * decrease 1 when each EagerDeletionOpHandle runs. As a result, it should + * be reset to ref_cnt_ after each iteration ends. Since operators are + * scheduled in many threads inside ParallelExecutor, runtime_ref_cnt_ + * must be an atomic integer to guarantee the thread safety and visibility. + * + * Speciallly, if ref_cnt_ is 1, we do not need to reset runtime_ref_cnt_ + * after iteration ends. + */ size_t ref_cnt_; std::atomic runtime_ref_cnt_; - bool skipped_{false}; + bool skip_memory_reuse_{false}; + bool skip_all_memory_optimization_{false}; }; using MemOptVarInfoMapList = std::vector< - std::unordered_map>>; + std::unordered_map>>; class SkipMemOptVarsGuard { public: @@ -72,8 +103,9 @@ class SkipMemOptVarsGuard { for (auto &var : vars) { for (auto &map : *list_) { auto iter = map.find(var); - if (iter != map.end() && !iter->second->IsSkipped()) { - iter->second->SetSkip(true); + if (iter != map.end() && + !iter->second->IsSkippedAllMemoryOptimization()) { + iter->second->SetSkipAllMemoryOptimization(true); skip_vars_.emplace_back(iter->second.get()); } } @@ -82,7 +114,7 @@ class SkipMemOptVarsGuard { ~SkipMemOptVarsGuard() { for (auto *var : skip_vars_) { - var->SetSkip(false); + var->SetSkipAllMemoryOptimization(false); } if (list_ && need_reset_ref_cnt_) { diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc deleted file mode 100644 index 0437de68..00000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc +++ /dev/null @@ -1,569 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/var_desc.h" -#include "paddle/fluid/platform/cpu_info.h" - -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/gpu_info.h" -#endif // PADDLE_WITH_CUDA - -namespace paddle { -namespace framework { -namespace ir { -using paddle::framework::VarDesc; - -std::vector SortOpLikeDescOrder(const ir::Graph& graph) { - PADDLE_ENFORCE(graph.Has(details::kStaleProgramOpDescs), - "Graph has no attribute of kStaleProgramOpDescs."); - // 1. get op desc order - auto& op_descs = - graph.Get>(details::kStaleProgramOpDescs); - - // 2. topology sort order - auto nodes = graph.Nodes(); - std::deque ops; - FilterVariables(nodes, [&](ir::Node* op) { - if (op->IsOp() && op->Op() != nullptr) { - ops.emplace_back(op); - } - }); - std::unordered_map op_deps; - std::list ready_ops; - std::unordered_map> pending_ops; - - for (auto* op : ops) { - std::unordered_set preceding_op; - for (auto* in : op->inputs) { - if (in->inputs.empty()) continue; - PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); - preceding_op.emplace(in->inputs[0]); - pending_ops[in->inputs[0]].emplace(op); - } - op_deps[op] = preceding_op.size(); - if (preceding_op.empty()) { - ready_ops.emplace_back(op); - } - } - - // 3. generated op list based desc order and the topology order - std::vector ret; - std::list op_descs_list(op_descs.begin(), op_descs.end()); - - auto update_by_found_node = [&](ir::Node* found_node) { - for (auto* pending_op : pending_ops[found_node]) { - if (--op_deps[pending_op] == 0) { - ready_ops.emplace_back(pending_op); - } - } - ready_ops.remove(found_node); - ret.emplace_back(found_node); - }; - - while (!ready_ops.empty()) { - bool all_of_ready_op_unmatched = true; - for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { - auto op_desc = *it; - ir::Node* found_node = nullptr; - for (auto* op : ready_ops) { - if (IsSameDesc(op->Op(), op_desc)) { - found_node = op; - break; - } - } - - // 3.1 op desc deleted by other pass - if (found_node == nullptr) { - ++it; - continue; - } else { - all_of_ready_op_unmatched = false; - it = op_descs_list.erase(it); - } - update_by_found_node(found_node); - } - - // 3.2 op descs are added by other pass - // preceding op non empty means some new op descs are - // created, but not contained in return node list. - // these new op desc may depend on each other. - std::list prev_ready_ops(ready_ops); - if (all_of_ready_op_unmatched) { - for (auto op : prev_ready_ops) { - update_by_found_node(op); - } - } - } - - PADDLE_ENFORCE(std::all_of( - op_deps.begin(), op_deps.end(), - [&](const std::pair& p) { return p.second == 0; })); - - return ret; -} - -size_t NodeSize(const VarDesc& node) { - auto shape = node.GetShape(); - int size = - std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); - size_t type_size = SizeOfType(node.GetDataType()); - return type_size * std::abs(size); -} - -size_t NodeSize(ir::Node* n) { return NodeSize(*(n->Var())); } - -std::string DebugStringImpl(VarDesc* var) { - std::stringstream ss; - ss << var->Name(); - ss << "["; - try { - auto shape = var->GetShape(); - for (size_t i = 0; i < shape.size(); ++i) { - if (i != shape.size() - 1) { - ss << shape[i] << ","; - } else { - ss << shape[i]; - } - } - ss << "]"; - } catch (...) { - ss << "Var has no VarDesc !!! Name:" << var->Name(); - } - return ss.str(); -} - -std::string DebugString(ir::Node* var) { - return DebugStringImpl(GetVarDesc(var)); -} - -// NOTE(dzh): based ir node, if a large node has been reused -// by a small size node, then next time it appear in pool, it will -// have the small size. Find the original node shap from blockdesc. -VarDesc* GetVarDesc(ir::Node* n) { - PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); - return n->Var(); -} - -struct NodeComparator { - bool operator()(ir::Node* lhs, ir::Node* rhs) const { - if (lhs->Var()->GetType() != rhs->Var()->GetType()) return false; - auto* lhs_desc = GetVarDesc(lhs); - auto* rhs_desc = GetVarDesc(rhs); - // match data type - if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) { - return false; - } - // match shape - auto lhs_shape = lhs_desc->GetShape(); - auto rhs_shape = rhs_desc->GetShape(); - if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || - (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { - return NodeSize(lhs) == NodeSize(rhs); - } else { - return false; - } - } -}; - -void OrderedSet::Insert(ir::Node* var) { - PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); - if (mark_table_.count(var->Name()) != 0) { - mark_table_[var->Name()]->emplace_back(var); - return; - } - - auto* var_desc = var->Var(); - auto var_shape = var_desc->GetShape(); - int batch_size = static_cast(var_shape[0]); - - NodeComparator functor; - Iter it = nodes_.begin(); - while (it != nodes_.end()) { - auto& prev = it->front(); - auto* cache_desc = GetVarDesc(prev); - int cache_batch_size = cache_desc->GetShape()[0]; - if ((cache_batch_size == -1 && batch_size == -1) || - (cache_batch_size != -1 && batch_size != -1)) { - if (functor(prev, var)) { - ++it; - } else { - break; - } - } else if (cache_batch_size == -1 && batch_size != -1) { - ++it; - } else if (cache_batch_size != -1 && batch_size == -1) { - break; - } - } - - it = nodes_.insert(it, {var}); - mark_table_[var->Name()] = it; -} - -int OrderedSet::GetNodeIndexInPool(ir::Node* var) { - return std::distance(nodes_.begin(), mark_table_[var->Name()]); -} - -ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const { - ir::Node* found_node = nullptr; - NodeComparator functor; - - for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - auto& candidate = it->front(); - if (functor(var, candidate)) { - found_node = candidate; - break; - } - } - return found_node; -} - -ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const { - ir::Node* found_node = nullptr; - NodeComparator functor; - auto it = - std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) { - if (v.front() == prev) - return true; - else - return false; - }); - PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!"); - for (it = std::next(it); it != nodes_.end(); ++it) { - auto& candidate = it->front(); - if (functor(var, candidate)) { - found_node = candidate; - break; - } - } - return found_node; -} - -bool OrderedSet::Has(ir::Node* var) const { - if (mark_table_.count(var->Name())) { - auto& node_in_samename = mark_table_.at(var->Name()); - auto iter = - std::find_if(node_in_samename->begin(), node_in_samename->end(), - [&](ir::Node* n) { return n->Name() == var->Name(); }); - return iter != node_in_samename->end(); - } - return false; -} - -void OrderedSet::Erase(const std::string& var) { - PADDLE_ENFORCE(mark_table_.count(var)); - nodes_.erase(mark_table_[var]); - mark_table_.erase(var); -} - -void OrderedSet::Erase(ir::Node* var) { - PADDLE_ENFORCE(var != nullptr); - Erase(var->Name()); -} - -std::string OrderedSet::ToString() const { - std::stringstream ss; - for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - for (auto& node : *it) { - ss << DebugString(node) << " "; - } - } - return ss.str(); -} - -bool NodeCanReused(ir::Node* node) { - // valid the node is a var node - // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad - if (node == nullptr || !node->IsVar() || node->IsCtrlVar() || - node->Name() == kEmptyVarName) - return false; - - bool flag = true; - // op output force generated in cpu, can not be reused. - for (auto* op : node->inputs) { - if (op->Op()->HasAttr("force_cpu")) { - flag &= framework::AttrReader(op->Op()->GetAttrMap()) - .Get("force_cpu") == 0; - } - } - // var desc validation. - flag &= NodeCanReused(*node->Var()); - return flag; -} - -int MinChunkSize() { - int size{0}; -#ifdef PADDLE_WITH_CUDA - size = platform::GpuMinChunkSize(); -#else - size = platform::CpuMinChunkSize(); -#endif // PADDLE_WITH_CUDA - return size; -} - -bool NodeCanReused(const VarDesc& node) { - auto type = node.GetType(); - // only these types holds bulk of gpu memory - // FIXME(liuwei1031) did not find good ways to test SELECTED_ROWS and - // LOD_TENSOR_ARRAY re-use logic, - // disable them in version 1.4 - // if (!(type == proto::VarType::LOD_TENSOR || - // type == proto::VarType::SELECTED_ROWS || - // type == proto::VarType::LOD_TENSOR_ARRAY)) { - // return false; - // } - if (type != proto::VarType::LOD_TENSOR) return false; - - // persistable variable is parameter - if (node.Persistable()) { - return false; - } - // shape < min_chunk_size is meaningless. - // further more, fetched loss always has size = 1 - // which should not be reused. - auto shape = node.GetShape(); - int size = std::abs( - std::accumulate(shape.begin(), shape.end(), 1, std::multiplies())); - if (shape.empty() || size < MinChunkSize()) { - return false; - } - return true; -} - -bool OpHasSubBlock(OpDesc* desc) { - const AttributeMap& attrs = desc->GetAttrMap(); - for (auto& attr : attrs) { - if (attr.second.type() == typeid(BlockDesc*) || // NOLINT - attr.second.type() == typeid(std::vector)) // NOLINT - return true; - } - return false; -} - -ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { - ops_ = SortOpLikeDescOrder(graph); - ConnectNodes(); -} - -void ControlFlowGraph::BuildCFGGraph() { - // FIXME(dzh): same effect with ConnectNodes, but use the control - // link to build dependency graph, it goes wrong in transformer. - for (ir::Node* op : ops_) { - for (auto& input_var : op->inputs) { - if (!input_var->inputs.empty()) { - PADDLE_ENFORCE( - input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), - "Preceding Op Node of Var Node must be unique"); - auto* pred_op = input_var->inputs[0]; - if (pred_op->Op() != nullptr) { - predecessors_[op].insert(pred_op); - successors_[pred_op].insert(op); - } - } - if (input_var->IsVar() && !input_var->IsCtrlVar()) { - uses_[op].insert(input_var->Name()); - } - } - for (auto& output_var : op->outputs) { - // output var may be used by many op - for (auto* succ_op : output_var->outputs) { - if (succ_op->Op() != nullptr) { - successors_[op].insert(succ_op); - predecessors_[succ_op].insert(op); - } - } - if (output_var->IsVar() && !output_var->IsCtrlVar()) { - defs_[op].insert(output_var->Name()); - } - } - } -} - -void ControlFlowGraph::ConnectNodes() { - for (size_t i = 0; i < ops_.size(); ++i) { - auto& op = ops_[i]; - try { - auto& next_op = ops_.at(i + 1); - successors_[op].insert(next_op); - predecessors_[next_op].insert(op); - } catch (...) { - // do nothing - } - - FilterVariables(op->inputs, - [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); - - FilterVariables(op->outputs, - [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); - } -} - -void ControlFlowGraph::LiveVariableAnalysis() { - // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) - // compute the liveness of for each variable though reversed_ops algorithm. - // It iterates the operators from end to begin, compute the live in/live out - // variable set for each op, then the diff between in/out will be used for - // the variable reuse. For detail refer to - // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf - std::list work_list(ops_.rbegin(), ops_.rend()); - while (!work_list.empty()) { - ir::Node* op = work_list.front(); - work_list.pop_front(); - // get the live_in calculated before. Empty if first. - auto prev_live_in = std::move(live_in_[op]); - for (auto& s : successors_[op]) { - for (auto& var : live_in_[s]) { - live_out_[op].insert(var); - } - } - for (auto& var : uses_[op]) { - live_in_[op].insert(var); - } - for (auto& var : live_out_[op]) { - live_in_[op].insert(var); - } - for (auto& var : defs_[op]) { - if (uses_[op].count(var)) continue; - live_in_[op].erase(var); - } - - // If the live_in is not changed, then the liveness analysis of - // predecessors is completed. - // - // Otherwise, recalculate the predecessors liveness - if (live_in_[op] != prev_live_in) { - for (auto& pre : predecessors_[op]) { - work_list.push_back(pre); - } - } - } - - for (auto* op : ops_) { - unlived_vars_[op] = std::set(); - for (auto& var : this->LiveIn(op)) { - if (!this->LiveOut(op).count(var)) { - unlived_vars_[op].insert(var); - } - } - } -} - -void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, - int begin_idx) { - std::vector need_update(ops_.size(), false); - // update graph from begin idx to the end - for (size_t i = begin_idx; i != ops_.size(); ++i) { - auto* op = ops_[i]; - if (uses_[op].find(old_node) != uses_[op].end()) { - uses_[op].erase(old_node); - uses_[op].insert(new_node); - } - if (defs_[op].find(old_node) != defs_[op].end()) { - defs_[op].erase(old_node); - defs_[op].insert(new_node); - } - if (live_in_[op].find(old_node) != live_in_[op].end()) { - live_in_[op].erase(old_node); - live_in_[op].insert(new_node); - need_update[i] = true; - } - if (live_out_[op].find(old_node) != live_out_[op].end()) { - live_out_[op].erase(old_node); - live_out_[op].insert(new_node); - need_update[i] = true; - } - } - - for (size_t i = begin_idx; i < ops_.size(); ++i) { - if (!need_update[i]) continue; - auto* op = ops_[i]; - for (auto& var : this->LiveIn(op)) { - if (!this->LiveOut(op).count(var)) { - unlived_vars_[op].insert(var); - } - } - } -} - -const std::set& ControlFlowGraph::LiveIn(ir::Node* op) const { - auto it = live_in_.find(op); - PADDLE_ENFORCE( - it != live_in_.end(), - string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); - return it->second; -} - -const std::set& ControlFlowGraph::LiveOut(ir::Node* op) const { - auto it = live_out_.find(op); - PADDLE_ENFORCE( - it != live_out_.end(), - string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); - return it->second; -} - -const std::set& ControlFlowGraph::Use(ir::Node* op) const { - auto it = uses_.find(op); - PADDLE_ENFORCE( - it != uses_.end(), - string::Sprintf("Expect %s in use, but Not Found.", op->Name())); - return it->second; -} - -const std::set& ControlFlowGraph::Unlived(ir::Node* op) const { - auto it = unlived_vars_.find(op); - PADDLE_ENFORCE( - it != unlived_vars_.end(), - string::Sprintf("Expect %s in unlived_set, but Not Found.", op->Name())); - return it->second; - return it->second; -} - -const std::vector& ControlFlowGraph::Ops() const { return ops_; } - -std::vector& ControlFlowGraph::Ops() { return ops_; } - -ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name, - ir::Node* op) const { - // in ssa-graph, different version nodes have same name, - // this function get the latest version var before target op - // It may return nullptr, such as data node. - ir::Node* found_node = nullptr; - for (auto* node : ops_) { - if (node == op) break; - for (auto& output : node->outputs) { - PADDLE_ENFORCE((output != nullptr && output->IsVar()), - "Output is empty!"); - if (output->Var() && output->Name() == name) { - found_node = output; - } - } - } - return found_node; -} - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h deleted file mode 100644 index cf9f4ef4..00000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/ir/graph.h" - -namespace paddle { -namespace framework { -namespace ir { - -/// this attribute is used to avoid some core variables removed/reused -/// in memory optimize related passes -constexpr char kMemOptSkipVars[] = "@MEM_OPT_SKIP_VARS@"; -typedef std::unordered_set MemOptSkipVars; - -constexpr char kUseCuda[] = "use_cuda"; - -std::vector SortOpLikeDescOrder(const ir::Graph& graph); - -// NOTE(dzh): A ordered set for node reuse in memory optimize. -// the orderedset sort node in ascend order(by node bytes size). -// in fluid, -1 means the batch_size, which is determined in runtime. -// So the reuse happens between nodes who's batch_size both are -1 -// simultaneously or not. -// -// sort rule: -// rule 0 : smaller node ranking in front. -// rule 1 : batch_size equal -1 ranking in the front than the node not. -// -// For example, -// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. - -class OrderedSet { - public: - // nodes with same name exists in pool. - using NodeVector = std::vector; - using Iter = typename std::list::iterator; - using ConstIter = typename std::list::const_iterator; - - void Insert(ir::Node* var); - void Erase(ir::Node* var); - void Erase(const std::string& var); - bool Has(ir::Node* var) const; - void Clear() { - mark_table_.clear(); - nodes_.clear(); - } - // find the bestfit shape node block with var. - ir::Node* FindBestFitNode(ir::Node* var) const; - ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const; - // map store non-const iterator, can not promise const - int GetNodeIndexInPool(ir::Node* var); - // pool all node to string - std::string ToString() const; - - Iter begin() { return nodes_.begin(); } - Iter end() { return nodes_.end(); } - ConstIter begin() const { return nodes_.begin(); } - ConstIter end() const { return nodes_.end(); } - - size_t size() const { return nodes_.size(); } - - private: - // for searching. - std::unordered_map mark_table_; - // node pool - std::list nodes_; -}; - -class ControlFlowGraph { - public: - ControlFlowGraph() = default; - // IR Graph - explicit ControlFlowGraph(const ir::Graph& graph); - - void LiveVariableAnalysis(); - - void RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, int begin_idx); - - const std::set& LiveIn(ir::Node* op) const; - const std::set& LiveOut(ir::Node* op) const; - const std::set& Use(ir::Node* op) const; - const std::set& Unlived(ir::Node* op) const; - const std::vector& Ops() const; - std::vector& Ops(); - - // for ssa-graph nodes - ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const; - - private: - void BuildCFGGraph(); - void ConnectNodes(); - - using NodeListMap = std::unordered_map>; - using VarSetMap = std::map>; - // successors ops use the output variables. - NodeListMap successors_; - // predecessors ops generated input variables. - NodeListMap predecessors_; - // variables lived before run current op. - VarSetMap live_in_; - // variables lived after run current op. - VarSetMap live_out_; - VarSetMap uses_; // op inputs - VarSetMap defs_; // op outputs - std::unordered_map> unlived_vars_; - - std::vector ops_; // op sequence by topology sort -}; - -// valid a tensor can be reuse or not -bool NodeCanReused(ir::Node* node); - -// valid a tensor can be reuse or not. -bool NodeCanReused(const VarDesc& node); - -// check op has subblock or not -bool OpHasSubBlock(OpDesc* desc); - -// node memory size in bytes -size_t NodeSize(ir::Node* n); - -// node memory size in bytes -size_t NodeSize(const VarDesc&); - -std::string DebugString(ir::Node* var); - -VarDesc* GetVarDesc(ir::Node* n); - -static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { - return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && - op1->Outputs() == op2->Outputs(); -} - -template -class FilterVariableImpl { - public: - void operator()(const Container& nodes, Callback callback) { - for (auto* node : nodes) { - callback(node); - } - } -}; - -// filter var node for op->inputs/outputs -template -class FilterVariableImpl, Callback> { - public: - void operator()(const std::vector& nodes, Callback callback) { - for (auto* var : nodes) { - if (var->IsVar() && !var->IsCtrlVar()) { - callback(var); - } - } - } -}; - -template -void FilterVariables(const Container& nodes, Callback callback) { - FilterVariableImpl()(nodes, callback); -} - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc deleted file mode 100644 index d38facd0..00000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc +++ /dev/null @@ -1,525 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include "glog/logging.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/details/graph_test_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" - -namespace paddle { -namespace framework { -namespace ir { - -TEST(OrderedSet, Normal) { - OrderedSet pool; - std::vector> nodes; - - // clang-format off - std::vector> shapes = {{-1, 10}, - {-1, 20}, - {1, 2}, - {5, 2}, - {10, 20}, - {-1, 2, 5}, - {-1, 1, 5}, - {-1, 1}}; - // clang-format on - const int COUNT = shapes.size(); - ProgramDesc prog; - BlockDesc* block_desc = prog.MutableBlock(0); - auto* op_desc = block_desc->AppendOp(); - op_desc->SetType("dummy"); - std::unique_ptr op = ir::CreateNodeForTest(op_desc); - - for (int i = 0; i < COUNT; ++i) { - auto desc = block_desc->Var(std::to_string(i)); - desc->SetShape(shapes[i]); - std::unique_ptr node = ir::CreateNodeForTest(desc); - node->inputs.emplace_back(op.get()); - nodes.emplace_back(std::move(node)); - } - - // Insert - for (auto& node : nodes) { - pool.Insert(node.get()); - } - - // Has/size - ASSERT_EQ(pool.size(), shapes.size()); - for (auto& node : nodes) { - ASSERT_TRUE(pool.Has(node.get())); - } - - // assert its order and interface. - std::cout << pool.ToString() << std::endl; - pool.Erase(nodes.front().get()); - std::cout << pool.ToString() << std::endl; - - ASSERT_EQ(pool.size(), static_cast(COUNT - 1)); - ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0); - - { - auto v1 = block_desc->Var("11"); - v1->SetShape({-1, 256, 56, 56}); - std::unique_ptr node1 = ir::CreateNodeForTest(v1); - node1->inputs.emplace_back(op.get()); - auto* cache = pool.FindBestFitNode(node1.get()); - ASSERT_EQ(cache, nullptr); - } - { - auto v2 = block_desc->Var("12"); - v2->SetShape({-1, 2, 5}); - std::unique_ptr node1 = ir::CreateNodeForTest(v2); - node1->inputs.emplace_back(op.get()); - auto* cache = pool.FindBestFitNode(node1.get()); - ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2); // match 6:[-1,2,5] - } - { - auto v3 = block_desc->Var("13"); - v3->SetShape({2, 5}); - std::unique_ptr node1 = ir::CreateNodeForTest(v3); - node1->inputs.emplace_back(op.get()); - auto* cache = pool.FindBestFitNode(node1.get()); - ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2] - } -} - -TEST(OrderedSet, FindBestFitNode) { - OrderedSet pool; - std::vector> nodes; - ProgramDesc prog; - BlockDesc* block_desc = prog.MutableBlock(0); - auto* op_desc = block_desc->AppendOp(); - op_desc->SetType("dummy"); - std::unique_ptr op = ir::CreateNodeForTest(op_desc); - - { - auto desc = block_desc->Var("a"); - desc->SetShape({128, 128}); - std::unique_ptr node = ir::CreateNodeForTest(desc); - node->inputs.emplace_back(op.get()); - nodes.emplace_back(std::move(node)); - } - { - auto desc = block_desc->Var("b"); - desc->SetShape({128, 129}); - std::unique_ptr node = ir::CreateNodeForTest(desc); - node->inputs.emplace_back(op.get()); - nodes.emplace_back(std::move(node)); - } - { - auto desc = block_desc->Var("c"); - desc->SetShape({128, 128}); - std::unique_ptr node = ir::CreateNodeForTest(desc); - node->inputs.emplace_back(op.get()); - nodes.emplace_back(std::move(node)); - } - - for (auto& node : nodes) { - pool.Insert(node.get()); - } - - auto* n = nodes[0].get(); - auto* cache = pool.FindBestFitNode(n); - ASSERT_TRUE(cache->Name() == "a" || cache->Name() == "c"); - auto* cache_b = pool.FindNextBestFitNode(n, cache); - ASSERT_TRUE(cache_b->Name() != cache->Name()); - ASSERT_TRUE(cache_b->Name() == "a" || cache_b->Name() == "c"); - cache = pool.FindNextBestFitNode(n, cache_b); - ASSERT_TRUE(cache == nullptr); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_OPERATOR(sum, paddle::framework::DummyOp, - paddle::framework::SumOpMaker, - paddle::framework::DummyVarTypeInference); -REGISTER_OPERATOR(assign, paddle::framework::DummyOp, - paddle::framework::AssignOpMaker, - paddle::framework::DummyVarTypeInference); -REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, - paddle::framework::SumOpMaker, - paddle::framework::DummyVarTypeInference); -/* - https://en.wikipedia.org/wiki/Live_variable_analysis - Create a customed classical dependency graph, left row is the instruction - number. - 1. a = 1 - 2. b = a - 3. c = a - 4. d = b + c - 5. e = d - - a--------+ - | | - b c - | | - d--------+ - | - e - Then analysis these variable's liveness range - */ - -namespace paddle { -namespace framework { -namespace ir { - -inline static ProgramDesc FillProgramDesc() { - ProgramDesc prog; - prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"a"}); - op->SetOutput("Out", {"b"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"a"}); - op->SetOutput("Out", {"c"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"d"}); - op->SetOutput("Out", {"e"}); - } - return prog; -} - -TEST(CFGGraph, IRGraph) { - // prepare ir graph - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - - ControlFlowGraph cfg(graph); - cfg.LiveVariableAnalysis(); - - // test assign op - ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); - ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); - - // test assign op - ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); - ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); - - // test sum op - ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); - ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); - - // test assign op - ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); - ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); -} - -// 1. normal test -TEST(SortOpLikeDescOrder, NormalTest) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - - auto nodes = SortOpLikeDescOrder(graph); - auto op_descs = prog.Block(0).AllOps(); - for (size_t i = 0; i < nodes.size(); ++i) { - auto node = nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 2. remove some op_desc -TEST(SortOpLikeDescOrder, RemoveOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - auto nodes = graph.Nodes(); - auto op_descs = prog.Block(0).AllOps(); - ir::Node* found_node = nullptr; - for (auto node : nodes) { - if (node->IsOp() && node->outputs.back()->Name() == "e") { - found_node = node; - break; - } - } - PADDLE_ENFORCE(found_node != nullptr); - for (auto it = op_descs.begin(); it != op_descs.end();) { - if (IsSameDesc(*it, found_node->Op())) { - it = op_descs.erase(it); - } else { - ++it; - } - } - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - ir::Node* e = find_node_in_graph("e"); - ir::Node* d = find_node_in_graph("d"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - graph.RemoveNode(found_node); - graph.RemoveNode(e); - - // other node keeps the same order - auto remain_nodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < remain_nodes.size(); ++i) { - auto node = remain_nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 3. add some op_desc -TEST(SortOpLikeDescOrder, AddOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - // cached desc different with real one - // mimic the intermidiete pass modify the programdesc. - std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); - - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - d1->inputs.emplace_back(node); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - op_descs.insert(op_descs.begin() + 4, op); - - auto nodes = SortOpLikeDescOrder(graph); - - for (size_t i = 0; i < nodes.size(); ++i) { - auto node = nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 4. add and delete some op_desc -TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); - - // remove sum node - ir::Node* found_node = nullptr; - auto nodes = graph.Nodes(); - for (auto node : nodes) { - if (node->Name() == "sum") { - found_node = node; - break; - } - } - PADDLE_ENFORCE(found_node != nullptr); - for (auto it = op_descs.begin(); it != op_descs.end();) { - if (IsSameDesc(*it, found_node->Op())) { - it = op_descs.erase(it); - } else { - ++it; - } - } - { - ir::Node* d = find_node_in_graph("d"); - ir::Node* c = find_node_in_graph("c"); - ir::Node* e = find_node_in_graph("e"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - std::remove(c->outputs.begin(), c->outputs.end(), found_node); - ir::Node* pending_op = found_node->outputs[0]->outputs[0]; - graph.RemoveNode(e); - graph.RemoveNode(pending_op); - graph.RemoveNode(found_node); - } - - // add node - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - { - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - } - op_descs.insert(op_descs.begin() + 2, op); - - // check the order - auto mynodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < mynodes.size(); ++i) { - auto node = mynodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 5. add and replace some op_desc inplace. -TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - // add node - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - { - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - d1->inputs.emplace_back(node); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - } - - op_descs.emplace_back(op); - - // replace op_desc inplace - auto nodes = graph.Nodes(); - ir::Node* found_node = nullptr; - for (auto node : nodes) { - if (node->IsOp() && node->Op() && node->Name() == "assign") { - if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { - found_node = node; - break; - } - } - } - { - ir::Node* d = find_node_in_graph("d"); - ir::Node* e = find_node_in_graph("e"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - std::remove(e->inputs.begin(), e->inputs.end(), found_node); - graph.RemoveNode(found_node); - } - op_descs.erase(op_descs.begin() + 3); - - auto replace_op = prog.MutableBlock(0)->AppendOp(); - replace_op->SetType("sum"); - replace_op->SetInput("X", {"d", "d1"}); - replace_op->SetOutput("Out", {"e"}); - { - ir::Node* sum2 = graph.CreateOpNode(replace_op); - ir::Node* e = find_node_in_graph("e"); - ir::Node* d = find_node_in_graph("d"); - ir::Node* d1 = find_node_in_graph("d1"); - sum2->inputs.emplace_back(d); - sum2->inputs.emplace_back(d1); - sum2->outputs.emplace_back(e); - e->inputs.emplace_back(sum2); - d->outputs.emplace_back(sum2); - d1->outputs.emplace_back(sum2); - } - - op_descs.emplace_back(replace_op); - // compare op order - auto graph_nodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < graph_nodes.size(); ++i) { - auto node = graph_nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc deleted file mode 100644 index af3fbb28..00000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc +++ /dev/null @@ -1,224 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" - -namespace paddle { -namespace framework { -namespace ir { - -void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const { - CollectSkipVarsSet(graph); - - cfg_.reset(new ControlFlowGraph(*graph)); - cfg_->LiveVariableAnalysis(); - InitSSAGraphNodes(); - - int reuse_id = 0; - for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) { - auto& op = cfg_->Ops()[idx]; - auto* op_desc = op->Op(); - // some op in graph has no op desc - if (op_desc == nullptr) continue; - - for (auto& var : op->outputs) { - if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) { - VLOG(3) << "Skip set contains variable of " << var->Name() - << "disable reuse on it. skipped"; - continue; - } - if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { - ir::Node* cache = pool_.FindBestFitNode(var); - while (cache != nullptr && var->Name() == cache->Name()) { - VLOG(3) << "The same cache variable is cascade reused. " - << cache->Name() << " is re-filled to the pool after " - << "the reused op is finished. Current op can not " - << "replace it again. Skip this candidate."; - cache = pool_.FindNextBestFitNode(var, cache); - } - - if (cache != nullptr) { - int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); - VLOG(3) << string::Sprintf( - "!!! %s, %s => %s, cache idx %d, pool size %d", - std::to_string(reuse_id++), DebugString(var), DebugString(cache), - node_idx_in_pool, static_cast(pool_.size())); - // NOTE(dzhwinter): update the ProgramDesc/IR Graph - // and the CFG Graph on the fly. - // - // IR Graph define the dependence relationship between nodes. - // - // ProgramDesc defines the input/output vars. Its used in - // CreateOp, CreateVar when running happens. - // - // CFG Graph store the liveness information, when reuse happens - // we also need to update the variable liveness. - const std::string var_name = var->Name(); - const std::string cache_name = cache->Name(); - - cfg_->RenameVarInCFGGraph(var_name, cache_name, idx); - RenameVarInGraphDesc(var_name, cache_name, idx); - RenameVarInGraphNode(var_name, cache_name, idx, graph); - pool_.Erase(cache_name); - } - } - } - // fill the pool - for (auto& var : cfg_->Unlived(op)) { - ir::Node* var_node = cfg_->GetNodeByName(var, op); - if (var_node == nullptr || var_node->IsCtrlVar()) continue; - if (NodeCanReused(var_node) && !pool_.Has(var_node)) { - pool_.Insert(var_node); - } - } - } - graph->ResolveHazard(var_nodes_); -} - -void MemoryOptimizePass::CollectSkipVarsSet(ir::Graph* graph) const { - // fill skip_set_ - PADDLE_ENFORCE(graph->Has(kMemOptSkipVars)); - auto& mem_opt_whitelist = graph->Get(kMemOptSkipVars); - for (const auto& var : mem_opt_whitelist) { - skip_set_.emplace(var); - } -} - -void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, - const std::string& cache_var, - size_t idx) const { - for (size_t i = idx; i < cfg_->Ops().size(); ++i) { - auto* op = cfg_->Ops()[i]; - PADDLE_ENFORCE(op->IsOp() && op->Op()); - auto* op_desc = op->Op(); - op_desc->RenameInput(var, cache_var); - op_desc->RenameOutput(var, cache_var); - if (op_desc->Block() != nullptr) { - op_desc->Block()->RemoveVar(var); - } else { - LOG(WARNING) << "op " << op->Name() << " not know its block." - << "Is the op_desc created without block pointer? " - << "Can not find " << var << " in Block(0)"; - } - op_desc->Flush(); - } -} - -void MemoryOptimizePass::InitSSAGraphNodes() const { - std::unordered_map> all_vars; - if (var_nodes_.empty()) { - for (auto* op : cfg_->Ops()) { - for (auto* node : op->inputs) { - if (all_vars[node->Name()].count(node) == 0) { - all_vars[node->Name()].emplace(node); - var_nodes_[node->Name()].emplace_back(node); - } - } - for (auto* node : op->outputs) { - if (all_vars[node->Name()].count(node) == 0) { - all_vars[node->Name()].emplace(node); - var_nodes_[node->Name()].emplace_back(node); - } - } - } - } -} - -void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, - const std::string& cache_var, - size_t idx, - ir::Graph* graph) const { - // if replace happens, we need to create a newer version cache_var - // but use the same dims/data_type with var. - PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && - var_nodes_[var].at(0)->Var() != nullptr); - std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); - var_desc->SetName(cache_var); - - for (size_t i = idx; i < cfg_->Ops().size(); ++i) { - auto* op = cfg_->Ops()[i]; - - // redirect the input to the latest version of cache_var - for (auto* node : op->inputs) { - if (node->Name() == var) { - ir::Node* cache_node = var_nodes_[cache_var].back(); - - // swap node to cache_node - cache_node->outputs.insert(cache_node->outputs.end(), - node->outputs.begin(), node->outputs.end()); - PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); - auto* prev_op = node->inputs[0]; - std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, - cache_node); - for (auto* next_op : node->outputs) { - std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, - cache_node); - } - - // erase unused node - auto& nodes = var_nodes_.at(var); - nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); - graph->RemoveNode(node); - } - } - - // if we need to rename the output, - // always create a newer version of cache_var - for (auto* node : op->outputs) { - if (node->Name() == var) { - ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); - var_nodes_[cache_var].emplace_back(cache_node); - - // swap node to cache node - cache_node->outputs.insert(cache_node->outputs.end(), - node->outputs.begin(), node->outputs.end()); - cache_node->inputs.emplace_back(op); - std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); - for (auto* next_op : node->outputs) { - std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, - cache_node); - } - - // erase unused node - auto& nodes = var_nodes_.at(var); - nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); - graph->RemoveNode(node); - } - } - } -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(memory_optimize_pass, paddle::framework::ir::MemoryOptimizePass) - .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h deleted file mode 100644 index eef289ef..00000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace ir { - -class MemoryOptimizePass : public ir::Pass { - protected: - void ApplyImpl(ir::Graph* graph) const override; - // fill the variable map(var_nodes) by version. - void InitSSAGraphNodes() const; - - private: - // update program descs - void RenameVarInGraphDesc(const std::string& var, - const std::string& cache_var, size_t idx) const; - // update ir nodes - void RenameVarInGraphNode(const std::string& var, - const std::string& cache_var, size_t idx, - ir::Graph* graph) const; - - void SubGraphOptimize(OpDesc* op_desc) const; - // 1. scan op with subblock and collect the output/input vars. - // while, while_grad, conditional_block - // 2. scan distributed ops and collect the output/input vars - // 3. op_role_vars - void CollectSkipVarsSet(ir::Graph* graph) const; - - private: - // Reuse Node Pool, Owned. - mutable OrderedSet pool_; - // controlflow Graph - mutable std::unique_ptr cfg_; - // skip set - mutable std::unordered_set skip_set_; - // var nodes - mutable std::map> var_nodes_; -}; - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc index 9a8e2530..20c7968d 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h" +#include #include #include #include @@ -23,39 +24,23 @@ namespace paddle { namespace framework { namespace ir { -// Each ShareTensorBufferOpHandle should only have one pending -// ComputationOpHandle -static details::ComputationOpHandle *GetUniquePendingComputationOpHandle( - details::ShareTensorBufferOpHandle *share_tensor_op) { - details::ComputationOpHandle *result_op = nullptr; - for (Node *out_var : share_tensor_op->Node()->outputs) { - for (Node *pending_op : out_var->outputs) { - auto &op = pending_op->Wrapper(); - auto *compute_op = dynamic_cast(&op); - PADDLE_ENFORCE_NOT_NULL(compute_op); - - if (result_op == nullptr) { - result_op = compute_op; - } else { - PADDLE_ENFORCE_EQ(result_op, compute_op); - } - } - } - - PADDLE_ENFORCE_NOT_NULL(result_op); - return result_op; -} - void MemoryReusePass::ApplyImpl(Graph *graph) const { graph_ = graph; + use_cuda_ = Get(kUseCuda); all_vars_ = &(graph_->Get(details::kGraphVars)); var_infos_ = &(Get(kMemOptVarInfoMapList)); last_live_ops_of_vars_ = &(Get>(kLastLiveOpsOfVars)); - reused_var_names_.resize(all_vars_->size()); + reused_in_var_names_.resize(all_vars_->size()); + reused_out_var_names_.resize(all_vars_->size()); var_descs_.resize(all_vars_->size()); + pinned_var_set_ = nullptr; + if (graph->Has(details::kPinnedVars)) { + pinned_var_set_ = &graph->Get(details::kPinnedVars); + } + // Collect the existing ShareTensorBufferOpHandles. // This is because (1) we want to reuse the existing // ShareTensorBufferOpHandles to avoid inserting too many ops; @@ -82,7 +67,7 @@ bool MemoryReusePass::TryReuseVar(details::VarHandle *in_var, auto *op = dynamic_cast(out_var->GeneratedOp()); PADDLE_ENFORCE_NOT_NULL(op); - if (IsVarsReusable(in_var, out_var)) { + if (IsVarPairReusable(*in_var, *out_var)) { AddReuseVar(op, in_var, out_var); return true; } else { @@ -101,28 +86,39 @@ std::unordered_set MemoryReusePass::FindNodesByName( return ret; } -VarDesc *MemoryReusePass::GetVarDesc(details::VarHandle *var) const { - auto iter = var_descs_[var->scope_idx()].find(var->Name()); - if (iter == var_descs_[var->scope_idx()].end()) { - PADDLE_ENFORCE((*all_vars_)[var->scope_idx()].count(var->Name()), - "Variable %s not found", var->Name()); - auto *desc = - TryGetLatestVarDesc((*all_vars_)[var->scope_idx()].at(var->Name())); +VarDesc *MemoryReusePass::GetVarDesc(const details::VarHandle &var) const { + const auto var_name = var.Name(); + size_t scope_idx = var.scope_idx(); + auto iter = var_descs_[scope_idx].find(var_name); + if (iter == var_descs_[scope_idx].end()) { + PADDLE_ENFORCE((*all_vars_)[scope_idx].count(var_name), + "Variable %s not found", var_name); + auto *desc = TryGetLatestVarDesc((*all_vars_)[scope_idx].at(var_name)); PADDLE_ENFORCE_NOT_NULL(desc); - var_descs_[var->scope_idx()].emplace(var->Name(), desc); + var_descs_[scope_idx].emplace(var_name, desc); return desc; } else { return iter->second; } } +int64_t MemoryReusePass::GetMemorySize(const details::VarHandle &var) const { + auto *var_desc = GetVarDesc(var); + auto shapes = var_desc->GetShape(); + auto sizeof_dtype = static_cast(SizeOfType(var_desc->GetDataType())); + return std::accumulate(shapes.begin(), shapes.end(), static_cast(1), + std::multiplies()) * + sizeof_dtype; +} + void MemoryReusePass::CollectShareTensorBufferOpHandles() const { auto all_ops = FilterByNodeWrapper(*graph_); for (auto *op : all_ops) { auto *share_buffer_op = dynamic_cast(op); if (share_buffer_op != nullptr) { - auto *compute_op = GetUniquePendingComputationOpHandle(share_buffer_op); + auto *compute_op = + details::GetUniquePendingComputationOpHandle(share_buffer_op); PADDLE_ENFORCE(ops_.count(compute_op) == 0); ops_.emplace(compute_op, share_buffer_op); } @@ -131,14 +127,28 @@ void MemoryReusePass::CollectShareTensorBufferOpHandles() const { void MemoryReusePass::CollectReusedVars() const { for (auto &pair : ops_) { - auto reused_vars = pair.second->ReusedVarSet(); - reused_var_names_[pair.first->GetScopeIdx()].insert(reused_vars.begin(), - reused_vars.end()); + auto reused_vars = pair.second->ReusedVars(); + for (auto &reused_var_pair : reused_vars) { + reused_in_var_names_[pair.first->GetScopeIdx()].insert( + reused_var_pair.first); + reused_out_var_names_[pair.first->GetScopeIdx()].insert( + reused_var_pair.second); + } } } -bool MemoryReusePass::IsVarAlreadyReused(details::VarHandle *var) const { - return reused_var_names_[var->scope_idx()].count(var->Name()) > 0; +bool MemoryReusePass::IsInVarAlreadyReused( + const details::VarHandle &in_var) const { + const auto var_name = in_var.Name(); + size_t scope_idx = in_var.scope_idx(); + return reused_in_var_names_[scope_idx].count(var_name) > 0; +} + +bool MemoryReusePass::IsOutVarAlreadyReused( + const details::VarHandle &out_var) const { + const auto var_name = out_var.Name(); + size_t scope_idx = out_var.scope_idx(); + return reused_out_var_names_[scope_idx].count(var_name) > 0; } details::ShareTensorBufferOpHandle * @@ -171,57 +181,123 @@ MemoryReusePass::InsertShareTensorBufferOpHandleToGraph( return buffer_share_op; } -bool MemoryReusePass::IsVarsReusable(details::VarHandle *in_var, - details::VarHandle *out_var) const { - const auto in_name = in_var->Name(); - const auto out_name = out_var->Name(); +/** + * Input var is reusable only when: + * - it is not an empty var. + * - it has not been reused. If an input var is reused twice or more, + * the calculation result may be wrong. + * - it is not a persistable var. + * - it is LoDTensor. We can support SelectedRows in the future. + */ +bool MemoryReusePass::IsInVarReusable(const details::VarHandle &in_var) const { + if (in_var.Name() == kEmptyVarName) { + return false; + } - if (in_name == out_name) { + if (IsInVarAlreadyReused(in_var)) { return false; } - if (in_name == kEmptyVarName || out_name == kEmptyVarName) { + const VarDesc *in_var_desc = GetVarDesc(in_var); + + if (IsPinnedVar(*in_var_desc)) { return false; } - if (IsVarAlreadyReused(in_var)) { + if (in_var_desc->GetType() != proto::VarType::LOD_TENSOR) { + return false; + } + + return true; +} + +/** + * Output var is reusable only when: + * - it is not an empty var. + * - it is the first version var. Otherwise, the var may be overwritten + * in the second batch, which results in wrong calculation result. + * It is critical especially when + * ExecutionStrategy::num_iteration_per_drop_scope_ > 1. + * - it has not reused other var's memory. It is not necessary to do memory + * reuse twice for the same var. + * - it is not a persistable var. + * - it is LoDTensor. We can support SelectedRows in the future. + * - it does not occur in inputs of the generated op. It would happen when + * op has the same var as both input and output. + */ +bool MemoryReusePass::IsOutVarReusable( + const details::VarHandle &out_var) const { + PADDLE_ENFORCE_NOT_NULL(dynamic_cast( + out_var.GeneratedOp())); + const auto out_name = out_var.Name(); + if (out_name == kEmptyVarName) { return false; } // out_var must be the first version!!! - auto out_var_iter = (*all_vars_)[out_var->scope_idx()].find(out_name); - PADDLE_ENFORCE(out_var_iter != (*all_vars_)[out_var->scope_idx()].end() && + auto out_var_iter = (*all_vars_)[out_var.scope_idx()].find(out_name); + PADDLE_ENFORCE(out_var_iter != (*all_vars_)[out_var.scope_idx()].end() && !out_var_iter->second.empty(), "Cannot find variable %s", out_name); - if (out_var_iter->second[0] != out_var) { + if (out_var_iter->second[0] != &out_var) { return false; } - const VarDesc *in_var_desc = GetVarDesc(in_var); - const VarDesc *out_var_desc = GetVarDesc(out_var); + if (IsOutVarAlreadyReused(out_var)) { + return false; + } - if (in_var_desc->Persistable() || out_var_desc->Persistable()) { + const VarDesc *out_var_desc = GetVarDesc(out_var); + if (IsPinnedVar(*out_var_desc)) { return false; } - if (in_var_desc->GetType() != proto::VarType::LOD_TENSOR || - out_var_desc->GetType() != proto::VarType::LOD_TENSOR) { + if (out_var_desc->GetType() != proto::VarType::LOD_TENSOR) { return false; } - if (!FindNodesByName(in_name, out_var->GeneratedOp()->Node()->outputs) + // If out_name occurs in input of the generated op, it cannot reuse others. + if (!FindNodesByName(out_name, out_var.GeneratedOp()->Node()->inputs) .empty()) { return false; } - if (!FindNodesByName(out_name, out_var->GeneratedOp()->Node()->inputs) - .empty()) { + return true; +} + +bool MemoryReusePass::IsPinnedVar(const VarDesc &var_desc) const { + return var_desc.Persistable() || + (pinned_var_set_ && pinned_var_set_->count(var_desc.Name())); +} + +/** + * Input-Output pair can be reused only when: + * - they are not the same var. + * - they are both reusable. + * - input var does not occur in output of op. + * - input var does not occur in input of op for multiple times. + */ +bool MemoryReusePass::IsVarPairReusable( + const details::VarHandle &in_var, const details::VarHandle &out_var) const { + auto *op = + dynamic_cast(out_var.GeneratedOp()); + PADDLE_ENFORCE_NOT_NULL(op); + + const auto in_name = in_var.Name(); + if (in_name == out_var.Name()) { + return false; + } + + if (!IsInVarReusable(in_var) || !IsOutVarReusable(out_var)) { return false; } - auto all_input_args = - out_var->GeneratedOp()->Node()->Op()->InputArgumentNames(); + if (!FindNodesByName(in_name, op->Node()->outputs).empty()) { + return false; + } + + auto all_input_args = op->Node()->Op()->InputArgumentNames(); if (std::count(all_input_args.begin(), all_input_args.end(), in_name) > 1) { return false; } @@ -249,10 +325,11 @@ void MemoryReusePass::AddReuseVar(details::ComputationOpHandle *op, share_buffer_op->AddInput(in_var); } - share_buffer_op->Add( + share_buffer_op->AddReuseVarPair( (*var_infos_)[op->GetScopeIdx()].at(in_var->Name()).get(), out_var->Name()); - reused_var_names_[op->GetScopeIdx()].insert(in_var->Name()); + reused_in_var_names_[op->GetScopeIdx()].insert(in_var->Name()); + reused_out_var_names_[op->GetScopeIdx()].insert(out_var->Name()); UpdateLastLiveOpOfVar(op, in_var, out_var); } @@ -265,14 +342,21 @@ void MemoryReusePass::UpdateLastLiveOpOfVar(details::ComputationOpHandle *op, size_t scope_idx = op->GetScopeIdx(); auto out_var_op_iter = (*last_live_ops_of_vars_)[scope_idx].find(out_var->Name()); - PADDLE_ENFORCE(out_var_op_iter != (*last_live_ops_of_vars_)[scope_idx].end(), - "Cannot find variable %s", out_var->Name()); - PADDLE_ENFORCE(!out_var_op_iter->second.empty()); - - auto &last_live_ops_of_in_var = - (*last_live_ops_of_vars_)[scope_idx][in_var->Name()]; - last_live_ops_of_in_var.clear(); - last_live_ops_of_in_var.insert(*(out_var_op_iter->second.begin())); + + // In Reduce mode, some output variable(gradient of parameter) does not have + // last live ops + details::ComputationOpHandle *last_live_op_of_in_var = nullptr; + if (out_var_op_iter == (*last_live_ops_of_vars_)[scope_idx].end()) { + last_live_op_of_in_var = op; + } else { + PADDLE_ENFORCE(!out_var_op_iter->second.ops().empty()); + last_live_op_of_in_var = *(out_var_op_iter->second.ops().begin()); + } + + auto *last_live_ops_of_in_var = + (*last_live_ops_of_vars_)[scope_idx][in_var->Name()].mutable_ops(); + last_live_ops_of_in_var->clear(); + last_live_ops_of_in_var->insert(last_live_op_of_in_var); auto in_var_info_iter = (*var_infos_)[scope_idx].find(in_var->Name()); PADDLE_ENFORCE(in_var_info_iter != (*var_infos_)[scope_idx].end(), diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h index f706b48e..82274419 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h +++ b/paddle/fluid/framework/ir/memory_optimize_pass/memory_reuse_pass.h @@ -81,18 +81,26 @@ class MemoryReusePass : public Pass { bool TryReuseVar(details::VarHandle *in_var, details::VarHandle *out_var) const; - std::unordered_set FindNodesByName( - const std::string &name, const std::vector &nodes) const; + bool IsInVarReusable(const details::VarHandle &in_var) const; + + bool IsOutVarReusable(const details::VarHandle &out_var) const; + + std::unordered_set FindNodesByName( + const std::string &name, const std::vector &nodes) const; size_t ScopeNum() const { return all_vars_->size(); } + int64_t GetMemorySize(const details::VarHandle &var) const; + private: - VarDesc *GetVarDesc(details::VarHandle *var) const; + VarDesc *GetVarDesc(const details::VarHandle &var) const; + + bool IsVarPairReusable(const details::VarHandle &in_var, + const details::VarHandle &out_var) const; - bool IsVarsReusable(details::VarHandle *in_var, - details::VarHandle *out_var) const; + bool IsInVarAlreadyReused(const details::VarHandle &in_var) const; - bool IsVarAlreadyReused(details::VarHandle *var) const; + bool IsOutVarAlreadyReused(const details::VarHandle &out_var) const; details::ShareTensorBufferOpHandle *InsertShareTensorBufferOpHandleToGraph( details::ComputationOpHandle *op) const; @@ -110,17 +118,24 @@ class MemoryReusePass : public Pass { private: mutable Graph *graph_; + mutable bool use_cuda_; + mutable details::GraphVars *all_vars_; mutable MemOptVarInfoMapList *var_infos_; + mutable std::vector *last_live_ops_of_vars_; mutable std::unordered_map ops_; - mutable std::vector> reused_var_names_; + mutable std::vector> reused_in_var_names_; + mutable std::vector> reused_out_var_names_; mutable std::vector> var_descs_; + mutable details::PinnedVars *pinned_var_set_; + + bool IsPinnedVar(const VarDesc &out_var_desc) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc index 6b7249b1..d2cc89a2 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.cc @@ -66,6 +66,24 @@ const std::unordered_set &OpGraphView::PendingOps( return pending_ops_.at(op); } +const std::unordered_set &OpGraphView::PrecedingOps( + details::OpHandleBase *op) const { + EnforceHasOp(op); + return preceding_ops_.at(op); +} + +std::unordered_map +OpGraphView::GetPrecedingDepNum() const { + std::unordered_map result; + result.reserve(preceding_ops_.size()); + for (auto &pair : preceding_ops_) { + result.emplace(pair.first, pair.second.size()); + } + return result; +} + +size_t OpGraphView::OpNumber() const { return preceding_ops_.size(); } + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h index afd29091..86b25c13 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h +++ b/paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h @@ -33,13 +33,24 @@ class OpGraphView { const std::unordered_set &PendingOps( details::OpHandleBase *op) const; + const std::unordered_set &PrecedingOps( + details::OpHandleBase *op) const; + + std::unordered_map GetPrecedingDepNum() + const; + bool HasOp(details::OpHandleBase *op) const; + size_t OpNumber() const; + // Use a visitor to visit all pending ops of op // Stop when callback returns false template bool VisitAllPendingOps(details::OpHandleBase *op, Callback &&callback) const; + template + void BreadthFirstVisit(Callback &&callback) const; + private: void Build(const std::vector &ops); void EnforceHasOp(details::OpHandleBase *op) const; @@ -75,6 +86,52 @@ bool OpGraphView::VisitAllPendingOps(details::OpHandleBase *op, return true; } +template +void OpGraphView::BreadthFirstVisit(Callback &&callback) const { + auto op_deps = GetPrecedingDepNum(); + size_t op_num = op_deps.size(); + + std::unordered_set visited_ops; + std::queue ready_ops; + size_t num_calls = 0; + for (auto iter = op_deps.begin(); iter != op_deps.end();) { + if (iter->second != 0) { + ++iter; + continue; + } + + visited_ops.insert(iter->first); + ready_ops.push(iter->first); + callback(iter->first); + ++num_calls; + op_deps.erase(iter++); + } + + while (!ready_ops.empty()) { + auto *cur_op = ready_ops.front(); + ready_ops.pop(); + + auto &pending_ops = PendingOps(cur_op); + for (auto *pending_op : pending_ops) { + if (visited_ops.count(pending_op) > 0) { + continue; + } + + if (--op_deps.at(pending_op) == 0) { + visited_ops.insert(pending_op); + op_deps.erase(pending_op); + ready_ops.push(pending_op); + callback(pending_op); + ++num_calls; + } + } + } + + PADDLE_ENFORCE_EQ(num_calls, op_num, "There are unvisited ops"); + PADDLE_ENFORCE_EQ(visited_ops.size(), op_num, "There are unvisited ops"); + PADDLE_ENFORCE(op_deps.empty(), "There are unvisited ops"); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc deleted file mode 100644 index 040b769f..00000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/framework/op_proto_maker.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace framework { -namespace ir { - -class RecordSkipMemoryOptVarsPass : public ir::Pass { - protected: - void ApplyImpl(ir::Graph* graph) const override { - PADDLE_ENFORCE(!graph->Has(kMemOptSkipVars)); - graph->Set(kMemOptSkipVars, new MemOptSkipVars); - auto& skip_vars = graph->Get(kMemOptSkipVars); - - std::vector op_nodes; - for (auto& node : graph->Nodes()) { - PADDLE_ENFORCE_NOT_NULL(node, "The node should not be nullptr."); - if (node->IsOp() && node->Op()) { - op_nodes.emplace_back(node); - } - } - - // Insert kEmptyVarName to avoid optimizing empty variable - skip_vars.insert(framework::kEmptyVarName); - - // NOTE(zcd): Insert OpRoleVars to SkipVarSet to prevent the vars are rename - // in memory optimize pass. - InsertOpRoleVarsToSkipVarSet(op_nodes, &skip_vars); - - InsertSkipMemOptOpInOutToSkipVarSet(op_nodes, &skip_vars); - } - - private: - static void InsertOpRoleVarsToSkipVarSet(const std::vector& ops, - MemOptSkipVars* skip_vars) { - for (auto& node : ops) { - try { - auto op_role_vars = - boost::get>(node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - PADDLE_ENFORCE_EQ(op_role_vars.size() % 2, 0); - for (size_t i = 0; i < op_role_vars.size(); i += 2) { - auto& g_name = op_role_vars[i + 1]; - skip_vars->insert(g_name); - } - } catch (boost::bad_get& e) { - } - } - } - - static void UpdateSkipVarSet( - MemOptSkipVars* skip_vars, - const std::vector>& var_names) { - for (auto& var_name : var_names) { - skip_vars->insert(var_name.begin(), var_name.end()); - } - } - - static std::vector ToGradVarName( - const std::vector& names) { - std::vector ret; - ret.reserve(names.size()); - for (auto& name : names) { - if (name != framework::kEmptyVarName) { - ret.emplace_back(framework::GradVarName(name)); - } - } - return ret; - } - - static void InsertSkipMemOptOpInOutToSkipVarSet( - const std::vector& ops, MemOptSkipVars* skip_vars) { - static std::unordered_set kSkipMemOptOps{ - "send", "recv", "prefetch", "send_barrier", "fetch_barrier"}; - - for (auto& node : ops) { - auto* op_desc = node->Op(); - // Some ops (while, conditional_block, recurrent, etc.) have sub-blocks. - // These ops often use variables from its parent or forward blocks. - // Optimizing in/out of such ops would make these variables cannot - // be found when running sub-block ops. - if (OpHasSubBlock(op_desc)) { - UpdateSkipVarSet(skip_vars, {op_desc->InputArgumentNames(), - op_desc->OutputArgumentNames()}); - } - - // Skip ops that are related to parameter server. - // In distributed mode, trainers and parameter server use same - // variable names to track same variables. We cannot change the - // names of these variables, otherwise trainers or parameter - // server would not find them. - if (kSkipMemOptOps.count(op_desc->Type()) > 0) { - UpdateSkipVarSet(skip_vars, {op_desc->InputArgumentNames(), - op_desc->OutputArgumentNames()}); - } - - // FIXME(zjl): some ops use variables that are not from their - // inputs or outputs. We do not have a nice method to solve this - // issue yet. Currently, we should skip these variables when - // memory optimization is enabled. - auto op_type = op_desc->Type(); - if (op_type == "while_grad") { - // In while_grad, framework::GradVarName(Input("X")) is visited - // without being any in/out of while_grad. While_grad uses - // these variable to accumulate gradient of X across time steps. - UpdateSkipVarSet(skip_vars, {ToGradVarName(op_desc->Input("X"))}); - } else if (op_type == "conditional_block_grad") { - // In conditional_block_grad, framework::GradVarName(Input("Input", - // "Cond")) is visited without being any in/out of - // conditional_block_grad. Conditional_block_grad uses these - // variables to accumulate gradient of Input/Cond across time steps. - UpdateSkipVarSet(skip_vars, {ToGradVarName(op_desc->Input("Input")), - ToGradVarName(op_desc->Input("Cond"))}); - } else if (op_type == "recurrent" || op_type == "recurrent_grad") { - // Recurrent and recurrent_grad ops are implemented by a very trickly - // way. Attr("states", "ex_states") is visited without being any - // in/out of op. It is because these variables are from sub blocks, - // not main block. Adding these variables to input would make recurrent - // fail since "states" and "ex_states" cannot be found in main block. - // When memory optimization is enabled, "states", "ex_states" and their - // gradient should be skipped. - auto ex_states = - boost::get>(op_desc->GetAttr("ex_states")); - auto states = - boost::get>(op_desc->GetAttr("states")); - if (op_type == "recurrent") { - UpdateSkipVarSet(skip_vars, {ex_states, states}); - } else { - // In recurrent_grad, framework::GradVarName(Input("parameters", - // "input")) is visited without being any in/out of recurrent_grad. - // Recurrent_grad uses these variables to accumulate gradient of - // parameters/input across time steps. - UpdateSkipVarSet( - skip_vars, - {ToGradVarName(op_desc->Input("parameters")), - ToGradVarName(op_desc->Input("inputs")), ex_states, states, - ToGradVarName(ex_states), ToGradVarName(states)}); - } - } - } - } -}; - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(record_skip_memory_opt_vars_pass, - paddle::framework::ir::RecordSkipMemoryOptVarsPass); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc index 40e07ce8..6077069e 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc @@ -40,7 +40,8 @@ void RecurrentOpEagerDeletionPass::ApplyImpl(Graph *graph) const { // Prepare safe eager deletion on different devices because the garbage // collection may be different across devices OpAndGradOpPair &op_pair = entry.second; - PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(&op_pair); + PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( + graph->OriginProgram(), &op_pair); } } diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc index e9114156..cc26f7f9 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass.cc @@ -312,13 +312,22 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const { ShrinkDepsOpFunctor shrink_func( ir::FilterByNodeWrapper(*graph)); + details::PinnedVars *pinned_var_set = nullptr; + if (graph->Has(details::kPinnedVars)) { + pinned_var_set = &graph->Get(details::kPinnedVars); + } + auto is_pinned_var = [&pinned_var_set](const VarDesc &var_desc) { + return var_desc.Persistable() || + (pinned_var_set && pinned_var_set->count(var_desc.Name())); + }; + VLOG(1) << "Place number: " << vars.size(); for (size_t i = 0; i < vars.size(); ++i) { for (auto &name_var_pair : vars[i]) { // Whether this variable can be reused or deleted? If not, we do not // compute reference counts and dependencies. VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second); - if (var_desc == nullptr || var_desc->Persistable()) { + if (var_desc == nullptr || is_pinned_var(*var_desc)) { continue; } @@ -337,6 +346,10 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const { for (auto iter = var_handles.rbegin(); iter != var_handles.rend(); ++iter) { + if ((*iter)->Node()->IsCtrlVar()) { + break; + } + VLOG(10) << "Try to find last living ops of " << var_name << " " << (iter - var_handles.rbegin()) << " time"; LastLiveOpSearchStatus status = LastLiveOpSearchStatus::kFailure; @@ -346,6 +359,8 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const { // Seldomly, some vars may have no pending or preceding computation ops // Just break; if (status == LastLiveOpSearchStatus::kFailure) { + VLOG(1) << "Cannot find last live ops of variable " << var_name + << " in scope " << (*iter)->scope_idx(); break; } @@ -362,7 +377,9 @@ void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const { VLOG(10) << "Extract " << result.size() << " ops of var " << var_name; var_infos[i][var_name].reset( new MemOptVarInfo(var_name, result.size())); - last_live_ops_of_vars[i].emplace(var_name, std::move(result)); + auto &last_live_ops_of_var = last_live_ops_of_vars[i][var_name]; + last_live_ops_of_var.set_var(*iter); + *(last_live_ops_of_var.mutable_ops()) = std::move(result); break; } diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h index 3433694b..0e8f4e78 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h +++ b/paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h @@ -39,10 +39,28 @@ using GarbageCollectorMap = const char kMemOptVarInfoMapList[] = "mem_opt_var_info_map_list"; const char kGarbageCollector[] = "garbage_collector"; const char kAllPlaces[] = "all_places"; +const char kUseCuda[] = "use_cuda"; -using LastLiveOpsOfVars = - std::unordered_map>; +class LastLiveOpOfVarInfo { + public: + details::VarHandle *var() { return var_; } + + void set_var(details::VarHandle *var) { var_ = var; } + + const std::unordered_set &ops() const { + return ops_; + } + + std::unordered_set *mutable_ops() { + return &ops_; + } + + private: + details::VarHandle *var_{nullptr}; + std::unordered_set ops_; +}; + +using LastLiveOpsOfVars = std::unordered_map; const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; VarDesc *TryGetLatestVarDesc(const std::vector &vars); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc index 63f996ad..da0da4c7 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/while_op_eager_deletion_pass.cc @@ -47,7 +47,7 @@ class WhileOpEagerDeletionPass : public ir::Pass { auto &while_ops = ops_pair.second.first; auto &while_grad_ops = ops_pair.second.second; operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( - while_ops, while_grad_ops); + graph->OriginProgram(), while_ops, while_grad_ops); } } }; diff --git a/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc new file mode 100644 index 00000000..2226169e --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h" +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void ConvActivationFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL(graph, "graph cannot be nullptr."); + FusePassBase::Init("conv_activation_mkldnn_fuse", graph); + + GraphPatternDetector gpd; + auto* conv_input = gpd.mutable_pattern() + ->NewNode("conv_activation_mkldnn_fuse/conv_input") + ->AsInput() + ->assert_is_op_input(conv_type(), "Input"); + patterns::ConvActivation conv_activation_pattern( + gpd.mutable_pattern(), "conv_activation_mkldnn_fuse"); + conv_activation_pattern(conv_input, conv_type(), activation_type()); + + int found_conv_activation_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle " + conv_type() + "+" + activation_type() + " fuse"; + GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, + conv_activation_pattern); // Filter + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, + conv_activation_pattern); // tmp + GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_activation_pattern); // CONV op + GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, + conv_activation_pattern); // Out + GET_IR_NODE_FROM_SUBGRAPH(activation, activation, + conv_activation_pattern); // Activation op + + // Transform Conv node into ConvActivation node. + OpDesc* desc = conv->Op(); + desc->SetOutput("Output", + std::vector({activation_out->Name()})); + + desc->SetAttr("fuse_activation", activation_type()); + + // MKLDNN ops use alpha and beta as activation parameters but paddle ops are + // not generalized + if (activation_type() == "relu6") { + desc->SetAttr("fuse_alpha", + boost::get(activation->Op()->GetAttr("threshold"))); + } else { + desc->SetAttr("fuse_alpha", + activation->Op()->GetAttrIfExists("alpha")); + } + desc->SetAttr("fuse_beta", + activation->Op()->GetAttrIfExists("beta")); + + GraphSafeRemoveNodes(graph, {activation, conv_out}); + + PADDLE_ENFORCE_GT(subgraph.count(conv_input), 0UL, + "subgraph has to contain conv_input node."); + IR_NODE_LINK_TO(conv, activation_out); + found_conv_activation_count++; + }; + + gpd(graph, handler); + + AddStatis(found_conv_activation_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_activation_mkldnn_fuse_pass, + paddle::framework::ir::ConvActivationFusePass); + +REGISTER_PASS(conv_relu_mkldnn_fuse_pass, + paddle::framework::ir::ConvActivationFusePass); + +REGISTER_PASS(conv_leaky_relu_mkldnn_fuse_pass, + paddle::framework::ir::Conv2DLeakyReLUFusePass); + +REGISTER_PASS(conv_relu6_mkldnn_fuse_pass, + paddle::framework::ir::Conv2DReLU6FusePass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h similarity index 60% rename from paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h rename to paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h index 2174c22d..7c6dc238 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h @@ -14,6 +14,7 @@ #pragma once +#include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" @@ -22,18 +23,33 @@ namespace paddle { namespace framework { namespace ir { - /* - * Fuse the CONV and ReLU to a ConvReLUOp. + * Fuse Conv and Activation base class. */ -class ConvReLUFusePass : public FusePassBase { +class ConvActivationFusePass : public FusePassBase { public: - virtual ~ConvReLUFusePass() {} + virtual ~ConvActivationFusePass() {} + virtual std::string conv_type() const { return "conv2d"; } + virtual std::string activation_type() const { return "relu"; } protected: void ApplyImpl(ir::Graph* graph) const override; + const std::string name_scope_{"conv_activation_mkldnn_fuse"}; +}; +/* + * Fuse Conv and LeakyReLU class + */ +class Conv2DLeakyReLUFusePass : public ConvActivationFusePass { + public: + std::string activation_type() const { return "leaky_relu"; } +}; +/* + * Fuse Conv and BoundedReLU class + */ +class Conv2DReLU6FusePass : public ConvActivationFusePass { + public: + std::string activation_type() const { return "relu6"; } }; - } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc similarity index 60% rename from paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc rename to paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc index 67a99570..ec38788b 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h" +#include "paddle/fluid/framework/ir/mkldnn/conv_activation_mkldnn_fuse_pass.h" #include #include "paddle/fluid/framework/op_proto_maker.h" @@ -23,18 +23,24 @@ namespace ir { void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, const std::vector& inputs, - const std::vector& outputs, bool use_mkldnn = false) { + const std::vector& outputs, bool is_activation = false, + bool use_mkldnn = false) { auto* op = prog->MutableBlock(0)->AppendOp(); op->SetType(type); + op->SetAttr("name", name); if (type == "conv2d") { op->SetAttr("use_mkldnn", use_mkldnn); - op->SetAttr("name", name); op->SetInput("Input", {inputs[0]}); op->SetInput("Filter", {inputs[1]}); op->SetInput("Bias", {inputs[2]}); - } else if (type == "relu") { + } else if (is_activation) { op->SetAttr("use_mkldnn", use_mkldnn); op->SetInput("X", inputs); + if (type == "leaky_relu") { + op->SetAttr("alpha", 0.02f); + } else if (type == "relu6") { + op->SetAttr("threshold", 6.0f); + } } op->SetOutput("Out", outputs); op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), @@ -44,15 +50,15 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, // a->OP0->b // b->OP1->c // (c, weights, bias)->conv->f -// (f)->relu->g -ProgramDesc BuildProgramDesc() { +// (f)->activation->g +ProgramDesc BuildProgramDesc(std::string activation) { ProgramDesc prog; for (auto& v : std::vector({"a", "b", "c", "weights", "bias", "f", "g", - "h", "weights2", "bias2", "k", "l"})) { + "h", "weights2", "bias2", "k", "l", "m"})) { auto* var = prog.MutableBlock(0)->Var(v); var->SetType(proto::VarType::SELECTED_ROWS); - if (v == "weights" || v == "bias") { + if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2") { var->SetPersistable(true); } } @@ -61,30 +67,33 @@ ProgramDesc BuildProgramDesc() { std::vector({"b"})); SetOp(&prog, "OP1", "op1", std::vector({"b"}), std::vector({"c"})); - // conv+relu, both with MKL-DNN + // conv+activation, both with MKL-DNN SetOp(&prog, "conv2d", "conv1", std::vector({"c", "weights", "bias"}), - std::vector({"f"}), true); - SetOp(&prog, "relu", "relu1", std::vector({"f"}), - std::vector({"g"}), true); + std::vector({"f"}), false, true); + SetOp(&prog, activation, "activation1", std::vector({"f"}), + std::vector({"g"}), true, true); SetOp(&prog, "OP3", "op3", std::vector({"g"}), std::vector({"h"})); - // conv+relu, only one with MKL-DNN + // conv+activation, only one with MKL-DNN SetOp(&prog, "conv2d", "conv2", std::vector({"h", "weights2", "bias2"}), - std::vector({"k"}), true); - SetOp(&prog, "relu", "relu2", std::vector({"k"}), - std::vector({"l"})); + std::vector({"k"}), false, true); + SetOp(&prog, "activation", "activation2", std::vector({"k"}), + std::vector({"l"}), true, false); + SetOp(&prog, "OP4", "op4", std::vector({"l"}), + std::vector({"m"})); return prog; } -TEST(ConvReLUFusePass, basic) { - auto prog = BuildProgramDesc(); +void MainTest(std::string activation) { + auto prog = BuildProgramDesc(activation); std::unique_ptr graph(new ir::Graph(prog)); - auto pass = PassRegistry::Instance().Get("conv_relu_mkldnn_fuse_pass"); + auto pass = + PassRegistry::Instance().Get("conv_" + activation + "_mkldnn_fuse_pass"); int original_nodes_num = graph->Nodes().size(); @@ -92,36 +101,41 @@ TEST(ConvReLUFusePass, basic) { int current_nodes_num = graph->Nodes().size(); - // Remove 3 Nodes: CONV, RELU, conv_out - // Add 1 Node: ConvReLU + // Remove 3 Nodes: CONV, activation, conv_out + // Add 1 Node: ConvActivation EXPECT_EQ(original_nodes_num - 2, current_nodes_num); - // Assert conv_relu op in newly generated graph - int conv_relu_count = 0; + // Assert conv_activation op in newly generated graph + int conv_activation_count = 0; for (auto* node : graph->Nodes()) { if (node->IsOp() && node->Op()->Type() == "conv2d") { auto* op = node->Op(); ASSERT_TRUE(op->HasAttr("use_mkldnn")); EXPECT_TRUE(boost::get(op->GetAttr("use_mkldnn"))); - // check if only "conv1" convolution is fused auto op_name = boost::get(op->GetAttr("name")); + if (op->GetAttrIfExists("fuse_activation") == activation) { + ++conv_activation_count; + } + // check if only "conv1" convolution is fused if (op_name == "conv1") { - ASSERT_TRUE(op->HasAttr("fuse_relu")); - bool fuse_relu = boost::get(op->GetAttr("fuse_relu")); - if (fuse_relu) { - ++conv_relu_count; - } + ASSERT_TRUE(op->HasAttr("fuse_activation")); } else if (op_name == "conv2") { - ASSERT_FALSE(op->HasAttr("fuse_relu")); + ASSERT_FALSE(op->HasAttr("fuse_activation")); } } } - EXPECT_EQ(conv_relu_count, 1); + EXPECT_EQ(conv_activation_count, 1); +} + +TEST(ConvActivationFusePass, conv_relu_fuse_pass) { MainTest("relu"); } +TEST(ConvActivationFusePass, conv_leaky_relu_fuse_pass) { + MainTest("leaky_relu"); } +TEST(ConvActivationFusePass, conv_relu6_fuse_pass) { MainTest("relu6"); } } // namespace ir } // namespace framework } // namespace paddle -USE_PASS(conv_relu_mkldnn_fuse_pass); +USE_PASS(conv_activation_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc deleted file mode 100644 index dd9d4486..00000000 --- a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.cc +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h" -#include -#include -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { -namespace ir { - -void ConvBReLUFusePass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE(graph); - FusePassBase::Init("conv_bounded_relu_mkldnn_fuse", graph); - - GraphPatternDetector gpd; - auto* conv_input = gpd.mutable_pattern() - ->NewNode("conv_bounded_relu_mkldnn_fuse/conv_input") - ->AsInput() - ->assert_is_op_input("conv2d", "Input"); - patterns::ConvBReLU conv_brelu_pattern(gpd.mutable_pattern(), - "conv_bounded_relu_mkldnn_fuse"); - conv_brelu_pattern(conv_input); - - int found_conv_brelu_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - VLOG(4) << "handle ConvBoundedReLUFusePass fuse"; - GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, - conv_brelu_pattern); // Filter - GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_brelu_pattern); // tmp - GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_brelu_pattern); // CONV op - GET_IR_NODE_FROM_SUBGRAPH(brelu_out, brelu_out, conv_brelu_pattern); // Out - GET_IR_NODE_FROM_SUBGRAPH(brelu, brelu, conv_brelu_pattern); // ReLU op - - // Transform Conv node into ConvBReLU node. - OpDesc* desc = conv->Op(); - desc->SetOutput("Output", std::vector({brelu_out->Name()})); - desc->SetAttr("fuse_brelu", true); - desc->SetAttr("fuse_brelu_threshold", brelu->Op()->GetAttr("threshold")); - - GraphSafeRemoveNodes(graph, {brelu, conv_out}); - - PADDLE_ENFORCE(subgraph.count(conv_input)); - IR_NODE_LINK_TO(conv, brelu_out); - found_conv_brelu_count++; - }; - - gpd(graph, handler); - - AddStatis(found_conv_brelu_count); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(conv_brelu_mkldnn_fuse_pass, - paddle::framework::ir::ConvBReLUFusePass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc deleted file mode 100644 index 5a546bfa..00000000 --- a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass_tester.cc +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h" - -#include -#include "paddle/fluid/framework/op_proto_maker.h" - -namespace paddle { -namespace framework { -namespace ir { - -void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, - const std::vector& inputs, - const std::vector& outputs, bool use_mkldnn = false) { - auto* op = prog->MutableBlock(0)->AppendOp(); - op->SetType(type); - if (type == "conv2d") { - op->SetAttr("use_mkldnn", use_mkldnn); - op->SetAttr("name", name); - op->SetInput("Input", {inputs[0]}); - op->SetInput("Filter", {inputs[1]}); - op->SetInput("Bias", {inputs[2]}); - } else if (type == "relu6") { - op->SetAttr("use_mkldnn", use_mkldnn); - if (use_mkldnn) { - op->SetAttr("threshold", 6.0f); - } - op->SetInput("X", inputs); - } - op->SetOutput("Out", outputs); - op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), - static_cast(OpRole::kForward)); -} - -// a->OP0->b -// b->OP1->c -// (c, weights, bias)->conv->f -// (f)->brelu->g -ProgramDesc BuildProgramDesc() { - ProgramDesc prog; - for (auto& v : - std::vector({"a", "b", "c", "weights", "bias", "f", "g", - "h", "weights2", "bias2", "k", "l"})) { - auto* var = prog.MutableBlock(0)->Var(v); - var->SetType(proto::VarType::SELECTED_ROWS); - if (v == "weights" || v == "bias") { - var->SetPersistable(true); - } - } - - SetOp(&prog, "OP0", "op0", std::vector({"a"}), - std::vector({"b"})); - SetOp(&prog, "OP1", "op1", std::vector({"b"}), - std::vector({"c"})); - // conv+brelu, both with MKL-DNN - SetOp(&prog, "conv2d", "conv1", - std::vector({"c", "weights", "bias"}), - std::vector({"f"}), true); - SetOp(&prog, "relu6", "relu1", std::vector({"f"}), - std::vector({"g"}), true); - SetOp(&prog, "OP3", "op3", std::vector({"g"}), - std::vector({"h"})); - // conv+brelu, only one with MKL-DNN - SetOp(&prog, "conv2d", "conv2", - std::vector({"h", "weights2", "bias2"}), - std::vector({"k"}), true); - SetOp(&prog, "relu6", "relu2", std::vector({"k"}), - std::vector({"l"})); - - return prog; -} - -TEST(ConvBReLUFusePass, basic) { - auto prog = BuildProgramDesc(); - - std::unique_ptr graph(new ir::Graph(prog)); - - auto pass = PassRegistry::Instance().Get("conv_brelu_mkldnn_fuse_pass"); - - int original_nodes_num = graph->Nodes().size(); - - graph.reset(pass->Apply(graph.release())); - - int current_nodes_num = graph->Nodes().size(); - - // Remove 3 Nodes: CONV, BRELU, conv_out - // Add 1 Node: ConvBReLU - EXPECT_EQ(original_nodes_num - 2, current_nodes_num); - - // Assert conv_brelu op in newly generated graph - int conv_brelu_count = 0; - - for (auto* node : graph->Nodes()) { - if (node->IsOp() && node->Op()->Type() == "conv2d") { - auto* op = node->Op(); - ASSERT_TRUE(op->HasAttr("use_mkldnn")); - EXPECT_TRUE(boost::get(op->GetAttr("use_mkldnn"))); - // check if only "conv1" convolution is fused - auto op_name = boost::get(op->GetAttr("name")); - if (op_name == "conv1") { - ASSERT_TRUE(op->HasAttr("fuse_brelu")); - ASSERT_TRUE(op->HasAttr("fuse_brelu_threshold")); - - bool fuse_brelu = boost::get(op->GetAttr("fuse_brelu")); - if (fuse_brelu) { - ++conv_brelu_count; - float fuse_brelu_threshold = - boost::get(op->GetAttr("fuse_brelu_threshold")); - EXPECT_EQ(fuse_brelu_threshold, 6.0f); - } - } else if (op_name == "conv2") { - ASSERT_FALSE(op->HasAttr("fuse_brelu")); - } - } - } - EXPECT_EQ(conv_brelu_count, 1); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -USE_PASS(conv_brelu_mkldnn_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc index a037a6bf..9e8f0f0c 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass.cc @@ -83,7 +83,7 @@ void ConvConcatReLUFusePass::FuseConvConcatReLU( // Transform Conv node into ConvReLU node. OpDesc* conv_desc = conv_op->Op(); - conv_desc->SetAttr("fuse_relu", true); + conv_desc->SetAttr("fuse_activation", std::string("relu")); // Remove ReLU when all Convs were transformed. auto number_of_unfused_convs_left = diff --git a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc index 0d7ddac8..ee00a395 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc @@ -28,7 +28,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, op->SetType(type); if (type == "conv2d") { op->SetAttr("use_mkldnn", use_mkldnn); - op->SetAttr("fuse_relu", false); + op->SetAttr("fuse_activation", std::string("")); op->SetInput("Input", {inputs[0]}); op->SetInput("Filter", {inputs[1]}); if (inputs.size() > 2) { @@ -109,8 +109,9 @@ void MainTest(const ProgramDesc& prog, bool fuse_relu) { if (node->IsOp()) { auto* op = node->Op(); if (op->Type() == "conv2d") { - ASSERT_TRUE(op->HasAttr("fuse_relu")); - bool fuse_relu_attr = boost::get(op->GetAttr("fuse_relu")); + ASSERT_TRUE(op->HasAttr("fuse_activation")); + bool fuse_relu_attr = + (boost::get(op->GetAttr("fuse_activation")) == "relu"); EXPECT_EQ(fuse_relu, fuse_relu_attr); } else if (op->Type() == "relu") { relu_count++; diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index ef7874c1..1263ddd1 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -109,8 +109,7 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( if (!IsReachable(graph, elementwise_add_identity, conv_output)) return; - auto fuse_relu = HasAttribute(*conv_op, "fuse_relu"); - if (fuse_relu && *fuse_relu) return; + if (HasFusedActivation(conv_op)) return; conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); @@ -179,8 +178,7 @@ void ResidualConnectionMKLDNNFusePass::ProjectionFuseHandle::operator()( return; } - auto fuse_relu = HasAttribute(*residual_conv_op, "fuse_relu"); - if (fuse_relu && *fuse_relu) return; + if (HasFusedActivation(residual_conv_op)) return; residual_conv_op->Op()->SetInput("ResidualData", {projection_node->Name()}); residual_conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h index 9bf1ae60..b95aec34 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h @@ -126,6 +126,11 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { protected: void ApplyImpl(graph_ptr graph) const; + static bool HasFusedActivation(Node* conv_node) { + return !(conv_node->Op() + ->GetAttrIfExists("fuse_activation") + .empty()); + } const std::string name_scope_{"residual_connection_fuse_pass"}; }; diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc deleted file mode 100644 index dd0fb456..00000000 --- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h" -#include -#include -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { -namespace ir { - -void ConvReLUFusePass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE(graph); - FusePassBase::Init("conv_relu_mkldnn_fuse", graph); - - GraphPatternDetector gpd; - auto* conv_input = gpd.mutable_pattern() - ->NewNode("conv_relu_mkldnn_fuse/conv_input") - ->AsInput() - ->assert_is_op_input("conv2d", "Input"); - patterns::ConvReLU conv_relu_pattern(gpd.mutable_pattern(), - "conv_relu_mkldnn_fuse"); - conv_relu_pattern(conv_input); - - int found_conv_relu_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* g) { - VLOG(4) << "handle ConvReLU fuse"; - GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, - conv_relu_pattern); // Filter - GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern); // tmp - GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern); // CONV op - GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern); // Out - GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern); // ReLU op - - FuseOptions fuse_option = FindFuseOption(*conv, *relu); - if (fuse_option == DO_NOT_FUSE) { - VLOG(3) << "do not perform conv+relu fuse"; - return; - } - - // Transform Conv node into ConvReLU node. - OpDesc* desc = conv->Op(); - desc->SetOutput("Output", std::vector({relu_out->Name()})); - desc->SetAttr("fuse_relu", true); - GraphSafeRemoveNodes(graph, {relu, conv_out}); - - PADDLE_ENFORCE(subgraph.count(conv_input)); - IR_NODE_LINK_TO(conv, relu_out); - - found_conv_relu_count++; - }; - - gpd(graph, handler); - - AddStatis(found_conv_relu_count); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(conv_relu_mkldnn_fuse_pass, - paddle::framework::ir::ConvReLUFusePass); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index 89f51bfa..9cf55ee3 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -208,6 +208,14 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, DequantizeOutput(g, conv_op, conv_output, "Output", output_scale, is_output_unsigned, "Scale_out"); + // change threshold in bounded ReLu + if (conv_op->Op()->GetAttrIfExists("fuse_activation") == + "relu6") { + float scale_out = boost::get(conv_op->Op()->GetAttr("Scale_out")); + float threshold = boost::get(conv_op->Op()->GetAttr("fuse_alpha")); + conv_op->Op()->SetAttr("fuse_alpha", scale_out * threshold); + } + ++quantize_conv_count; }; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc index 2270e2b5..1e23539c 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc @@ -49,14 +49,14 @@ void CPUQuantizeSquashPass::FindNodesToKeep( AddStatis(found_count); } -void CPUQuantizeSquashPass::Squash( +void CPUQuantizeSquashPass::DequantQuantSquash( Graph* graph, std::unordered_map* nodes_keep_counter) const { GraphPatternDetector gpd; patterns::DequantQuantAny squash_pattern{gpd.mutable_pattern(), "squash"}; squash_pattern(); - int found_squash_count = 0; + int found_dequant_quant_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { VLOG(4) << "squash requantize-quantize ops pair"; @@ -96,7 +96,7 @@ void CPUQuantizeSquashPass::Squash( IR_NODE_LINK_TO(dequant_in, next_op); - found_squash_count++; + found_dequant_quant_count++; } else { // squash dequantize-quantize to requantize op OpDesc desc; @@ -116,13 +116,83 @@ void CPUQuantizeSquashPass::Squash( IR_NODE_LINK_TO(dequant_in, requant_op); IR_NODE_LINK_TO(requant_op, quant_out); - found_squash_count++; + found_dequant_quant_count++; } }; gpd(graph, handler); - AddStatis(found_squash_count); + AddStatis(found_dequant_quant_count); PrettyLogDetail("--- squashed %d dequantize-quantize pairs", - found_squash_count); + found_dequant_quant_count); +} + +void CPUQuantizeSquashPass::ConvRequantSquash(Graph* graph) const { + GraphPatternDetector gpd; + patterns::ConvRequant conv_requant_pattern{gpd.mutable_pattern(), + "conv_requant"}; + conv_requant_pattern(); + + int found_requant_squash_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "squash conv-requantize ops pair"; + + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_requant_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_requant_pattern); + GET_IR_NODE_FROM_SUBGRAPH(requant_op, requant_op, conv_requant_pattern); + GET_IR_NODE_FROM_SUBGRAPH(requant_out, requant_out, conv_requant_pattern); + + // if conv2d has one output squash + if (conv_out->outputs.size() == 1) { + float requant_scale_out = + boost::get(requant_op->Op()->GetAttr("Scale_out")); + conv_op->Op()->SetAttr("Scale_out", requant_scale_out); + conv_op->Op()->SetOutput("Output", + std::vector({requant_out->Name()})); + IR_NODE_LINK_TO(conv_op, requant_out); + GraphSafeRemoveNodes(graph, {conv_out, requant_op}); + + found_requant_squash_count++; + } + }; + gpd(graph, handler); + AddStatis(found_requant_squash_count); + PrettyLogDetail("--- squashed %d requantize with convs", + found_requant_squash_count); +} + +void CPUQuantizeSquashPass::ConvDequantSquash(Graph* graph) const { + GraphPatternDetector gpd; + patterns::ConvDequant conv_dequant_pattern{gpd.mutable_pattern(), + "conv_dequant"}; + conv_dequant_pattern(); + + int found_conv_dequant_squash_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "squash conv-dequant ops pair"; + + GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_dequant_pattern); + GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_dequant_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, conv_dequant_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, conv_dequant_pattern); + + // if conv2d has one output + // and there is no fuse residual connection + // because residual fusion does not support force output with fp32 + if (conv_out->outputs.size() == 1 && + !(conv_op->Op()->GetAttrIfExists("fuse_residual_connection"))) { + conv_op->Op()->SetAttr("force_fp32_output", true); + conv_op->Op()->SetOutput("Output", + std::vector({dequant_out->Name()})); + IR_NODE_LINK_TO(conv_op, dequant_out); + GraphSafeRemoveNodes(graph, {conv_out, dequant_op}); + found_conv_dequant_squash_count++; + } + }; + gpd(graph, handler); + AddStatis(found_conv_dequant_squash_count); + PrettyLogDetail("--- squashed %d dequant with convs", + found_conv_dequant_squash_count); } void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const { @@ -131,7 +201,9 @@ void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const { std::unordered_map nodes_keep_counter; FindNodesToKeep(graph, &nodes_keep_counter); - Squash(graph, &nodes_keep_counter); + DequantQuantSquash(graph, &nodes_keep_counter); + ConvRequantSquash(graph); + ConvDequantSquash(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h index e873994c..7e9e92e3 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h @@ -46,8 +46,19 @@ class CPUQuantizeSquashPass : public FusePassBase { /* * Squash dequantize-quantize ops pairs into requantize or nothing */ - void Squash(Graph* graph, - std::unordered_map* nodes_keep_counter) const; + void DequantQuantSquash( + Graph* graph, + std::unordered_map* nodes_keep_counter) const; + + /* + * Squash requantize op into conv with scale_out like requantize scale_out + */ + void ConvRequantSquash(Graph* graph) const; + + /* + * Squash conv2d with dequant when dequant is the only op after conv2d + */ + void ConvDequantSquash(Graph* graph) const; const std::string name_scope_{"squash"}; }; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc index 057a790c..08b605a7 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc @@ -30,6 +30,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetAttr("use_mkldnn", use_mkldnn); op->SetAttr("name", name); if (type == "conv2d") { + op->SetAttr("Scale_out", scale); op->SetInput("Input", {inputs[0]}); if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]}); if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]}); @@ -42,14 +43,22 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetInput("Input", {inputs[0]}); op->SetOutput("Output", {outputs[0]}); op->SetAttr("Scale", scale); + } else if (type == "requantize") { + op->SetInput("Input", {inputs[0]}); + op->SetOutput("Output", {outputs[0]}); + op->SetAttr("Scale_out", scale); + } else if (type == "concat") { + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); } } // (a,w1,b1)->Conv1->d -// d->Dequant->e -// e->Quant->f +// d->Dequant(scale1)->e +// e->Quant(scale2)->f // (f,w2,b2)->Conv2->i -ProgramDesc BuildProgramDesc(bool use_mkldnn, float scale1, float scale2) { +ProgramDesc BuildConvRequantProgramDesc(bool use_mkldnn, float scale_out, + float scale1, float scale2) { ProgramDesc prog; for (auto& v : std::initializer_list( {"a", "w1", "b1", "d", "e", "f", "w2", "b2", "i"})) { @@ -59,42 +68,126 @@ ProgramDesc BuildProgramDesc(bool use_mkldnn, float scale1, float scale2) { } } - SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn); + SetOp(&prog, "conv2d", "Conv1", {"a", "w1", "b1"}, {"d"}, use_mkldnn, + scale_out); SetOp(&prog, "dequantize", "Dequant", {"d"}, {"e"}, use_mkldnn, scale1); SetOp(&prog, "quantize", "Quant", {"e"}, {"f"}, use_mkldnn, scale2); - SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn); + SetOp(&prog, "conv2d", "Conv2", {"f", "w2", "b2"}, {"i"}, use_mkldnn, + scale_out); return prog; } static const std::initializer_list variable_names{ "a", "b", "c", "d", "e", "f", "g", "h"}; + // a->Conv1->b -// b->Dequant->c -// -// c->Quant1->d and d->Conv2->e -// +// b->Dequant(scale1)->c +// c->Quant1(scale2)->d and d->Conv2->e // c->Conv3->f -// -// c->Quant2->g and g->Conv4->h -// -ProgramDesc BuildProgramDesc2(bool use_mkldnn, float scale1, float scale2, - float scale3) { +// c->Quant2(scale3)->g and g->Conv4->h +ProgramDesc BuildConvMultiOutputProgramDesc(bool use_mkldnn, float scale_out, + float scale1, float scale2, + float scale3) { ProgramDesc prog; for (auto& v : variable_names) { prog.MutableBlock(0)->Var(v); } - SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn); + SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out); SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1); SetOp(&prog, "quantize", "Quant1", {"c"}, {"d"}, use_mkldnn, scale2); - SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn); + SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn, scale_out); - SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn); + SetOp(&prog, "conv2d", "Conv3", {"c"}, {"f"}, use_mkldnn, scale_out); SetOp(&prog, "quantize", "Quant2", {"c"}, {"g"}, use_mkldnn, scale3); - SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn); + SetOp(&prog, "conv2d", "Conv4", {"g"}, {"h"}, use_mkldnn, scale_out); + + return prog; +} + +// a->Conv1->b->Requant(scale1)->c +// d->Conv2->e->Requant(scale2)->f +// {c,f}->Concat +ProgramDesc BuildConvsRequantConcatProgramDesc(bool use_mkldnn, float scale_out, + float scale1, float scale2) { + ProgramDesc prog; + for (auto& v : variable_names) { + prog.MutableBlock(0)->Var(v); + } + + SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out); + SetOp(&prog, "requantize", "Requant1", {"b"}, {"c"}, use_mkldnn, scale1); + + SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn, scale_out); + SetOp(&prog, "requantize", "Requant2", {"e"}, {"f"}, use_mkldnn, scale2); + + SetOp(&prog, "concat", "Concat", {"c"}, {"f"}, use_mkldnn); + + return prog; +} + +// a->Concat->b +// b->Dequant(scale1)->c +// c->Quant(scale2)->d +// d->Conv->e +ProgramDesc BuildConcatDequantQuantProgramDesc(bool use_mkldnn, float scale_out, + float scale1, float scale2) { + ProgramDesc prog; + for (auto& v : variable_names) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "concat", "Concat", {"a"}, {"b"}, use_mkldnn); + SetOp(&prog, "dequantize", "Dequant", {"b"}, {"c"}, use_mkldnn, scale1); + SetOp(&prog, "quantize", "Quant", {"c"}, {"d"}, use_mkldnn, scale2); + SetOp(&prog, "conv2d", "Conv2", {"d"}, {"e"}, use_mkldnn, scale_out); + return prog; +} + +// a->Conv1->b +// b->Requant1(Scale1)->c +// b->Requant2(Scale2)->d +ProgramDesc BuildConvMultiRequantProgramDesc(bool use_mkldnn, float scale_out, + float scale1, float scale2) { + ProgramDesc prog; + for (auto& v : variable_names) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out); + SetOp(&prog, "requantize", "Requant1", {"b"}, {"c"}, use_mkldnn, scale1); + SetOp(&prog, "requantize", "Requant2", {"b"}, {"d"}, use_mkldnn, scale2); + return prog; +} + +// a->Conv1->b +// b->Dequant1(Scale1)->c +// c->Concat +ProgramDesc BuildConvDequantConcatProgramDesc(bool use_mkldnn, float scale_out, + float scale) { + ProgramDesc prog; + for (auto& v : variable_names) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out); + SetOp(&prog, "dequantize", "Dequant1", {"b"}, {"c"}, use_mkldnn, scale); + SetOp(&prog, "concat", "Concat1", {"c"}, {"d"}, use_mkldnn); + return prog; +} + +// a->Conv1->b +// b->Dequant1(Scale1)->c +// b->Conv2->d +ProgramDesc BuildConvDequantConvProgramDesc(bool use_mkldnn, float scale_out, + float scale) { + ProgramDesc prog; + for (auto& v : variable_names) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "conv2d", "Conv1", {"a"}, {"b"}, use_mkldnn, scale_out); + SetOp(&prog, "dequantize", "Dequant1", {"b"}, {"c"}, use_mkldnn, scale); + SetOp(&prog, "conv2d", "Conv2", {"b"}, {"d"}, use_mkldnn); return prog; } @@ -105,10 +198,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, tensor->mutable_data(place, proto::VarType::FP32, 1); } -void MainTest(const ProgramDesc& prog, int removed_nodes_num) { - std::unique_ptr graph(new ir::Graph(prog)); - - // Init scope, as it is used in pass +void PrepareGraph(std::unique_ptr* graph, const ProgramDesc& prog) { auto place = paddle::platform::CPUPlace(); NaiveExecutor exe{place}; Scope scope; @@ -117,58 +207,198 @@ void MainTest(const ProgramDesc& prog, int removed_nodes_num) { for (auto& v : variable_names) { InitTensorHolder(&scope, place, v.c_str()); } + (*graph)->SetNotOwned(kParamScopeAttr, &scope); +} - graph->SetNotOwned(kParamScopeAttr, &scope); - +void RegisterPass(std::unique_ptr* graph) { auto pass = PassRegistry::Instance().Get("cpu_quantize_squash_pass"); + graph->reset(pass->Apply(graph->release())); +} - int original_nodes_num = graph->Nodes().size(); - - graph.reset(pass->Apply(graph.release())); +// check number of nodes +void CountNodeTest(const ProgramDesc& prog, int removed_nodes_num) { + std::unique_ptr graph(new ir::Graph(prog)); + PrepareGraph(&graph, prog); + int original_nodes_num = graph->Nodes().size(); + RegisterPass(&graph); int current_nodes_num = graph->Nodes().size(); EXPECT_EQ(original_nodes_num - removed_nodes_num, current_nodes_num); } +// check op->scale_out +void EqualScaleOutTest(const ProgramDesc& prog, const std::string& name, + float scale) { + std::unique_ptr graph(new ir::Graph(prog)); + PrepareGraph(&graph, prog); + RegisterPass(&graph); + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && + boost::get(node->Op()->GetAttr("name")) == name) { + float scale_out = boost::get(node->Op()->GetAttr("Scale_out")); + EXPECT_EQ(scale_out, scale); + } + } +} + +// check requant_op scales +void CheckRequantScalesTest(const ProgramDesc& prog, float scale_in, + float scale_out) { + std::unique_ptr graph(new ir::Graph(prog)); + + PrepareGraph(&graph, prog); + RegisterPass(&graph); + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "requantize") { + float op_scale_in = boost::get(node->Op()->GetAttr("Scale_in")); + EXPECT_EQ(op_scale_in, scale_in); + float op_scale_out = boost::get(node->Op()->GetAttr("Scale_out")); + EXPECT_EQ(op_scale_out, scale_out); + } + } +} + +// From Conv1->d->Dequant->e->Quant->f->Conv2 +// To Conv1->d->Conv2 TEST(CpuQuantizeSquashPass, equal_scales) { + auto scale_out = 1.0f; auto scale = 1.2345f; auto use_mkldnn = true; // Remove 4 nodes: Dequant, Quant, e, f auto remove_nodes = 4; - MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes); - use_mkldnn = !use_mkldnn; - MainTest(BuildProgramDesc(use_mkldnn, scale, scale), remove_nodes); + CountNodeTest( + BuildConvRequantProgramDesc(use_mkldnn, scale_out, scale, scale), + remove_nodes); } -TEST(CpuQuantizeSquashPass, inequal_scales) { +// From Conv1->d->Dequant->e->Quant->f->Conv2 +// First change to Conv1->d->Requant->f->Conv2 +// Then Conv1->f->Conv2 +TEST(CpuQuantizeSquashPass, unequal_scales) { + auto scale_out = 1.0f; auto scale1 = 1.2345f; auto scale2 = 21.0f; auto use_mkldnn = true; - // Remove 3 nodes: Dequant, Quant, e - // Insert 1 node: requantize + // Remove 4 nodes: Dequant, Quant, e, d + auto remove_nodes = 4; + + CountNodeTest( + BuildConvRequantProgramDesc(use_mkldnn, scale_out, scale1, scale2), + remove_nodes); + + EqualScaleOutTest( + BuildConvRequantProgramDesc(use_mkldnn, scale_out, scale1, scale2), + "Conv1", scale2); +} + +// from +// a->Conv1->b->Dequant(Scale1)->c +// c->Quant1(Scale1)->d and d->Conv2->e +// c->Quant2(Scale2)->g and g->Conv4->h +// c->Conv3->f +// to +// a->Conv1->b +// b->Conv2->e +// b->Requant(Scale_in = Scale1; Scale_out = Scale2)->g->Conv4->h +// b->Dequant(Scale1)->c->Conv3->f +TEST(CpuQuantizeSquashPass, branch_to_equal_unequal_and_fp32) { + auto scale_out = 1.0f; + auto scale = 1.2345f; + auto scale2 = 21.0f; + auto use_mkldnn = true; + // Remove 3 nodes: Quant1, c, Quant2, + // Insert 1 node: Requant + auto remove_nodes = 2; + + CountNodeTest(BuildConvMultiOutputProgramDesc(use_mkldnn, scale_out, scale, + scale, scale2), + remove_nodes); + CheckRequantScalesTest(BuildConvMultiOutputProgramDesc(use_mkldnn, scale_out, + scale, scale, scale2), + scale, scale2); +} + +// a->Conv1->b->Requant->c +// d->Conv2->e->Requant->f +// {c,f}->Concat +TEST(CpuQuantizeSquashPass, equal_scales_squash_requantize) { + // Delete both requantize op + auto scale_out = 1.0f; + auto scale = 1.2345f; + auto use_mkldnn = true; + // Remove 4 nodes: b, Requant1, e, Requant2 + auto remove_nodes = 4; + CountNodeTest( + BuildConvsRequantConcatProgramDesc(use_mkldnn, scale_out, scale, scale), + remove_nodes); + + // check equal scale conv->scale_out and requant->scale_out + EqualScaleOutTest( + BuildConvsRequantConcatProgramDesc(use_mkldnn, scale_out, scale, scale), + "Conv1", scale); + EqualScaleOutTest( + BuildConvsRequantConcatProgramDesc(use_mkldnn, scale_out, scale, scale), + "Conv2", scale); +} + +// a->Concat->b->Dequant->c->Quant->d->Conv->e +// to a->Concat->b->Requant->d->Conv->e +TEST(CpuQuantizeSquashPass, + unequal_scales_squash_dequantize_quantize_into_requantize) { + auto scale_out = 1.0f; + auto scale = 1.2345f; + auto scale2 = 21.0f; + auto use_mkldnn = true; + // Remove 3 nodes: Dequant1, c, Quant + // Insert 1 node: Requant auto remove_nodes = 2; - MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes); - use_mkldnn = !use_mkldnn; - MainTest(BuildProgramDesc(use_mkldnn, scale1, scale2), remove_nodes); + CountNodeTest( + BuildConcatDequantQuantProgramDesc(use_mkldnn, scale_out, scale, scale2), + remove_nodes); + CheckRequantScalesTest( + BuildConcatDequantQuantProgramDesc(use_mkldnn, scale_out, scale, scale2), + scale, scale2); } -TEST(CpuQuantizeSquashPass, branch_to_equal_inequal_and_fp32) { - // Delete both quantize ops, - // bypass dequantize in both branches, - // insert requantize on one branch +// a->Conv1->b +// b->Requant1(Scale1)->c +// b->Requant2(Scale2)->d +TEST(CpuQuantizeSquashPass, more_than_one_conv_out_outputs) { + auto scale_out = 1.0f; auto scale = 1.2345f; auto scale2 = 21.0f; auto use_mkldnn = true; - // Remove 3 nodes: Quant1, Quant2, g - // Insert 1 node: requantize + // nothing change + auto remove_nodes = 0; + CountNodeTest( + BuildConvMultiRequantProgramDesc(use_mkldnn, scale_out, scale, scale2), + remove_nodes); +} + +// a->Conv1->c->Concat +TEST(CpuQuantizeSquashPass, conv_dequant_only_one_output) { + auto scale_out = 1.0f; + auto scale = 1.2345f; + auto use_mkldnn = true; + // remove 2 nodes: Dequant1, c auto remove_nodes = 2; - MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes); + CountNodeTest(BuildConvDequantConcatProgramDesc(use_mkldnn, scale_out, scale), + remove_nodes); +} - use_mkldnn = !use_mkldnn; - MainTest(BuildProgramDesc2(use_mkldnn, scale, scale, scale2), remove_nodes); +TEST(CpuQuantizeSquashPass, conv_dequant_more_than_one_op_after_conv) { + auto scale_out = 1.0f; + auto scale = 1.2345f; + auto use_mkldnn = true; + // nothing change + auto remove_nodes = 0; + CountNodeTest(BuildConvDequantConvProgramDesc(use_mkldnn, scale_out, scale), + remove_nodes); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc index a2092a50..6032f38b 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc @@ -13,39 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" -#include -#include -#include - -namespace paddle { -namespace framework { -namespace ir { - -void MKLDNNPlacementPass::ApplyImpl(ir::Graph* graph) const { - VLOG(3) << "Applies MKL-DNN placement strategy."; - const auto& op_types_list = - Get>("mkldnn_enabled_op_types"); - if (!graph->Has("use_mkldnn")) { - graph->Set("use_mkldnn", new bool(true)); - } - for (const Node* n : graph->Nodes()) { - if (n->IsOp()) { - auto* op = n->Op(); - if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) { - if (op_types_list.empty()) { - op->SetAttr("use_mkldnn", true); - } else if (std::find(op_types_list.begin(), op_types_list.end(), - n->Name()) != op_types_list.end()) { - op->SetAttr("use_mkldnn", true); - } - } - } - } -} - -} // namespace ir -} // namespace framework -} // namespace paddle REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass) .RequirePassAttr("mkldnn_enabled_op_types"); diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h index ffa62273..98bd2d0a 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h @@ -14,8 +14,9 @@ limitations under the License. */ #pragma once -#include -#include "paddle/fluid/framework/ir/pass.h" +#include +#include +#include "paddle/fluid/framework/ir/placement_pass_base.h" namespace paddle { namespace framework { @@ -24,9 +25,15 @@ namespace ir { /* * Specifies which operators should use MKLDNN. */ -class MKLDNNPlacementPass : public Pass { - protected: - void ApplyImpl(ir::Graph* graph) const override; +class MKLDNNPlacementPass : public PlacementPassBase { + private: + const std::string GetPlacementName() const { return "MKLDNN"; } + + const std::string GetAttrName() const { return "use_mkldnn"; } + + const std::unordered_set GetOpTypesList() const { + return Get>("mkldnn_enabled_op_types"); + } }; } // namespace ir diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc index 1019c4f8..fccc36ba 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/all_reduce_deps_pass.cc @@ -131,7 +131,7 @@ class AllReduceDepsPass : public ir::Pass { auto right_in_vars = details::DynamicCast(right->Inputs()); PADDLE_ENFORCE_GT(left_in_vars.size(), 0); - PADDLE_ENFORCE_EQ(left_in_vars.size(), right_in_vars.size()); + PADDLE_ENFORCE_GT(right_in_vars.size(), 0); return left_in_vars[0]->Name() > right_in_vars[0]->Name(); }); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc index d0afebcb..73d7bf6d 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc @@ -29,14 +29,21 @@ namespace ir { class FuseAllReduceOpPass : public ir::Pass { protected: void ApplyImpl(ir::Graph *graph) const override { - ir::Graph &result = *graph; + if (Get(details::kNRanks) <= 1) { + VLOG(6) << "The number of place is" << Get(details::kNRanks) + << ", there doesn't need apply FuseAllReduceOpPass."; + return; + } + auto &places = Get>(details::kPlaces); auto &local_scopes = Get>(details::kLocalScopes); + #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *multi_nccl_ctxs = &Get(details::kNCCLCtxs); #endif + ir::Graph &result = *graph; auto ¶ms_grads = result.Get(details::kParamsAndDenseGrads); size_t num_of_all_reduce = params_grads.size(); @@ -49,7 +56,7 @@ class FuseAllReduceOpPass : public ir::Pass { std::unordered_map all_reduce_ops = GetAllReduceOps(result, places, grads); - VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size(); + VLOG(6) << "Find all_reduce_ops: " << all_reduce_ops.size(); if (all_reduce_ops.size() == 0) { return; } @@ -58,11 +65,16 @@ class FuseAllReduceOpPass : public ir::Pass { "The number of all_reduce OpHandle is not equal to the " "number of grads. Maybe some gradients are sparse type, " "it is not supported currently."); - VLOG(10) << "Insert fused_all_reduce"; auto &group_params_grads = graph->Get( details::kGroupParamsAndDenseGrads); + LOG(WARNING) << string::Sprintf( + "Find all_reduce operators: %d. To make the speed faster, some " + "all_reduce ops are fused during training, after fusion, " + "the number of all_reduce ops is %d.", + all_reduce_ops.size(), group_params_grads.size()); + for (auto &group_p_g : group_params_grads) { size_t group_size = group_p_g.size(); PADDLE_ENFORCE_GT(group_size, static_cast(0)); @@ -203,4 +215,5 @@ class FuseAllReduceOpPass : public ir::Pass { } // namespace paddle REGISTER_PASS(fuse_all_reduce_op_pass, - paddle::framework::ir::FuseAllReduceOpPass); + paddle::framework::ir::FuseAllReduceOpPass) + .RequirePassAttr(paddle::framework::details::kNRanks); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc index d6d9c8bb..224ab21b 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc @@ -205,7 +205,7 @@ void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const { } // Insert collective ops if nranks > 1 - if (!is_forwarding && Get(kNRanks) > 1) { + if (!is_forwarding && Get(details::kNRanks) > 1) { try { bool is_bk_op = static_cast(boost::get(node->Op()->GetAttr( @@ -273,7 +273,7 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( loss_scale = 1; break; case details::BuildStrategy::GradientScaleStrategy::kCoeffNumDevice: - loss_scale = Get(kNRanks); + loss_scale = Get(details::kNRanks); break; case details::BuildStrategy::GradientScaleStrategy::kCustomized: loss_scale = 0; @@ -699,7 +699,7 @@ bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { if (UseGPU()) { - if (strategy_.fuse_broadcast_ops_) { + if (strategy_.fuse_broadcast_ops_ == true) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { @@ -1068,7 +1068,7 @@ void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { strategy_.reduce_ == details::BuildStrategy::ReduceStrategy::kReduce) { return; } - if (strategy_.fuse_broadcast_ops_) { + if (strategy_.fuse_broadcast_ops_ == true) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { @@ -1106,7 +1106,7 @@ static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) { .RequirePassAttr(paddle::framework::details::kPlaces) \ .RequirePassAttr(paddle::framework::details::kLocalScopes) \ .RequirePassAttr(paddle::framework::ir::kStrategy) \ - .RequirePassAttr(paddle::framework::ir::kNRanks) + .RequirePassAttr(paddle::framework::details::kNRanks) REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass, paddle::framework::ir::ReduceSSAGraphBuilder); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h index 9b36d231..ea0455b6 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h @@ -35,7 +35,6 @@ namespace ir { constexpr char kLossVarName[] = "loss_var_name"; constexpr char kStrategy[] = "strategy"; -constexpr char kNRanks[] = "nranks"; class MultiDevSSAGraphBuilderBase : public ir::Pass { protected: @@ -124,7 +123,7 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { const std::string &g_name) const override {} bool NeedCollectiveForGrad(const std::string &grad_name, - std::vector ops) const { + std::vector ops) const override { return false; } @@ -133,13 +132,6 @@ class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { VLOG(1) << "set recv op do_not_run to true"; node->Op()->SetAttr("do_not_run", 1); node->Op()->Flush(); - } else if (node->Name() == "lookup_table" || node->Name() == "nce" || - node->Name() == "hierarchical_sigmoid") { - // in async_mode, we do not need remote prefetch, because communicator - // will do async parameter recv. - VLOG(1) << "set " << node->Name() << " op remote_prefetch to false"; - node->Op()->SetAttr("remote_prefetch", false); - node->Op()->Flush(); } return false; } diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc index a6c2b282..efd549e7 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.cc @@ -12,12 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_print_pass.h" #include #include #include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_printer.h" namespace paddle { namespace framework { @@ -29,7 +29,12 @@ class SSAGraghBuilderWithPrinterPass : public ir::Pass { std::unique_ptr fout( new std::ofstream(Get(kGraphvizPath))); PADDLE_ENFORCE(fout->good()); - Get("graph_printer").Print(*graph, *fout); + if (Has("graph_printer")) { + Get("graph_printer").Print(*graph, *fout); + } else { + GraphvizSSAGraphPrinter printer; + printer.Print(*graph, *fout); + } } }; diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc index 3a5333d0..7de3b7c6 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc @@ -17,7 +17,6 @@ #include #include #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/op_proto_maker.h" diff --git a/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc b/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc index 4664d13d..6198fab7 100644 --- a/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc +++ b/paddle/fluid/framework/ir/ngraph_subgraph_pass.cc @@ -47,8 +47,8 @@ std::string GenerateEngineKey(const std::set &engine_inputs, return engine_key; } -void NgraphSubgraphPass::ApplyImpl(ir::Graph *graph) const { - PADDLE_ENFORCE(graph); +void NgraphSubgraphPass::ApplyImpl(Graph *graph) const { + PADDLE_ENFORCE_NOT_NULL(graph); FusePassBase::Init("ngraph_subgraph_pass", graph); std::unordered_set nodes2delete; @@ -66,15 +66,13 @@ void NgraphSubgraphPass::ApplyImpl(ir::Graph *graph) const { if (node->IsOp() && !ANAT::Agent(node).subgraph()->empty()) { OpDesc *op_desc = node->Op(); op_desc->SetType("ngraph_engine"); - for (auto it = ANAT::Agent(node).subgraph()->begin(); - it != ANAT::Agent(node).subgraph()->end(); ++it) { - } CreateNgraphEngineOp(node, graph); std::unordered_set nodes2remove( ANAT::Agent(node).subgraph()->begin(), ANAT::Agent(node).subgraph()->end()); + GraphSafeRemoveNodes(graph, nodes2remove); } } @@ -85,70 +83,100 @@ void NgraphSubgraphPass::ApplyImpl(ir::Graph *graph) const { nodes2remove.insert(node); } } + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); - std::vector nodes = ir::TopologySortOperations(*graph); + // std::vector nodes = ir::TopologySortOperations(*graph); } -void NgraphSubgraphPass::CreateNgraphEngineOp(framework::ir::Node *node, - Graph *graph) const { - auto *op_desc = node->Op(); +bool IsValid(std::string name) { + return name.find(Node::kControlDepVarName) == std::string::npos; +} + +void UpdateNgraphIO(Node *node, Graph *graph, + std::vector *input_names, + std::vector *output_names) { + bool is_test = true, has_fetch = false; + for (Node *node : graph->Nodes()) { + if (node->IsOp() && node->Name().find("_grad") != std::string::npos) { + is_test = false; + } + if (node->IsVar() && node->Var()) { + for (auto out : node->outputs) { + if (out->Name() == "fetch") has_fetch = true; + } + } + } + if (is_test && has_fetch) { + for (auto *x : node->inputs) { + (*input_names).emplace_back(x->Name()); + } + for (auto *x : node->outputs) { + (*output_names).emplace_back(x->Name()); + } + return; + } + auto &subgraph = *ANAT::Agent(node).subgraph(); - PADDLE_ENFORCE(!subgraph.empty()); + std::unordered_set inputs; + std::unordered_set outputs; + for (auto *node : subgraph) { + for (auto in : node->inputs) { + auto name = in->Name(); + if (!IsValid(name)) continue; + if (!outputs.count(name) && !inputs.count(name)) { + (*input_names).emplace_back(name); + inputs.insert(name); + } + } + for (auto out : node->outputs) { + auto name = out->Name(); + if (!IsValid(name)) continue; + outputs.insert(name); + (*output_names).emplace_back(name); + } + } +} - framework::ProgramDesc *program_desc = - Get("program"); - const framework::BlockDesc &main_block = - program_desc->Block(framework::kRootBlockIndex); - framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); +void NgraphSubgraphPass::CreateNgraphEngineOp(Node *node, Graph *graph) const { + auto &subgraph = *ANAT::Agent(node).subgraph(); + PADDLE_ENFORCE_NE(subgraph.empty(), true, "subgraph cannot be empty"); framework::proto::BlockDesc block_proto; framework::BlockDesc block_desc(nullptr, &block_proto); block_desc.Proto()->set_parent_idx(-1); block_desc.Proto()->set_idx(0); for (auto *node : subgraph) { - auto *new_block_op = new_block->AppendOp(); auto *op = block_desc.AppendOp(); - *new_block_op->Proto() = *node->Op()->Proto(); *op->Proto() = *node->Op()->Proto(); } - - std::set input_names; - std::set input_names_with_id; - for (auto *x : node->inputs) { - input_names.insert(x->Name()); - input_names_with_id.insert(x->Name() + std::to_string(x->id())); - } - op_desc->SetInput( - "Xs", std::vector(input_names.begin(), input_names.end())); - - std::set output_names; - std::set output_names_with_id; - - for (auto *x : node->outputs) { - output_names.insert(x->Name()); - output_names_with_id.insert(x->Name() + std::to_string(x->id())); - } - op_desc->SetOutput( - "Ys", std::vector(output_names.begin(), output_names.end())); auto *vars = block_desc.Proto()->mutable_vars(); - for (framework::ir::Node *node : graph->Nodes()) { + for (Node *node : graph->Nodes()) { if (node->IsVar() && node->Var()) { *vars->Add() = *node->Var()->Proto(); } } + PADDLE_ENFORCE_NE(block_desc.Proto()->vars().empty(), true, + "the block has no var-desc"); - PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), - "the block has no var-desc"); - - op_desc->SetType("ngraph_engine"); + std::vector input_names; + std::vector output_names; + UpdateNgraphIO(node, graph, &input_names, &output_names); + auto *op_desc = node->Op(); + op_desc->SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); + op_desc->SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); int sgs = subgraph.size(); - std::string engine_key = GenerateEngineKey( - input_names_with_id, output_names_with_id, std::to_string(sgs)); + std::string subgraph_str = block_desc.Proto()->SerializeAsString(); + std::string engine_key = + std::to_string(std::hash()(subgraph_str)); std::vector interval{0, sgs}; + op_desc->SetType("ngraph_engine"); op_desc->SetAttr("interval", interval); - op_desc->SetAttr("graph", block_desc.Proto()->SerializeAsString()); + op_desc->SetAttr("graph", subgraph_str); op_desc->SetAttr("engine_key", engine_key); + op_desc->SetAttr("op_role", 0); } } // namespace ir diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 276e6a5b..fbc0d759 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -66,12 +66,12 @@ class Node { std::string Name() const { return name_; } VarDesc* Var() const { - PADDLE_ENFORCE(IsVar()); + PADDLE_ENFORCE_EQ(IsVar(), true); return var_desc_.get(); } OpDesc* Op() const { - PADDLE_ENFORCE(IsOp()); + PADDLE_ENFORCE_EQ(IsOp(), true); return op_desc_.get(); } @@ -99,7 +99,7 @@ class Node { // Test if the Node is wrapped by type T. template - bool IsWrappedBy() { + bool IsWrappedBy() const { return std::type_index(typeid(T)) == wrapper_type_; } diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index ca8e8299..b4cfda91 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -24,6 +24,7 @@ namespace framework { namespace ir { Graph* Pass::Apply(Graph* graph) const { + CheckPrevPass(); PADDLE_ENFORCE(graph, "graph passed to Pass::Apply() cannot be empty."); for (const std::string& attr : required_pass_attrs_) { PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), @@ -36,10 +37,15 @@ Graph* Pass::Apply(Graph* graph) const { ApplyImpl(graph); // TODO(panyx0718): Add more verifications. PADDLE_ENFORCE(!HasCircle(*graph), - "Illegal Pass. Generated graph shouldn't has cycle."); + "Illegal Pass %s. Generated graph shouldn't have cycle.", + Type()); PADDLE_ENFORCE(VarDescIsConsistency(*graph), "The VarDescs of persistable variable are not consistency."); applied_ = true; + if (!graph->Has(kPassRecorder)) { + graph->Set(kPassRecorder, new PassRecorder); + } + graph->Get(kPassRecorder).insert(Type()); return graph; } diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 6cbe9a82..cf6b8d13 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -20,6 +20,7 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" @@ -31,6 +32,9 @@ namespace ir { template struct PassRegistrar; +typedef std::unordered_set PassRecorder; +constexpr char kPassRecorder[] = "pass_recorder"; + class Pass { public: Pass() = default; @@ -104,6 +108,10 @@ class Pass { LOG(FATAL) << "Calling virtual Pass not implemented."; } + // Some Pass must be placed before this Pass, and some + // Pass must be placed after this Pass. + virtual void CheckPrevPass() const {} + private: template friend struct PassRegistrar; diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc index e0719867..8355764a 100644 --- a/paddle/fluid/framework/ir/pass_builder.cc +++ b/paddle/fluid/framework/ir/pass_builder.cc @@ -13,12 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/pass_builder.h" +#include +#include namespace paddle { namespace framework { namespace ir { std::shared_ptr PassBuilder::AppendPass(const std::string& pass_type) { + VLOG(1) << "Append " << pass_type; auto pass = ir::PassRegistry::Instance().Get(pass_type); passes_.emplace_back(pass.release()); return passes_.back(); diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc index 87e3c964..44fddd80 100644 --- a/paddle/fluid/framework/ir/pass_test.cc +++ b/paddle/fluid/framework/ir/pass_test.cc @@ -99,7 +99,7 @@ TEST(PassTest, TestPassAttrCheck) { } catch (paddle::platform::EnforceNotMet e) { exception = std::string(e.what()); } - ASSERT_TRUE(exception.find("shouldn't has cycle") != exception.npos); + ASSERT_TRUE(exception.find("shouldn't have cycle") != exception.npos); } } // namespace ir diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h new file mode 100644 index 00000000..8df292b4 --- /dev/null +++ b/paddle/fluid/framework/ir/pass_tester_helper.h @@ -0,0 +1,338 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +struct Layers { + public: + const ProgramDesc& main_program() { return program_; } + + VarDesc* data(std::string name, std::vector shape = {}, + bool is_persistable = false) { + return lod_tensor(name, shape, is_persistable); + } + + VarDesc* conv2d(VarDesc* input, VarDesc* filter, VarDesc* bias, + bool use_cudnn = false) { + VarDesc* out = lod_tensor(unique_name()); + OpDesc* op = program_.MutableBlock(0)->AppendOp(); + op->SetType("conv2d"); + op->SetInput("Input", {input->Name()}); + op->SetInput("Filter", {filter->Name()}); + op->SetInput("Bias", {bias->Name()}); + op->SetOutput("Out", {out->Name()}); + op->SetAttr("use_cudnn", use_cudnn); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return out; + } + + VarDesc* depthwise_conv2d(VarDesc* input, VarDesc* filter, VarDesc* bias, + bool use_cudnn) { + VarDesc* out = lod_tensor(unique_name()); + OpDesc* op = program_.MutableBlock(0)->AppendOp(); + op->SetType("depthwise_conv2d"); + op->SetInput("Input", {input->Name()}); + op->SetInput("Filter", {filter->Name()}); + op->SetInput("Bias", {bias->Name()}); + op->SetOutput("Out", {out->Name()}); + op->SetAttr("use_cudnn", use_cudnn); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return out; + } + + VarDesc* pool2d(VarDesc* x, bool use_cudnn) { + VarDesc* out = lod_tensor(unique_name()); + OpDesc* op = program_.MutableBlock(0)->AppendOp(); + op->SetType("pool2d"); + op->SetInput("X", {x->Name()}); + op->SetOutput("Out", {out->Name()}); + op->SetAttr("use_cudnn", use_cudnn); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return out; + } + + VarDesc* relu(VarDesc* x, VarDesc* out = nullptr) { + return unary_op("relu", x, out); + } + + VarDesc* fc(VarDesc* input, VarDesc* w, VarDesc* bias, + int in_num_col_dims = 1, std::string activation_type = "") { + VarDesc* out = lod_tensor(unique_name()); + OpDesc* op = program_.MutableBlock(0)->AppendOp(); + op->SetType("fc"); + op->SetInput("Input", {input->Name()}); + op->SetInput("W", {w->Name()}); + op->SetInput("Bias", {bias->Name()}); + op->SetOutput("Out", {out->Name()}); + op->SetAttr("in_num_col_dims", in_num_col_dims); + op->SetAttr("activation_type", activation_type); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return out; + } + + VarDesc* mul(VarDesc* x, VarDesc* y, VarDesc* out = nullptr, + int x_num_col_dims = 1) { + AttributeMap attrs; + attrs["x_num_col_dims"] = 1; + return binary_op("mul", x, y, out, &attrs); + } + + VarDesc* elementwise_add(VarDesc* x, VarDesc* y, VarDesc* out = nullptr) { + return binary_op("elementwise_add", x, y, out); + } + + VarDesc* dropout(VarDesc* x, float dropout_prob, + std::string dropout_implementation) { + VarDesc* out = lod_tensor(unique_name()); + OpDesc* op = program_.MutableBlock(0)->AppendOp(); + op->SetType("dropout"); + op->SetInput("X", {x->Name()}); + op->SetOutput("Out", {out->Name()}); + op->SetAttr("is_test", true); + op->SetAttr("dropout_prob", dropout_prob); + op->SetAttr("dropout_implementation", dropout_implementation); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return out; + } + + VarDesc* concat(std::vector inputs, int axis = -1) { + VarDesc* out = lod_tensor(unique_name()); + OpDesc* op = program_.MutableBlock(0)->AppendOp(); + op->SetType("concat"); + std::vector input_names(inputs.size()); + for (size_t i = 0; i < inputs.size(); ++i) { + input_names[i] = inputs[i]->Name(); + } + op->SetInput("X", input_names); + op->SetOutput("Out", {out->Name()}); + op->SetAttr("axis", axis); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return out; + } + + std::vector layer_norm(VarDesc* x, VarDesc* scale = nullptr, + VarDesc* bias = nullptr) { + VarDesc* y = lod_tensor(unique_name()); + VarDesc* mean = lod_tensor(unique_name()); + VarDesc* variance = lod_tensor(unique_name()); + OpDesc* op = program_.MutableBlock(0)->AppendOp(); + op->SetType("layer_norm"); + op->SetInput("X", {x->Name()}); + if (scale) { + op->SetInput("Scale", {scale->Name()}); + } + if (bias) { + op->SetInput("Bias", {bias->Name()}); + } + op->SetOutput("Y", {y->Name()}); + op->SetOutput("Mean", {mean->Name()}); + op->SetOutput("Variance", {variance->Name()}); + op->SetAttr("epsilon", static_cast(1E-05)); + op->SetAttr("begin_norm_axis", static_cast(1)); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + std::vector outs = {y, mean, variance}; + return outs; + } + + private: + VarDesc* lod_tensor(std::string name, std::vector shape = {}, + bool is_persistable = false) { + auto* var = program_.MutableBlock(0)->Var(name); + var->SetType(proto::VarType::LOD_TENSOR); + var->SetShape(shape); + var->SetPersistable(is_persistable); + return var; + } + + VarDesc* unary_op(std::string type, VarDesc* x, VarDesc* out = nullptr) { + if (!out) { + out = lod_tensor(unique_name()); + } + OpDesc* op = program_.MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetInput("X", {x->Name()}); + op->SetOutput("Out", {out->Name()}); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return out; + } + + VarDesc* binary_op(std::string type, VarDesc* x, VarDesc* y, + VarDesc* out = nullptr, + const AttributeMap* attrs = nullptr) { + if (!out) { + out = lod_tensor(unique_name()); + } + OpDesc* op = program_.MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetInput("X", {x->Name()}); + op->SetInput("Y", {y->Name()}); + op->SetOutput("Out", {out->Name()}); + if (attrs) { + for (auto& iter : *attrs) { + op->SetAttr(iter.first, iter.second); + } + } + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + return out; + } + + std::string unique_name() { return "tmp_" + std::to_string(idx_++); } + + private: + ProgramDesc program_; + int idx_{0}; +}; + +static std::string DebugString(OpDesc* op) { + std::ostringstream os; + os << "Op(" << op->Type() << "), inputs:{"; + bool is_first = true; + for (auto& name : op->InputNames()) { + if (!is_first) { + os << ", "; + } + os << name << "["; + bool is_first_var_name = true; + for (auto& var_name : op->Input(name)) { + if (!is_first_var_name) { + os << ", "; + } + os << var_name; + is_first_var_name = false; + } + os << "]"; + is_first = false; + } + + os << "}, outputs:{"; + is_first = true; + for (auto& name : op->OutputNames()) { + if (!is_first) { + os << ", "; + } + os << name << "["; + bool is_first_var_name = true; + for (auto& var_name : op->Output(name)) { + if (!is_first_var_name) { + os << ", "; + } + os << var_name; + is_first_var_name = false; + } + os << "]"; + is_first = false; + } + os << "}"; + return os.str(); +} + +static std::string DebugString(Node* node) { + std::ostringstream os; + if (node->IsOp() && node->Op()) { + OpDesc* op = node->Op(); + os << "Node(" << DebugString(op) << "), inputs:{"; + bool is_first = true; + for (auto* in : node->inputs) { + if (!is_first) { + os << ", "; + } + os << in->Name(); + is_first = false; + } + os << "}, outputs:{"; + is_first = true; + for (auto* out : node->outputs) { + if (!is_first) { + os << ", "; + } + os << out->Name(); + is_first = false; + } + os << "}."; + } else if (node->IsVar() && node->Var()) { + os << "Node(" << node->Name() << "), inputs:{"; + bool is_first = true; + for (auto* in : node->inputs) { + if (!is_first) { + os << ", "; + } + if (in->IsOp() && in->Op()) { + os << in->Op()->Type(); + } + is_first = false; + } + os << "}, outputs:{"; + is_first = true; + for (auto* out : node->outputs) { + if (!is_first) { + os << ", "; + } + if (out->IsOp() && out->Op()) { + os << out->Op()->Type(); + } + is_first = false; + } + os << "}"; + } + return os.str(); +} + +static std::string DebugString(const std::unique_ptr& graph) { + std::ostringstream os; + os << "Graph: {\n"; + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()) { + os << " "; + } else if (node->IsVar() && node->Var()) { + os << " "; + } + os << DebugString(node) << "\n"; + } + os << "}\n"; + return os.str(); +} + +static int GetNumOpNodes(const std::unique_ptr& graph, + std::string op_type) { + int num_nodes = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op() && node->Op()->Type() == op_type) { + num_nodes++; + } + } + return num_nodes; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc new file mode 100644 index 00000000..1ac7e4d6 --- /dev/null +++ b/paddle/fluid/framework/ir/placement_pass_base.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/placement_pass_base.h" +#include +#include +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace framework { +namespace ir { + +void PlacementPassBase::ApplyImpl(ir::Graph* graph) const { + VLOG(3) << "Applies " << GetPlacementName() << " placement strategy."; + std::string attr_name = GetAttrName(); + const auto& op_types_list = GetOpTypesList(); + if (!graph->Has(attr_name)) { + graph->Set(attr_name, new bool(true)); + } + for (const Node* n : graph->Nodes()) { + if (n->IsOp()) { + auto* op = n->Op(); + if ((op->HasAttr(attr_name) || op->HasProtoAttr(attr_name)) && + IsSupport(op->Type())) { + if (op_types_list.empty()) { + op->SetAttr(attr_name, true); + } else if (std::find(op_types_list.begin(), op_types_list.end(), + n->Name()) != op_types_list.end()) { + op->SetAttr(attr_name, true); + } + } + } + } +} + +bool PlacementPassBase::IsSupport(const std::string& op_type) const { + if (GetAttrName() == "use_cudnn") { + auto& all_kernels = OperatorWithKernel::AllOpKernels(); + auto it = all_kernels.find(op_type); + if (it == all_kernels.end()) { + // All control operators don't have kernel. + return false; + } + for (auto& kernel_pair : it->second) { + if (platform::is_gpu_place(kernel_pair.first.place_) && + (kernel_pair.first.library_type_ == LibraryType::kCUDNN)) { + return true; + } + } + } else if (GetAttrName() == "use_mkldnn") { + return true; + } + return false; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/placement_pass_base.h b/paddle/fluid/framework/ir/placement_pass_base.h new file mode 100644 index 00000000..91693e7b --- /dev/null +++ b/paddle/fluid/framework/ir/placement_pass_base.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Specifies which operators should use cuDNN. + */ +class PlacementPassBase : public Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override; + + virtual const std::string GetPlacementName() const = 0; + virtual const std::string GetAttrName() const = 0; + virtual const std::unordered_set GetOpTypesList() const = 0; + + private: + bool IsSupport(const std::string& op_type) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc index 00263b8a..45157ca1 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc @@ -1,16 +1,16 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h" #include // for max @@ -25,55 +25,84 @@ namespace paddle { namespace framework { namespace ir { -PDNode* BuildRepeatedFCReluPattern(PDPattern* pattern, - const std::string& name_scope, int num_fc) { +static bool IsInputOfFC(Node* n) { + if (n && n->IsVar() && VarLinksToOp(n, "fc")) { + return true; + } + return false; +} + +static bool IsOutputOfFC(Node* n) { + if (n && n->IsVar() && VarLinksFromOp(n, "fc") && n->inputs.size() == 1U) { + return true; + } + return false; +} + +static bool IsFCWithAct(Node* n, const std::string& act_type = "relu") { + if (n && n->IsOp() && n->Op() && n->Op()->Type() == "fc" && + n->inputs.size() == 3U && n->outputs.size() == 1U) { + return boost::get(n->Op()->GetAttr("activation_type")) == + act_type; + } + return false; +} + +static bool IsParamOfFC(Node* n, const std::string& param_name) { + if (IsInputOfFC(n) && n->inputs.empty() && + (n->Name() == n->outputs[0]->Op()->Input(param_name)[0])) { + return true; + } + return false; +} + +static int FindFCIdx(Node* x, const std::string& act_type = "relu") { + if (!IsInputOfFC(x)) { + return -1; + } + for (size_t k = 0; k < x->outputs.size(); ++k) { + auto* out_op = x->outputs[k]; + if (IsFCWithAct(out_op, act_type) && out_op->outputs.size() == 1U) { + return k; + } + } + return -1; +} + +static int FindInputIdx(Node* n, const std::string& name, + const std::string& act_type = "relu") { + if (!IsFCWithAct(n, act_type)) { + return -1; + } + for (size_t i = 0; i < n->inputs.size(); ++i) { + if (n->inputs[i]->Name() == n->Op()->Input(name)[0]) { + return i; + } + } + return -1; +} + +void BuildRepeatedFCReluPattern(PDPattern* pattern, + const std::string& name_scope, int num_fc) { auto var_next_is_fc_act = [=](Node* x, const std::string& act_type = "relu", bool check_in_has_only_one_out = true, int fc_idx = 0) -> bool { - bool next_is_fc = x && x->IsVar() && VarLinksToOp(x, "fc"); - if (check_in_has_only_one_out) { - next_is_fc = next_is_fc && x->outputs.size() == 1; - } - if (!next_is_fc) { + if (!IsInputOfFC(x)) { return false; } - auto* fc_op = x->outputs[fc_idx]; - bool next_is_act = fc_op && fc_op->IsOp() && fc_op->outputs.size() == 1 && - fc_op->outputs[0] && fc_op->outputs[0]->IsVar() && - VarLinksToOp(fc_op->outputs[0], act_type) && - fc_op->outputs[0]->outputs.size() == 1; - if (!next_is_act) { + if (check_in_has_only_one_out && x->outputs.size() != 1U) { return false; } - auto* act_op = fc_op->outputs[0]->outputs[0]; - return act_op && act_op->IsOp() && act_op->outputs.size() == 1; - }; - - auto find_fc_idx = [=](Node* x, const std::string& act_type = "relu") -> int { - bool next_is_fc = x && x->IsVar() && VarLinksToOp(x, "fc"); - if (!next_is_fc) { - return 0; - } - for (size_t k = 0; k < x->outputs.size(); ++k) { - auto* fc_op = x->outputs[k]; - bool next_is_act = fc_op && fc_op->IsOp() && fc_op->outputs.size() == 1 && - fc_op->outputs[0] && fc_op->outputs[0]->IsVar() && - VarLinksToOp(fc_op->outputs[0], act_type) && - fc_op->outputs[0]->outputs.size() == 1; - if (!next_is_act) { - continue; - } - auto* act_op = fc_op->outputs[0]->outputs[0]; - if (act_op && act_op->IsOp() && act_op->outputs.size() == 1) { - return k; - } - } - return 0; + auto* fc_op = x->outputs[fc_idx]; + return IsFCWithAct(fc_op, act_type) && fc_op->outputs.size() == 1U; }; + // in -> fc -> out + // Current x is in, return fc's out which is next fc's input. auto next_var_of_part = [=](Node* x, int fc_idx = 0) -> Node* { - return x->outputs[fc_idx]->outputs[0]->outputs[0]->outputs[0]; + return x->outputs[fc_idx]->outputs[0]; }; + auto var_next_is_fc_act_repeated_n_times = [=]( Node* x, int repeated_times, const std::string& act_type = "relu", bool check_in_has_only_one_out = true) -> bool { @@ -87,25 +116,14 @@ PDNode* BuildRepeatedFCReluPattern(PDPattern* pattern, return true; }; + // x is output of fc auto var_before_is_fc_act = [=](Node* x, const std::string& act_type = "relu", bool at_top = false) -> bool { - bool before_is_act = - x && x->IsVar() && x->inputs.size() == 1 && VarLinksFromOp(x, "relu"); - if (!before_is_act) { + if (!IsOutputOfFC(x)) { return false; } - auto* relu_op = x->inputs[0]; - bool before_is_fc = relu_op->IsOp() && relu_op->inputs.size() == 1 && - relu_op->inputs[0]->IsVar() && - VarLinksFromOp(relu_op->inputs[0], "fc") && - relu_op->inputs[0]->inputs.size() == 1; - - if (!before_is_fc) { - return false; - } - auto* fc_op = relu_op->inputs[0]->inputs[0]; - bool is_fc = fc_op->IsOp() && fc_op->inputs.size() == 3; - if (!is_fc) { + auto* fc_op = x->inputs[0]; + if (!IsFCWithAct(fc_op, act_type) || fc_op->inputs.size() != 3U) { return false; } for (auto* fc_i : fc_op->inputs) { @@ -113,7 +131,7 @@ PDNode* BuildRepeatedFCReluPattern(PDPattern* pattern, if (at_top) { return true; } else { - return VarLinksFromOp(fc_i, "relu"); + return VarLinksFromOp(fc_i, "fc"); } } } @@ -121,10 +139,11 @@ PDNode* BuildRepeatedFCReluPattern(PDPattern* pattern, }; auto before_var_of_part = [=](Node* x) -> Node* { - auto* fc_op = x->inputs[0]->inputs[0]; - for (auto* fc_i : fc_op->inputs) { - if (!fc_i->inputs.empty()) { - return fc_i->inputs[0]; + auto* fc_op = x->inputs[0]; + for (auto* in : fc_op->inputs) { + if (!in->inputs.empty()) { + // w and bias has no input. + return in; } } return nullptr; @@ -142,76 +161,76 @@ PDNode* BuildRepeatedFCReluPattern(PDPattern* pattern, return true; }; - std::vector fc_input_var(num_fc); + PDNode* fc_input_var_0 = nullptr; std::vector fc_output_var(num_fc); std::vector fc_weight_var(num_fc); std::vector fc_bias_var(num_fc); std::vector fc_ops(num_fc); - std::vector relu_ops(num_fc); for (int i = 0; i < num_fc; ++i) { - fc_input_var[i] = pattern->NewNode( - [=](Node* x) { - if (i == 0 && x->outputs.size() > 0) { - bool ok = x->inputs.size() > 0; - if (!ok) { + if (i == 0) { + fc_input_var_0 = pattern->NewNode( + [=](Node* x) { + if (x->outputs.size() <= 0 || x->inputs.size() <= 0U) { return false; } - int idx = find_fc_idx(x); - if (idx == 0) { + int fc_idx = FindFCIdx(x); + if (fc_idx < 0) { + return false; + } else if (fc_idx == 0) { return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu"); } else { - x = next_var_of_part(x, idx); + x = next_var_of_part(x, fc_idx); return var_next_is_fc_act_repeated_n_times( x, std::max(1, num_fc - i - 1), "relu"); } - } else { - return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu") && - x->inputs.size() > 0 && - var_before_is_fc_act_repeated_n_times(x, i, "relu"); - } - }, - name_scope + "/fc_in_" + std::to_string(i)); + }, + name_scope + "/fc_in_0"); + } fc_weight_var[i] = pattern->NewNode( [=](Node* x) { + if (!IsParamOfFC(x, "W")) { + return false; + } + auto* fc_op = x->outputs[0]; + int input_idx = FindInputIdx(fc_op, "Input", "relu"); return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu") && - x->inputs.empty() && - var_before_is_fc_act_repeated_n_times(x->outputs[0]->inputs[0], - i, "relu") && - x->Name() == x->outputs[0]->Op()->Input("W")[0]; + var_before_is_fc_act_repeated_n_times(fc_op->inputs[input_idx], + i, "relu"); }, name_scope + "/fc_weight_" + std::to_string(i)); fc_bias_var[i] = pattern->NewNode( [=](Node* x) { + if (!IsParamOfFC(x, "Bias")) { + return false; + } + auto* fc_op = x->outputs[0]; + int input_idx = FindInputIdx(fc_op, "Input", "relu"); return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu") && - x->inputs.empty() && - var_before_is_fc_act_repeated_n_times(x->outputs[0]->inputs[0], - i, "relu") && - x->Name() == x->outputs[0]->Op()->Input("Bias")[0]; + var_before_is_fc_act_repeated_n_times(fc_op->inputs[input_idx], + i, "relu"); }, name_scope + "/fc_bias_" + std::to_string(i)); fc_output_var[i] = pattern->NewNode( [=](Node* x) { - bool basic = x && x->IsVar() && VarLinksFromOp(x, "fc") && - VarLinksToOp(x, "relu") && x->inputs.size() == 1 && - x->inputs[0]->inputs.size() == 3; - if (!basic) { + if (!IsOutputOfFC(x)) { return false; } - x = x->inputs[0]->inputs[0]; - if (i == 0 && x->outputs.size() > 0) { - bool ok = x->inputs.size() > 0; - if (!ok) { + x = before_var_of_part(x); + if (i == 0 && x->outputs.size() > 0U) { + if (x->inputs.size() <= 0U) { return false; } - int idx = find_fc_idx(x); - if (idx == 0) { + int fc_idx = FindFCIdx(x); + if (fc_idx < 0) { + return false; + } else if (fc_idx == 0) { return var_next_is_fc_act_repeated_n_times(x, num_fc - i, "relu"); } else { - x = next_var_of_part(x, idx); + x = next_var_of_part(x, fc_idx); return var_next_is_fc_act_repeated_n_times( x, std::max(1, num_fc - i - 1), "relu"); } @@ -225,53 +244,29 @@ PDNode* BuildRepeatedFCReluPattern(PDPattern* pattern, fc_ops[i] = pattern->NewNode( [=](Node* x) { - bool basic = x && x->IsOp() && x->Op()->Type() == "fc" && - x->inputs.size() == 3 && x->outputs.size() == 1; - if (!basic) { + if (!IsFCWithAct(x, "relu")) { return false; } auto* fc_out_var = x->outputs[0]; return fc_out_var && fc_out_var->IsVar() && fc_out_var->outputs.size() == 1 && - VarLinksToOp(fc_out_var, "relu") && - fc_out_var->outputs[0]->outputs.size() == 1 && - var_next_is_fc_act_repeated_n_times( - fc_out_var->outputs[0]->outputs[0], num_fc - i - 1, - "relu") && - var_before_is_fc_act_repeated_n_times( - fc_out_var->outputs[0]->outputs[0], i + 1, "relu"); - }, - name_scope + "/fc_op_" + std::to_string(i)); - - relu_ops[i] = pattern->NewNode( - [=](Node* x) { - return x && x->IsOp() && x->Op()->Type() == "relu" && - x->inputs.size() == 1 && x->outputs.size() == 1 && - x->inputs[0]->IsVar() && VarLinksFromOp(x->inputs[0], "fc") && - x->outputs[0]->IsVar() && - var_next_is_fc_act_repeated_n_times(x->outputs[0], - num_fc - i - 1, "relu") && - var_before_is_fc_act_repeated_n_times(x->outputs[0], i + 1, + var_next_is_fc_act_repeated_n_times(fc_out_var, num_fc - i - 1, + "relu") && + var_before_is_fc_act_repeated_n_times(fc_out_var, i + 1, "relu"); }, - name_scope + "/act_op_" + std::to_string(i)); - - fc_ops[i] - ->LinksFrom({fc_input_var[i], fc_weight_var[i], fc_bias_var[i]}) - .LinksTo({fc_output_var[i]}); - relu_ops[i]->LinksFrom({fc_output_var[i]}); - } + name_scope + "/fc_op_" + std::to_string(i)); - auto* last_out_var = pattern->NewNode( - [=](Node* x) { - return var_before_is_fc_act_repeated_n_times(x, num_fc, "relu"); - }, - name_scope + "/act_out"); - for (int i = 0; i < num_fc - 1; ++i) { - relu_ops[i]->LinksTo({fc_input_var[i + 1]}); + if (i == 0) { + fc_ops[i] + ->LinksFrom({fc_input_var_0, fc_weight_var[i], fc_bias_var[i]}) + .LinksTo({fc_output_var[i]}); + } else { + fc_ops[i] + ->LinksFrom({fc_output_var[i - 1], fc_weight_var[i], fc_bias_var[i]}) + .LinksTo({fc_output_var[i]}); + } } - relu_ops[num_fc - 1]->LinksTo({last_out_var}); - return last_out_var; } static int BuildFusion(Graph* graph, const std::string& name_scope, @@ -304,11 +299,11 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, auto& fused_pattern = gpd.pattern(); for (int i = 0; i < num_fc; ++i) { - if (i >= 1) { - relu_vars[i - 1] = - retrieve_node(name_scope + "/fc_in_" + std::to_string(i), subgraph, + if (i < num_fc - 1) { + relu_vars[i] = + retrieve_node(name_scope + "/fc_out_" + std::to_string(i), subgraph, fused_pattern); - relu_names[i - 1] = relu_vars[i - 1]->Name(); + relu_names[i] = relu_vars[i]->Name(); } weights_vars[i] = @@ -324,7 +319,8 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, auto* input_var = retrieve_node(name_scope + "/fc_in_0", subgraph, fused_pattern); auto* last_out_var = - retrieve_node(name_scope + "/act_out", subgraph, fused_pattern); + retrieve_node(name_scope + "/fc_out_" + std::to_string(num_fc - 1), + subgraph, fused_pattern); // Create New OpDesc OpDesc op_desc; @@ -334,6 +330,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, op_desc.SetInput("Bias", bias_names); op_desc.SetOutput("ReluOut", relu_names); op_desc.SetOutput("Out", {last_out_var->Name()}); + auto* op = graph->CreateOpNode(&op_desc); IR_NODE_LINK_TO(input_var, op); for (size_t i = 0; i < weights_vars.size(); ++i) { @@ -367,7 +364,9 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, } void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL(graph); FusePassBase::Init(name_scope_, graph); + int fusion_count = 0; for (int i = MAX_NUM_FC; i > 1; --i) { fusion_count += diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc new file mode 100644 index 00000000..81d9476d --- /dev/null +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass_tester.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h" + +#include +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +void TestMain(int num_fc) { + // inputs operator output + // ------------------------------------------------------------- + // (x, filters, bias_0) conv2d -> conv2d_out + // (conv2d_out, fc_weights_0, fc_bias_0) fc -> fc_out_0 + // (fc_out_0, fc_weights_1, fc_bias_1) fc -> fc_out_1 + // ... + Layers layers; + VarDesc* x = layers.data("x"); + VarDesc* filters = layers.data("filters", {}, true); + VarDesc* bias_0 = layers.data("bias_0", {}, true); + VarDesc* conv2d_out = layers.conv2d(x, filters, bias_0); + VarDesc* fc_in = conv2d_out; + for (int i = 0; i < num_fc; ++i) { + VarDesc* weights_i = + layers.data("fc_weights_" + std::to_string(i), {}, true); + VarDesc* bias_i = layers.data("fc_bias_" + std::to_string(i), {}, true); + std::string activation_type = i < (num_fc - 1) ? "relu" : ""; + VarDesc* fc_out = layers.fc(fc_in, weights_i, bias_i, 1, activation_type); + fc_in = fc_out; + } + + std::unique_ptr graph(new ir::Graph(layers.main_program())); + auto pass = PassRegistry::Instance().Get("repeated_fc_relu_fuse_pass"); + int num_nodes_before = graph->Nodes().size(); + int num_fc_nodes_before = GetNumOpNodes(graph, "fc"); + VLOG(3) << DebugString(graph); + + graph.reset(pass->Apply(graph.release())); + int num_nodes_after = graph->Nodes().size(); + int num_fused_nodes_after = GetNumOpNodes(graph, "fusion_repeated_fc_relu"); + VLOG(3) << DebugString(graph); + + // Delete (num_fc_nodes_before - 1) fc ops + PADDLE_ENFORCE_EQ(num_nodes_before - (num_fc_nodes_before - 1) + 1, + num_nodes_after); + PADDLE_ENFORCE_EQ(num_fused_nodes_after, 1); +} + +TEST(RepeatedFCReluFusePass, basic_3) { TestMain(3); } + +TEST(RepeatedFCReluFusePass, basic_9) { TestMain(9); } + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(repeated_fc_relu_fuse_pass); diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc new file mode 100644 index 00000000..8261bfc1 --- /dev/null +++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.cc @@ -0,0 +1,153 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace ir { + +namespace { +static PDNode* BuildCVMConcatPattern(PDPattern* pattern) { + auto cvm_behind_x = [](Node* x) -> bool { + Node* adj = x->inputs[0]; + Node* alt = x->inputs[0]->inputs[0]; + return x && adj && adj->IsVar() && alt->IsOp() && + alt->Op()->Type() == "cvm"; + }; + auto* concat_op_node = pattern->NewNode("concat_op") + ->assert_is_op("concat") + ->assert_op_attr("axis", 1) + ->assert_more(cvm_behind_x); + return concat_op_node; +} + +static void GetConcatNodes(ir::Graph* graph, std::vector* concat_nodes) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + auto concat_op_node = BuildCVMConcatPattern(pattern); + GraphPatternDetector::handle_t handler = [&]( + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + Node* concat_op = subgraph.at(concat_op_node); + concat_nodes->push_back(concat_op); + }; + gpd(graph, handler); +} +} // anonymous namespace + +void SeqPoolCVMConcatFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init("seqpool_cvm_concat_fuse", graph); + std::vector concat_nodes; + GetConcatNodes(graph, &concat_nodes); + + int count = 0; + for (auto* concat_node : concat_nodes) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + auto concat_before_x = [=](Node* x) -> bool { + return x && x->outputs[0] == concat_node; + }; + PDNode* seqpool_in_var_node = + pattern->NewNode("seqpool_in_var") + ->assert_is_only_input_of_op("sequence_pool"); + PDNode* seqpool_op_node = + pattern->NewNode("seqpool_op") + ->assert_is_op("sequence_pool") + ->assert_op_attr("pooltype", "SUM"); + PDNode* seqpool_out_var_node = + pattern->NewNode("seqpool_out_var") + ->assert_is_op_nth_output("sequence_pool", "Out", 0) + ->assert_is_op_nth_input("cvm", "X", 0); + PDNode* seqpool_idx_out_var_node = + pattern->NewNode("seqpool_idx_out_var") + ->assert_is_op_nth_output("sequence_pool", "MaxIndex", 0); + PDNode* cvm_op_node = + pattern->NewNode("cvm_op")->assert_is_op("cvm")->assert_op_attr( + "use_cvm", true); + PDNode* cvm_out_var_node = pattern->NewNode("cvm_op_out_var") + ->assert_is_op_nth_output("cvm", "Y", 0) + ->assert_more(concat_before_x); + PDNode* cvm_cvm_in_var_node = pattern->NewNode("cvm_cvm_in_var") + ->assert_is_op_nth_input("cvm", "CVM", 0); + + seqpool_op_node->LinksFrom({seqpool_in_var_node}) + .LinksTo({seqpool_out_var_node, seqpool_idx_out_var_node}); + seqpool_out_var_node->LinksFrom({seqpool_op_node}).LinksTo({cvm_op_node}); + cvm_op_node->LinksTo({cvm_out_var_node}) + .LinksFrom({cvm_cvm_in_var_node, seqpool_out_var_node}); + + std::unordered_map ins_to_concat; + std::vector subgraph_ins; + std::vector subgraph_ins_name; + std::unordered_set marked_nodes; + + Node* cvm_input_of_cvm; + Node* concat_out_var = concat_node->outputs[0]; + + GraphPatternDetector::handle_t handler = [&]( + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + Node* seqpool_in_var = subgraph.at(seqpool_in_var_node); + Node* seqpool_op = subgraph.at(seqpool_op_node); + Node* seqpool_out_var = subgraph.at(seqpool_out_var_node); + Node* seqpool_idx_out_var = subgraph.at(seqpool_idx_out_var_node); + Node* cvm_op = subgraph.at(cvm_op_node); + Node* cvm_out_var = subgraph.at(cvm_out_var_node); + cvm_input_of_cvm = subgraph.at(cvm_cvm_in_var_node); + marked_nodes.insert({seqpool_op, seqpool_out_var, seqpool_idx_out_var, + cvm_op, cvm_out_var, concat_node}); + ins_to_concat[cvm_out_var->Name()] = seqpool_in_var; + }; + gpd(graph, handler); + + if (!ins_to_concat.empty()) { + for (const auto* in : concat_node->inputs) { + subgraph_ins.push_back(ins_to_concat.at(in->Name())); + subgraph_ins_name.push_back(ins_to_concat.at(in->Name())->Name()); + } + + // Create New OpDesc + OpDesc op_desc; + op_desc.SetType("fusion_seqpool_cvm_concat"); + op_desc.SetInput("X", subgraph_ins_name); + op_desc.SetInput("CVM", {cvm_input_of_cvm->Name()}); + op_desc.SetAttr("pooltype", std::string("SUM")); + op_desc.SetAttr("use_cvm", true); + op_desc.SetAttr("axis", concat_node->Op()->GetAttr("axis")); + op_desc.SetOutput("Out", {concat_out_var->Name()}); + auto* op = graph->CreateOpNode(&op_desc); + + for (size_t i = 0; i < subgraph_ins.size(); ++i) { + IR_NODE_LINK_TO(subgraph_ins[i], op); + } + IR_NODE_LINK_TO(cvm_input_of_cvm, op); + IR_NODE_LINK_TO(op, concat_out_var); + + GraphSafeRemoveNodes(graph, marked_nodes); + count++; + } + } + AddStatis(count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(seqpool_cvm_concat_fuse_pass, + paddle::framework::ir::SeqPoolCVMConcatFusePass); diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h new file mode 100644 index 00000000..88a41983 --- /dev/null +++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/** + * Fuse SequencePool(with sum pooltype yet) and Concat; + * + * Before fuse: + * | | | + * seq_pool, seq_pool, ... seq_pool + * | | | + * cvm cvm cvm + * \ | ... / + * concat + * | + * After fuse: + * \ | / + * FusionSeqPoolCVMConcat + * | + */ +class SeqPoolCVMConcatFusePass : public FusePassBase { + public: + virtual ~SeqPoolCVMConcatFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + const std::string name_scope_{"seqpool_cvm_concat_fuse"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc new file mode 100644 index 00000000..bba640cf --- /dev/null +++ b/paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass_tester.cc @@ -0,0 +1,239 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/seqpool_cvm_concat_fuse_pass.h" +#include +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + if (type == "sequence_pool") { + op->SetInput("X", {inputs[0]}); + std::string pooltype = "SUM"; + op->SetAttr("pooltype", pooltype); + op->SetOutput("MaxIndex", {outputs[0]}); + op->SetOutput("Out", {outputs[1]}); + } else if (type == "concat") { + op->SetInput("X", inputs); + op->SetAttr("axis", 1); + op->SetOutput("Out", {outputs[0]}); + } else if (type == "cvm") { + op->SetInput("X", {inputs[0]}); + op->SetInput("CVM", {inputs[1]}); + op->SetOutput("Y", {outputs[0]}); + op->SetAttr("use_cvm", true); + } else { + op->SetInput("X", inputs); + op->SetOutput("Out", outputs); + } + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); +} + +int CountOpType(const ir::Graph* graph, + const std::string& op_type = "fusion_seqpool_cvm_concat") { + int count = 0; + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == op_type) { + ++count; + } + } + return count; +} + +std::unique_ptr GetNumNodesOfBeforeAfter( + std::unique_ptr graph, int* before, int* after, + const std::string& pass_type = "seqpool_cvm_concat_fuse_pass") { + auto pass = PassRegistry::Instance().Get(pass_type); + *before = graph->Nodes().size(); + graph.reset(pass->Apply(graph.release())); + *after = graph->Nodes().size(); + return graph; +} + +/* + * Before fuse: + * + * + * a b c + * | | | + * op1 op2 op3 + * / \ / \ / \ + * d e n f g n h i n + * | / | / | / + * op4 op5 op6 + * | | | + j k l + * \ | / + * concat + * | + * m + * + * Type of op1, op2 and op3 are sequence_pool, with "SUM" pooltype attr. + * Type of op4, op5 and op6 are cvm, with use_cvm is true. + * + * After fuse: + * a b c n + * \ | | / + * fusion_seqpool_cvm_concat + * | + * m + */ +TEST(SeqPoolCVMConcatFusePass, basic) { + ProgramDesc prog; + for (auto& v : + std::vector({"a", "b", "c", "d", "e", "f", "g", "h", "i", + "j", "k", "l", "m", "n"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::LOD_TENSOR); + } + + SetOp(&prog, "sequence_pool", std::vector({"a"}), + std::vector({"d", "e"})); + SetOp(&prog, "sequence_pool", std::vector({"b"}), + std::vector({"f", "g"})); + SetOp(&prog, "sequence_pool", std::vector({"c"}), + std::vector({"h", "i"})); + SetOp(&prog, "cvm", std::vector({"e", "n"}), + std::vector({"j"})); + SetOp(&prog, "cvm", std::vector({"g", "n"}), + std::vector({"k"})); + SetOp(&prog, "cvm", std::vector({"i", "n"}), + std::vector({"l"})); + SetOp(&prog, "concat", std::vector({"j", "k", "l"}), + std::vector({"m"})); + + std::unique_ptr graph(new ir::Graph(prog)); + int before, after; + graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after); + // Remove 16 Nodes: op1, op2, op3, op4, op5, op6, d, e, f, g, h, i, j, k, l, + // concat_op + // Add 1 Node: fusion_seqpool_cvm_concat + EXPECT_EQ(after, before - 15); + EXPECT_EQ(CountOpType(graph.get()), 1); +} + +/* + * Before fuse: + * a b + * | / \ + * op1 k op2 k op3 + * / \ / / \ / \ + * c d e f g + * | | + * op4 op5 + * | | + * h i + * \ / + * concat + * | + * j + * Type of op1 and op2 are sequence_pool, with "SUM" pooltype attr. + * Type of op4 and op5 are cvm, with use_cvm is true. + * + * After fuse: + * a k b + * \ | / \ + * fusion_seqpool_cvm_concat op3 + * | | + * j g + */ +TEST(SeqPoolCVMConcatFusePass, advanced) { + ProgramDesc prog; + for (auto& v : std::vector( + {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::LOD_TENSOR); + } + + SetOp(&prog, "sequence_pool", std::vector({"a"}), + std::vector({"c", "d"})); + SetOp(&prog, "sequence_pool", std::vector({"b"}), + std::vector({"e", "f"})); + SetOp(&prog, "op3", std::vector({"b"}), + std::vector({"g"})); + SetOp(&prog, "cvm", std::vector({"d", "k"}), + std::vector({"h"})); + SetOp(&prog, "cvm", std::vector({"f", "k"}), + std::vector({"i"})); + SetOp(&prog, "concat", std::vector({"h", "i"}), + std::vector({"j"})); + + std::unique_ptr graph(new ir::Graph(prog)); + int before, after; + graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after); + // Remove 11 Nodes: op1, op2, op4, op5, c, d, e, f, h, i, concat_op + // Add 1 Node: fusion_seqpool_cvm_concat + EXPECT_EQ(after, before - 10); + EXPECT_EQ(CountOpType(graph.get()), 1); +} + +ProgramDesc BuildProgramDesc(int num_inputs_of_concat) { + ProgramDesc prog; + auto new_var = [&](const std::string& name) { + auto* var = prog.MutableBlock(0)->Var(name); + var->SetType(proto::VarType::LOD_TENSOR); + }; + std::vector concat_inputs; + new_var("cvm_in"); + for (int i = 0; i < num_inputs_of_concat; ++i) { + std::string seqpool_prefix = "seqpool_op_" + std::to_string(i); + new_var(seqpool_prefix + "in"); + new_var(seqpool_prefix + "out"); + new_var(seqpool_prefix + "out_unused"); + SetOp(&prog, "sequence_pool", + std::vector({seqpool_prefix + "in"}), + std::vector( + {seqpool_prefix + "out_unused", seqpool_prefix + "out"})); + + std::string cvm_prefix = "cvm_op_" + std::to_string(i); + new_var(cvm_prefix + "out"); + SetOp(&prog, "cvm", + std::vector({seqpool_prefix + "out", "cvm_in"}), + std::vector({cvm_prefix + "out"})); + + concat_inputs.push_back(cvm_prefix + "out"); + } + SetOp(&prog, "concat", concat_inputs, + std::vector({"concat_out"})); + return prog; +} + +// test more inputs of concat +TEST(SeqPoolCVMConcatFusePass, more_inputs) { + for (int num : {1, 2, 10}) { + ProgramDesc prog = BuildProgramDesc(num); + std::unique_ptr graph(new ir::Graph(prog)); + int before, after; + graph = GetNumNodesOfBeforeAfter(std::move(graph), &before, &after); + // Remove Nodes: n * (seqpool_op, seqpool_out, out_unused, cvm_op, cvm_out), + // and concat_op + // Add Node: fusion_seqpool_cvm_concat op + EXPECT_EQ(after, before - num * 5); + EXPECT_EQ(CountOpType(graph.get()), 1); + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(seqpool_cvm_concat_fuse_pass); diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc new file mode 100644 index 00000000..61784f8c --- /dev/null +++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.cc @@ -0,0 +1,202 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h" + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * This pass is to simplify the Grpah, it may contains: + * - replace comlicated op with basic op + * - remove some unnecessary op + * + * In the current implementation, it supports: + * - remove dropout_op (upscale_in_train) or + * replace dropout_op with scale_op (downgrade_in_infer) when is_test is true + */ +void SimplifyWithBasicOpsPass::ApplyImpl(Graph* graph) const { + VLOG(3) << "Simplify the Graph with basic ops."; + std::unordered_set del_node_set; + for (Node* n : graph->Nodes()) { + if (n->IsOp() && n->Op()) { + if (n->Op()->Type() == "dropout") { + SimplifyDropout(graph, n, &del_node_set); + } + } + } + + GraphSafeRemoveNodes(graph, del_node_set); +} + +bool SimplifyWithBasicOpsPass::SimplifyDropout( + Graph* graph, Node* n, + std::unordered_set* del_node_set) const { + OpDesc* dropout_op_desc = n->Op(); + bool is_test = false; + // In the model used in test_analyzer_bert, the is_test's AttrType of + // dropout_op is INT. + if (dropout_op_desc->HasAttr("is_test")) { + if (dropout_op_desc->GetAttrType("is_test") == proto::AttrType::BOOLEAN) { + is_test = boost::get(dropout_op_desc->GetAttr("is_test")); + } else if (dropout_op_desc->GetAttrType("is_test") == + proto::AttrType::INT) { + is_test = boost::get(dropout_op_desc->GetAttr("is_test")) == 0 + ? false + : true; + } + } + + if (!is_test) { + return false; + } + + Node* dropout_x = GetInputVar(n, dropout_op_desc->Input("X")[0]); + Node* dropout_out = GetOutputVar(n, dropout_op_desc->Output("Out")[0]); + + bool upscale_in_train = false; + // Once the dropout_implementation's AttrType is BOOLEAN, but now is STRING. + if (dropout_op_desc->HasAttr("dropout_implementation")) { + if (dropout_op_desc->GetAttrType("dropout_implementation") == + proto::AttrType::BOOLEAN) { + upscale_in_train = + boost::get(dropout_op_desc->GetAttr("dropout_implementation")); + } else if (dropout_op_desc->GetAttrType("dropout_implementation") == + proto::AttrType::STRING) { + upscale_in_train = boost::get(dropout_op_desc->GetAttr( + "dropout_implementation")) == "upscale_in_train"; + } + } + + if (upscale_in_train) { + // dropout_op can be deleted. + // dropout_x -> dropout_op -> dropout_out -> next_op -> next_out + // | + // \|/ + // dropout_x -> next_op -> next_out + // Check whether dropout_x is some next_op's output + bool dropout_x_is_reused_as_output = false; + for (auto* next_op : dropout_out->outputs) { + for (auto* next_out : next_op->outputs) { + if (next_out == dropout_x || + next_out->Var()->Name() == dropout_x->Var()->Name()) { + dropout_x_is_reused_as_output = true; + break; + } + } + if (dropout_x_is_reused_as_output) { + break; + } + } + if (dropout_x_is_reused_as_output) { + VarDesc new_var_desc(*dropout_x->Var()); + new_var_desc.SetName("simplify_with_basic_ops_" + dropout_x->Name()); + auto* new_var_node = graph->CreateVarNode(&new_var_desc); + for (auto* out_op : dropout_x->outputs) { + if (out_op != n) { + ReplaceInputVar(out_op, dropout_x, new_var_node); + } + } + for (auto* in_op : dropout_x->inputs) { + ReplaceOutputVar(in_op, dropout_x, new_var_node); + } + dropout_x = new_var_node; + } + for (auto* next_op : dropout_out->outputs) { + ReplaceInputVar(next_op, dropout_out, dropout_x); + } + + del_node_set->insert(dropout_out); + } else { + // Use a scale_op replaces the dropout_op + // dropout_x -> dropout_op -> dropout_out -> next_op -> next_out + // | + // \|/ + // dropout_x -> scale_op -> dropout_out -> next_op -> next_out + float scale = + 1.0f - boost::get(dropout_op_desc->GetAttr("dropout_prob")); + + framework::OpDesc new_op_desc; + new_op_desc.SetType("scale"); + new_op_desc.SetInput("X", {dropout_x->Name()}); + new_op_desc.SetOutput("Out", {dropout_out->Name()}); + new_op_desc.SetAttr("scale", scale); + new_op_desc.SetAttr("bias", static_cast(0)); + new_op_desc.SetAttr("bias_after_scale", true); + + auto* scale_op_node = graph->CreateOpNode(&new_op_desc); + IR_NODE_LINK_TO(dropout_x, scale_op_node); + IR_NODE_LINK_TO(scale_op_node, dropout_out); + } + + del_node_set->insert(n); + return true; +} + +Node* SimplifyWithBasicOpsPass::GetInputVar(Node* n, + const std::string& name) const { + for (auto* in : n->inputs) { + if (in->Name() == name) { + return in; + } + } + return nullptr; +} + +Node* SimplifyWithBasicOpsPass::GetOutputVar(Node* n, + const std::string& name) const { + for (auto* out : n->outputs) { + if (out->Name() == name) { + return out; + } + } + return nullptr; +} + +void SimplifyWithBasicOpsPass::ReplaceInputVar(Node* op, Node* old_var, + Node* new_var) const { + if (op->IsOp() && op->Op()) { + new_var->outputs.push_back(op); + for (size_t i = 0; i < op->inputs.size(); ++i) { + if (op->inputs[i] == old_var) { + op->inputs[i] = new_var; + op->Op()->RenameInput(old_var->Name(), new_var->Name()); + } + } + } +} + +void SimplifyWithBasicOpsPass::ReplaceOutputVar(Node* op, Node* old_var, + Node* new_var) const { + if (op->IsOp() && op->Op()) { + new_var->inputs.push_back(op); + for (size_t i = 0; i < op->outputs.size(); ++i) { + if (op->outputs[i] == old_var) { + op->outputs[i] = new_var; + op->Op()->RenameOutput(old_var->Name(), new_var->Name()); + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(simplify_with_basic_ops_pass, + paddle::framework::ir::SimplifyWithBasicOpsPass); diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h new file mode 100644 index 00000000..f5185622 --- /dev/null +++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class SimplifyWithBasicOpsPass : public Pass { + protected: + void ApplyImpl(Graph* graph) const override; + + private: + bool SimplifyDropout(Graph* graph, Node* n, + std::unordered_set* del_node_set) const; + + Node* GetInputVar(Node* n, const std::string& name) const; + Node* GetOutputVar(Node* n, const std::string& name) const; + + void ReplaceInputVar(Node* op, Node* old_var, Node* new_var) const; + void ReplaceOutputVar(Node* op, Node* old_var, Node* new_var) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc new file mode 100644 index 00000000..7fb67df4 --- /dev/null +++ b/paddle/fluid/framework/ir/simplify_with_basic_ops_pass_tester.cc @@ -0,0 +1,77 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/simplify_with_basic_ops_pass.h" + +#include +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +TEST(SimplifyWithBasicOpsPass, dropout) { + for (std::string dropout_implementation : + {"downgrade_in_infer", "upscale_in_train"}) { + for (auto inplace : {false, true}) { + if (dropout_implementation == "downgrade_in_infer" && inplace == true) { + continue; + } + + LOG(INFO) << "dropout_implementation: " << dropout_implementation + << ", inplace: " << inplace; + Layers layers; + // (x, y) -> mul -> tmp_0 + // (tmp_0) -> dropout -> (tmp_1) + // (tmp_1, z) -> elementwise_add -> (tmp_2) + // or + // (tmp_1, z) -> elementwise_add -> (tmp_0) + auto* x = layers.data("x"); + auto* y = layers.data("y"); + auto* z = layers.data("z"); + auto* mul_out = layers.mul(x, y); + auto* dropout_out = layers.dropout(mul_out, 0.5f, dropout_implementation); + if (inplace) { + layers.elementwise_add(dropout_out, z, mul_out); + } else { + layers.elementwise_add(dropout_out, z); + } + + std::unique_ptr graph(new Graph(layers.main_program())); + auto pass = PassRegistry::Instance().Get("simplify_with_basic_ops_pass"); + int num_dropout_nodes_before = GetNumOpNodes(graph, "dropout"); + int num_scale_nodes_before = GetNumOpNodes(graph, "scale"); + VLOG(3) << DebugString(graph); + + graph.reset(pass->Apply(graph.release())); + int num_dropout_nodes_after = GetNumOpNodes(graph, "dropout"); + int num_scale_nodes_after = GetNumOpNodes(graph, "scale"); + VLOG(3) << DebugString(graph); + + PADDLE_ENFORCE_EQ(num_dropout_nodes_after, 0UL); + if (dropout_implementation == "downgrade_in_infer") { + PADDLE_ENFORCE_EQ(num_dropout_nodes_before, + num_scale_nodes_after - num_scale_nodes_before); + } else { + PADDLE_ENFORCE_EQ(num_scale_nodes_after - num_scale_nodes_before, 0UL); + } + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(simplify_with_basic_ops_pass); diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc index 25207ffc..2077304b 100644 --- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc @@ -26,7 +26,7 @@ class SyncBatchNormPass : public Pass { void ApplyImpl(ir::Graph *graph) const override { VLOG(3) << "Use synchronous batch norm"; for (const Node *n : graph->Nodes()) { - if (n->IsOp()) { + if (n->IsOp() && n->Op()) { auto *op = n->Op(); if (op->Type() == "batch_norm") { op->SetType("sync_batch_norm"); diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 9883a194..ca820068 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -26,9 +26,6 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/recordio/scanner.h" -#include "paddle/fluid/recordio/writer.h" - namespace paddle { namespace framework { @@ -275,36 +272,6 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor, TensorFromStream(is, static_cast(tensor), dev_ctx); } -void WriteToRecordIO(recordio::Writer *writer, - const std::vector &tensor, - const platform::DeviceContext &dev_ctx) { - std::stringstream buffer; - size_t sz = tensor.size(); - buffer.write(reinterpret_cast(&sz), sizeof(uint32_t)); - for (auto &each : tensor) { - SerializeToStream(buffer, each, dev_ctx); - } - writer->Write(buffer.str()); -} - -bool ReadFromRecordIO(recordio::Scanner *scanner, - const platform::DeviceContext &dev_ctx, - std::vector *result_ptr) { - if (!scanner->HasNext()) { - return false; - } - std::istringstream sin(scanner->Next()); - uint32_t sz; - sin.read(reinterpret_cast(&sz), sizeof(uint32_t)); - auto &result = *result_ptr; - result.resize(sz); - for (uint32_t i = 0; i < sz; ++i) { - DeserializeFromStream(sin, &result[i], dev_ctx); - } - - return true; -} - std::vector LoDTensor::SplitLoDTensor( const std::vector places) const { check_memory_size(); @@ -316,6 +283,21 @@ std::vector LoDTensor::SplitLoDTensor( std::vector results; results.reserve(result_size); + // if result_size(batch_size) is 0, just return #places.size() copys of empty + // tensors. + if (result_size == 0) { + for (size_t i = 0; i < places.size(); ++i) { + LoDTensor dst; + dst.Resize(dims()); + dst.mutable_data(places[i], type()); + if (!lod().empty()) { + dst.set_lod(lod()); + } + results.emplace_back(dst); + } + return results; + } + int step_width = static_cast(batch_size / result_size); for (size_t i = 0; i < result_size; ++i) { int begin = static_cast(i * step_width); @@ -359,17 +341,28 @@ void LoDTensor::MergeLoDTensor( PADDLE_ENFORCE(!lod_tensors.empty()); framework::DDim new_dim = lod_tensors[0]->dims(); - auto new_type = lod_tensors[0]->type(); + proto::VarType::Type new_type = proto::VarType::FP32; framework::DataLayout new_layout = lod_tensors[0]->layout(); + for (auto *t : lod_tensors) { + if (t->numel() && t->IsInitialized()) { + new_dim = t->dims(); + new_type = t->type(); + new_layout = t->layout(); + break; + } + } + LoD new_lod = lod_tensors[0]->lod(); + for (size_t i = 1; i < lod_tensors.size(); ++i) { auto *t = lod_tensors[i]; - PADDLE_ENFORCE_EQ(new_type, t->type()); - PADDLE_ENFORCE_EQ(new_layout, t->layout()); - - PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0], - framework::product(t->dims()) / t->dims()[0]); - new_dim[0] += t->dims()[0]; + if (t->numel() && t->IsInitialized()) { + PADDLE_ENFORCE_EQ(new_type, t->type()); + PADDLE_ENFORCE_EQ(new_layout, t->layout()); + PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0], + framework::product(t->dims()) / t->dims()[0]); + new_dim[0] += t->dims()[0]; + } auto &lod = t->lod(); PADDLE_ENFORCE_EQ(new_lod.size(), lod.size()); @@ -389,6 +382,9 @@ void LoDTensor::MergeLoDTensor( int begin = 0; for (auto *src : lod_tensors) { int end = begin + src->dims()[0]; + if (end == begin) { + continue; + } auto dst = Slice(begin, end); framework::TensorCopy(*src, dst_place, &dst); begin = end; diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index 5e20ba7c..ef487533 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -32,12 +32,6 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" namespace paddle { - -namespace recordio { -class Writer; -class Scanner; -} - namespace framework { /* @@ -216,14 +210,6 @@ void SerializeToStream(std::ostream& os, const LoDTensor& tensor, void DeserializeFromStream(std::istream& is, LoDTensor* tensor, const platform::DeviceContext& dev_ctx); -extern void WriteToRecordIO(recordio::Writer* writer, - const std::vector& tensor, - const platform::DeviceContext& dev_ctx); - -extern bool ReadFromRecordIO(recordio::Scanner* scanner, - const platform::DeviceContext& dev_ctx, - std::vector* result_ptr); - /* * Convert between length-based LoD and offset-based LoD. * The implementation of LoDTensor class use offset-based LoD. diff --git a/paddle/fluid/framework/lod_tensor_test.cc b/paddle/fluid/framework/lod_tensor_test.cc index d1554113..c93c3f26 100644 --- a/paddle/fluid/framework/lod_tensor_test.cc +++ b/paddle/fluid/framework/lod_tensor_test.cc @@ -20,9 +20,6 @@ #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/recordio/scanner.h" -#include "paddle/fluid/recordio/writer.h" - namespace paddle { namespace framework { @@ -158,6 +155,26 @@ TEST(LoD, SplitLoDTensor) { EXPECT_EQ(lods[1].lod(), lod1); } +TEST(LoD, SplitLoDTensorWithZeroBatchSize) { + LoD lod; + lod.push_back(std::vector({0})); + + platform::CPUPlace place; + LoDTensor lod_tensor; + lod_tensor.Resize({0, 5}); + lod_tensor.mutable_data(place); + lod_tensor.set_lod(lod); + + std::vector places{platform::CPUPlace(), + platform::CPUPlace()}; + LoD lod_res; + lod_res.push_back(std::vector({0})); + + auto lods = lod_tensor.SplitLoDTensor(places); + EXPECT_EQ(lods[0].lod(), lod_res); + EXPECT_EQ(lods[1].lod(), lod_res); +} + TEST(LoD, MergeLoDTensor) { LoD lod; lod.push_back(std::vector({0, 2, 4, 5, 6})); @@ -188,7 +205,15 @@ TEST(LoD, MergeLoDTensor) { dst_ptr[i] = i; } - std::vector lods{&lod_tensor0, &lod_tensor1}; + LoDTensor lod_tensor2; + LoD lod2; + lod2.push_back(std::vector({0})); + lod2.push_back(std::vector({0})); + lod_tensor2.set_lod(lod2); + lod_tensor2.Resize({0}); + dst_ptr = lod_tensor2.mutable_data(place); + + std::vector lods{&lod_tensor0, &lod_tensor1, &lod_tensor2}; LoDTensor lod_tensor; lod_tensor.MergeLoDTensor(lods, place); @@ -281,52 +306,5 @@ TEST(LoD, ConvertToOffsetBasedLoD) { EXPECT_EQ(offset_lod, expected); } -template -static void TestRecordIO() { - LoDTensor tensor; - T* tmp = tensor.mutable_data(make_ddim({4, 5}), platform::CPUPlace()); - for (int i = 0; i < 20; ++i) { - tmp[i] = static_cast(i); - } - - std::stringstream* stream = new std::stringstream(); - auto& ctx = - *platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); - { - recordio::Writer writer(stream, recordio::Compressor::kSnappy); - WriteToRecordIO(&writer, {tensor, tensor}, ctx); - WriteToRecordIO(&writer, {tensor, tensor}, ctx); - writer.Flush(); - } - - auto assert_tensor_ok = [](const LoDTensor& tensor) { - for (int i = 0; i < 20; ++i) { - ASSERT_EQ(tensor.data()[i], static_cast(i)); - } - }; - - { - std::unique_ptr stream_ptr(stream); - recordio::Scanner scanner(std::move(stream_ptr)); - std::vector tensors; - ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors)); - ASSERT_EQ(tensors.size(), static_cast(2)); - assert_tensor_ok(tensors[0]); - assert_tensor_ok(tensors[1]); - ASSERT_TRUE(ReadFromRecordIO(&scanner, ctx, &tensors)); - ASSERT_EQ(tensors.size(), static_cast(2)); - assert_tensor_ok(tensors[0]); - assert_tensor_ok(tensors[1]); - } -} - -TEST(LoDTensor, RecordIO) { - TestRecordIO(); - TestRecordIO(); - TestRecordIO(); - TestRecordIO(); - TestRecordIO(); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor_test.cu b/paddle/fluid/framework/lod_tensor_test.cu index b9950627..7d6ba984 100644 --- a/paddle/fluid/framework/lod_tensor_test.cu +++ b/paddle/fluid/framework/lod_tensor_test.cu @@ -18,7 +18,6 @@ #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc index 8cbf2efa..be25672b 100644 --- a/paddle/fluid/framework/multi_trainer.cc +++ b/paddle/fluid/framework/multi_trainer.cc @@ -24,6 +24,11 @@ namespace framework { void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, Dataset* dataset) { thread_num_ = trainer_desc.thread_num(); + for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size(); + i++) { + need_merge_var_names_.push_back( + trainer_desc.downpour_param().stat_var_names(i)); + } SetDataset(dataset); // get filelist from trainer_desc here const std::vector readers = @@ -50,6 +55,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, const platform::Place& place) { for (int i = 0; i < thread_num_; ++i) { workers_[i]->SetPlace(place); + workers_[i]->SetReaderPlace(place); workers_[i]->SetRootScope(root_scope_); workers_[i]->CreateDeviceResource(main_program); // Program workers_[i]->BindingDataFeedMemory(); diff --git a/paddle/fluid/framework/no_need_buffer_vars_inference.h b/paddle/fluid/framework/no_need_buffer_vars_inference.h index 2c933659..a6357561 100644 --- a/paddle/fluid/framework/no_need_buffer_vars_inference.h +++ b/paddle/fluid/framework/no_need_buffer_vars_inference.h @@ -45,15 +45,16 @@ class NoNeedBufferVarsInference { const AttributeMap &attrs_; }; -#define DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(class_type, ...) \ - class class_type : public ::paddle::framework::NoNeedBufferVarsInference { \ - public: \ - using ::paddle::framework::NoNeedBufferVarsInference:: \ - NoNeedBufferVarsInference; \ - \ - std::unordered_set operator()() const override { \ - return {__VA_ARGS__}; \ - } \ +#define DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(class_type, ...) \ + class class_type final \ + : public ::paddle::framework::NoNeedBufferVarsInference { \ + public: \ + using ::paddle::framework::NoNeedBufferVarsInference:: \ + NoNeedBufferVarsInference; \ + \ + std::unordered_set operator()() const final { \ + return {__VA_ARGS__}; \ + } \ } } // namespace framework diff --git a/paddle/fluid/framework/op_call_stack.cc b/paddle/fluid/framework/op_call_stack.cc new file mode 100644 index 00000000..cf3b7188 --- /dev/null +++ b/paddle/fluid/framework/op_call_stack.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_call_stack.h" +#include +#include +#include "paddle/fluid/framework/attribute.h" +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { + +void InsertCallStackInfo(const std::string &type, const AttributeMap &attrs, + platform::EnforceNotMet *exception) { + if (attrs.count("sub_block") != 0) { + return; + } + auto &callstack = boost::get>( + attrs.at(OpProtoAndCheckerMaker::OpCreationCallstackAttrName())); + + if (callstack.empty()) { + return; + } + std::ostringstream sout; + sout << "Invoke operator " << type << " error.\n"; + sout << "Python Call stacks: \n"; + for (auto &line : callstack) { + sout << line; + } + sout << "C++ Call stacks: \n"; + sout << exception->err_str_; + exception->err_str_ = sout.str(); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/revision.h b/paddle/fluid/framework/op_call_stack.h similarity index 64% rename from paddle/fluid/framework/revision.h rename to paddle/fluid/framework/op_call_stack.h index 11588ea5..4408601a 100644 --- a/paddle/fluid/framework/revision.h +++ b/paddle/fluid/framework/op_call_stack.h @@ -1,10 +1,10 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at -http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -12,12 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include - #pragma once +#include +#include "paddle/fluid/framework/type_defs.h" +#include "paddle/fluid/platform/enforce.h" + namespace paddle { namespace framework { -const std::string GetPaddleRevision(); +void InsertCallStackInfo(const std::string &type, const AttributeMap &attrs, + platform::EnforceNotMet *exception); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc new file mode 100644 index 00000000..bf2f85e6 --- /dev/null +++ b/paddle/fluid/framework/op_compatible_info.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_compatible_info.h" +#include +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace framework { + +inline std::vector ConvertStr2Int(const std::string& str_text) { + auto vec_text = string::split_string(str_text, "."); + PADDLE_ENFORCE((vec_text.size() == 2 || vec_text.size() == 3), + "Input[%s] is not a right version format [1.6 or 1.6.0]", + str_text); + + std::vector vec_res; + vec_res.reserve(3); + for (auto& val : vec_text) { + vec_res.emplace_back(atoi(val.c_str())); + } + + if (vec_res.size() == 2) { + vec_res.emplace_back(0); + } + + return vec_res; +} + +/* first version >= second version return true */ + +inline bool CompareVersion(const std::string& str_first, + const std::string& str_second) { + auto vec_first_version = ConvertStr2Int(str_first); + auto vec_second_version = ConvertStr2Int(str_second); + + // first version id + PADDLE_ENFORCE_EQ( + vec_first_version.size(), vec_second_version.size(), + "version information size not equal, first is [%d] second is [%d]", + vec_first_version.size(), vec_second_version.size()); + + for (size_t i = 0; i < vec_first_version.size() - 1; ++i) { + if (vec_first_version[i] != vec_second_version[i]) { + return vec_first_version[i] > vec_second_version[i]; + } + } + return vec_first_version[2] >= vec_second_version[2]; +} + +void OpCompatibleMap::InitOpCompatibleMap() { + op_compatible_map_["sequence_pad"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; + op_compatible_map_["sequence_unpad"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; + + op_compatible_map_["reshape2"] = {"1.6.0", OpCompatibleType::DEFIN_NOT}; + op_compatible_map_["slice"] = {"1.6.0", OpCompatibleType::possible}; + op_compatible_map_["expand"] = {"1.6.0", OpCompatibleType::possible}; + + op_compatible_map_["layer_norm"] = {"1.6.0", OpCompatibleType::bug_fix}; +} + +CompatibleInfo OpCompatibleMap::GetOpCompatibleInfo(std::string op_name) { + auto it = op_compatible_map_.find(op_name); + if (it != op_compatible_map_.end()) { + return it->second; + } else { + return {default_required_version_, OpCompatibleType::DEFIN_NOT}; + } +} + +OpCompatibleType OpCompatibleMap::IsRequireMiniVersion( + std::string op_name, std::string str_current_version) { + auto it = op_compatible_map_.find(op_name); + if (it != op_compatible_map_.end()) { + if (CompareVersion(str_current_version, it->second.required_version_)) { + return OpCompatibleType::compatible; + } else { + return it->second.compatible_type_; + } + + } else { + if (CompareVersion(str_current_version, default_required_version_)) { + return OpCompatibleType::compatible; + } else { + return OpCompatibleType::DEFIN_NOT; + } + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_compatible_info.h b/paddle/fluid/framework/op_compatible_info.h new file mode 100644 index 00000000..03d47c82 --- /dev/null +++ b/paddle/fluid/framework/op_compatible_info.h @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#pragma once + +namespace paddle { +namespace framework { + +enum class OpCompatibleType { + compatible = 0, // support previous version + DEFIN_NOT = 1, // definitely can't support previous version + possible = 2, // possible can support previous version, not sure + bug_fix = 3, // bug fix, can't support previous version + precision_change = 4 // precision change, may cause difference +}; + +struct CompatibleInfo { + CompatibleInfo(std::string required_version, OpCompatibleType compatible_type) + : required_version_(required_version), + compatible_type_(compatible_type) {} + CompatibleInfo() {} + + // op required version, previous version not support + std::string required_version_; + OpCompatibleType compatible_type_; +}; + +class OpCompatibleMap { + public: + OpCompatibleMap() : default_required_version_("1.5.0") {} + void InitOpCompatibleMap(); + + CompatibleInfo GetOpCompatibleInfo(std::string op_name); + + /* IsRequireMiniVersion + * return type OpCompatibleType */ + + OpCompatibleType IsRequireMiniVersion(std::string op_name, + std::string current_version); + + void SerializeToStr(std::string& str) {} /* NOLINT */ + void UnSerialize(const std::string& str) {} + + const std::string& GetDefaultRequiredVersion() { + return default_required_version_; + } + + private: + std::map op_compatible_map_; + + std::string default_required_version_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_compatible_info_test.cc b/paddle/fluid/framework/op_compatible_info_test.cc new file mode 100644 index 00000000..2a50a830 --- /dev/null +++ b/paddle/fluid/framework/op_compatible_info_test.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_compatible_info.h" +#include +#include "gtest/gtest.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { +TEST(test_op_compatible_info, test_op_compatible) { + auto comp_map = OpCompatibleMap(); + comp_map.InitOpCompatibleMap(); + + auto default_req_version = comp_map.GetDefaultRequiredVersion(); + + auto seq_pad = comp_map.GetOpCompatibleInfo("sequence_pad"); + auto reshape = comp_map.GetOpCompatibleInfo("reshape"); + auto layer_norm = comp_map.GetOpCompatibleInfo("layer_norm"); + + auto deafult_info = comp_map.GetOpCompatibleInfo("layer_xx"); + + auto comp_1 = comp_map.IsRequireMiniVersion("sequence_pad", "1.5.0"); + ASSERT_EQ(comp_1, OpCompatibleType::DEFIN_NOT); + auto comp_2 = comp_map.IsRequireMiniVersion("sequence_pad", "1.6.0"); + ASSERT_EQ(comp_2, OpCompatibleType::compatible); + auto comp_3 = comp_map.IsRequireMiniVersion("sequence_pad", "1.6.1"); + ASSERT_EQ(comp_3, OpCompatibleType::compatible); + auto comp_6 = comp_map.IsRequireMiniVersion("sequence_pad", "1.7.0"); + ASSERT_EQ(comp_6, OpCompatibleType::compatible); + auto comp_7 = comp_map.IsRequireMiniVersion("sequence_pad", "0.7.0"); + ASSERT_EQ(comp_7, OpCompatibleType::DEFIN_NOT); + auto comp_8 = comp_map.IsRequireMiniVersion("sequence_pad", "2.0.0"); + ASSERT_EQ(comp_8, OpCompatibleType::compatible); + + ASSERT_EQ(comp_map.IsRequireMiniVersion("unkop", "2.0.0"), + OpCompatibleType::compatible); + ASSERT_EQ(comp_map.IsRequireMiniVersion("unkop", "0.7.0"), + OpCompatibleType::DEFIN_NOT); + + ASSERT_EQ(comp_map.IsRequireMiniVersion("slice", "0.7.0"), + OpCompatibleType::possible); + ASSERT_EQ(comp_map.IsRequireMiniVersion("slice", "1.6.0"), + OpCompatibleType::compatible); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 1ea93b76..a36e3605 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -18,8 +18,10 @@ limitations under the License. */ #include // NOLINT #include #include +#include #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_call_stack.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" @@ -679,26 +681,33 @@ void OpDesc::CheckAttrs() { } void OpDesc::InferShape(const BlockDesc &block) const { - VLOG(3) << "CompileTime infer shape on " << Type(); - InitInferShapeFuncs(); - auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_; - PADDLE_ENFORCE(static_cast(infer_shape), - "%s's infer_shape has not been registered", this->Type()); - CompileTimeInferShapeContext ctx(*this, block); - if (VLOG_IS_ON(10)) { - std::ostringstream sout; - auto inames = this->InputArgumentNames(); - sout << " From ["; - std::copy(inames.begin(), inames.end(), - std::ostream_iterator(sout, ", ")); - sout << "] to ["; - auto onames = this->OutputArgumentNames(); - std::copy(onames.begin(), onames.end(), - std::ostream_iterator(sout, ", ")); - sout << "]"; - VLOG(10) << sout.str(); - } - infer_shape(&ctx); + try { + VLOG(3) << "CompileTime infer shape on " << Type(); + InitInferShapeFuncs(); + auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_; + PADDLE_ENFORCE(static_cast(infer_shape), + "%s's infer_shape has not been registered", this->Type()); + CompileTimeInferShapeContext ctx(*this, block); + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + auto inames = this->InputArgumentNames(); + sout << " From ["; + std::copy(inames.begin(), inames.end(), + std::ostream_iterator(sout, ", ")); + sout << "] to ["; + auto onames = this->OutputArgumentNames(); + std::copy(onames.begin(), onames.end(), + std::ostream_iterator(sout, ", ")); + sout << "]"; + VLOG(10) << sout.str(); + } + infer_shape(&ctx); + } catch (platform::EnforceNotMet exception) { + framework::InsertCallStackInfo(Type(), attrs_, &exception); + throw std::move(exception); + } catch (...) { + std::rethrow_exception(std::current_exception()); + } } void OpDesc::InferVarType(BlockDesc *block) const { @@ -807,7 +816,7 @@ void CompileTimeInferShapeContext::SetRepeatedDims( auto var = block_.FindVarRecursive(name); PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); std::vector> dim_vec(dims.size()); - std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize); + std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize<>); var->SetShapes(dim_vec); } diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index dedaf243..2f6fb9e2 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -80,6 +80,15 @@ class OpDesc { Attribute GetAttr(const std::string &name) const; + template + T GetAttrIfExists(const std::string &name) const { + T result{}; + if (HasAttr(name)) { + result = boost::get(GetAttr(name)); + } + return result; + } + const proto::OpProto::Attr &GetProtoAttr(const std::string &name) const; Attribute GetNullableAttr(const std::string &name) const; diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index daa72769..765ca361 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -52,24 +52,41 @@ struct OpInfo { } const proto::OpProto& Proto() const { - PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered"); + PADDLE_ENFORCE_NOT_NULL(proto_, "Operator's Proto has not been registered"); PADDLE_ENFORCE(proto_->IsInitialized(), - "Operator Proto must be initialized in op info"); + "Operator's Proto must be initialized in op info"); return *proto_; } const OpCreator& Creator() const { PADDLE_ENFORCE_NOT_NULL(creator_, - "Operator Creator has not been registered"); + "Operator's Creator has not been registered"); return creator_; } const GradOpMakerFN& GradOpMaker() const { - PADDLE_ENFORCE_NOT_NULL(grad_op_maker_, - "Operator GradOpMaker has not been registered."); + // Normally, proto_ should not be null, except some special operators, such + // as LeaklyReluDoubleGrad op. + std::string type = proto_ ? proto_->type() : "unknown"; + PADDLE_ENFORCE_NOT_NULL( + grad_op_maker_, + "Operator %s's GradOpMaker has not been " + "registered.\nPlease check whether %s_op has " + "grad_op.\nIf not, please set stop_gradient to True " + "for its input and output variables using var.stop_gradient=True.", + type.c_str(), type.c_str()); return grad_op_maker_; } + // some op has no grad_op_maker, add check before use GradOpMaker() + bool HasGradOpMaker() const { + return grad_op_maker_ != nullptr ? true : false; + } + + bool HasInferInplace() const { + return infer_inplace_ != nullptr ? true : false; + } + const OpAttrChecker* Checker() const { return checker_; } const InferNoNeedBufferVarsFN& NoNeedBufferVarsInferer() const { diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index a53a81c2..3f14f47f 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include #include #include @@ -53,8 +54,9 @@ class Registrar { template struct OperatorRegistrar : public Registrar { explicit OperatorRegistrar(const char* op_type) { - PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type), - "'%s' is registered more than once.", op_type); + if (OpInfoMap::Instance().Has(op_type)) { + PADDLE_THROW("'%s' is registered more than once.", op_type); + } static_assert(sizeof...(ARGS) != 0, "OperatorRegistrar should be invoked at least by OpClass"); OpInfo info; @@ -206,7 +208,8 @@ struct OpKernelRegistrarFunctorExIsType()) { const LoDTensor& tensor = var->Get(); - if (UNLIKELY(!tensor.IsInitialized())) { - return DDim({-1}); - } return tensor.dims(); } else if (var->IsType()) { if (get_actual_dim) { @@ -186,28 +182,9 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { } else { RunImpl(scope, place); } - VLOG(3) << place << " " << DebugStringEx(&scope); } catch (platform::EnforceNotMet exception) { - if (Attrs().count("sub_block") != 0) { - throw std::move(exception); - } - - auto& callstack = Attr>( - OpProtoAndCheckerMaker::OpCreationCallstackAttrName()); - - if (callstack.empty()) { - throw std::move(exception); - } - std::ostringstream sout; - sout << "Invoke operator " << Type() << " error.\n"; - sout << "Python Callstacks: \n"; - for (auto& line : callstack) { - sout << line; - } - sout << "C++ Callstacks: \n"; - sout << exception.err_str_; - exception.err_str_ = sout.str(); + framework::InsertCallStackInfo(Type(), Attrs(), &exception); throw std::move(exception); } catch (...) { std::rethrow_exception(std::current_exception()); @@ -671,7 +648,7 @@ class RuntimeInferShapeContext : public InferShapeContext { Variable* out_var = out_it->second.at(j); PADDLE_ENFORCE(out_var->IsType(), "The %d-th output of Output(%s) must be LoDTensor.", j, out); - auto in_tensor = in_var->Get(); + auto& in_tensor = in_var->Get(); auto* out_tensor = out_var->GetMutable(); out_tensor->set_lod(in_tensor.lod()); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 07e7abd5..5899a14f 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -35,6 +35,7 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/variant.h" @@ -248,6 +249,8 @@ class ExecutionContext { return op_.Attr(name); } + bool HasAttr(const std::string& name) const { return op_.HasAttr(name); } + bool HasInput(const std::string& name) const; bool HasOutput(const std::string& name) const; @@ -339,7 +342,7 @@ class ExecutionContext { #ifdef PADDLE_WITH_CUDA const inline platform::CUDADeviceContext& cuda_device_context() const { - PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(device_context_.GetPlace()), true); return *reinterpret_cast( &device_context_); } @@ -358,9 +361,7 @@ class ExecutionContext { template Tensor AllocateTmpTensor(const framework::DDim& dim, const DevContext& dev_ctx) const { - auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance() - .Get(dev_ctx) - .Allocate(product(dim) * sizeof(T)); + auto tmp_allocation_ptr = memory::Alloc(dev_ctx, product(dim) * sizeof(T)); auto& deleter = tmp_allocation_ptr.get_deleter(); auto* allocation_ptr = tmp_allocation_ptr.release(); auto shared_allocation = std::shared_ptr( @@ -462,7 +463,8 @@ class OperatorWithKernel : public OperatorBase { std::vector* GetKernelConfig(const OpKernelType& key) const; - protected: + // change this to public so that in dygraph mode we can call it to check if we + // need transform data virtual OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const OpKernelType& expected_kernel_type) const; diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h index a350b895..5c5a7423 100644 --- a/paddle/fluid/framework/operator_kernel_configs.h +++ b/paddle/fluid/framework/operator_kernel_configs.h @@ -81,6 +81,8 @@ TAlgorithm framework::AlgorithmsCache::GetAlgorithm( seed ^= hashFn(static_cast(algorithmFlags)) + 0x9e3779b9 + (seed << 6) + (seed >> 2) + 5; + VLOG(10) << "seed:" << seed << ", hash_.size:" << hash_.size(); + if (seed == 0) return gen_func(); if (hash_.find(seed) == hash_.end()) { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e45b5925..a12b4c87 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -32,6 +32,8 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/platform/profiler.h" +DECLARE_bool(use_ngraph); + #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" #endif @@ -82,6 +84,29 @@ class ParallelExecutorPrivate { inline bool HasGarbageCollectors() const { return !gcs_.empty(); } + /** + * NOTE(zengjinle): the feeded variables of users should not be reused, + * because users may feed them into another network. Changing the feeded + * variables that users can visit may cause calculation wrong, which is + * a very subtle bug when traning networks. However, these variables + * can be garbage collected. + * + * ParallelExecutor provides 2 methods to feed variables: + * + * - FeedTensorsIntoLocalScopes: this method would share memory of feeded + * variables, so we have to skip these. + * + * - FeedAndSplitTensorIntoLocalScopes: this method would copy data of feeded + * variables, so we do not need to skip + * them. + */ + inline void SetSkipMemoryReuse(size_t scope_idx, const std::string &name) { + auto iter = mem_opt_var_infos_[scope_idx].find(name); + if (iter != mem_opt_var_infos_[scope_idx].end()) { + iter->second->SetSkipMemoryReuse(true); + } + } + #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) void InitNCCLCtxs(framework::Scope *scope, const BuildStrategy &bst) { VLOG(1) << "nccl comm num:" << bst.nccl_comm_num_ << ", nranks:" << nranks_ @@ -233,6 +258,13 @@ class ParallelExecutorPrivate { }; ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { + if (FLAGS_use_ngraph) { + LOG_FIRST_N(WARNING, 1) + << "FLAGS_use_ngraph=True, memory optimization strategy is " + "disabled in ParallelExecutor"; + return graph; + } + std::vector last_live_ops_of_vars; auto ref_cnt_pass = ir::PassRegistry::Instance().Get("reference_count_pass"); @@ -250,11 +282,42 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { VLOG(10) << "Start to apply buffer_shared_inplace_pass"; graph = inplace_pass->Apply(graph); VLOG(10) << "buffer_shared_inplace_pass Applied"; - } - - // TODO(zjl): refactor MemoryOptimizePass as well!!! - - if (GetEagerDeletionThreshold() < 0) { + LOG(INFO) << "Inplace strategy is enabled, when " + "build_strategy.enable_inplace = True"; + } + + /** + * NOTE(zengjinle): If BuildStrategy.memory_optimize = None in Python, + * set BuildStrategy.memory_optimize according to whether gc is enabled. + * If gc is enabled, BuildStrategy.memory_optimize = False. + * If gc is disabled, BuildStrategy.memory_optimize = True. + * This is because gc+memory_optimize is worse than gc only. + * + * As an option, users can enable BuildStrategy.memory_optimize forcely + * by setting True, and disable it forcely by setting False. + */ + bool is_gc_enabled = (GetEagerDeletionThreshold() >= 0); + if (!build_strategy_.memory_optimize_) { + build_strategy_.memory_optimize_ = !is_gc_enabled; + } + + if (build_strategy_.memory_optimize_.get()) { + auto cross_op_memory_reuse_pass = ir::PassRegistry::Instance().Get( + "buffer_shared_cross_op_memory_reuse_pass"); + cross_op_memory_reuse_pass->SetNotOwned(ir::kMemOptVarInfoMapList, + &mem_opt_var_infos_); + cross_op_memory_reuse_pass->SetNotOwned(ir::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + cross_op_memory_reuse_pass->SetNotOwned(ir::kUseCuda, &use_cuda_); + VLOG(10) << "Start to apply buffer_shared_cross_op_memory_reuse_pass"; + graph = cross_op_memory_reuse_pass->Apply(graph); + VLOG(10) << "buffer_shared_cross_op_memory_reuse_pass Applied"; + LOG(INFO) << "Cross op memory reuse strategy is enabled, when " + "build_strategy.memory_optimize = True or garbage collection " + "strategy is disabled, which is not recommended"; + } + + if (!is_gc_enabled) { return graph; } size_t max_memory_size = static_cast(GetEagerDeletionThreshold()); @@ -302,6 +365,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { eager_deletion_pass->SetNotOwned(ir::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(graph); VLOG(10) << "EagerDeletionPass Applied"; + LOG(INFO) << "Garbage collection strategy is enabled, when " + << "FLAGS_eager_delete_tensor_gb = " + << (static_cast(GetEagerDeletionThreshold()) / (1 << 30)); } return graph; } @@ -499,21 +565,6 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } } - // If the loss_var_name is given, the number of graph should be only one. - if (loss_var_name.size()) { - size_t graph_num = ir::GraphNum(*graph); - if (graph_num > 1) { - LOG(WARNING) - << "The number of graph should be only one, " - "but the current graph has " - << ir::GraphNum(*graph) - << " sub_graphs. If you want to see the nodes of the " - "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " - "to specify the output dir. NOTES: if you not do training, " - "please don't pass loss_var_name."; - } - } - std::unordered_map scope_map; for (auto *scope : member_->local_scopes_) { auto &local_exec_scope = scope->NewScope(); @@ -654,8 +705,8 @@ void ParallelExecutor::BCastParamsToDevices( } } -void ParallelExecutor::Run(const std::vector &fetch_tensors, - const std::string &fetched_var_name) { +FeedFetchList ParallelExecutor::Run( + const std::vector &fetch_tensors) { VLOG(3) << "enter ParallelExecutor Run"; #ifdef WITH_GPERFTOOLS if (gProfileStarted) { @@ -670,8 +721,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, VLOG(3) << "ParallelExecutor begin to run member_->executor_->Run"; auto fetch_data = member_->executor_->Run(fetch_tensors); - *member_->global_scope_->Var(fetched_var_name)->GetMutable() = - fetch_data; + return fetch_data; } void ParallelExecutor::FeedTensorsIntoLocalScopes( @@ -682,6 +732,9 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( auto &map = tensors[i]; for (auto &pair : map) { bool is_persistable = member_->IsPersistable(pair.first); + if (!is_persistable) { + member_->SetSkipMemoryReuse(i, pair.first); + } auto *feed_scope = is_persistable ? member_->local_scopes_[i] : member_->local_exec_scopes_[i]; auto *feed_var = feed_scope->Var(pair.first); @@ -695,15 +748,19 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( const std::unordered_map &tensors) { + size_t num_places = member_->places_.size(); for (auto &pair : tensors) { + bool is_persistable = member_->IsPersistable(pair.first); + VLOG(3) << "Split " << (is_persistable ? "persistable" : "no persistable") + << " data (" << pair.first << "), dim:" << pair.second.dims() + << ", place: " << pair.second.place(); auto lod_tensors = pair.second.SplitLoDTensor(member_->places_); - if (member_->places_.size() != lod_tensors.size()) { - bool is_cpu_place = platform::is_cpu_place(member_->places_.front()); + bool is_cpu_place = platform::is_cpu_place(member_->places_.front()); + if (!is_persistable && num_places != lod_tensors.size()) { auto error_info = string::Sprintf( - "The number(%d) of samples of " - "current batch is less than the count(%d) of " - "devices(%s), currently, it is not allowed. ", - lod_tensors.size(), member_->places_.size(), + "The number(%d) of samples[%s] of current batch is less than the " + "count(%d) of devices(%s), currently, it is not allowed. ", + lod_tensors.size(), pair.first, num_places, (is_cpu_place ? "CPU" : "GPU")); if (is_cpu_place) { error_info += @@ -711,10 +768,35 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( "to determine the number of devices you need."; } PADDLE_THROW(error_info); + } else if (is_persistable) { + if (lod_tensors.size() == 1) { + lod_tensors.reserve(num_places); + auto &tensor = lod_tensors.front(); + PADDLE_ENFORCE_EQ(tensor.dims(), pair.second.dims(), + "The dim doesn't match."); + PADDLE_ENFORCE_EQ(tensor.place(), member_->places_.at(0), + "The place doesn't match."); + for (size_t i = 1; i < num_places; ++i) { + lod_tensors.emplace_back(); + auto &tmp = lod_tensors.back(); + framework::TensorCopy(pair.second, member_->places_.at(i), &tmp); + } + } + if (lod_tensors.size() != num_places) { + auto error_info = string::Sprintf( + "The number(%d) of samples[%s] of the current batch does not match " + "the count(%d) of devices(%s). Because that %s is a persistable " + "variable, you can feed just one sample, in that case, the input " + "sample will be copied in %d copies and be sent to different " + "places separately. If you need that different place has different " + "value, you should feed %d samples.", + lod_tensors.size(), pair.first, num_places, + (is_cpu_place ? "CPU" : "GPU"), pair.first, num_places, num_places); + PADDLE_THROW(error_info); + } } - bool is_persistable = member_->IsPersistable(pair.first); - for (size_t j = 0; j < member_->places_.size(); ++j) { + for (size_t j = 0; j < num_places; ++j) { auto *feed_scope = is_persistable ? member_->local_scopes_[j] : member_->local_exec_scopes_[j]; auto *feed_var = feed_scope->Var(pair.first); @@ -780,3 +862,4 @@ bool ParallelExecutor::EnableParallelGraphExecution( USE_PASS(reference_count_pass); USE_PASS(eager_deletion_pass); USE_PASS(buffer_shared_inplace_pass); +USE_PASS(buffer_shared_cross_op_memory_reuse_pass); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 1ac800c9..00ac5e13 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -74,8 +75,7 @@ class ParallelExecutor { void FeedAndSplitTensorIntoLocalScopes( const std::unordered_map &tensors); - void Run(const std::vector &fetch_tensors, - const std::string &fetched_var_name); + FeedFetchList Run(const std::vector &fetch_tensors); private: // broadcast the parameters from the 0th device. diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index 916359ab..3617a8f1 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -101,6 +101,7 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, this_worker->SetPipelineNum(pipeline_num_); if (i == 0) { this_worker->SetDataFeed(readers[reader_index++]); + this_worker->SetReaderPlace(place); } this_worker->SetPlace(place); this_worker->Initialize(trainer_desc); diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc index 0afcd85f..c58cb8ad 100644 --- a/paddle/fluid/framework/prune.cc +++ b/paddle/fluid/framework/prune.cc @@ -17,19 +17,44 @@ limitations under the License. */ #include #include +#include #include #include #include +#include #include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/program_desc.h" + namespace paddle { namespace framework { const char kFeedOpType[] = "feed"; const char kFetchOpType[] = "fetch"; -bool HasDependentVar(const proto::OpDesc& op_desc, - const std::set& dependent_vars) { +const char kRecurrent[] = "recurrent"; +const char kStates[] = "states"; +const char kExStates[] = "ex_states"; + +bool HasDependentInputVar( + const proto::OpDesc& op_desc, + const std::unordered_set& dependent_vars) { + for (auto& var : op_desc.inputs()) { + for (auto& argu : var.arguments()) { + if (dependent_vars.count(argu) != 0) { + return true; + } + } + } + return false; +} + +bool HasDependentOutputVar( + const proto::OpDesc& op_desc, + const std::unordered_set& dependent_vars) { for (auto& var : op_desc.outputs()) { for (auto& argu : var.arguments()) { if (dependent_vars.count(argu) != 0) { @@ -47,6 +72,14 @@ bool IsTarget(const proto::OpDesc& op_desc) { return false; } +bool HasTrueTarget(const proto::OpDesc& op_desc) { + return op_desc.has_is_target() && op_desc.is_target(); +} + +bool HasFalseTarget(const proto::OpDesc& op_desc) { + return op_desc.has_is_target() && !op_desc.is_target(); +} + int GetSubBlockIndex(const proto::OpDesc& op_desc) { for (auto& attr : op_desc.attrs()) { if (attr.type() == proto::AttrType::BLOCK) { @@ -61,6 +94,24 @@ bool HasSubBlock(const proto::OpDesc& op_desc) { return GetSubBlockIndex(op_desc) > 0; } +void AppendOpInputVarNames(const proto::OpDesc& op_desc, + std::unordered_set* vars_set) { + for (auto& var : op_desc.inputs()) { + for (auto& arg : var.arguments()) { + vars_set->emplace(arg); + } + } +} + +void AppendOpOutputVarNames(const proto::OpDesc& op_desc, + std::unordered_set* vars_set) { + for (auto& var : op_desc.outputs()) { + for (auto& arg : var.arguments()) { + vars_set->emplace(arg); + } + } +} + // block_id is the idx of the current block in the input desc // parent_block_id is the idx of the parent of the current block // in the output desc, -1 means the current block is global block @@ -68,7 +119,8 @@ bool HasSubBlock(const proto::OpDesc& op_desc) { // the child block to help pruning void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, int block_id, int parent_block_id, - std::set* dependent_vars) { + std::unordered_set* dependent_vars, + const std::set feed_var_names) { auto& block = input.blocks(block_id); auto& ops = block.ops(); @@ -90,11 +142,13 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, std::vector should_run; for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) { auto& op_desc = *op_iter; - if (IsTarget(op_desc) || HasDependentVar(op_desc, *dependent_vars)) { + if (IsTarget(op_desc) || HasDependentOutputVar(op_desc, *dependent_vars)) { // insert its input to the dependency graph for (auto& var : op_desc.inputs()) { for (auto& argu : var.arguments()) { - dependent_vars->insert(argu); + if (feed_var_names.count(argu) == 0) { + dependent_vars->insert(argu); + } } } should_run.push_back(true); @@ -123,22 +177,41 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, auto* op = op_field->Add(); *op = input.blocks(block_id).ops(i); if (HasSubBlock(*op)) { + VLOG(2) << "Pruning op which has sub block: " << op->type(); // create sub_block_dependent_vars here to help prune the sub block - std::set sub_block_dependent_vars; + std::unordered_set sub_block_dependent_vars; for (auto& var : op->inputs()) { for (auto& argu : var.arguments()) { - sub_block_dependent_vars.insert(argu); + if (feed_var_names.count(argu) == 0) { + sub_block_dependent_vars.insert(argu); + } } } for (auto& var : op->outputs()) { for (auto& argu : var.arguments()) { - sub_block_dependent_vars.insert(argu); + if (feed_var_names.count(argu) == 0) { + sub_block_dependent_vars.insert(argu); + } + } + } + + // Recurrent op's states are also dependent vars + if (op->type() == kRecurrent) { + auto& attributes = op->attrs(); + for (auto& attr : attributes) { + if (attr.name() == kStates || attr.name() == kExStates) { + for (auto& argu : attr.strings()) { + if (feed_var_names.count(argu) == 0) { + sub_block_dependent_vars.insert(argu); + } + } + } } } // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc // output_block_id is the idx of the current block in the output desc prune_impl(input, output, GetSubBlockIndex(*op), output_block_id, - &sub_block_dependent_vars); + &sub_block_dependent_vars, feed_var_names); } } } @@ -178,10 +251,142 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, } // TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies -void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) { - std::set dependent_vars; +void Prune(const proto::ProgramDesc& input, + const std::set& feed_var_names, + proto::ProgramDesc* output) { + std::unordered_set dependent_vars; output->clear_blocks(); - prune_impl(input, output, 0, -1, &dependent_vars); + prune_impl(input, output, 0, -1, &dependent_vars, feed_var_names); } + +void CloneWholeBlock(proto::ProgramDesc* input, proto::ProgramDesc* output, + int block_id, int parent_block_id) { + auto* block_field = output->mutable_blocks(); + *block_field->Add() = input->blocks(block_id); + int output_block_id = output->blocks_size() - 1; + auto* output_block = output->mutable_blocks(output_block_id); + output_block->set_idx(output_block_id); + output_block->set_parent_idx(parent_block_id); +} + +void PruneBackwardImpl(proto::ProgramDesc* input, proto::ProgramDesc* output, + int block_id, int parent_block_id) { + // Step 1. Copy the current input block to output + CloneWholeBlock(input, output, block_id, parent_block_id); + int output_block_id = output->blocks_size() - 1; + auto* output_block = output->mutable_blocks(output_block_id); + + // Step 2. Mark forward ops on main branch + auto* ops = input->mutable_blocks(block_id)->mutable_ops(); + std::unordered_set op_input_vars; + std::unordered_set op_output_vars; + for (auto op_iter = ops->rbegin(); op_iter != ops->rend(); ++op_iter) { + auto& op_desc = *op_iter; + if (HasTrueTarget(op_desc) || + HasDependentOutputVar(op_desc, op_input_vars)) { + op_desc.set_is_target(true); + AppendOpInputVarNames(op_desc, &op_input_vars); + AppendOpOutputVarNames(op_desc, &op_output_vars); + } + } + + // Step 3. Mark backward & optimize ops on main branch + std::unordered_set gradop_input_vars; + std::unordered_set gradop_output_vars; + for (auto op_iter = ops->begin(); op_iter != ops->end(); ++op_iter) { + auto& op_desc = *op_iter; + if (HasFalseTarget(op_desc) || + HasDependentInputVar(op_desc, gradop_output_vars)) { + op_desc.set_is_target(false); + AppendOpInputVarNames(op_desc, &gradop_input_vars); + AppendOpOutputVarNames(op_desc, &gradop_output_vars); + } + } + + // Step 4. Mark ops need to be reserved on sub-branch + for (auto op_iter = ops->rbegin(); op_iter != ops->rend(); ++op_iter) { + auto& op_desc = *op_iter; + if (!op_desc.has_is_target()) { + if (HasDependentOutputVar(op_desc, gradop_input_vars)) { + op_desc.set_is_target(false); + AppendOpInputVarNames(op_desc, &gradop_input_vars); + } else { + op_desc.set_is_target(true); + AppendOpInputVarNames(op_desc, &op_input_vars); + AppendOpOutputVarNames(op_desc, &op_output_vars); + } + } + } + + // Step 5. Copy the forward ops to new ProgramDesc + // Note: The proto::ProgramDesc doesn't have interface + // to remove op and var + auto* op_field = output_block->mutable_ops(); + op_field->Clear(); + for (auto op_iter = ops->begin(); op_iter != ops->end(); ++op_iter) { + if (IsTarget(*op_iter)) { + auto* op = op_field->Add(); + *op = *op_iter; + if (HasSubBlock(*op)) { + CloneWholeBlock(input, output, GetSubBlockIndex(*op), output_block_id); + } + } + } + + // Step 6. Copy the forward vars to new ProgramDesc + // construct all var's map before clear + auto* var_field = output_block->mutable_vars(); + std::unordered_map var_map; + for (const auto& var : *var_field) { + var_map[var.name()] = var; + } + std::unordered_set var_names; + var_names.insert(op_input_vars.begin(), op_input_vars.end()); + var_names.insert(op_output_vars.begin(), op_output_vars.end()); + var_field->Clear(); + for (const auto& name : var_names) { + *var_field->Add() = var_map[name]; + } +} + +std::unique_ptr PruneBackward( + const framework::ProgramDesc& origin) { + // Copy original ProgramDesc, origin can't be change + framework::ProgramDesc origin_clone(origin); + + // Step 1. Update loss op's role & set loss op to be target + // The loss op's op_role is (kForward | kLoss) + // The input ProgramDesc should have loss operator. + auto ops = origin_clone.Block(0).AllOps(); + bool has_loss_op = false; + for (auto op : ops) { + int op_role = + boost::get(op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + if (op_role == (static_cast(OpRole::kForward) | + static_cast(OpRole::kLoss))) { + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); + op->SetIsTarget(true); + has_loss_op = true; + } else if (op_role == (static_cast(OpRole::kBackward) | + static_cast(OpRole::kLoss))) { + op->SetIsTarget(false); + break; + } + } + PADDLE_ENFORCE_EQ(has_loss_op, true, + "The Program need to be pruned its backward part" + "should have loss operator."); + + // Step 2. Prune backward + proto::ProgramDesc pruned_desc; + pruned_desc.clear_blocks(); + PruneBackwardImpl(origin_clone.Proto(), &pruned_desc, 0, -1); + + // Step 3. Contruct new framework::ProgramDesc + return std::unique_ptr( + new framework::ProgramDesc(pruned_desc)); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/prune.h b/paddle/fluid/framework/prune.h index 1be7cd25..f710106a 100644 --- a/paddle/fluid/framework/prune.h +++ b/paddle/fluid/framework/prune.h @@ -14,13 +14,22 @@ limitations under the License. */ #pragma once +#include +#include +#include #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace framework { -void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output); +void Prune(const proto::ProgramDesc& input, + const std::set& feed_var_names, + proto::ProgramDesc* output); + +std::unique_ptr PruneBackward( + const framework::ProgramDesc& origin); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/prune_test.cc b/paddle/fluid/framework/prune_test.cc index 8af7d2d5..eb5c241a 100644 --- a/paddle/fluid/framework/prune_test.cc +++ b/paddle/fluid/framework/prune_test.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include "paddle/fluid/framework/prune.h" #include +#include #include +#include #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/operator.h" @@ -58,13 +60,14 @@ TEST(Prune, one_operator) { f::proto::ProgramDesc *pdesc = program.Proto(); f::proto::ProgramDesc pruned; + std::set feed_var_names = {}; + f::Prune(*pdesc, feed_var_names, &pruned); + EXPECT_EQ(pruned.blocks(0).ops_size(), 0); - f::Prune(*pdesc, &pruned); - PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0); - + feed_var_names.insert("a"); pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true); - f::Prune(*pdesc, &pruned); - PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1); + f::Prune(*pdesc, feed_var_names, &pruned); + EXPECT_EQ(pruned.blocks(0).ops_size(), 1); } TEST(Prune, forward) { @@ -81,12 +84,12 @@ TEST(Prune, forward) { block); f::proto::ProgramDesc *pdesc = program.Proto(); - + std::set feed_var_names = {"a"}; for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) { f::proto::ProgramDesc pruned; pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true); - f::Prune(*pdesc, &pruned); - PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1); + f::Prune(*pdesc, feed_var_names, &pruned); + EXPECT_EQ(pruned.blocks(0).ops_size(), i + 1); } } @@ -107,8 +110,9 @@ TEST(Prune, multi_input_op) { pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true); f::proto::ProgramDesc pruned; - f::Prune(*pdesc, &pruned); - PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4); + std::set feed_var_names = {"a0", "a1", "a2"}; + f::Prune(*pdesc, feed_var_names, &pruned); + EXPECT_EQ(pruned.blocks(0).ops_size(), 4); } TEST(Prune, multi_output_op) { @@ -126,8 +130,9 @@ TEST(Prune, multi_output_op) { pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true); f::proto::ProgramDesc pruned; - f::Prune(*pdesc, &pruned); - PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2); + std::set feed_var_names = {"a"}; + f::Prune(*pdesc, feed_var_names, &pruned); + EXPECT_EQ(pruned.blocks(0).ops_size(), 2); } TEST(Prune, multi_target) { @@ -146,6 +151,37 @@ TEST(Prune, multi_target) { pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true); f::proto::ProgramDesc pruned; - f::Prune(*pdesc, &pruned); - PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3); + std::set feed_var_names = {"a"}; + f::Prune(*pdesc, feed_var_names, &pruned); + EXPECT_EQ(pruned.blocks(0).ops_size(), 3); +} + +TEST(Prune, recurrrent_op) { + f::ProgramDesc program; + f::BlockDesc *block = program.MutableBlock(0); + f::BlockDesc *sub_block = program.AppendBlock(*block); + AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, + f::AttributeMap{}, block); + + std::vector state_var_name(1, "y"); + AddOp("recurrent", {{"input", {"b", "c"}}}, {{"output", {"b1, c1"}}}, + {{"ex_states", state_var_name}, + {"states", state_var_name}, + {"sub_block", sub_block}}, + block); + + EXPECT_TRUE(sub_block != nullptr); + AddOp("rnn_memory_helper", {{"input", {"x"}}}, {{"output", {"y"}}}, + f::AttributeMap{}, sub_block); + + f::proto::ProgramDesc *pdesc = program.Proto(); + pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true); + + f::proto::ProgramDesc pruned; + std::set feed_var_names = {"a"}; + + f::Prune(*pdesc, feed_var_names, &pruned); + EXPECT_EQ(pruned.blocks_size(), 2); + EXPECT_EQ(pruned.blocks(0).ops_size(), 2); + EXPECT_EQ(pruned.blocks(1).ops_size(), 1); } diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 9de29632..d3e2f33d 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -141,9 +141,12 @@ class Scope { DISABLE_COPY_AND_ASSIGN(Scope); +#ifndef PADDLE_ON_INFERENCE + private: mutable RWLock kids_lock_; mutable RWLock vars_lock_; +#endif }; // Generate some debug string about the inherience structure of scope, quite diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 565b7d9d..7b39c535 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -40,7 +40,9 @@ void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type, PADDLE_ENFORCE_GE(numel(), 0, "When calling this method, the Tensor's numel must be " "equal or larger than zero. " - "Please check Tensor::Resize has been called first."); + "Please check Tensor::dims, or Tensor::Resize has been " + "called first. The Tensor's shape is [", + dims(), "] now"); size_t size = numel() * SizeOfType(type); if (requested_size) { PADDLE_ENFORCE_GE(requested_size, size); @@ -49,6 +51,8 @@ void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type, /* some versions of boost::variant don't have operator!= */ if (holder_ == nullptr || !(holder_->place() == place) || holder_->size() < size + offset_) { + // Reset holder first before re-allocate to save memory + holder_.reset(); holder_ = memory::AllocShared(place, size); offset_ = 0; } @@ -57,8 +61,8 @@ void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type, } void* Tensor::mutable_data(platform::Place place, size_t requested_size) { - PADDLE_ENFORCE(this->holder_ != nullptr, - "Cannot invoke mutable data if current hold nothing."); + PADDLE_ENFORCE_NOT_NULL( + this->holder_, "Cannot invoke mutable data if current hold nothing."); return mutable_data(place, type_, requested_size); } diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 238af9ba..f94c0c53 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -57,23 +57,28 @@ TEST(Tensor, MutableData) { // initialization p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), platform::CPUPlace()); + auto p1_holder = src_tensor.Holder(); EXPECT_NE(p1, nullptr); // set src_tensor a new dim with large size // momery is supposed to be re-allocated p2 = src_tensor.mutable_data(framework::make_ddim({3, 4}), platform::CPUPlace()); EXPECT_NE(p2, nullptr); - EXPECT_NE(p1, p2); + auto p2_holder1 = src_tensor.Holder(); + EXPECT_NE(p1_holder.get(), p2_holder1.get()); // set src_tensor a new dim with same size // momery block is supposed to be unchanged p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), platform::CPUPlace()); - EXPECT_EQ(p1, p2); + auto p2_holder2 = src_tensor.Holder(); + EXPECT_EQ(p2_holder1.get(), p2_holder2.get()); // set src_tensor a new dim with smaller size // momery block is supposed to be unchanged p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), platform::CPUPlace()); + auto p2_holder3 = src_tensor.Holder(); EXPECT_EQ(p1, p2); + EXPECT_EQ(p2_holder2.get(), p2_holder3.get()); float* p3 = nullptr; float* p4 = nullptr; @@ -82,14 +87,18 @@ TEST(Tensor, MutableData) { auto* tmp = src_tensor.mutable_data(framework::make_ddim({2, 2}), platform::CPUPlace()); p3 = reinterpret_cast(tmp); + auto p3_holder1 = src_tensor.Holder(); EXPECT_EQ(p1, p3); + EXPECT_EQ(p2_holder3.get(), p3_holder1.get()); // set src_tensor a different type but bigger size. // memory block is supposed to be changed. auto* tmp2 = src_tensor.mutable_data( framework::make_ddim({2, 2, 3}), platform::CPUPlace()); + auto p3_holder2 = src_tensor.Holder(); p4 = reinterpret_cast(tmp2); EXPECT_NE(p1, p4); + EXPECT_NE(p3_holder1.get(), p3_holder2.get()); } // Not sure if it's desired, but currently, Tensor type can be changed. { @@ -113,13 +122,15 @@ TEST(Tensor, MutableData) { // initialization p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), platform::CUDAPlace()); + auto p1_holder = src_tensor.Holder(); EXPECT_NE(p1, nullptr); // set src_tensor a new dim with large size // momery is supposed to be re-allocated p2 = src_tensor.mutable_data(framework::make_ddim({3, 1024}), platform::CUDAPlace()); + auto p2_holder = src_tensor.Holder(); EXPECT_NE(p2, nullptr); - EXPECT_NE(p1, p2); + EXPECT_NE(p1_holder.get(), p2_holder.get()); // set src_tensor a new dim with same size // momery block is supposed to be unchanged p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 33ef3b91..fb6cc1f2 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -53,7 +53,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto src_gpu_place = boost::get(src_place); auto dst_cpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); - PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true); auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); auto stream = @@ -64,7 +64,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto src_cpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); - PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true); auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); auto stream = @@ -75,7 +75,7 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, auto src_gpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); - PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx_place), true); auto stream = reinterpret_cast(ctx).stream(); if (platform::is_same_place(src_place, dst_place)) { @@ -99,6 +99,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place."); } } + } else { + PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place); } #endif } @@ -166,6 +168,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, nullptr); + } else { + PADDLE_THROW("Copy from %s to %s is not supported.", src_place, dst_place); } #endif } diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index e382f920..cab72e29 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/temporary_allocator.h" namespace paddle { namespace framework { @@ -146,7 +145,7 @@ void TensorToVector(const Tensor& src, std::vector* dst) { dst->resize(src.numel()); auto dst_ptr = static_cast(dst->data()); - PADDLE_ENFORCE(platform::is_cpu_place(src.place())); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(src.place()), true); memory::Copy(dst_place, dst_ptr, boost::get(src.place()), src_ptr, size); diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index d34f826c..7f7f426d 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -13,6 +13,8 @@ limitations under the License. */ #include "paddle/fluid/framework/threadpool.h" +#include +#include #include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" @@ -20,8 +22,7 @@ DEFINE_int32(io_threadpool_size, 100, "number of threads used for doing IO, default 100"); -DEFINE_int32(dist_threadpool_size, 0, - "number of threads used for distributed executed."); +DECLARE_int32(dist_threadpool_size); namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 5fe296ff..170ceb50 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -76,6 +76,7 @@ class MultiTrainer : public TrainerBase { std::vector threads_; std::vector readers_; std::vector> workers_; + std::vector need_merge_var_names_; }; class DistMultiTrainer : public MultiTrainer { @@ -86,9 +87,23 @@ class DistMultiTrainer : public MultiTrainer { virtual void InitOtherEnv(const ProgramDesc& main_program); virtual void Run(); virtual void Finalize(); + template + void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); + virtual void FinalizeDumpEnv(); + virtual void InitDumpEnv(); + virtual void DumpWork(); protected: std::shared_ptr pull_dense_worker_; + std::thread dump_thread_; + std::shared_ptr fp_; + std::shared_ptr> queue_; + + bool need_dump_field_; + std::string dump_fields_path_; + std::string dump_converter_; + std::vector dump_fields_; + int mpi_rank_; }; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 4910fb74..2724be65 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -33,6 +33,12 @@ message TrainerDesc { optional bool debug = 6 [ default = false ]; optional FetchConfig fetch_config = 7; optional bool use_cvm = 8 [ default = false ]; + optional bool dump_slot = 9 [ default = false ]; + optional float scale_datanorm = 10 [ default = -1 ]; + optional int32 mpi_rank = 11 [ default = -1 ]; + optional string dump_fields_path = 12; + repeated string dump_fields = 13; + optional string dump_converter = 14; // device worker parameters optional HogwildWorkerParameter hogwild_param = 101; @@ -41,6 +47,8 @@ message TrainerDesc { optional SectionWorkerParameter section_param = 104; // datafeed desc optional DataFeedDesc data_desc = 201; + // adjust ins weight + optional AdjustInsWeightConfig adjust_ins_weight_config = 301; } message HogwildWorkerParameter { repeated string skip_ops = 1; } @@ -52,6 +60,7 @@ message DownpourWorkerParameter { repeated ProgramConfig program_config = 4; optional bool push_sparse = 5 [ default = true ]; optional bool push_dense = 6 [ default = true ]; + repeated string stat_var_names = 7; } message SectionWorkerParameter { @@ -86,6 +95,14 @@ message FetchConfig { optional Method method = 4 [ default = PRINT ]; } +message AdjustInsWeightConfig { + optional bool need_adjust = 1 [ default = false ]; + optional string nid_slot = 2 [ default = "" ]; + optional float nid_adjw_threshold = 3 [ default = 0.0 ]; + optional float nid_adjw_ratio = 4 [ default = 0.0 ]; + optional string ins_weight_slot = 5 [ default = "" ]; +} + message ProgramConfig { required string program_id = 1; repeated int32 push_sparse_table_id = 2; diff --git a/paddle/fluid/framework/transfer_scope_cache.cc b/paddle/fluid/framework/transfer_scope_cache.cc index e1326f88..2b138280 100644 --- a/paddle/fluid/framework/transfer_scope_cache.cc +++ b/paddle/fluid/framework/transfer_scope_cache.cc @@ -17,61 +17,12 @@ namespace paddle { namespace framework { -#ifdef PADDLE_WITH_MKLDNN -using transfer_data_cache_map = std::unordered_map; -using transfer_scope_cache_map = std::unordered_set; -static std::unordered_map - static_transfer_data_caches; -static std::unordered_map - static_transfer_scope_caches; -#endif - std::unordered_map& global_transfer_data_cache() { -#ifdef PADDLE_WITH_MKLDNN - size_t sid = platform::get_cur_mkldnn_session_id(); - - // if there is specific mkldnn tid setting from user. - if (sid != platform::kMKLDNNSessionID_Default) { - sid = std::hash()(std::this_thread::get_id()); - - static std::mutex acquire_barrier; - std::lock_guard block_until_finish_this_job(acquire_barrier); - - auto map_it = static_transfer_data_caches.find(sid); - if (map_it == static_transfer_data_caches.end()) { - auto* x = new transfer_data_cache_map; - static_transfer_data_caches[sid] = x; - return *x; - } else { - return *static_transfer_data_caches[sid]; - } - } -#endif thread_local auto* x = new std::unordered_map; return *x; } std::unordered_set& global_transfer_scope_cache() { -#ifdef PADDLE_WITH_MKLDNN - size_t sid = platform::get_cur_mkldnn_session_id(); - - // if there is specific mkldnn session id setting from user. - if (sid != platform::kMKLDNNSessionID_Default) { - sid = std::hash()(std::this_thread::get_id()); - - static std::mutex acquire_barrier; - std::lock_guard block_until_finish_this_job(acquire_barrier); - - auto map_it = static_transfer_scope_caches.find(sid); - if (map_it == static_transfer_scope_caches.end()) { - auto* x = new transfer_scope_cache_map; - static_transfer_scope_caches[sid] = x; - return *x; - } else { - return *static_transfer_scope_caches[sid]; - } - } -#endif thread_local auto* x = new std::unordered_set; return *x; } diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 73c629fd..5ba7c32d 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,10 +1,11 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags) -if(WITH_PYTHON) -cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind profiler imperative_flag) -cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind profiler) -cc_library(engine SRCS engine.cc) +cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform) +cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows var_type_traits layer) +cc_library(tracer SRCS tracer.cc DEPS layer engine) +cc_library(engine SRCS engine.cc DEPS layer gradient_accumulator) cc_library(imperative_profiler SRCS profiler.cc) cc_library(nccl_context SRCS nccl_context.cc DEPS device_context) -cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) -endif() + +add_subdirectory(tests) diff --git a/paddle/fluid/imperative/backward_strategy.h b/paddle/fluid/imperative/backward_strategy.h index 9ff07d6d..0f04d6db 100644 --- a/paddle/fluid/imperative/backward_strategy.h +++ b/paddle/fluid/imperative/backward_strategy.h @@ -16,17 +16,12 @@ // Created by Jiabin on 2019-04-25. // #pragma once -#ifndef PADDLE_BACKWARDSTRATEGY_H -#define PADDLE_BACKWARDSTRATEGY_H - -#endif // PADDLE_BACKWARDSTRATEGY_H namespace paddle { namespace imperative { namespace detail { -class BackwardStrategy { - public: +struct BackwardStrategy { /* DyGraph now support two kinds of backward strategy, one is sorted sum * gradient, another is sum gradient once they are created */ // TODO(jiabin): add more Strategy when we support diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc index de7ab0e5..3a41bafb 100644 --- a/paddle/fluid/imperative/engine.cc +++ b/paddle/fluid/imperative/engine.cc @@ -14,40 +14,241 @@ #include "paddle/fluid/imperative/engine.h" -#include // NOLINT +#include +#include +#include +#include +#include +#include #include - -#include "glog/logging.h" +#include "paddle/fluid/imperative/gradient_accumulator.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace imperative { -static std::once_flag init_engine; -static Engine* engine; +void Engine::RunOp(paddle::imperative::OpBase* op, + const paddle::imperative::NameVarBaseMap& ins, + const paddle::imperative::NameVarBaseMap& outs, + const paddle::platform::Place& place) { + platform::RecordEvent event(op->Type()); + + op->Run(ins, outs); +} + +void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy) { + backward_strategy_ = strategy; + const std::vector ops = var->GradVarBase()->GradOps(); + var->ClearGradOps(); -class DummyEngine : public Engine { - public: - void Enqueue(Runnable* runnable) override { - queued_runnables_.push_back(runnable); + if (ops.empty()) { + VLOG(3) << "Skip auto grad since there is no grad op for var: " + << var->Name(); + return; + } else { + bool valid = false; + for (const auto& op : ops) { + if (op) { + valid = true; + } + } + if (!valid) { + VLOG(3) << "Skip auto grad since all grad op of start VarBase is nullptr"; + return; + } } + init_ops_ = ops; + platform::RecordEvent record_event("Imperative Backward"); + VLOG(3) << "start backward"; - size_t Size() const override { return queued_runnables_.size(); } + PADDLE_ENFORCE_EQ(var->HasGradVar(), true, + "Grad variable not exist for variable %s", var->Name()); - void Sync() override { - for (Runnable* l : queued_runnables_) { - LOG(INFO) << "running " << reinterpret_cast(l); + auto& fwd_var = var->Var().Get(); + auto* grad_var = + var->GradVarBase()->MutableVar()->GetMutable(); + VLOG(6) << "init loss grad:" << var->GradVarBase()->Name() + << " as stop_gradient false"; + var->GradVarBase()->InnerSetOverridedStopGradient(false); + var->GradVarBase()->SetGradGenerated(true); + auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place()); + grad_var->Resize(fwd_var.dims()); + grad_var->mutable_data(fwd_var.place(), fwd_var.type()); + operators::math::set_constant(*dev_ctx, grad_var, 1.0); +} + +void BasicEngine::CheckBackwardInputs(OpBase* op) { + for (auto& pair : op->GetInsMap()) { + for (auto& var : pair.second) { + if (var && IsGrad(var.get())) { + // if grad var has OverridedStopGradient skip this Op + if (!var->GradGenerated()) { + VLOG(6) << "Set ungenerated Grad: " << var->Name() << " as zero"; + auto* dev_ctx = + platform::DeviceContextPool::Instance().Get(op->place()); + auto* tensor = var->MutableVar()->GetMutable(); + tensor->mutable_data(op->place(), var->DataType()); + operators::math::set_constant(*dev_ctx, tensor, 0.0); + } else { + continue; + } + } } - queued_runnables_.clear(); } +} - private: - std::vector queued_runnables_; -}; +void BasicEngine::SetBackwardOutputs(paddle::imperative::OpBase* op) { + for (auto& pair : op->GetOutsMap()) { + for (auto& var : pair.second) { + if (var) { + // Set Backward outputs's generate_grad as true + var->SetGradGenerated(true); + VLOG(6) << "Set backward output: " << var->Name() + << "'s SetGeneratedGrad as True"; + } + } + } +} +void BasicEngine::PrepareGradAccumulators(OpBase* op) { + for (const auto& pair : op->GetOutsMap()) { + for (const auto& var : pair.second) { + if (!var) continue; + + auto& accumulator = accumulators_[var.get()]; + if (!accumulator) { + if (backward_strategy_.sorted_sum_gradient_) { + accumulator.reset(new SortedGradientAccumulator(var.get())); + } else { + accumulator.reset(new EagerGradientAccumulator(var.get())); + } + } + + accumulator->IncreaseRefCnt(); -Engine* GetEngine() { - std::call_once(init_engine, []() { engine = new DummyEngine(); }); - return engine; + VLOG(3) << "Prepare to acccumulate variable grad " << var->Name() + << "with reference count " << accumulator->RefCnt(); + } + } } +void BasicEngine::PrepareDeps() { + PADDLE_ENFORCE_EQ(op_deps_.empty(), true, "Op deps must be initialized here"); + PADDLE_ENFORCE_EQ(accumulators_.empty(), true, + "Accumulators must be initialized here"); + + std::queue q; + std::unordered_set visited; + for (const auto& init_op : init_ops_) { + q.push(init_op); + visited.insert(init_op); + } + + while (!q.empty()) { + auto* cur_op = q.front(); + q.pop(); + VLOG(3) << "Checking grads of op " << cur_op->Type(); + + CheckBackwardInputs(cur_op); + + SetBackwardOutputs(cur_op); + + PrepareGradAccumulators(cur_op); + + auto& grad_pending_ops = cur_op->GradPendingOps(); + for (auto* grad_pending_op : grad_pending_ops) { + PADDLE_ENFORCE_NOT_NULL(grad_pending_op); + ++op_deps_[grad_pending_op]; + if (visited.count(grad_pending_op) == 0) { + visited.insert(grad_pending_op); + q.push(grad_pending_op); + } + } + } +} + +void BasicEngine::SumGradient(OpBase* op, std::shared_ptr src, + VarBase* dst) { + auto iter = accumulators_.find(dst); + PADDLE_ENFORCE_EQ(iter != accumulators_.end(), true, + "Cannot find gradient of variable %s", dst->Name()); + iter->second->Add(std::move(src), op->id()); +} +void BasicEngine::Execute() { + PrepareDeps(); + // Start execute Computation graph + std::queue q; + for (const auto& init_op : init_ops_) { + q.push(init_op); + } + while (!q.empty()) { + OpBase* cur_op = q.front(); + q.pop(); + + // Step 1: Run Backward + auto& bwd_ins = cur_op->GetInsMap(); + auto& bwd_outs = cur_op->GetOutsMap(); + + NameVarBaseMap tmp_outs; + // A var may be coresponding to several grad var in one op + std::unordered_map>> var_map; + size_t counter = 0; + for (auto& bwd_out : bwd_outs) { + auto& tmp_var_list = tmp_outs[bwd_out.first]; + tmp_var_list.reserve(bwd_out.second.size()); + for (auto& var : bwd_out.second) { + auto tmp_var = std::make_shared( + false, "Gtmp@" + std::to_string(counter++)); // Do not need grad + tmp_var_list.emplace_back(tmp_var); + if (var) { + var_map[var.get()].emplace_back(std::move(tmp_var)); + var->ClearGradOps(); + } + } + } + + VLOG(3) << "Start to execute grad op " << cur_op->Type(); + RunOp(cur_op, bwd_ins, tmp_outs, cur_op->place()); + // Step 2: Sum Gradient + { + platform::RecordEvent record_event("merge_grads"); + for (auto& var_pair : var_map) { + auto* dst_var = var_pair.first; + if (dst_var == nullptr) continue; + for (auto& src_var : var_pair.second) { + VLOG(3) << "Sum gradient of variable " << dst_var->Name() + << " after op " << cur_op->Type(); + SumGradient(cur_op, std::move(src_var), dst_var); + } + } + } + + // Step 3: Collect ready ops + for (auto* grad_pending_op : cur_op->GradPendingOps()) { + PADDLE_ENFORCE_NOT_NULL(grad_pending_op); + auto iter = op_deps_.find(grad_pending_op); + if (iter == op_deps_.end()) { + continue; + } + + VLOG(3) << "Found grad_pending op of " << cur_op->Type(); + // An Op is ready to go while its deps comes to zero + + if (--(iter->second) == 0) { + q.push(grad_pending_op); + VLOG(3) << "Push grad_pending op " << grad_pending_op->Type() + << " into queue"; + } + } + + // Step 4: Delete op to collect unused variables + VLOG(3) << "Remove op after op " << cur_op->Type() << " runs"; + RemoveOp(cur_op); + } + VLOG(3) << "Clean properties of BasicEngine"; + CleanEngine(); +} } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h index a1dfa5bd..a2680045 100644 --- a/paddle/fluid/imperative/engine.h +++ b/paddle/fluid/imperative/engine.h @@ -16,24 +16,92 @@ #include #include +#include +#include +#include +#include +#include +#include "paddle/fluid/imperative/backward_strategy.h" +#include "paddle/fluid/imperative/gradient_accumulator.h" +#include "paddle/fluid/imperative/layer.h" namespace paddle { namespace imperative { -struct Runnable {}; - +// It seems there is no need for Engine to be an +// singleton, we can have multi-engine to run +// mutil-graoh. For future use we may expose a interface +// to Python to support class Engine { public: - virtual ~Engine() {} + virtual ~Engine() = default; + virtual void Execute() = 0; + virtual void Init(VarBase* var, const detail::BackwardStrategy& strategy) = 0; + virtual void RunOp(imperative::OpBase* op, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, const platform::Place& place); + + virtual void RemoveOp(OpBase* op) { + PADDLE_ENFORCE_NOT_NULL(op, "Cannot remove null op"); + auto iter = grad_ops_.find(op); + PADDLE_ENFORCE_EQ(iter != grad_ops_.end(), true, "Op is not inside tracer"); + grad_ops_.erase(iter); + } + + void InsertOp(OpBase* op, std::shared_ptr op_shared) { + grad_ops_[op] = std::move(op_shared); + } + + void InsertGradVar(VarBase* grad) { grad_vars_.emplace(grad); } - virtual void Enqueue(Runnable* runnable) = 0; + bool IsGrad(VarBase* var) { return grad_vars_.count(var) > 0; } - virtual size_t Size() const = 0; + void Clear() { + grad_ops_.clear(); + grad_vars_.clear(); + } - virtual void Sync() = 0; + private: + std::unordered_map> + grad_ops_; // opBase for remove - grad_op + std::unordered_set grad_vars_; }; -Engine* GetEngine(); +class BasicEngine : public Engine { + public: + BasicEngine() = default; + + void Init(VarBase* var, const detail::BackwardStrategy& strategy) override; + + ~BasicEngine() override = default; + + void Execute() override; + + private: + void PrepareDeps(); + + void CheckBackwardInputs(OpBase* op); + + void SetBackwardOutputs(OpBase* op); + + void PrepareGradAccumulators(OpBase* op); + + void SumGradient(OpBase* op, std::shared_ptr src, VarBase* dst); + + // TODO(jiabin): maybe we can optimize the performance of engine by cache the + // result + void CleanEngine() { + init_ops_.clear(); + op_deps_.clear(); + accumulators_.clear(); + Clear(); + } + + std::vector init_ops_; + detail::BackwardStrategy backward_strategy_; + std::unordered_map op_deps_; + std::unordered_map> + accumulators_; +}; } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc new file mode 100644 index 00000000..509415a3 --- /dev/null +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/gradient_accumulator.h" +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace imperative { + +template +class TensorAddFunctor : public boost::static_visitor<> { + public: + TensorAddFunctor(int64_t numel, const T* x, T* y) + : numel_(numel), x_(x), y_(y) {} + + void operator()(const platform::CPUPlace& place) { + platform::CPUDeviceContext* ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = operators::math::GetBlas(*ctx); + blas.AXPY(numel_, 1., x_, y_); + } + +#ifdef PADDLE_WITH_CUDA + void operator()(const platform::CUDAPlace& place) { + platform::CUDADeviceContext* ctx = + dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto blas = operators::math::GetBlas(*ctx); + blas.AXPY(numel_, 1., x_, y_); + } +#else + void operator()(const platform::CUDAPlace& place) { + PADDLE_THROW("Do NOT support gradient merge in place %s", place); + } +#endif + + // there is NO blas in CUDAPinnedPlace + void operator()(const platform::CUDAPinnedPlace& place) { + PADDLE_THROW("Do NOT support gradient merge in place %s", place); + } + + private: + int64_t numel_; + const T* x_; + T* y_; +}; + +void TensorAdd(const framework::Variable& src, framework::Variable* dst) { + auto* dst_tensor = dst->GetMutable(); + auto& src_tensor = src.Get(); + + auto numel = src_tensor.numel(); + + // FIXME(minqiyang): loss_grad op will pass a zero grad of label + // ugly fix for it + if (numel == 0) { + return; + } + + PADDLE_ENFORCE_EQ(dst_tensor->numel() == numel, true, + "dst_numel %d vs. src_numel %d", dst_tensor->numel(), + numel); + + auto data_type = src_tensor.type(); + auto place = src_tensor.place(); + +#define PADDLE_TENSOR_ADD_MACRO(cpp_type) \ + if (data_type == framework::DataTypeTrait::DataType()) { \ + TensorAddFunctor func( \ + numel, src_tensor.data(), \ + dst_tensor->mutable_data(place)); \ + boost::apply_visitor(func, place); \ + return; \ + } + + PADDLE_TENSOR_ADD_MACRO(float); + PADDLE_TENSOR_ADD_MACRO(double); + +#undef PADDLE_TENSOR_ADD_MACRO + + PADDLE_THROW("Not supported data type %s for AddTo", + framework::DataTypeToString(data_type)); +} + +void EagerGradientAccumulator::Add(std::shared_ptr var, + size_t trace_id) { + auto* dst_var = var_->MutableVar(); + auto place = var->Var().Get().place(); + if (!var_->OverridedStopGradient()) { + VLOG(3) << "Sum Gradient for: " << var_->Name(); + if (cur_cnt_ == 0) { + *dst_var = std::move(*(var->MutableVar())); + } else { + TensorAdd(var->Var(), dst_var); + } + } else { + if (!var_->Var().IsInitialized() || + !var_->Var().Get().IsInitialized()) { + VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero"; + auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place); + auto* tensor = var_->MutableVar()->GetMutable(); + tensor->mutable_data(place, var->DataType()); + operators::math::set_constant(*dev_ctx, tensor, 0.0); + } + } + ++cur_cnt_; +} + +void SortedGradientAccumulator::Add(std::shared_ptr var, + size_t trace_id) { + auto* dst_var = var_->MutableVar(); + auto place = var->Var().Get().place(); + if (!var_->OverridedStopGradient()) { + if (ref_cnt_ == 1) { + *dst_var = std::move(*(var->MutableVar())); + } else { + if (tmp_grad_vars_.empty()) { + tmp_grad_vars_.reserve(ref_cnt_); + } + + tmp_grad_vars_.emplace_back(std::move(var), trace_id); + + if (tmp_grad_vars_.size() != ref_cnt_) { + return; + } + + std::sort(tmp_grad_vars_.begin(), tmp_grad_vars_.end(), + [](const std::pair, size_t>& p1, + const std::pair, size_t>& p2) { + return p1.second > p2.second; + }); + + *dst_var = std::move(*(tmp_grad_vars_[0].first->MutableVar())); + for (size_t i = 1; i < tmp_grad_vars_.size(); ++i) { + TensorAdd(tmp_grad_vars_[i].first->Var(), dst_var); + } + + tmp_grad_vars_.clear(); + } + } else { + if (!var_->Var().IsInitialized() || + !var_->Var().Get().IsInitialized()) { + VLOG(6) << "Set StopGradient Grad: " << var->Name() << " as zero"; + auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place); + auto* tensor = var_->MutableVar()->GetMutable(); + tensor->mutable_data(place, var->DataType()); + operators::math::set_constant(*dev_ctx, tensor, 0.0); + } + // looks like tmp_grad_vars will not have any member but just in case + tmp_grad_vars_.clear(); + } +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h new file mode 100644 index 00000000..d4980496 --- /dev/null +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -0,0 +1,63 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/imperative/layer.h" + +namespace paddle { +namespace imperative { + +class GradientAccumulator { + public: + explicit GradientAccumulator(VarBase* var) : var_(var) {} + + virtual void Add(std::shared_ptr var, size_t trace_id) = 0; + + virtual ~GradientAccumulator() = default; + + inline void IncreaseRefCnt() { ++ref_cnt_; } + + inline size_t RefCnt() const { return ref_cnt_; } + + protected: + VarBase* var_; + size_t ref_cnt_{0}; +}; + +class EagerGradientAccumulator : public GradientAccumulator { + public: + using GradientAccumulator::GradientAccumulator; + + void Add(std::shared_ptr var, size_t trace_id) override; + + private: + size_t cur_cnt_{0}; +}; + +class SortedGradientAccumulator : public GradientAccumulator { + public: + using GradientAccumulator::GradientAccumulator; + + void Add(std::shared_ptr var, size_t trace_id) override; + + private: + std::vector, size_t>> tmp_grad_vars_; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index fb22d334..8a5db26d 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,27 +13,21 @@ // limitations under the License. #include "paddle/fluid/imperative/layer.h" - #include -#include -#include -#include -#include -#include +#include #include - -#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/imperative/prepared_operator.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/string/printf.h" namespace paddle { namespace imperative { +using framework::Variable; void ThreadSafeNameSet::Insert(const std::string& name) { std::lock_guard guard(mtx_); set_.insert(name); @@ -42,7 +36,7 @@ void ThreadSafeNameSet::Insert(const std::string& name) { void ThreadSafeNameSet::Remove(const std::string& name) { std::lock_guard guard(mtx_); auto iter = set_.find(name); - PADDLE_ENFORCE(iter != set_.end(), "%s does not exist", name); + PADDLE_ENFORCE_EQ(iter != set_.end(), true, "%s does not exist", name); set_.erase(iter); } @@ -55,222 +49,161 @@ ThreadSafeNameSet VarBase::name_set_; std::vector VarBase::AliveVarNames() { return name_set_.Names(); } -using framework::Variable; - -namespace detail { - -template -class TensorAddToFunctor : public boost::static_visitor<> { - public: - TensorAddToFunctor(int64_t numel, const T* x, T* y) - : numel_(numel), x_(x), y_(y) {} - - void operator()(const platform::CPUPlace& place) { - platform::CPUDeviceContext* ctx = dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto blas = operators::math::GetBlas(*ctx); - blas.AXPY(numel_, 1., x_, y_); +static framework::VariableNameMap CreateVarNameMap( + const framework::OpInfo& op_info, const std::string& op_type, + const NameVarBaseMap& varbase_map, bool is_input) { + if (op_info.proto_ == nullptr) { + return {}; } -#ifdef PADDLE_WITH_CUDA - void operator()(const platform::CUDAPlace& place) { - platform::CUDADeviceContext* ctx = - dynamic_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto blas = operators::math::GetBlas(*ctx); - blas.AXPY(numel_, 1., x_, y_); - } -#else - void operator()(const platform::CUDAPlace& place) { - PADDLE_THROW("Do NOT support gradient merge in place %s", place); + framework::VariableNameMap result; + + for (auto& var : + is_input ? op_info.Proto().inputs() : op_info.Proto().outputs()) { + auto it = varbase_map.find(var.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE_EQ( + var.dispensable(), true, + "Var: %s not dispensable and there are no such var in inputs", + var.name()); + result[var.name()] = {}; + } else { + auto& var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (auto& var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[var.name()] = std::move(args); + } } -#endif + return result; +} - // there is NO blas in CUDAPinnedPlace - void operator()(const platform::CUDAPinnedPlace& place) { - PADDLE_THROW("Do NOT support gradient merge in place %s", place); +static framework::RuntimeContext PrepareRuntimeContext( + const NameVarBaseMap& ins, const NameVarBaseMap& outs) { + framework::VariableValueMap inputs, outputs; + for (auto& in_pair : ins) { + auto& in_ctx = inputs[in_pair.first]; + in_ctx.reserve(in_pair.second.size()); + for (auto& in_var : in_pair.second) { + in_ctx.emplace_back(in_var->MutableVar()); + } } - private: - int64_t numel_; - const T* x_; - T* y_; -}; - -} // namespace detail - -void AddTo(std::shared_ptr src, std::shared_ptr dst, - platform::Place place, GradientRef* grad_ref) { - PADDLE_ENFORCE(grad_ref->find(dst.get()) != grad_ref->end(), - "gradient %s are not found in grad_ref", dst->Name()); - if ((*grad_ref)[dst.get()].second) { - PADDLE_ENFORCE(src->IsInitialize(), "Using uninitialized VarBase"); - dst->var_ = std::move(src->var_); - (*grad_ref)[dst.get()].second = false; - if (!dst->IsInitialize()) { - dst->SetInitialize(true); - } - return; - } else { - framework::Tensor* dst_tensor = - dst->var_->GetMutable(); - framework::Tensor* src_tensor = - src->var_->GetMutable(); - - // FIXME(minqiyang): loss_grad op will pass a zero grad of label - // ugly fix for it - if (src_tensor->numel() == 0) { - return; + for (auto& out_pair : outs) { + auto& out_ctx = outputs[out_pair.first]; + out_ctx.reserve(out_pair.second.size()); + for (auto& out_var : out_pair.second) { + out_ctx.emplace_back(out_var->MutableVar()); } + } + return framework::RuntimeContext(std::move(inputs), std::move(outputs)); +} + +static std::string DebugString( + const std::string& name, + const std::vector>& vars) { + std::stringstream ss; + ss << name << "{"; - PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), - "dst_numel %lld vs. src_numel %lld", dst_tensor->numel(), - src_tensor->numel()); + for (size_t i = 0; i < vars.size(); ++i) { + if (i > 0) ss << ", "; - detail::TensorAddToFunctor func( - src_tensor->numel(), src_tensor->data(), - dst_tensor->mutable_data(place)); - boost::apply_visitor(func, place); + if (vars[i] == nullptr) { + ss << "NULL"; + continue; + } + ss << vars[i]->Name() << "["; + auto& var = vars[i]->Var(); + if (!var.IsInitialized()) { + ss << "NOT_INITED_VAR"; + } else if (var.IsType()) { + auto& tensor = var.Get(); + ss << "LoDTensor<"; + if (tensor.IsInitialized()) { + ss << framework::DataTypeToString(tensor.type()) << ", "; + ss << tensor.place() << ", "; + ss << "(" << tensor.dims() << ")"; + } else { + ss << "NOT_INITED"; + } + ss << ">"; + } else { + ss << "UNRESOLVED_TYPE"; + } + ss << "]"; } -} -void ZeroGrads(const std::shared_ptr vb, - const platform::Place& place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - auto grad_t = vb->var_->GetMutable(); - operators::math::set_constant(*dev_ctx, grad_t, 0.0); + ss << "}"; + return ss.str(); } -void AddGradBySort(BackwardSumMap* bck_map, - std::shared_ptr target, - GradientRef* grad_ref) { - PADDLE_ENFORCE(bck_map->find(target.get()) != bck_map->end(), - "Can't find %s in backward grad map", target->Name()); - std::pair>>>& - current = bck_map->at(target.get()); - std::sort(current.second.begin(), current.second.end(), - [](const std::pair>& a, - const std::pair>& b) { - return a.first > b.first; - }); - for (auto& var_pair : current.second) { - VLOG(10) << "add origin_grad: " << target->Name(); - VLOG(10) << "added grad: " << var_pair.second->Name() - << " trace id is: " << var_pair.first; - AddTo(var_pair.second, target, current.first, grad_ref); - var_pair.second.reset(); +std::string LayerDebugString(const std::string& op_type, + const NameVarBaseMap& ins, + const NameVarBaseMap& outs) { + std::stringstream ss; + ss << "Op(" << op_type << "): "; + + ss << "Inputs: "; + + size_t i = 0; + for (auto& pair : ins) { + if (i > 0) ss << ", "; + ss << DebugString(pair.first, pair.second); + ++i; } -} -class Autograd { - public: - Autograd() {} + ss << ", Outputs: "; + i = 0; + for (auto& pair : outs) { + if (i > 0) ss << ", "; + ss << DebugString(pair.first, pair.second); + ++i; + } + return ss.str(); +} - void RunBackward(VarBase* var, const detail::BackwardStrategy& bck_stratedy) { - if (var->IsStopGradient()) { +void VarBase::AddGradOps(const std::weak_ptr& op) { + if (op.lock() == nullptr) { + return; + } + for (const auto& cur_op : grad_ops_) { + if (cur_op.lock() == op.lock()) { return; } - VLOG(2) << "start autograd"; - BackwardSumMap bck_map; - std::deque ready; - ready.push_back(var->PreOp()); - - std::map dep_counts = - ComputeDepCounts(var->PreOp(), bck_stratedy, &grad_ref); - - while (!ready.empty()) { - OpBase* ready_op = ready.front(); - ready.pop_front(); - std::vector grads_outputs = - ready_op->ApplyGrad(&bck_map, &grad_ref, bck_stratedy); - - for (const auto& map : grads_outputs) { - for (auto it = map.rbegin(); it != map.rend(); ++it) { - const std::vector>& grad_outs = it->second; - for (size_t i = 0; i < grad_outs.size(); ++i) { - if (!grad_outs[i] || grad_outs[i]->IsStopGradient()) continue; - OpBase* pre_op = grad_outs[i]->PreOp(); - if (!pre_op) continue; - dep_counts[pre_op] -= 1; - PADDLE_ENFORCE(dep_counts[pre_op] >= 0); - bool pre_op_ready = dep_counts[pre_op] == 0; - if (pre_op_ready) { - ready.push_back(pre_op); - } - } - } - } - - ready_op->InvokeBackwardHooks(); - } } + grad_ops_.emplace_back(op); +} - private: - std::map ComputeDepCounts( - OpBase* op, const detail::BackwardStrategy& bck_stratedy, - GradientRef* grad_ref) { - if (bck_stratedy.sorted_sum_gradient_) { - PADDLE_ENFORCE_NOT_NULL(grad_ref, - "grad_ref should not be null when " - "using sorted grad backward strategy"); - } - std::map ret; - - std::deque queue; - queue.push_back(op); - std::unordered_set visited; - visited.insert(op); - while (!queue.empty()) { - OpBase* candidate = queue.front(); - queue.pop_front(); - for (const auto& map : candidate->grad_output_vars_) { - for (const auto& it : map) { - for (const auto& vb : it.second) { - if (bck_stratedy.sorted_sum_gradient_) { - ++(*grad_ref)[vb.get()].first; - } - // init the state of the grad_ - (*grad_ref)[vb.get()].second = true; - } - } - } - for (auto it : candidate->pre_ops_) { - for (OpBase* pre_op : it.second) { - if (!pre_op) continue; - VLOG(2) << "op dep " << candidate->Type() << " trace id " - << candidate->trace_id_ << " <---- " << it.first << " <---- " - << pre_op->Type() << " trace id " << pre_op->trace_id_; - if (visited.find(pre_op) == visited.end()) { - visited.insert(pre_op); - queue.push_back(pre_op); - } - ret[pre_op] += 1; - } - } +void VarBase::ClearGradient() { + if (grad_var_) { + auto* grad_t = grad_var_->var_.GetMutable(); + if (grad_t->IsInitialized()) { + auto* dev_ctx = + platform::DeviceContextPool::Instance().Get(grad_t->place()); + operators::math::set_constant(*dev_ctx, grad_t, 0.0); } - return ret; } +} - GradientRef grad_ref; -}; - -std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, +std::shared_ptr VarBase::NewVarBase(const platform::Place& dst_place, const bool blocking) const { - PADDLE_ENFORCE(var_->IsInitialized(), - "Variable must be initialized when getting numpy tensor"); - - // TODO(minqiyang): change this after move unique_name generator to CXX - const framework::LoDTensor& self_tensor = var_->Get(); - std::unique_ptr new_var(new VarBase( - "Itmp", self_tensor.type(), self_tensor.dims(), dst_place, true, false)); - framework::LoDTensor* tensor = - new_var->var_->GetMutable(); - tensor->set_lod(var_->Get().lod()); - - const auto& src_tensor = var_->Get(); - framework::TensorCopy(src_tensor, dst_place, tensor); + PADDLE_ENFORCE_EQ(var_.IsInitialized() && var_.IsType(), + true, + "Variable must be initialized and type of LoDTensor when " + "getting numpy tensor"); + + auto& src_tensor = var_.Get(); + + // TODO(Jiabin): change this after move unique_name generator to CXX + auto new_var = std::make_shared( + false, "Itmp" + std::to_string(copied_counter_++)); + + auto* dst_tensor = new_var->var_.GetMutable(); + dst_tensor->set_lod(src_tensor.lod()); + + framework::TensorCopy(src_tensor, dst_place, dst_tensor); if (blocking) { platform::DeviceContextPool::Instance().Get(dst_place)->Wait(); auto src_place = src_tensor.place(); @@ -285,184 +218,66 @@ std::unique_ptr VarBase::NewVarBase(const platform::Place& dst_place, return new_var; } +// create OpBase from optype +OpBase::OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs, + const platform::Place& place) + : id_(id), place_(place) { + const auto& info = framework::OpInfoMap::Instance().Get(type); + + // Step 1: Run forward + if (info.Checker() != nullptr) { + info.Checker()->Check(&attrs); + } -framework::LoDTensor& VarBase::GradValue() { - VLOG(3) << "get var grad " << Name(); - PADDLE_ENFORCE_NOT_NULL(grads_, - "Could not get grad value from no grad variable"); - return *(grads_->var_->GetMutable()); + auto input_name_map = CreateVarNameMap(info, type, ins, true); + auto output_name_map = CreateVarNameMap(info, type, outs, false); + op_ = framework::OpRegistry::CreateOp(type, std::move(input_name_map), + std::move(output_name_map), + std::move(attrs)); + VLOG(3) << "Construct Op: " << type << std::endl; } -std::vector OpBase::ApplyGrad( - BackwardSumMap* bck_map, GradientRef* grad_ref, - const detail::BackwardStrategy& bck_stratedy) { - PADDLE_ENFORCE(!grad_op_descs_.empty(), "%s has no backward implementation", - Type()); - VLOG(3) << "apply op grad: " << Type(); - std::vector tmp_grad_outputs; - const size_t grad_op_count = grad_op_descs_.size(); - - tmp_grad_outputs.resize(grad_op_count); - for (size_t k = 0; k < grad_op_count; ++k) { - framework::OpDesc* grad_op_desc = grad_op_descs_[k]; - platform::RecordEvent record_event(grad_op_desc->Type()); - auto& grad_output_variable_map = grad_output_vars_[k]; - VLOG(3) << "apply grad op " << grad_op_desc->Type(); - - // Allocate tmp grad output variable - for (const auto& it : grad_output_variable_map) { - auto& outputs = tmp_grad_outputs[k][it.first]; - outputs.reserve(it.second.size()); - for (const std::shared_ptr& origin_grad_var_base : - it.second) { - // Allocate a new variable - std::shared_ptr tmp_grad_var_base(new VarBase( - string::Sprintf("%s@IGrad", origin_grad_var_base->Name()), - origin_grad_var_base->DataType(), origin_grad_var_base->Dims(), - place_, true, false)); - outputs.emplace_back(std::move(tmp_grad_var_base)); - } - } - - // No need to do compile time infer shape here. - // grad_op_desc_->InferShape(*block_); - // grad_op_desc->InferVarType(block_); - - std::unique_ptr opbase = - framework::OpRegistry::CreateOp(*grad_op_desc); - - auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type()); - if (info.infer_var_type_) { - RuntimeInferVarTypeContext infer_var_type_ctx( - &grad_input_vars_[k], &tmp_grad_outputs[k], &(opbase->Attrs())); - info.infer_var_type_(&infer_var_type_ctx); - } - - framework::OperatorWithKernel* op_kernel = - dynamic_cast(opbase.get()); - PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); - - // Run grad op - framework::VariableValueMap grad_invars_map; - framework::VariableValueMap grad_outvars_map; - - for (const auto& it : grad_input_vars_[k]) { - auto& grad_invars = grad_invars_map[it.first]; - grad_invars.reserve(it.second.size()); - for (const std::shared_ptr& grad_inp : it.second) { - PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr", - grad_op_desc->Type(), grad_inp->Name()); - if (!grad_inp->IsInitialize()) { - grad_inp->InitBuffer(); - ZeroGrads(grad_inp, place_); - } - const std::shared_ptr& const_grad_inp = grad_inp; - grad_invars.emplace_back(const_grad_inp->var_.get()); - } - } - - for (const auto& it : tmp_grad_outputs[k]) { - auto& grad_outvars = grad_outvars_map[it.first]; - grad_outvars.reserve(it.second.size()); - for (const std::shared_ptr& grad_out : it.second) { - PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr", - grad_op_desc->Type(), grad_out->Name()); - - grad_outvars.emplace_back(grad_out->var_.get()); - } - } +// create OpBase from opdesc +OpBase::OpBase(size_t id, const framework::OpDesc& op_desc, + const platform::Place& place) + : id_(id), op_(framework::OpRegistry::CreateOp(op_desc)), place_(place) { + VLOG(3) << "Construct Op: " << op_desc.Type() << std::endl; +} - framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map); - framework::Scope scope; - PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_); - p.op.RuntimeInferShape(scope, place_, ctx); - p.func( - framework::ExecutionContext(p.op, scope, *p.dev_ctx, p.ctx, nullptr)); +void OpBase::Run(const NameVarBaseMap& ins, const NameVarBaseMap& outs) { + auto* op_kernel = dynamic_cast(op_.get()); + PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); + auto& info = op_->Info(); + if (info.infer_var_type_) { + RuntimeInferVarTypeContext infer_var_type_ctx(ins, &outs, op_->Attrs()); + info.infer_var_type_(&infer_var_type_ctx); } - platform::RecordEvent record_event("merge_grads"); - // Add tmp grad outputs to original grad vars - for (size_t k = 0; k < grad_output_vars_.size(); ++k) { - for (const auto& it : grad_output_vars_[k]) { - auto& outputs = tmp_grad_outputs[k][it.first]; - const auto& origin_outputs = it.second; - PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); - - for (size_t i = 0; i < outputs.size(); ++i) { - // track outputs used by sum - if (bck_stratedy.sorted_sum_gradient_) { - if (bck_map->find(origin_outputs[i].get()) != bck_map->end()) { - VLOG(10) << "add sub grad to " << origin_outputs[i]->Name(); - bck_map->at(origin_outputs[i].get()) - .second.emplace_back( - std::pair>( - this->trace_id_, std::move(outputs[i]))); - } else { - VLOG(10) << "insert new map for " << origin_outputs[i]->Name(); - std::pair>>> - tmp(place_, - {std::make_pair(this->trace_id_, std::move(outputs[i]))}); - bck_map->insert(std::make_pair(origin_outputs[i].get(), tmp)); - } - - PADDLE_ENFORCE( - grad_ref->find(origin_outputs[i].get()) != grad_ref->end(), - "Can't find %s in grad_reference count map", - origin_outputs[i]->Name()); - PADDLE_ENFORCE(grad_ref->at(origin_outputs[i].get()).first >= 1, - "Backward error when calculate grad reference"); - if (grad_ref->at(origin_outputs[i].get()).first > 1) { - VLOG(10) << "remove ref for " << origin_outputs[i]->Name(); - grad_ref->at(origin_outputs[i].get()).first--; - } else { - VLOG(10) << "Add grad for: " << origin_outputs[i]->Name(); - AddGradBySort(bck_map, origin_outputs[i], grad_ref); - grad_ref->at(origin_outputs[i].get()).first--; - } - } else { - VLOG(10) << "AddTo Called with orig_grad is: " - << origin_outputs[i]->name_ << " Grad to be added is " - << outputs[i]->name_; - AddTo(outputs[i], origin_outputs[i], place_, grad_ref); - outputs[i].reset(); - } - } + // Initialize output var type + for (auto& var_pair : outs) { + for (auto& var : var_pair.second) { + InitializeVariable(var->MutableVar(), var->Type()); } } - return grad_output_vars_; -} + VLOG(3) << "Running Op " << Type(); + VLOG(5) << LayerDebugString(Type(), ins, outs); + auto runtime_ctx = PrepareRuntimeContext(ins, outs); -void OpBase::InvokeBackwardHooks() { - VLOG(3) << "call backward hooks, hooks num: " << backward_hooks_.size(); - - // call backward hooks - for (py::object& callable : backward_hooks_) { - callable(this); - } -} + VLOG(6) << "start preparing op: " << Type(); + auto prepared_op = PreparedOp::Prepare(runtime_ctx, *op_kernel, place(), ins); -void OpBase::RegisterBackwardHooks(const py::object& callable) { - VLOG(3) << "Register backward hooks " << trace_id_; + VLOG(6) << "finish preparing op: " << Type(); + prepared_op.Run(); - // TODO(minqiyang): check the callable format - backward_hooks_.push_back(callable); + VLOG(4) << LayerDebugString(Type(), ins, outs); } -void VarBase::RunBackward(const detail::BackwardStrategy& bck_stratedy) { - if (!pre_op_) return; - platform::RecordEvent record_event("Imperative Backward"); - VLOG(3) << "start backward"; - grads_->InitBuffer(); - auto grads_t = grads_->var_->GetMutable(); - operators::math::set_constant( - *(platform::DeviceContextPool::Instance().Get( - var_->GetMutable()->place())), - grads_t, 1.0); - - Autograd().RunBackward(this, bck_stratedy); +void OpBase::ClearBackwardTrace() { + grad_pending_ops_.clear(); + ins_.clear(); + outs_.clear(); } } // namespace imperative diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 2fbedd82..4ef22c97 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,8 +13,10 @@ // limitations under the License. #pragma once - +#include +#include #include +#include #include // NOLINT #include // NOLINT #include // NOLINT @@ -22,94 +24,19 @@ #include // NOLINT #include // NOLINT #include -#include // NOLINT - -// clang-format off -#include "paddle/fluid/framework/python_headers.h" -// clang-format on - +#include #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/framework/var_type_inference.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/imperative/backward_strategy.h" -#include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/flags.h" +#include "paddle/fluid/imperative/type_defs.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" namespace paddle { namespace imperative { -class VarBase; - -namespace py = ::pybind11; - -class PreparedOp { - public: - PreparedOp(const framework::OperatorBase& op, - const framework::RuntimeContext& ctx, - framework::OperatorWithKernel::OpKernelFunc func, - platform::DeviceContext* dev_ctx, - std::vector* kernel_configs) - : op(op), - ctx(ctx), - func(func), - dev_ctx(dev_ctx), - kernel_configs(kernel_configs) {} - - static PreparedOp Prepare(const framework::RuntimeContext& ctx, - const framework::OperatorWithKernel& op, - const platform::Place& place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = pool.Get(place); - - // check if op[type] has kernel registered. - auto& all_op_kernels = op.AllOpKernels(); - auto kernels_iter = all_op_kernels.find(op.Type()); - if (kernels_iter == all_op_kernels.end()) { - PADDLE_THROW( - "There are no kernels which are registered in the %s operator.", - op.Type()); - } - - framework::OperatorWithKernel::OpKernelMap& kernels = kernels_iter->second; - - auto expected_kernel_key = - op.GetExpectedKernelType(framework::ExecutionContext( - op, framework::Scope(), *dev_ctx, ctx, nullptr)); - VLOG(3) << "expected_kernel_key:" << expected_kernel_key; - - auto kernel_iter = kernels.find(expected_kernel_key); -#ifdef PADDLE_WITH_MKLDNN - // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set - if (kernel_iter == kernels.end() && - expected_kernel_key.library_type_ == framework::LibraryType::kMKLDNN) { - VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one"; - expected_kernel_key.library_type_ = framework::LibraryType::kPlain; - expected_kernel_key.data_layout_ = framework::DataLayout::kAnyLayout; - kernel_iter = kernels.find(expected_kernel_key); - } -#endif - if (kernel_iter == kernels.end()) { - PADDLE_THROW("op %s does not have kernel for %s", op.Type(), - KernelTypeToString(expected_kernel_key)); - } - std::vector* kernel_configs = - op.GetKernelConfig(expected_kernel_key); - return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs); - } - - inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx; } - - const framework::OperatorBase& op; - const framework::RuntimeContext& ctx; - framework::OperatorWithKernel::OpKernelFunc func; - platform::DeviceContext* dev_ctx; - std::vector* kernel_configs; -}; - class OpBase; class ThreadSafeNameSet { @@ -125,290 +52,150 @@ class ThreadSafeNameSet { mutable std::mutex mtx_; }; -/* The wrapper for Variable which holds a Variable and a VarBase of its - * gradient. This object should be managed totally by Python intepreter. - * - * Nearly all interface should be implemented in C++. - */ class VarBase { + DISABLE_COPY_AND_ASSIGN(VarBase); + public: static std::vector AliveVarNames(); - - // Internal interface, create VarBase from exist variable - VarBase(const std::string& name, std::unique_ptr var, - VarBase* grad, bool stop_gradient) - : VarBase(name, var->Get().type(), - var->Get().dims(), - var->Get().place(), nullptr, grad, - stop_gradient, false, true) { - var_ = std::move(var); - } - - // Python interface - VarBase(const std::string& name, const framework::proto::VarType::Type dtype, - const std::vector& shape, const platform::Place& place, - bool stop_gradient, bool persistable) - : VarBase(name, dtype, framework::make_ddim(shape), place, stop_gradient, - persistable) {} - - // Internal interface, create VarBase from with ddim - VarBase(const std::string& name, const framework::proto::VarType::Type dtype, - const framework::DDim& shape, const platform::Place& place, - bool stop_gradient, bool persistable) - : VarBase(name, dtype, shape, place, nullptr, nullptr, stop_gradient, - persistable, true) {} - - // Grad used constructor - VarBase(const std::string& name, const framework::proto::VarType::Type dtype, - const std::vector& shape, const platform::Place& place, - bool stop_gradient, bool persistable, bool need_initialize) - : VarBase(name, dtype, framework::make_ddim(shape), place, nullptr, - nullptr, stop_gradient, persistable, need_initialize) {} - - private: - // TODO(minqiyang): need support SelectedRows - VarBase(const std::string& name, framework::proto::VarType::Type dtype, - const framework::DDim& shape, const platform::Place& place, - std::unique_ptr var, VarBase* grad, - bool stop_gradient, bool persistable, bool need_initialize) + explicit VarBase(bool has_grad, const std::string& name) : name_(name), - type_(framework::proto::VarType::LOD_TENSOR), - place_(place), - var_(std::move(var)), - grads_(grad), - dtype_(dtype), - stop_gradient_(stop_gradient), - persistable_(persistable), - pre_op_(nullptr), - pre_op_out_name_(), - pre_op_out_idx_(-1) { - if (!var_) { - var_.reset(new framework::Variable()); - } - - auto tensor = var_->GetMutable(); - tensor->Resize(shape); - if (need_initialize) { - tensor->mutable_data(place, dtype); - is_initialized_ = true; - VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype - << " place: " << place; - } else { - is_initialized_ = false; - VLOG(8) << "not initialized varbase: " << name_; - } - VLOG(8) << "create varbase: " << name_ << " type: " << dtype - << " place: " << place << "Stop gradient: " << stop_gradient_; - + grad_var_(has_grad ? new VarBase(false, GradVarName()) : nullptr) { if (IsDebugEnabled()) { + VLOG(10) << "Construct VarBase: " << name; name_set_.Insert(name_); } } - public: - virtual ~VarBase() { - pre_op_ = nullptr; - pre_op_out_idx_ = -1; - VLOG(8) << "destruct varbase: " << name_; + explicit VarBase(const std::string& name) : VarBase(true, name) {} + + ~VarBase() { + VLOG(10) << "Destruct VarBase: " << name_; if (IsDebugEnabled()) { name_set_.Remove(name_); } } - inline void SetName(const std::string& name) { name_ = name; } - inline std::string Name() const { return name_; } - inline bool IsInitialize() const { return is_initialized_; } - inline void SetInitialize(bool inited) { is_initialized_ = inited; } - inline std::vector Shape() const { - if (var_->IsInitialized()) { - return framework::vectorize(var_->Get().dims()); - } else { - return {}; - } - } + const framework::Variable& Var() const { return var_; } - inline framework::DDim Dims() const { - return var_->Get().dims(); - } + framework::Variable* MutableVar() { return &var_; } - // data type. e.g.. FP32 - inline void SetDataType(framework::proto::VarType::Type type) { - auto tensor = var_->GetMutable(); - tensor->mutable_data(tensor->place(), type); - } - inline framework::proto::VarType::Type DataType() const { return dtype_; } + bool HasGradVar() const { return grad_var_ != nullptr; } - // tensor type. e.g.. LoDTensor - inline void SetType(framework::proto::VarType::Type type) { type_ = type; } - inline framework::proto::VarType::Type Type() const { return type_; } + const std::shared_ptr& GradVarBase() const { return grad_var_; } - inline void SetStopGradient(bool stop_gradient) { - stop_gradient_ = stop_gradient; - if (grads_) { - grads_->stop_gradient_ = stop_gradient; - } + const framework::Variable& GradVar() const { + PADDLE_ENFORCE_NOT_NULL(grad_var_, "Gradient of %s does not exist", name_); + return grad_var_->var_; } - inline bool IsStopGradient() const { return stop_gradient_; } - - inline void SetPersistable(bool persistable) { persistable_ = persistable; } - inline bool IsPersistable() const { return persistable_; } - inline void SetPreOp(OpBase* op) { pre_op_ = op; } - inline platform::Place GetPlace() { return place_; } - inline OpBase* PreOp() const { return pre_op_; } - inline int PreOpOutIdx() const { return pre_op_out_idx_; } - - void RunBackward(const detail::BackwardStrategy& bck_stratedy); - inline void ResetPreOp(OpBase* op) { - if (op == pre_op_) { - // clear pre_op info when op equals to var's pre_op - pre_op_ = nullptr; - pre_op_out_idx_ = -1; - } + framework::Variable* MutableGradVar() { + PADDLE_ENFORCE_NOT_NULL(grad_var_, "Gradient of %s does not exist", name_); + return &(grad_var_->var_); } - void InitBuffer() { - if (!is_initialized_) { - var_->GetMutable()->mutable_data(place_, dtype_); - is_initialized_ = true; - VLOG(8) << "initialized varbase: " << name_ << " type: " << dtype_ - << " place: " << place_; + // This is used for python api + void SetOverridedStopGradient(bool stop_gradient) { + if (stop_gradient) { + overrided_stop_gradient_ = 1; } else { - VLOG(8) << "var: " << name_ << " has already been initialized "; + overrided_stop_gradient_ = 0; } - } - - void TrackPreOp(OpBase* pre_op, const std::string& pre_op_out_name, - int pre_op_out_idx, bool pre_op_stop_gradient) { - pre_op_ = pre_op; - pre_op_out_name_ = pre_op_out_name; - pre_op_out_idx_ = pre_op_out_idx; - if (pre_op_stop_gradient) { - stop_gradient_ = pre_op_stop_gradient; + if (grad_var_) { + grad_var_->SetOverridedStopGradient(stop_gradient); } } - - void ClearGradient() { - VLOG(1) << "clear gradient of " << Name(); - if (grads_ && grads_->var_ && grads_->var_->IsInitialized()) { - auto grads_t = grads_->var_->GetMutable(); - operators::math::set_constant( - *(platform::DeviceContextPool::Instance().Get( - grads_->var_->Get().place())), - grads_t, 0.0); + // This is used for python api + bool OverridedStopGradient() const { + if (overrided_stop_gradient_ == 0) { + return false; + } else { + return true; } } - framework::LoDTensor& GradValue(); + // This is used inside C++ + int InnerOverridedStopGradient() const { return overrided_stop_gradient_; } - std::unique_ptr NewVarBase(const platform::Place& dst_place, - const bool blocking) const; + bool GradGenerated() const { return grad_generated_; } - inline std::string GradName() const { - return string::Sprintf("%s@IGrad", Name()); + void SetGradGenerated(bool generated) { grad_generated_ = generated; } + // This is used inside C++ + void InnerSetOverridedStopGradient(bool stop_gradient) { + if (overrided_stop_gradient_ == -1) { + overrided_stop_gradient_ = static_cast(stop_gradient); + if (grad_var_) { + grad_var_->InnerSetOverridedStopGradient(stop_gradient); + } + } else { + VLOG(6) << "Ignore Stop gradient conversion for Var: " << Name() + << "Set value is: " << overrided_stop_gradient_; + } } - std::string name_; - framework::proto::VarType::Type type_; - platform::Place place_; + void SetPersistable(bool persistable) { persistable_ = persistable; } - std::unique_ptr var_; - std::shared_ptr grads_; + bool Persistable() const { return persistable_; } - private: - framework::proto::VarType::Type dtype_; - bool stop_gradient_; - bool persistable_; - bool is_initialized_; - OpBase* pre_op_; - std::string pre_op_out_name_; - int pre_op_out_idx_; - - // A private flag to check memory leak - static ThreadSafeNameSet name_set_; -}; + void AddGradOps(const std::weak_ptr& op); -/* The wrapper for OpDesc which holds a OpDesc and a OpDesc of its - * gradient. This object should be managed totally by Python intepreter. - */ -class PYBIND11_HIDDEN OpBase { - public: - OpBase(const std::string& type) - : type_(type), - trace_id_(-1), - place_(platform::CPUPlace()), - backward_hooks_() {} - - virtual ~OpBase() { - for (const auto& it : outputs_ref) { - auto vb = it.lock(); - if (vb) { - VLOG(3) << "Op reset by" << vb->name_; - vb->ResetPreOp(this); - } - } - // TODO(minqiyang): remove op_desc from block_desc in tracer - // release resource - for (framework::OpDesc* desc : grad_op_descs_) { - delete desc; + std::vector GradOps() { + std::vector rlt; + // TODO(jiabin): use better data structure to remove nullptr when we find it + for (const auto& wk_ptr : grad_ops_) { + OpBase* tmp_op = wk_ptr.lock().get(); + if (tmp_op) rlt.emplace_back(tmp_op); } + return rlt; } + void ClearGradOps() { grad_ops_.clear(); } - std::vector ApplyGrad( - BackwardSumMap* bck_map, GradientRef* grad_ref, - const detail::BackwardStrategy& bck_stratedy); - - inline std::string Type() const { return type_; } - inline std::string GradOpType(size_t index) const { - PADDLE_ENFORCE_NOT_NULL(grad_op_descs_[index]); - return grad_op_descs_[index]->Type(); - } - - void RegisterBackwardHooks(const py::object& callable); - - void InvokeBackwardHooks(); - - void TrackPreOp( - const std::string& inp_name, - const std::vector>& inputs) { - auto& pre_ops_list = pre_ops_[inp_name]; - pre_ops_list.reserve(inputs.size()); - auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name]; - for (std::shared_ptr inp_var : inputs) { - if (inp_var->PreOp() && !inp_var->IsStopGradient()) { - VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " - << inp_name; - pre_ops_list.emplace_back(inp_var->PreOp()); - pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx()); - } else { - VLOG(3) << "no pre op in slot " << inp_name - << " input var stop_gradient: " << inp_var->IsStopGradient(); - pre_ops_list.emplace_back(nullptr); - // pre_ops_out_idx_list.push_back(-1); - } + const std::string& Name() const { return name_; } + + void SetName(const std::string& name) { + name_ = name; + if (grad_var_) { + grad_var_->SetName(GradVarName()); } } - std::string type_; - int trace_id_; + std::string GradVarName() { return framework::GradVarName(name_); } - // Note: each fwd op corresponds to a vector of bwd ops. - std::vector grad_op_descs_; + void SetType(framework::proto::VarType::Type type) { type_ = type; } - platform::Place place_; + framework::proto::VarType::Type Type() const { return type_; } + + void SetDataType(framework::proto::VarType::Type data_type) { + data_type_ = data_type; + if (grad_var_) { + grad_var_->SetDataType(data_type_); + } + } - OpBasePtrMap pre_ops_; - std::map> pre_ops_out_idx_; + framework::proto::VarType::Type DataType() const { return data_type_; } - VarBaseWeakPtrList outputs_ref; - // Inputs to a vector of bwd ops. - std::vector grad_input_vars_; - // Outputs to a vector of bwd ops. - std::vector grad_output_vars_; + void ClearGradient(); - std::vector backward_hooks_; + std::shared_ptr NewVarBase(const platform::Place& dst_place, + const bool blocking) const; - framework::AttributeMap attrs_; + private: + framework::Variable var_; + std::string name_; + std::shared_ptr grad_var_; + mutable size_t copied_counter_ = 0; + + // grad_op indicates which grad_op will this var be used as input + std::vector> grad_ops_; + // add this property for users may set stop_gradient themselves and this + // should override the + // frameworks setting (-1) unset, (1) true, (0) false + int overrided_stop_gradient_{-1}; + bool grad_generated_{false}; + bool persistable_{false}; + + framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR}; + framework::proto::VarType::Type data_type_{framework::proto::VarType::FP32}; + static ThreadSafeNameSet name_set_; }; class Layer { @@ -417,18 +204,16 @@ class Layer { virtual std::vector> Forward( const std::vector>& inputs) { - std::vector> vars; - return vars; + return {}; } }; // infer var type context for imperative mode -class PYBIND11_HIDDEN RuntimeInferVarTypeContext - : public framework::InferVarTypeContext { +class RuntimeInferVarTypeContext : public framework::InferVarTypeContext { public: - RuntimeInferVarTypeContext(const imperative::VarBasePtrMap* inputs, - imperative::VarBasePtrMap* outputs, - const framework::AttributeMap* attrs_map) + RuntimeInferVarTypeContext(const NameVarBaseMap& inputs, + const NameVarBaseMap* outputs, + const framework::AttributeMap& attrs_map) : InferVarTypeContext(nullptr, nullptr), inputs_(inputs), outputs_(outputs), @@ -436,19 +221,19 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext input_names_(), output_names_(), var_set_() { - input_names_.reserve(inputs_->size()); - for (auto& it : *inputs_) { - for (std::shared_ptr var : it.second) { + input_names_.reserve(inputs_.size()); + for (auto& it : inputs_) { + for (auto& var : it.second) { input_names_[it.first].emplace_back(var->Name()); - var_set_[var->Name()] = var; + var_set_[var->Name()] = var.get(); } } output_names_.reserve(outputs_->size()); for (auto& it : *outputs_) { - for (std::shared_ptr var : it.second) { + for (auto& var : it.second) { output_names_[it.first].emplace_back(var->Name()); - var_set_[var->Name()] = var; + var_set_[var->Name()] = var.get(); } } } @@ -456,8 +241,10 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext virtual ~RuntimeInferVarTypeContext() {} framework::Attribute GetAttr(const std::string& name) const override { - PADDLE_ENFORCE_NOT_NULL(attrs_); - return attrs_->at(name); + auto iter = attrs_.find(name); + PADDLE_ENFORCE_EQ(iter != attrs_.end(), true, "Cannot find attribute %s", + name); + return iter->second; } bool HasVar(const std::string& name) const override { @@ -465,8 +252,7 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext } bool HasInput(const std::string& name) const override { - PADDLE_ENFORCE_NOT_NULL(inputs_); - return inputs_->count(name) > 0; + return inputs_.count(name) > 0; } bool HasOutput(const std::string& name) const override { @@ -476,17 +262,26 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext const std::vector& Input( const std::string& name) const override { - return input_names_.at(name); + auto iter = input_names_.find(name); + PADDLE_ENFORCE_EQ(iter != input_names_.end(), true, "Cannot find input %s", + name); + return iter->second; } const std::vector& Output( const std::string& name) const override { - return output_names_.at(name); + auto iter = output_names_.find(name); + PADDLE_ENFORCE_EQ(iter != output_names_.end(), true, + "Cannot find output %s", name); + return iter->second; } framework::proto::VarType::Type GetType( const std::string& name) const override { - return var_set_.at(name)->Type(); + auto iter = var_set_.find(name); + PADDLE_ENFORCE_EQ(iter != var_set_.end(), true, + "Cannot find var %s in GetType", name); + return iter->second->Type(); } void SetType(const std::string& name, @@ -500,7 +295,10 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext framework::proto::VarType::Type GetDataType( const std::string& name) const override { - return var_set_.at(name)->DataType(); + auto iter = var_set_.find(name); + PADDLE_ENFORCE_EQ(iter != var_set_.end(), true, + "Cannot find var %s in GetDataType", name); + return iter->second->DataType(); } void SetDataType(const std::string& name, @@ -537,13 +335,97 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext } private: - const imperative::VarBasePtrMap* inputs_; - imperative::VarBasePtrMap* outputs_; - const framework::AttributeMap* attrs_; + const NameVarBaseMap& inputs_; + const NameVarBaseMap* outputs_; + const framework::AttributeMap& attrs_; std::unordered_map> input_names_; std::unordered_map> output_names_; - std::unordered_map> - var_set_; + std::unordered_map var_set_; +}; + +// TODO(zjl): to support py_func layer +class OpBase : public std::enable_shared_from_this { + DISABLE_COPY_AND_ASSIGN(OpBase); + + public: + ~OpBase() { VLOG(3) << "Destruct Op: " << Type() << std::endl; } + + // Developer should not rely on this method to create OpBase. + // OpBase should be created in Tracer and managed by Tracer totally. + template + static std::shared_ptr Create(Args&&... args) { + return std::shared_ptr(new OpBase(std::forward(args)...)); + } + + size_t id() const { return id_; } + + const std::string& Type() const { return op_->Type(); } + + void Run(const NameVarBaseMap& ins, const NameVarBaseMap& outs); + + const framework::VariableNameMap& InputNameMap() const { + return op_->Inputs(); + } + + const framework::VariableNameMap& OutputNameMap() const { + return op_->Outputs(); + } + + const framework::AttributeMap& Attrs() const { return op_->Attrs(); } + const framework::OpInfo& Info() const { return op_->Info(); } + + void ClearBackwardTrace(); + + const std::vector& GradPendingOps() const { + return grad_pending_ops_; + } + + void InsertGradPendingOps(OpBase* op) { grad_pending_ops_.emplace_back(op); } + + void SortGradPendingOps() { + std::sort(grad_pending_ops_.begin(), grad_pending_ops_.end(), + [](OpBase* op1, OpBase* op2) { return op1->id() > op2->id(); }); + } + NameVarBaseMap* GetMutableOutsMap() { return &outs_; } + NameVarBaseMap* GetMutableInsMap() { return &ins_; } + const NameVarBaseMap& GetInsMap() { return ins_; } + const NameVarBaseMap& GetOutsMap() { return outs_; } + const platform::Place& place() const { return place_; } + + // TODO(jiabin) prepare for backward hook + void RegisterBackwardHooks(const std::function& func) { + backward_hooks_.emplace_back(func); + } + + void InvokeBackwardHooks() { + for (const auto& func : backward_hooks_) { + func(); + VLOG(5) << "Invoke Backward Hook for: " << Type() << std::endl; + } + } + + private: + OpBase(size_t id, const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs, + const platform::Place& place); + + OpBase(size_t id, const framework::OpDesc& op_desc, + const platform::Place& place); + + size_t id_; + + std::unique_ptr op_; + + std::vector> backward_hooks_; + platform::Place place_; + + // Not need to be std::weak_ptr, because op is binded to a certain Tracer, + // and would not be used by a Tracer that does not create itself. + std::vector grad_pending_ops_; + + // This part is only used for backward + NameVarBaseMap ins_; + NameVarBaseMap outs_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index d9630bd6..ab612b2f 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -82,11 +82,18 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep, if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) PADDLE_THROW("invalied address: %s", ep); + int try_times = 0; while (true) { if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) { VLOG(0) << "worker: " << ep - << " is not ready, will retry after 3 seconds..."; + << (try_times < 5 ? " is not ready, will retry after 3 seconds..." + : " is not ready. Maybe that some process " + "is occupied the GPUs of this node now, " + "and you should kill those process manually. " + "Will retry after 3 seconds..."); + std::this_thread::sleep_for(std::chrono::seconds(3)); + ++try_times; continue; } VLOG(3) << "sending the ncclUniqueId to " << ep; diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc new file mode 100644 index 00000000..6f8ee92b --- /dev/null +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/prepared_operator.h" +#include + +namespace paddle { +namespace imperative { + +const framework::Tensor* GetTensorFromVar(const framework::Variable& var) { + if (var.IsType()) { + return &(var.Get()); + } else if (var.IsType()) { + return &(var.Get().value()); + } else { + return nullptr; + } +} + +void PreparedOp::PrepareData( + const platform::Place& place, const NameVarBaseMap& ins, + const framework::OperatorWithKernel& op, + const framework::OpKernelType& expected_kernel_key) { + for (const auto& name_pair : ins) { + for (const auto& var_base : name_pair.second) { + const auto* tensor = GetTensorFromVar(var_base->Var()); + if (tensor && tensor->IsInitialized()) { + auto tmp_place = tensor->place(); + // TODO(jiabin): Support transform data layout when we Verify it on more + // tests + if (!(tmp_place == place)) { + auto kernel_type_for_var = op.GetKernelTypeForVar( + name_pair.first, *tensor, expected_kernel_key); + if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) { + continue; + } else { + VLOG(3) << "Transform Variable " << var_base->Name() << " from " + << kernel_type_for_var << " to " << expected_kernel_key; + framework::Tensor out; + TransformData(expected_kernel_key, kernel_type_for_var, *tensor, + &out); + SetTensorToVariable(var_base->Var(), out, var_base->MutableVar()); + } + } + } + } + } +} + +PreparedOp::PreparedOp(const framework::OperatorBase& op, + const framework::RuntimeContext& ctx, + framework::OperatorWithKernel::OpKernelFunc func, + platform::DeviceContext* dev_ctx, + std::vector* kernel_configs) + : op_(op), + ctx_(ctx), + func_(std::move(func)), + dev_ctx_(dev_ctx), + kernel_configs_(kernel_configs) {} + +PreparedOp PreparedOp::Prepare(const framework::RuntimeContext& ctx, + const framework::OperatorWithKernel& op, + platform::Place place, + const NameVarBaseMap& ins) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + + // check if op[type] has kernel registered. + auto& all_op_kernels = op.AllOpKernels(); + auto kernels_iter = all_op_kernels.find(op.Type()); + if (kernels_iter == all_op_kernels.end()) { + PADDLE_THROW( + "There are no kernels which are registered in the %s operator.", + op.Type()); + } + + auto& kernels = kernels_iter->second; + + auto expected_kernel_key = + op.GetExpectedKernelType(framework::ExecutionContext( + op, framework::Scope(), *dev_ctx, ctx, nullptr)); + VLOG(3) << "expected_kernel_key:" << expected_kernel_key; + + auto kernel_iter = kernels.find(expected_kernel_key); + // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case + if (kernel_iter == kernels.end()) { + PADDLE_THROW("op %s does not have kernel for %s", op.Type(), + KernelTypeToString(expected_kernel_key)); + } + std::vector* kernel_configs = + op.GetKernelConfig(expected_kernel_key); + + if (!(expected_kernel_key.place_ == place)) { + dev_ctx = pool.Get(expected_kernel_key.place_); + place = dev_ctx->GetPlace(); + } + + PrepareData(place, ins, op, expected_kernel_key); + return PreparedOp(op, ctx, kernel_iter->second, dev_ctx, kernel_configs); +} + +void PreparedOp::Run() { + // TODO(zjl): remove scope in dygraph + framework::Scope scope; + op_.RuntimeInferShape(scope, dev_ctx_->GetPlace(), ctx_); + VLOG(6) << "Finish Runtime infer shape"; + func_(framework::ExecutionContext(op_, scope, *dev_ctx_, ctx_, + kernel_configs_)); +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h new file mode 100644 index 00000000..886311f8 --- /dev/null +++ b/paddle/fluid/imperative/prepared_operator.h @@ -0,0 +1,62 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/data_transform.h" +#include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/type_defs.h" + +namespace paddle { +namespace imperative { + +const framework::Tensor* GetTensorFromVar(const framework::Variable& var); + +class PreparedOp { + public: + static PreparedOp Prepare(const framework::RuntimeContext& ctx, + const framework::OperatorWithKernel& op, + platform::Place place, const NameVarBaseMap& ins); + + inline platform::DeviceContext* GetDeviceContext() const { return dev_ctx_; } + + void Run(); + + static void PrepareData(const platform::Place& place, + const NameVarBaseMap& ins, + const framework::OperatorWithKernel& op, + const framework::OpKernelType& expected_kernel_key); + + private: + PreparedOp(const framework::OperatorBase& op, + const framework::RuntimeContext& ctx, + framework::OperatorWithKernel::OpKernelFunc func, + platform::DeviceContext* dev_ctx, + std::vector* kernel_configs); + + private: + const framework::OperatorBase& op_; + const framework::RuntimeContext& ctx_; + framework::OperatorWithKernel::OpKernelFunc func_; + platform::DeviceContext* dev_ctx_; + std::vector* kernel_configs_; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt new file mode 100644 index 00000000..f32f0a17 --- /dev/null +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -0,0 +1,5 @@ +cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context) +cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS gradient_accumulator memcpy) +cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy) +cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split assign_op place) +cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op memcpy) diff --git a/paddle/fluid/imperative/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc similarity index 100% rename from paddle/fluid/imperative/nccl_context_test.cc rename to paddle/fluid/imperative/tests/nccl_context_test.cc diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc new file mode 100644 index 00000000..29a51733 --- /dev/null +++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/imperative/gradient_accumulator.h" +#include "paddle/fluid/memory/memcpy.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; +namespace framework = paddle::framework; +namespace paddle { +namespace imperative { + +void TensorAdd(const framework::Variable& src, framework::Variable* dst); + +#if defined(PADDLE_WITH_CUDA) +template +int TensorGPUAddTest(platform::CUDAPlace place, T t1, T t2) { + framework::Variable var1; + framework::Variable var2; + std::vector src_data(10, t1); + std::vector dst_data(10, t2); + std::vector result; + platform::CPUPlace src_place; + for (unsigned int i = 0; i < 10; i++) { + result.emplace_back(src_data[i] + dst_data[i]); + } + std::vector dims = {2, 5}; + auto* src = var1.GetMutable(); + auto* dst = var2.GetMutable(); + src->Resize(framework::make_ddim(dims)); + dst->Resize(framework::make_ddim(dims)); + auto* src_mutable = src->mutable_data(place); + auto* dst_mutable = dst->mutable_data(place); + paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), + sizeof(T) * src_data.size(), 0); + paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), + sizeof(T) * dst_data.size(), 0); + imperative::TensorAdd(var1, &var2); + framework::LoDTensor rlt; + platform::CPUPlace rlt_place; + framework::TensorCopySync(*dst, rlt_place, &rlt); + + for (unsigned int i = 0; i < rlt.numel(); i++) { + if (rlt.data()[i] != result[i]) return 1; + } + return 0; +} +#endif + +template +int TensorCPUAddTest(platform::CPUPlace place, T t1, T t2) { + framework::Variable var1; + framework::Variable var2; + std::vector src_data(10, t1); + std::vector dst_data(10, t2); + std::vector result; + platform::CPUPlace src_place; + for (unsigned int i = 0; i < 10; i++) { + result.emplace_back(src_data[i] + dst_data[i]); + } + std::vector dims = {2, 5}; + auto* src = var1.GetMutable(); + auto* dst = var2.GetMutable(); + src->Resize(framework::make_ddim(dims)); + dst->Resize(framework::make_ddim(dims)); + auto* src_mutable = src->mutable_data(place); + auto* dst_mutable = dst->mutable_data(place); + paddle::memory::Copy(place, src_mutable, src_place, src_data.data(), + sizeof(T) * src_data.size()); + paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(), + sizeof(T) * dst_data.size()); + imperative::TensorAdd(var1, &var2); + framework::LoDTensor rlt; + platform::CPUPlace rlt_place; + framework::TensorCopySync(*dst, rlt_place, &rlt); + + for (unsigned int i = 0; i < rlt.numel(); i++) { + if (rlt.data()[i] != result[i]) return 1; + } + return 0; +} + +TEST(test_add_functor, add_functor) { +#if defined(PADDLE_WITH_CUDA) + platform::CUDAPlace gpu_place(0); +#endif + platform::CPUPlace cpu_place; + + int cpu_res = 1; + cpu_res = TensorCPUAddTest(cpu_place, 1.0, 0.0); + EXPECT_EQ(cpu_res, 0); + cpu_res = TensorCPUAddTest(cpu_place, static_cast(1.0), + static_cast(2.0)); + EXPECT_EQ(cpu_res, 0); +#if defined(PADDLE_WITH_CUDA) + int gpu_res = 1; + gpu_res = TensorGPUAddTest(gpu_place, 1.0, 0.0); + EXPECT_EQ(gpu_res, 0); + gpu_res = TensorGPUAddTest(gpu_place, static_cast(1.0), + static_cast(2.0)); + EXPECT_EQ(gpu_res, 0); +#endif +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tests/test_layer.cc b/paddle/fluid/imperative/tests/test_layer.cc new file mode 100644 index 00000000..c92d0fd6 --- /dev/null +++ b/paddle/fluid/imperative/tests/test_layer.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Created by Jiabin on 2019-08-16. +// + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/imperative/layer.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; +namespace framework = paddle::framework; + +namespace paddle { +namespace imperative { + +using vb_vector = std::vector>; + +using var_pair = std::pair; + +TEST(test_layer, test_runtime_context) { + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + var_pair in_pair = var_pair("X", vb_vector(1, vin)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {in_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap attrs; + auto* ctx = new imperative::RuntimeInferVarTypeContext(ins, &outs, attrs); + ASSERT_TRUE(ctx->HasVar("vin")); + ASSERT_TRUE(ctx->HasInput("X")); + ASSERT_TRUE(ctx->HasOutput("Out")); + + ASSERT_ANY_THROW(ctx->GetDataTypes("vin")); + std::vector NullType; + ASSERT_ANY_THROW(ctx->SetDataTypes("vin", NullType)); + ASSERT_ANY_THROW(ctx->GetShape("vin")); + ASSERT_ANY_THROW(ctx->GetLoDLevel("vin")); + ASSERT_ANY_THROW(ctx->SetLoDLevel("vin", 2)); +} + +std::string LayerDebugString(const std::string& op_type, + const NameVarBaseMap& ins, + const NameVarBaseMap& outs); + +TEST(test_layer, test_debug_string_test_debug_Test) { + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + std::shared_ptr vin_error( + new imperative::VarBase(false, "vin_error")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + std::shared_ptr vout_error( + new imperative::VarBase(false, "vout_error")); + vin_error->MutableVar()->GetMutable(); + vout->MutableVar()->GetMutable(); + vout_error->MutableVar()->GetMutable(); + var_pair in_pair = var_pair("X", vb_vector(1, vin)); + vb_vector vb_in_error = {vin_error, nullptr}; + var_pair vin_error_pair = var_pair("X", vb_in_error); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + var_pair vout_error_pair = var_pair("Out2", vb_vector(1, vout_error)); + imperative::NameVarBaseMap ins = {in_pair}; + imperative::NameVarBaseMap ins_error = {vin_error_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + imperative::NameVarBaseMap outs_error = {vout_error_pair}; + ASSERT_NO_FATAL_FAILURE(LayerDebugString("test_op", ins, outs)); + std::string res = LayerDebugString("test_op", ins, outs_error); + ASSERT_TRUE(res.find("UNRESOLVED_TYPE") != std::string::npos); + std::string res2 = LayerDebugString("test_op", ins_error, outs_error); + VLOG(3) << res2; + ASSERT_TRUE(res2.find("NOT_INITED") != std::string::npos); + ASSERT_TRUE(res2.find("NULL") != std::string::npos); +} + +TEST(test_layer, test_clear_backward_info) { + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + framework::OpDesc desc; + platform::CPUPlace place; + var_pair x_pair = var_pair("X", vb_vector(1, vin)); + var_pair y_pair = var_pair("Y", vb_vector(1, vin)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair, y_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap concat_att_map; + concat_att_map["axis"] = 1; + std::shared_ptr op( + OpBase::Create(0, "mul", ins, outs, concat_att_map, place)); + std::shared_ptr preceding_op( + OpBase::Create(0, "mul", ins, outs, concat_att_map, place)); + op->InsertGradPendingOps(preceding_op.get()); + *(op->GetMutableInsMap()) = ins; + *(op->GetMutableOutsMap()) = outs; + ASSERT_GT(op->GetInsMap().size(), 0); + ASSERT_GT(op->GetOutsMap().size(), 0); + ASSERT_GT(op->GradPendingOps().size(), 0); + + op->ClearBackwardTrace(); + + ASSERT_EQ(op->GetInsMap().size(), 0); + ASSERT_EQ(op->GetOutsMap().size(), 0); + ASSERT_EQ(op->GradPendingOps().size(), 0); +} + +TEST(test_layer, test_varbase_basic) { + platform::CPUPlace place; + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + vin->MutableVar()->GetMutable()->mutable_data( + place); + std::shared_ptr vout(vin->NewVarBase(place, false)); + ASSERT_EQ(vout->Name(), "Itmp0"); + + std::shared_ptr vin_with_grad( + new imperative::VarBase(true, "vin")); + ASSERT_ANY_THROW(vin->MutableGradVar()); + ASSERT_NO_THROW(ASSERT_TRUE(dynamic_cast( + vin_with_grad->MutableGradVar()) != 0)); + ASSERT_TRUE( + dynamic_cast(vin_with_grad->MutableGradVar()) != 0); + vin_with_grad->SetOverridedStopGradient(false); + ASSERT_FALSE(vin_with_grad->OverridedStopGradient()); + ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetPersistable(true)); + ASSERT_FALSE(vin_with_grad->OverridedStopGradient()); + ASSERT_NO_FATAL_FAILURE(vin_with_grad->SetName("new_name")); + ASSERT_EQ(vin_with_grad->Name(), "new_name"); +} +// TODO(jiabin): Add more ut here for layer + +} // namespace imperative +} // namespace paddle + +USE_OP(mul); diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc new file mode 100644 index 00000000..1a30868d --- /dev/null +++ b/paddle/fluid/imperative/tests/test_prepare_op.cc @@ -0,0 +1,216 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Created by Jiabin on 2019-08-19. +// + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/imperative/prepared_operator.h" +#include "paddle/fluid/imperative/type_defs.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; +namespace framework = paddle::framework; + +namespace paddle { +namespace imperative { + +static framework::RuntimeContext PrepareRuntimeContext( + const NameVarBaseMap& ins, const NameVarBaseMap& outs) { + framework::VariableValueMap inputs, outputs; + for (auto& in_pair : ins) { + auto& in_ctx = inputs[in_pair.first]; + in_ctx.reserve(in_pair.second.size()); + for (auto& in_var : in_pair.second) { + in_ctx.emplace_back(in_var->MutableVar()); + } + } + + for (auto& out_pair : outs) { + auto& out_ctx = outputs[out_pair.first]; + out_ctx.reserve(out_pair.second.size()); + for (auto& out_var : out_pair.second) { + out_ctx.emplace_back(out_var->MutableVar()); + } + } + return framework::RuntimeContext(std::move(inputs), std::move(outputs)); +} + +static framework::VariableNameMap CreateVarNameMap( + const framework::OpInfo& op_info, const std::string& op_type, + const NameVarBaseMap& varbase_map, bool is_input) { + if (op_info.proto_ == nullptr) { + return {}; + } + + framework::VariableNameMap result; + + for (auto& var : + is_input ? op_info.Proto().inputs() : op_info.Proto().outputs()) { + auto it = varbase_map.find(var.name()); + if (it == varbase_map.end()) { + PADDLE_ENFORCE_EQ( + var.dispensable(), true, + "Var: %s not dispensable and there are no such var in inputs", + var.name()); + result[var.name()] = {}; + } else { + auto& var_vector = it->second; + std::vector args; + args.reserve(var_vector.size()); + for (auto& var_base : var_vector) { + args.emplace_back(var_base->Name()); + } + result[var.name()] = std::move(args); + } + } + return result; +} + +using vb_vector = std::vector>; + +using var_pair = std::pair; + +TEST(test_prepare_op, test_prepare_op) { + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + framework::OpDesc desc; + platform::CPUPlace place; + vin->MutableVar()->GetMutable()->mutable_data( + place); + var_pair x_pair = var_pair("X", vb_vector(1, vin)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap split_attr_map; + const auto& info = framework::OpInfoMap::Instance().Get("split"); + framework::VariableNameMap var_in_map = + CreateVarNameMap(info, "split", ins, true); + framework::VariableNameMap var_out_map = + CreateVarNameMap(info, "split", outs, false); + framework::OperatorWithKernel op("split", var_in_map, var_out_map, + split_attr_map); + framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs); + ASSERT_NO_FATAL_FAILURE(PreparedOp preparedOp = + PreparedOp::Prepare(ctx, op, place, ins)); +} + +const framework::Tensor* GetTensorFromVar(const framework::Variable& var); + +TEST(test_prepare_op, test_get_tensor_from_var) { + std::shared_ptr vout_error( + new imperative::VarBase(false, "vout_error")); + vout_error->MutableVar()->GetMutable(); + auto* ts = GetTensorFromVar(*vout_error->MutableVar()); + ASSERT_TRUE(ts != nullptr); +} +#if defined(PADDLE_WITH_CUDA) +TEST(test_prepare_op, test_prepare_data) { + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + + framework::OpDesc desc; + platform::CPUPlace cpu_place; + platform::CUDAPlace gpu_place(0); + std::vector src_data(10, 2.0); + std::vector dims = {2, 5}; + + // prepare an cpu only input + auto* vin_tensor = vin->MutableVar()->GetMutable(); + vin_tensor->Resize(framework::make_ddim(dims)); + auto* vin_mutable_tensor = vin_tensor->mutable_data(cpu_place); + paddle::memory::Copy(cpu_place, vin_mutable_tensor, cpu_place, + src_data.data(), sizeof(float) * src_data.size()); + + var_pair x_pair = var_pair("X", vb_vector(1, vin)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap assign_attr_map; + const auto& info = framework::OpInfoMap::Instance().Get("assign"); + framework::VariableNameMap var_in_map = + CreateVarNameMap(info, "assign", ins, true); + framework::VariableNameMap var_out_map = + CreateVarNameMap(info, "assign", outs, false); + framework::OperatorWithKernel assign_op("assign", var_in_map, var_out_map, + assign_attr_map); + framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs); + + // test if it can be transformed to GPU place + PreparedOp prepared_op = PreparedOp::Prepare(ctx, assign_op, gpu_place, ins); + for (const auto& name_pair : ins) { + for (const auto& vb : name_pair.second) { + ASSERT_TRUE(platform::is_same_place( + vb->Var().Get().place(), gpu_place)); + } + } +} +#endif + +TEST(test_prepare_op, test_prepare_data_same_place) { + std::shared_ptr vin( + new imperative::VarBase(false, "vin")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + + framework::OpDesc desc; + platform::CPUPlace cpu_place; + std::vector src_data(10, 2.0); + std::vector dims = {2, 5}; + + // prepare an cpu only input + auto* vin_tensor = vin->MutableVar()->GetMutable(); + vin_tensor->Resize(framework::make_ddim(dims)); + auto* vin_mutable_tensor = vin_tensor->mutable_data(cpu_place); + paddle::memory::Copy(cpu_place, vin_mutable_tensor, cpu_place, + src_data.data(), sizeof(float) * src_data.size()); + + var_pair x_pair = var_pair("X", vb_vector(1, vin)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap assign_attr_map; + const auto& info = framework::OpInfoMap::Instance().Get("assign"); + framework::VariableNameMap var_in_map = + CreateVarNameMap(info, "assign", ins, true); + framework::VariableNameMap var_out_map = + CreateVarNameMap(info, "assign", outs, false); + framework::OperatorWithKernel assign_op("assign", var_in_map, var_out_map, + assign_attr_map); + framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs); + + // test if it never transfered on GPU place + PreparedOp prepared_op = PreparedOp::Prepare(ctx, assign_op, cpu_place, ins); + for (const auto& name_pair : ins) { + for (const auto& vb : name_pair.second) { + ASSERT_TRUE(platform::is_same_place( + vb->Var().Get().place(), cpu_place)); + } + } +} +} // namespace imperative +} // namespace paddle + +USE_OP(split); +USE_OP(assign); diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc new file mode 100644 index 00000000..f112b9fc --- /dev/null +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +// Created by Jiabin on 2019-08-16. +// + +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/imperative/tracer.h" +#include "paddle/fluid/memory/memcpy.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; +namespace framework = paddle::framework; + +namespace paddle { +namespace imperative { + +using vb_vector = std::vector>; + +using var_pair = std::pair; + +TEST(test_tracer, test_trace_op) { + // Doing an mul + imperative::Tracer tracer; + std::shared_ptr x_in( + new imperative::VarBase(true, "x_in")); + std::shared_ptr y_in( + new imperative::VarBase(true, "y_in")); + std::shared_ptr vout( + new imperative::VarBase(true, "vout")); + platform::CPUPlace place; + std::vector src_data(10, 2.0); + std::vector dims1 = {2, 5}; + std::vector dims2 = {5, 2}; + + auto* x_in_tensor = x_in->MutableVar()->GetMutable(); + auto* y_in_tensor = y_in->MutableVar()->GetMutable(); + x_in_tensor->Resize(framework::make_ddim(dims1)); + auto* mutable_x = x_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, place, src_data.data(), + sizeof(float) * src_data.size()); + y_in_tensor->Resize(framework::make_ddim(dims2)); + auto* mutable_y = y_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_y, place, src_data.data(), + sizeof(float) * src_data.size()); + + var_pair x_pair = var_pair("X", vb_vector(1, x_in)); + var_pair y_pair = var_pair("Y", vb_vector(1, y_in)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair, y_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap mul_attr_map; + mul_attr_map["use_mkldnn"] = false; + tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); + const auto& out_tensor = vout->Var().Get(); + for (size_t i = 0; i < vout->Var().Get().numel(); i++) { + ASSERT_EQ(out_tensor.data()[i], 20.0); + } +} + +TEST(test_tracer, test_track_backward_output) { + // Doing an mul + imperative::Tracer tracer; + std::shared_ptr x_in( + new imperative::VarBase(true, "x_in")); + std::shared_ptr y_in( + new imperative::VarBase(false, "y_in")); + x_in->SetOverridedStopGradient(false); + std::shared_ptr vout( + new imperative::VarBase(true, "vout")); + platform::CPUPlace place; + std::vector src_data(10, 2.0); + std::vector dims1 = {2, 5}; + std::vector dims2 = {5, 2}; + + auto* x_in_tensor = x_in->MutableVar()->GetMutable(); + auto* y_in_tensor = y_in->MutableVar()->GetMutable(); + x_in_tensor->Resize(framework::make_ddim(dims1)); + auto* mutable_x = x_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, place, src_data.data(), + sizeof(float) * src_data.size()); + y_in_tensor->Resize(framework::make_ddim(dims2)); + auto* mutable_y = y_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_y, place, src_data.data(), + sizeof(float) * src_data.size()); + + var_pair x_pair = var_pair("X", vb_vector(1, x_in)); + var_pair y_pair = var_pair("Y", vb_vector(1, y_in)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair, y_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap mul_attr_map; + mul_attr_map["use_mkldnn"] = false; + ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true)); +} + +TEST(test_tracer, test_track_backward_input) { + // Doing an mul + imperative::Tracer tracer; + std::shared_ptr x_in( + new imperative::VarBase(true, "x_in")); + std::shared_ptr y_in( + new imperative::VarBase(true, "y_in")); + std::shared_ptr vout( + new imperative::VarBase(false, "vout")); + platform::CPUPlace place; + x_in->SetOverridedStopGradient(false); + std::vector src_data(10, 2.0); + std::vector dims1 = {2, 5}; + std::vector dims2 = {5, 2}; + + auto* x_in_tensor = x_in->MutableVar()->GetMutable(); + auto* y_in_tensor = y_in->MutableVar()->GetMutable(); + x_in_tensor->Resize(framework::make_ddim(dims1)); + auto* mutable_x = x_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, place, src_data.data(), + sizeof(float) * src_data.size()); + y_in_tensor->Resize(framework::make_ddim(dims2)); + auto* mutable_y = y_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_y, place, src_data.data(), + sizeof(float) * src_data.size()); + + var_pair x_pair = var_pair("X", vb_vector(1, x_in)); + var_pair y_pair = var_pair("Y", vb_vector(1, y_in)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair, y_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap mul_attr_map; + mul_attr_map["use_mkldnn"] = false; + ASSERT_ANY_THROW(tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true)); +} +#if defined(PADDLE_WITH_CUDA) +TEST(test_tracer, test_trace_op_with_multi_device_inputs) { + // Doing an mul + imperative::Tracer tracer; + std::shared_ptr x_in( + new imperative::VarBase(true, "x_in")); + std::shared_ptr y_in( + new imperative::VarBase(true, "y_in")); + std::shared_ptr vout( + new imperative::VarBase(true, "vout")); + platform::CPUPlace place; + platform::CUDAPlace gpu_place(0); + std::vector src_data(10, 2.0); + std::vector dims1 = {2, 5}; + std::vector dims2 = {5, 2}; + + auto* x_in_tensor = x_in->MutableVar()->GetMutable(); + auto* y_in_tensor = y_in->MutableVar()->GetMutable(); + x_in_tensor->Resize(framework::make_ddim(dims1)); + auto* mutable_x = x_in_tensor->mutable_data(place); + paddle::memory::Copy(place, mutable_x, place, src_data.data(), + sizeof(float) * src_data.size()); + y_in_tensor->Resize(framework::make_ddim(dims2)); + auto* mutable_y = y_in_tensor->mutable_data(gpu_place); + paddle::memory::Copy(gpu_place, mutable_y, place, src_data.data(), + sizeof(float) * src_data.size(), 0); + var_pair x_pair = var_pair("X", vb_vector(1, x_in)); + var_pair y_pair = var_pair("Y", vb_vector(1, y_in)); + var_pair out_pair = var_pair("Out", vb_vector(1, vout)); + imperative::NameVarBaseMap ins = {x_pair, y_pair}; + imperative::NameVarBaseMap outs = {out_pair}; + framework::AttributeMap mul_attr_map; + mul_attr_map["use_mkldnn"] = false; + tracer.TraceOp("mul", ins, outs, mul_attr_map, gpu_place, true); + framework::LoDTensor rlt; + framework::TensorCopySync(vout->Var().Get(), place, + &rlt); + for (size_t i = 0; i < rlt.numel(); i++) { + ASSERT_EQ(rlt.data()[i], 20.0); + } +} +#endif +} // namespace imperative +} // namespace paddle + +USE_OP(mul); diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 682bea7d..0fff6b8c 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -11,282 +11,244 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/imperative/tracer.h" - -#include -#include -#include #include #include - -#include "paddle/fluid/framework/var_type_inference.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace imperative { -void CreateGradOp(const framework::OpDesc& op_desc, - const std::unordered_set& no_grad_set, - const std::vector& grad_sub_block, - std::vector* grad_op_descs, - std::unordered_map* grad_to_var) { - PADDLE_ENFORCE(grad_op_descs->empty()); - const framework::OpInfo& op_info = - framework::OpInfoMap::Instance().Get(op_desc.Type()); - if (!op_info.grad_op_maker_) return; - - std::vector> descs = - op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); - for (auto& desc : descs) { - grad_op_descs->emplace_back(desc.release()); - } -} - -void CreateNoBuffuerGrad(std::shared_ptr var, - platform::DeviceContext* dev_ctx) { - PADDLE_ENFORCE_NOT_NULL(var, "Could not get valid var base"); - PADDLE_ENFORCE_NOT_NULL(dev_ctx, - "Could not get valid device from forward op"); - - if (var->grads_ == nullptr) { - auto& var_t = var->var_->Get(); - var->grads_ = std::shared_ptr( - new VarBase(var->GradName(), framework::proto::VarType::FP32, - framework::vectorize(var_t.dims()), dev_ctx->GetPlace(), - var->IsStopGradient(), false, false)); +static std::vector> CreateGradOpDescs( + const framework::OpInfo& op_info, const framework::OpDesc& op_desc, + const std::unordered_set& no_grad_set, + const std::vector& grad_sub_block, + std::unordered_map* grad_to_var) { + if (op_info.grad_op_maker_) { + return op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var, + grad_sub_block); + } else { + return {}; } } -platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs) { - platform::Place result = place; - for (const auto& it : inputs) { - for (const std::shared_ptr& var : it.second) { - platform::Place tmp_place = - var->var_->Get().place(); - if (!platform::is_same_place(tmp_place, result)) { - PADDLE_THROW( - "Input variable should keep in the same place: %s, but get place: " - "%s of input %s instead", - result, tmp_place, it.first); - } +static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) { + for (const auto& name_pair : outs) { + for (const auto& vb : name_pair.second) { + VLOG(6) << "Set output: " << vb->Name() << "'s OverridedStopGradient as " + << generate_grad; + vb->InnerSetOverridedStopGradient(generate_grad); } } - - return result; } -framework::VariableNameMap CreateInputVarNameMap( - const OpBase* op, const VarBasePtrMap& varbase_map) { - framework::VariableNameMap result; - - auto& info_map = framework::OpInfoMap::Instance(); - auto* op_info = info_map.GetNullable(op->Type()); - if (op_info == nullptr || op_info->proto_ == nullptr) { - return result; +void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs, + const platform::Place& place, bool trace_backward) { + platform::RecordEvent event(type); + VLOG(1) << "Trace Op: " << type; + size_t op_id = GenerateUniqueId(); + auto op = OpBase::Create(op_id, type, ins, outs, std::move(attrs), place); + op->Run(ins, outs); + + if (ComputeRequiredGrad(ins, outs, trace_backward)) { + TraceBackward(op, framework::OpDesc(op->Type(), op->InputNameMap(), + op->OutputNameMap(), op->Attrs()), + ins, outs); + } else { + VLOG(3) << "No Grad to track for Op: " << type; } - - for (auto& in : op_info->Proto().inputs()) { - auto it = varbase_map.find(in.name()); - if (it == varbase_map.end()) { - PADDLE_ENFORCE(in.dispensable()); - result[in.name()] = {}; - } else { - auto var_vector = it->second; - std::vector args; - args.reserve(var_vector.size()); - for (std::shared_ptr var_base : var_vector) { - args.emplace_back(var_base->Name()); - } - result[in.name()] = args; - } - } - return result; } -framework::VariableNameMap CreateOutputVarNameMap( - const OpBase* op, const VarBasePtrMap& varbase_map) { - framework::VariableNameMap result; - - auto& info_map = framework::OpInfoMap::Instance(); - auto* op_info = info_map.GetNullable(op->Type()); - if (op_info == nullptr || op_info->proto_ == nullptr) { - return result; - } - - for (auto& out : op_info->Proto().outputs()) { - auto it = varbase_map.find(out.name()); - if (it == varbase_map.end()) { - PADDLE_ENFORCE(out.dispensable()); - result[out.name()] = {}; - } else { - auto var_vector = it->second; - std::vector args; - args.reserve(var_vector.size()); - for (const std::shared_ptr& var_base : var_vector) { - args.emplace_back(var_base->Name()); +bool Tracer::ComputeRequiredGrad(const NameVarBaseMap& ins, + const NameVarBaseMap& outs, + bool trace_backward) { + if (!trace_backward) return false; + + for (const auto& name_pair : ins) { + for (const auto& var_base : name_pair.second) { + if (!var_base->OverridedStopGradient()) { + VLOG(6) << "Find out input: " << var_base->Name() + << "'s GeneratedGrad is True"; + PassStopGradient(outs, var_base->OverridedStopGradient()); + return true; } - result[out.name()] = args; } } - return result; + return false; } -Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {} +void Tracer::TraceBackward(const std::shared_ptr& fwd_op, + const framework::OpDesc& fwd_op_desc, + const NameVarBaseMap& ins, + const NameVarBaseMap& outs) { + // grad_to_var is a map of framework::GradVarName(in_var_name/out_var_name) -> + // in_var_name/out_var_name + std::unordered_map grad_to_var; -void Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, - VarBasePtrMap* outputs, framework::AttributeMap attrs_map, - const platform::Place expected_place, - const bool stop_gradient) { - platform::RecordEvent record_event(op->type_); - framework::VariableValueMap invars_map; - framework::VariableValueMap outvars_map; + // Get grad_op_desc using fwd_op_desc + std::vector> grad_op_descs_ = + CreateGradOpDescs(fwd_op->Info(), fwd_op_desc, {}, {}, &grad_to_var); - // Construct input_vars_map and output_vars_map - std::map> current_vars_map; - for (auto it : inputs) { - auto& invars = invars_map[it.first]; - invars.reserve(it.second.size()); - for (std::shared_ptr inp : it.second) { - PADDLE_ENFORCE_NOT_NULL(inp->var_, "op %s input %s nullptr", op->Type(), - inp->Name()); + // Create grad_ops using grad_op_descs - invars.emplace_back(inp->var_.get()); - if (!stop_gradient) { - current_vars_map[inp->Name()] = inp; - } - VLOG(3) << "input var name: " << inp->Name() - << " inited: " << inp->var_->IsInitialized() - << " stop_grad: " << inp->IsStopGradient(); - } - op->TrackPreOp(it.first, it.second); - } + size_t grad_op_num = grad_op_descs_.size(); - for (const auto& it : *outputs) { - auto& outvars = outvars_map[it.first]; - const std::vector>& outputs_tmp = - it.second; - outvars.reserve(outputs_tmp.size()); - for (size_t i = 0U; i < outputs_tmp.size(); ++i) { - // Add weak_ptr to track outputs - op->outputs_ref.emplace_back(outputs_tmp[i]); - std::shared_ptr out = outputs_tmp[i]; - outvars.emplace_back(out->var_.get()); - out->TrackPreOp(op, it.first, i, stop_gradient); - if (!stop_gradient) { - current_vars_map[out->Name()] = out; - } + VLOG(3) << "Create " << grad_op_num << " grad op desc(s) to op " + << fwd_op->Type(); - VLOG(3) << "output var name: " << out->Name() - << " inited: " << out->var_->IsInitialized() - << " stop_grad: " << out->IsStopGradient(); - } + if (grad_op_num == 0) { + return; } - - // Check attrs and create op - framework::VariableNameMap invars_name_map = - CreateInputVarNameMap(op, inputs); - framework::VariableNameMap outvars_name_map = - CreateOutputVarNameMap(op, *outputs); - - auto& info = framework::OpInfoMap::Instance().Get(op->Type()); - if (info.Checker() != nullptr) { - info.Checker()->Check(&attrs_map); + // Build a map to record var_name -> std::shared_ptr*, + // so that we can find suitable var in grad op descs + std::unordered_map*> name_to_var; + for (auto& pair : ins) { + for (auto& var : pair.second) { + auto& var_ptr = name_to_var[var->Name()]; + PADDLE_ENFORCE_EQ(var_ptr == nullptr || var_ptr->get() == var.get(), true, + "There are different variables with same name %s", + var->Name()); + var_ptr = &var; + } } - std::unique_ptr op_base = - framework::OpRegistry::CreateOp(op->Type(), invars_name_map, - outvars_name_map, attrs_map); - - if (info.infer_var_type_) { - RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, outputs, &attrs_map); - info.infer_var_type_(&infer_var_type_ctx); + for (auto& pair : outs) { + for (auto& var : pair.second) { + auto& var_ptr = name_to_var[var->Name()]; + PADDLE_ENFORCE_EQ(var_ptr == nullptr || var_ptr->get() == var.get(), true, + "There are different variables with same name %s", + var->Name()); + var_ptr = &var; + } } - // TODO(minqiyang): Support infer var type in imperative mode - // Run forward op - VLOG(3) << "tracer running " << op->Type(); - framework::RuntimeContext ctx(invars_map, outvars_map); - - // TODO(panyx0718): Cache p. - framework::OperatorWithKernel* op_kernel = - dynamic_cast(op_base.get()); - PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel"); - - framework::Scope scope; - op->place_ = GetExpectedPlace(expected_place, inputs); - - PreparedOp prepared_op = PreparedOp::Prepare(ctx, *op_kernel, op->place_); - prepared_op.op.RuntimeInferShape(scope, op->place_, ctx); - prepared_op.func( - framework::ExecutionContext(prepared_op.op, scope, *prepared_op.dev_ctx, - prepared_op.ctx, prepared_op.kernel_configs)); - - if (!stop_gradient) { - VLOG(5) << "start construct backward op"; - - // construct grad op descs - op->attrs_ = attrs_map; - std::unique_ptr fwd_op_desc(new framework::OpDesc( - op->Type(), invars_name_map, outvars_name_map, attrs_map)); - std::unique_ptr> grad_to_var( - new std::unordered_map()); - // NOTE(minqiyang): We don't support control flow op in imperative now - // Add grad_block_ when we want to support it - CreateGradOp(*fwd_op_desc, {}, {}, &op->grad_op_descs_, grad_to_var.get()); - - VLOG(5) << "create grad op desc: " << op->grad_op_descs_[0]->Type(); - - const size_t grad_op_count = op->grad_op_descs_.size(); - - op->grad_input_vars_.resize(grad_op_count); - op->grad_output_vars_.resize(grad_op_count); - - for (size_t i = 0; i < grad_op_count; ++i) { - framework::OpDesc* grad_op_desc = op->grad_op_descs_[i]; - for (auto it : grad_op_desc->Inputs()) { - auto& grad_in_vars = op->grad_input_vars_[i][it.first]; - grad_in_vars.reserve(it.second.size()); - for (const std::string& grad_invar : it.second) { - auto var_it = grad_to_var->find(grad_invar); - if (var_it == grad_to_var->end()) { - auto fwd_var_it = current_vars_map.find(grad_invar); - PADDLE_ENFORCE(fwd_var_it != current_vars_map.end()); - // Forward inputs or outputs. - grad_in_vars.emplace_back(fwd_var_it->second); - } else { - std::shared_ptr var = - current_vars_map[var_it->second]; - CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext()); - // Douts. - var->grads_->SetPreOp(var->PreOp()); - grad_in_vars.emplace_back(var->grads_); - } + // Build backward ins and outs + + for (size_t i = 0; i < grad_op_num; i++) { + // Step1: build grad op and add them to engine + + // Use trace id to decide the order of gradient sum in sorted sum mode + size_t trace_id = fwd_op->id(); + std::shared_ptr grad_op = + OpBase::Create(trace_id, (*(grad_op_descs_[i].get())), fwd_op->place()); + + // this OpBase* is just used to manage op's life time + engine_->InsertOp(grad_op.get(), grad_op); + + std::unordered_set visited_preceding_ops; + // Step2 : prepare grad_in vars and bind them with grad_op, + // set inputs' grad_op as current grad_op + for (const auto& grad_ins : grad_op_descs_[i]->Inputs()) { + if (grad_ins.second.empty()) continue; + auto& bwd_in = (*grad_op->GetMutableInsMap())[grad_ins.first]; + bwd_in.reserve(grad_ins.second.size()); + + for (auto& grad_in_var_name : grad_ins.second) { + auto iter = grad_to_var.find(grad_in_var_name); + + if (iter != grad_to_var.end()) { + // If it is a grad var, find its coresponding forward var + auto& fwd_var_name = iter->second; + auto fwd_var_iter = name_to_var.find(fwd_var_name); + PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true, + "Cannot find forward variable named %s", + fwd_var_name); + const auto& tmp = (*(fwd_var_iter->second))->GradVarBase(); + PADDLE_ENFORCE_NOT_NULL( + tmp.get(), + "Grad of %s should " + "not be NULL when we Track_Backward Input of %s", + (*(fwd_var_iter->second))->Name(), grad_op->Type()); + // Create grad_in's dim in tensor for Grad Dependency compute + auto* tensor = tmp->MutableVar()->GetMutable(); + tensor->Resize((*(fwd_var_iter->second)) + ->Var() + .Get() + .dims()); + // Add Grad Op for grad_in + tmp->AddGradOps(grad_op); + VLOG(3) << "Add Grad Op " << grad_op->Type() << " for :" + << (*(fwd_var_iter->second))->GradVarBase()->Name(); + // Add Grad var input to engine set + engine_->InsertGradVar(tmp.get()); + VLOG(3) << "Add Grad: " << tmp->Name() << " in to Engine"; + bwd_in.emplace_back((*(fwd_var_iter->second))->GradVarBase()); + } else { + // If it is a forward var, just add it + auto fwd_var_iter = name_to_var.find(grad_in_var_name); + PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true, + "Cannot find forward variable named %s", + grad_in_var_name); + bwd_in.emplace_back(*(fwd_var_iter->second)); } + VLOG(3) << "Set backward input from fwd var" << grad_ins.first << " of " + << grad_op->Type() << " to be " + << (bwd_in.back() ? bwd_in.back()->Name() : "nullptr"); } + } - for (auto it : grad_op_desc->Outputs()) { - auto& grad_out_vars = op->grad_output_vars_[i][it.first]; - for (const std::string& grad_outvar : it.second) { - auto var_it = grad_to_var->find(grad_outvar); - PADDLE_ENFORCE(var_it != grad_to_var->end(), - "Could not found the grad op output var, should this " - "operator %s's stop gradient be True", - op->Type()); - - std::shared_ptr var = - current_vars_map[var_it->second]; - CreateNoBuffuerGrad(var, prepared_op.GetDeviceContext()); - var->grads_->SetPreOp(var->PreOp()); - grad_out_vars.push_back(var->grads_); - VLOG(3) << "grads output var name: " << var->name_; + // Step3: prepare grad_out vars and using their grad_ops to set current + // grad_op's preceding op + for (auto& grad_outs : grad_op_descs_[i]->Outputs()) { + if (grad_outs.second.empty()) continue; + auto& bwd_out = (*grad_op->GetMutableOutsMap())[grad_outs.first]; + bwd_out.reserve(grad_outs.second.size()); + + for (auto& grad_out_var_name : grad_outs.second) { + auto iter = grad_to_var.find(grad_out_var_name); + PADDLE_ENFORCE_EQ(iter != grad_to_var.end(), true, + "Cannot find output of input grad %s in op %s", + grad_out_var_name, fwd_op->Type()); + auto fwd_var_iter = name_to_var.find(iter->second); + PADDLE_ENFORCE_EQ(fwd_var_iter != name_to_var.end(), true, + "Cannot find forward variable named %s", + iter->second); + const auto& tmp = (*(fwd_var_iter->second))->GradVarBase(); + + PADDLE_ENFORCE_NOT_NULL(tmp.get(), + "Grad output: %s of op: %s should not be NULL", + (tmp->Name(), grad_op->Type())); + + if ((!tmp->OverridedStopGradient()) || (grad_outs.second.size() > 1)) { + VLOG(3) << "Set backward output " << grad_outs.first << " of " + << grad_op->Type() << " to be " << tmp->Name() + << ". Its Overrided Stop_Gradient is: False"; + bwd_out.emplace_back(tmp); + auto grad_pending_ops = + (*(fwd_var_iter->second))->GradVarBase()->GradOps(); + if (VLOG_IS_ON(3) && !grad_pending_ops.empty()) { + VLOG(3) << "Add grad_pending Op of :" + << (*(fwd_var_iter->second))->GradVarBase()->Name() + << " It's grad_pending Op are: "; + for (const auto& op : grad_pending_ops) { + VLOG(3) << op->Type(); + } + } + if (!grad_pending_ops.empty()) { + for (const auto& op : grad_pending_ops) { + PADDLE_ENFORCE_NOT_NULL(op, + "No nullptr should be grad_pending op"); + if (visited_preceding_ops.count(op) == 0) { + visited_preceding_ops.insert(op); + grad_op->InsertGradPendingOps(op); + } + } + } else { + VLOG(5) << "Hit leaf VarBase" + << (*(fwd_var_iter->second))->GradVarBase()->Name(); + } + } else { + VLOG(3) << "Skip backward output " << grad_outs.first << " of " + << grad_op->Type() << " Named: " << tmp->Name() + << ", since its Overrided Stop_Gradient is: True"; } } } + // To ensure numeric stability as static graph + grad_op->SortGradPendingOps(); } } + } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 02d90227..9c24b65e 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,46 +14,48 @@ #pragma once -#include -#include +#include +#include // NOLINT +#include #include #include -#include #include - -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/op_registry.h" +#include "ThreadPool.h" #include "paddle/fluid/imperative/engine.h" #include "paddle/fluid/imperative/layer.h" -#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/macros.h" namespace paddle { namespace imperative { -void CreateGradOp(const framework::OpDesc& op_desc, - const std::unordered_set& no_grad_set, - const std::vector& grad_sub_block, - framework::OpDesc** grad_op_desc, - std::unordered_map* grad_to_var); - -platform::Place GetExpectedPlace(platform::Place place, VarBasePtrMap inputs); - class Tracer { + DISABLE_COPY_AND_ASSIGN(Tracer); + public: - explicit Tracer(framework::BlockDesc* root_block); + Tracer() : engine_(new BasicEngine()) {} - virtual ~Tracer() {} + ~Tracer() = default; - void Trace(OpBase* op, const VarBasePtrMap& inputs, - VarBasePtrMap* outputs, // NOLINT - framework::AttributeMap attrs_map, - const platform::Place expected_place, - const bool stop_gradient = false); + void TraceOp(const std::string& type, const NameVarBaseMap& ins, + const NameVarBaseMap& outs, framework::AttributeMap attrs, + const platform::Place& place, bool trace_bacward); + + bool ComputeRequiredGrad(const NameVarBaseMap& ins, + const NameVarBaseMap& outs, bool trace_backward); + + void TraceBackward(const std::shared_ptr& fwd_op, + const framework::OpDesc& fwd_op_desc, + const NameVarBaseMap& ins, const NameVarBaseMap& outs); + Engine* GetDefaultEngine() const { return engine_.get(); } private: - platform::Place GetPlace(const VarBasePtrMap& inputs); + static size_t GenerateUniqueId() { + static std::atomic id{0}; + return id.fetch_add(1); + } - framework::BlockDesc* root_block_; + private: + std::unique_ptr engine_; }; } // namespace imperative diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h index fab8c2e6..615b1b08 100644 --- a/paddle/fluid/imperative/type_defs.h +++ b/paddle/fluid/imperative/type_defs.h @@ -17,8 +17,6 @@ limitations under the License. */ #include #include #include -#include -#include #include namespace paddle { @@ -26,18 +24,10 @@ namespace imperative { class VarBase; class OpBase; +class Tracer; -typedef std::map>> - VarBasePtrMap; -typedef std::vector> VarBaseWeakPtrList; -typedef std::map> OpBasePtrMap; -typedef std::unordered_map< - const VarBase*, - std::pair>>>> - BackwardSumMap; // var_grad -> {place, {id -> var_grad@rename}} -typedef std::unordered_map> GradientRef; -// var_grad -> {ref_times, is_first_to_be_accumulate} +using NameVarBaseMap = + std::map>>; } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 83d91afa..d1db924e 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -47,7 +47,7 @@ if (ANAKIN_FOUND) set(ANAKIN_SHARED_INFERENCE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/api/api_anakin_engine.cc) endif() set(SHARED_INFERENCE_SRCS - io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc + io.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/data_feed.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/data_set.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/data_feed_factory.cc ${CMAKE_CURRENT_SOURCE_DIR}/../framework/dataset_factory.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc ${mkldnn_quantizer_src} ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt index 9ffe7047..8292af22 100644 --- a/paddle/fluid/inference/anakin/CMakeLists.txt +++ b/paddle/fluid/inference/anakin/CMakeLists.txt @@ -1,5 +1,5 @@ cc_library(anakin_engine SRCS engine.cc DEPS framework_proto boost) -cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto boost) +cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto device_context boost) target_link_libraries(anakin_engine anakin anakin_saber_common) cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine) add_subdirectory(convert) diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc index 26f78efa..7904d407 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d.cc +++ b/paddle/fluid/inference/anakin/convert/conv2d.cc @@ -41,7 +41,7 @@ void Conv2dOpConverter::operator()( auto *filter_v = scope.FindVar(op_desc.Input("Filter").front()); PADDLE_ENFORCE_NOT_NULL(filter_v); auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); - auto weight_shape = framework::vectorize2int(weight_tensor->dims()); + auto weight_shape = framework::vectorize(weight_tensor->dims()); PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc index f2e6003a..4d7a502d 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc @@ -43,7 +43,7 @@ void Conv2dFusionOpConverter::operator()( PADDLE_ENFORCE_NOT_NULL(filter_v); auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); - auto weight_shape = framework::vectorize2int(weight_tensor->dims()); + auto weight_shape = framework::vectorize(weight_tensor->dims()); auto *b_v = scope.FindVar(op_desc.Input("Bias").front()); PADDLE_ENFORCE_NOT_NULL(b_v); @@ -99,7 +99,7 @@ void Conv2dFusionOpConverter::operator()( this->engine_->AddTensorScale(input_name, in_scale / int8_range); } else { auto weight_tensor = tensor_from_var(*filter_v, platform::CPUPlace()); - auto weight_shape = framework::vectorize2int(weight_tensor->dims()); + auto weight_shape = framework::vectorize(weight_tensor->dims()); auto *weight1 = pblock_from_tensor( *weight_tensor, weight_shape, this->engine_); this->engine_->AddOpAttr(op_name, "weight_1", *weight1); diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc index b64d0b84..265d318a 100644 --- a/paddle/fluid/inference/anakin/convert/fc.cc +++ b/paddle/fluid/inference/anakin/convert/fc.cc @@ -43,7 +43,7 @@ void FcBaseOpConverter::operator()( auto *y_v = scope.FindVar(op_desc.Input(w_name).front()); PADDLE_ENFORCE_NOT_NULL(y_v); auto weight_tensor = tensor_from_var(*y_v, platform::CPUPlace()); - auto weight_shape = framework::vectorize2int(weight_tensor->dims()); + auto weight_shape = framework::vectorize(weight_tensor->dims()); int out_dim = weight_shape[1]; const int w_m = weight_shape[0]; diff --git a/paddle/fluid/inference/anakin/convert/helper.h b/paddle/fluid/inference/anakin/convert/helper.h index 7b0fb211..6ba8fd6c 100644 --- a/paddle/fluid/inference/anakin/convert/helper.h +++ b/paddle/fluid/inference/anakin/convert/helper.h @@ -86,7 +86,7 @@ template PBlock* pblock_from_var(const framework::Variable& var, AnakinEngine* engine) { auto tensor = tensor_from_var(var, platform::CPUPlace()); - auto shape = framework::vectorize2int(tensor->dims()); + auto shape = framework::vectorize(tensor->dims()); return pblock_from_tensor(*tensor, shape, engine); } diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index 1058e744..9692f427 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -219,7 +219,7 @@ template class AnakinOpConverter<::anakin::saber::X86, #define USE_ANAKIN_CONVERTER_BASE(op_type__, place_type__, precision_type__) \ extern int Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); \ int use_converter_anakin_##op_type__##_##place_type__##_##precision_type__ \ - __attribute__((unused)) = \ + UNUSED = \ Touch_anakin_##op_type__##_##place_type__##_##precision_type__(); #if defined(PADDLE_WITH_CUDA) && defined(ANAKIN_X86_PLACE) diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h index 92441f25..f2b56a25 100644 --- a/paddle/fluid/inference/anakin/convert/ut_helper.h +++ b/paddle/fluid/inference/anakin/convert/ut_helper.h @@ -136,7 +136,7 @@ class AnakinConvertValidation { if (parameters_.count(input)) continue; auto& t = inference::analysis::GetFromScope(*scope_, input); - auto t_shape = framework::vectorize2int(t.dims()); + auto t_shape = framework::vectorize(t.dims()); while (t_shape.size() < 4) { t_shape.push_back(1); } diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index 13f16c4c..d57952db 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -86,7 +86,7 @@ void AnakinEngine::BindInput( auto *tensor = input.second; auto *data = tensor->data(); - auto fluid_input_shape = framework::vectorize2int(tensor->dims()); + auto fluid_input_shape = framework::vectorize(tensor->dims()); while (fluid_input_shape.size() < 4) { fluid_input_shape.push_back(1); } diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index d82a063d..71fdb557 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -31,6 +31,9 @@ void Analyzer::RunAnalysis(Argument *argument) { "analsis_passes is not valid in the argument."); for (auto &pass : argument->analysis_passes()) { string::PrettyLogH1("--- Running analysis [%s]", pass); + if (!argument->enable_analysis_optim() && pass == "ir_analysis_pass") + continue; + auto *ptr = PassRegistry::Global().Retreive(pass); PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass); ptr->Run(argument); diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index c814ce45..489345da 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -30,7 +30,7 @@ using namespace framework; // NOLINT TEST(Analyzer, analysis_without_tensorrt) { Argument argument; argument.SetModelDir(FLAGS_inference_model_dir); - argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); + argument.SetEnableAnalysisOptim(false); argument.SetUseGPU(false); argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", "ir_params_sync_among_devices_pass"}); @@ -41,10 +41,10 @@ TEST(Analyzer, analysis_without_tensorrt) { TEST(Analyzer, analysis_with_tensorrt) { Argument argument; + argument.SetEnableAnalysisOptim(false); argument.SetTensorRtMaxBatchSize(3); argument.SetTensorRtWorkspaceSize(1 << 20); argument.SetModelDir(FLAGS_inference_model_dir); - argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); argument.SetUseGPU(false); argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", "ir_params_sync_among_devices_pass"}); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 3fcf579c..42858655 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -62,6 +62,9 @@ struct Argument { using anakin_max_shape_t = std::map>; bool Has(const std::string& key) const { return valid_fields_.count(key); } + // If we set the model using config.SetModelBuffer, + // the model and parameter will occupy additional CPU resources. + // Use this interface to release these resources. void PartiallyRelease() { if (Has("model_program_path")) { if (Has("model_from_memory") && model_from_memory()) { @@ -130,6 +133,7 @@ struct Argument { DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool); DECL_ARGUMENT_FIELD(optim_cache_dir, OptimCacheDir, std::string); + DECL_ARGUMENT_FIELD(enable_analysis_optim, EnableAnalysisOptim, bool); // The overall graph to work on. DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); @@ -192,9 +196,7 @@ struct Argument { // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); - DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool); - DECL_ARGUMENT_FIELD(static_memory_optim_force_update, - StaticMemoryOptimForceUpdate, bool); + // Indicate which kind of sort algorithm is used for operators, the memory // optimization relays on the sort algorithm. DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc index 008608c1..368ef2e5 100644 --- a/paddle/fluid/inference/analysis/helper.cc +++ b/paddle/fluid/inference/analysis/helper.cc @@ -75,6 +75,18 @@ void SetAttr>(framework::proto::OpDesc *op, } } +template <> +void SetAttr>(framework::proto::OpDesc *op, + const std::string &name, + const std::vector &data) { + auto *attr = op->add_attrs(); + attr->set_name(name); + attr->set_type(paddle::framework::proto::AttrType::LONGS); + for (const auto i : data) { + attr->add_longs(i); + } +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 2dae5137..3fa907b4 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -64,6 +64,9 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("mkldnn_enabled_op_types", new std::unordered_set( argument->mkldnn_enabled_op_types())); + } else if (pass_name == "cudnn_placement_pass") { + pass->Set("cudnn_enabled_op_types", + new std::unordered_set()); #ifdef PADDLE_WITH_MKLDNN } else if (pass_name == "cpu_quantize_placement_pass") { pass->Set("quantize_enabled_op_types", @@ -84,13 +87,15 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); - bool enable_int8 = argument->tensorrt_precision_mode() == - AnalysisConfig::Precision::kInt8; + auto precision_mode = argument->tensorrt_precision_mode(); + bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8; pass->Set("predictor_id", new int(argument->predictor_id())); bool use_calib_mode = argument->tensorrt_use_calib_mode(); pass->Set("enable_int8", new bool(enable_int8)); pass->Set("use_calib_mode", new bool(use_calib_mode)); + pass->Set("precision_mode", + new AnalysisConfig::Precision(precision_mode)); bool use_static_engine = argument->tensorrt_use_static_engine(); bool model_from_memory = argument->model_from_memory(); diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc index 67033582..064f947a 100644 --- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/node.h" +DECLARE_bool(use_ngraph); + namespace paddle { namespace inference { namespace analysis { @@ -398,6 +400,11 @@ void RemoveIntermediateOutputInSubgraph(const std::vector &subgraph, } } + // In use for ngraph subgraph pass for parallel executor, + // this will remove all nodes, bypass this and let ngraph + // subgraph pass to process outputs + if (FLAGS_use_ngraph && valid_output.size() == 0) return; + outputs->assign(valid_output.begin(), valid_output.end()); } diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index ce8f57c0..bd2f79a1 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -41,7 +41,8 @@ void analysis::TensorRtSubgraphPass::ApplyImpl( }; SubGraphFuser fuser(graph, teller, - Get("min_subgraph_size") /*min subgraph size*/); + Get("min_subgraph_size") /*min subgraph size*/, + "tensorrt_engine"); fuser(); std::vector graph_param_names = @@ -102,7 +103,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // const framework::BlockDesc& main_block = program_desc->Block(0); framework::BlockDesc *new_block = program_desc->AppendBlock(main_block); - // An fake block desc. + // A fake block desc. framework::proto::BlockDesc block_proto; framework::BlockDesc block_desc(nullptr, &block_proto); block_desc.Proto()->set_parent_idx(-1); @@ -118,20 +119,27 @@ void TensorRtSubgraphPass::CreateTensorRTOp( } // Then, we will use the input_names_with_id and output_names_with_id to - // generate the eigine key. + // generate the engine key. // So, We use set instead of unordered_set here to ensure that the engine key // is unique. std::set input_names; std::set input_names_with_id; std::vector params; + // if we delete fluid copy of params shared by more than 1 ops, there will be + // problem, so we filter them out. + std::vector params_not_shared; - // The node->inputs containes input tensors and parameters. + // The node->inputs contains input tensors and parameters. for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { params.push_back(x->Name()); } + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0 && + x->outputs.size() <= 1) { + params_not_shared.push_back(x->Name()); + } } std::set output_names; @@ -149,6 +157,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp( graph_var_map[node->Name()] = node; } } + auto precision_mode = Get("precision_mode"); + bool enable_fp16 = false; + if (precision_mode == AnalysisConfig::Precision::kHalf) enable_fp16 = true; auto enable_int8 = Get("enable_int8"); auto use_calib_mode = Get("use_calib_mode"); auto &subgraph_nodes = *Agent(node).subgraph(); @@ -190,13 +201,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp( "Ys", std::vector(output_names.begin(), output_names.end())); op_desc->SetBlockAttr("sub_block", new_block); - SetAttr(op_desc->Proto(), "subgraph", - block_desc.Proto()->SerializeAsString()); - SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); - SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); - SetAttr(op_desc->Proto(), "gpu_id", Get("gpu_device_id")); - SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); - SetAttr(op_desc->Proto(), "parameters", params); + op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString()); + op_desc->SetAttr("max_batch_size", Get("max_batch_size")); + op_desc->SetAttr("workspace_size", Get("workspace_size")); + op_desc->SetAttr("gpu_id", Get("gpu_device_id")); + op_desc->SetAttr("output_name_mapping", output_mapping); + op_desc->SetAttr("parameters", params); + + // we record all inputs' shapes in attr to check if they are consistent + // with the real inputs' shapes retrieved from scope when trt runs. + for (auto *x : node->inputs) { + if (x->IsVar() && x->Var()) { + framework::VarDesc *var = x->Var(); + SetAttr(op_desc->Proto(), var->Name() + "_shape", var->GetShape()); + } + } auto use_static_engine = Get("use_static_engine"); // TODO(NHZlX) @@ -213,15 +232,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp( calibration_data = GetTrtCalibTableData( Get("model_opt_cache_dir"), engine_key, enable_int8); } - SetAttr(op_desc->Proto(), "calibration_data", calibration_data); + op_desc->SetAttr("calibration_data", calibration_data); + op_desc->SetAttr("enable_int8", enable_int8); + op_desc->SetAttr("enable_fp16", enable_fp16); + op_desc->SetAttr("use_calib_mode", use_calib_mode); + op_desc->SetAttr("engine_key", engine_key); + op_desc->SetAttr("predictor_id", predictor_id); - SetAttr(op_desc->Proto(), "enable_int8", enable_int8); - SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode); - SetAttr(op_desc->Proto(), "engine_key", engine_key); - SetAttr(op_desc->Proto(), "predictor_id", predictor_id); std::string trt_engine_serialized_data = ""; - SetAttr(op_desc->Proto(), "engine_serialized_data", - trt_engine_serialized_data); + op_desc->SetAttr("engine_serialized_data", trt_engine_serialized_data); + op_desc->Flush(); std::unique_ptr calibrator; if (enable_int8 && calibration_data.size() != 0) { @@ -237,14 +257,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp( return; } - std::copy(params.begin(), params.end(), + std::copy(params_not_shared.begin(), params_not_shared.end(), std::back_inserter(*repetitive_params)); tensorrt::TensorRTEngine *trt_engine = inference::Singleton::Global() .Create(engine_key + std::to_string(predictor_id), Get("max_batch_size"), Get("workspace_size"), - enable_int8, calibrator.get(), Get("gpu_device_id")); + precision_mode, calibrator.get(), Get("gpu_device_id")); bool need_serialize = (use_static_engine && !load_from_memory); if (need_serialize) { diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h index f530a5a0..b6b67ce8 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index 860dc309..1c878d66 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -5,6 +5,7 @@ cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_p cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) cc_library(adjust_cudnn_workspace_size_pass SRCS adjust_cudnn_workspace_size_pass.cc DEPS analysis_pass graph_to_program_pass) cc_library(inference_op_replace_pass SRCS inference_op_replace_pass.cc DEPS analysis_pass graph_to_program_pass) +cc_library(ir_graph_clean_pass SRCS ir_graph_clean_pass.cc DEPS analysis_pass) cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass @@ -14,6 +15,7 @@ cc_library(analysis_passes SRCS passes.cc DEPS memory_optim_pass inference_op_replace_pass ir_graph_to_program_pass + ir_graph_clean_pass ) set(analysis_deps ${analysis_deps} diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc index ef7d13da..86ced982 100644 --- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc +++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc @@ -20,9 +20,9 @@ namespace inference { namespace analysis { void InferenceOpReplacePass::RunImpl(Argument* argument) { - if (!argument->use_gpu()) return; std::unordered_map replaced_map{ {"conditional_block", "conditional_block_infer"}, + {"merge_lod_tensor", "merge_lod_tensor_infer"}, }; auto& graph = argument->main_graph(); diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc new file mode 100644 index 00000000..1f888a28 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h" +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/node.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void IrInferCleanGraphPass::RunImpl(Argument* argument) { + auto& graph = argument->main_graph(); + auto is_valid_node = [](framework::ir::Node* x) { + return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); + }; + + std::unordered_set invalid_nodes; + int valid_op = 0; + for (auto* node : graph.Nodes()) { + PADDLE_ENFORCE_NOT_NULL(node); + if (is_valid_node(node)) { + invalid_nodes.insert(node); + } else if (node->IsOp()) { + ++valid_op; + } + } + + GraphSafeRemoveNodes(&graph, invalid_nodes); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/recordio/scanner.h b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h similarity index 58% rename from paddle/fluid/recordio/scanner.h rename to paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h index 0d885dd8..a9d58aa2 100644 --- a/paddle/fluid/recordio/scanner.h +++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,30 +14,21 @@ #pragma once -#include -#include #include - -#include "paddle/fluid/recordio/chunk.h" +#include +#include "paddle/fluid/inference/analysis/analysis_pass.h" namespace paddle { -namespace recordio { +namespace inference { +namespace analysis { -class Scanner { +class IrInferCleanGraphPass : public AnalysisPass { public: - explicit Scanner(std::unique_ptr&& stream); - - explicit Scanner(const std::string& filename); - - void Reset(); + void RunImpl(Argument *argument) override; - std::string Next(); - - bool HasNext() const; - - private: - std::unique_ptr stream_; - ChunkParser parser_; + std::string repr() const override { return "ir_graph_clean_pass"; } }; -} // namespace recordio + +} // namespace analysis +} // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index c894acfd..6fbf8803 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -101,6 +101,16 @@ int DataTypeToSpace(framework::proto::VarType_Type type) { return sizeof(int32_t); case framework::proto::VarType_Type_INT64: return sizeof(int64_t); + case framework::proto::VarType_Type_INT16: + return sizeof(int16_t); + case framework::proto::VarType_Type_FP16: + return sizeof(int16_t); + case framework::proto::VarType_Type_FP64: + return sizeof(double); + case framework::proto::VarType_Type_UINT8: + return sizeof(unsigned char); + case framework::proto::VarType_Type_INT8: + return sizeof(int8_t); default: PADDLE_THROW("Unknown data type"); } @@ -109,10 +119,16 @@ int DataTypeToSpace(framework::proto::VarType_Type type) { void MemoryOptimizePass::CollectVarMemorySize( space_table_t* space_table) const { const int fake_batch_size = 1; + auto valid_var = [&](framework::ir::Node* node) -> bool { - std::set invalid_op = {"while", "conditional_block", + std::set invalid_op = {"while", + "conditional_block", "tensorrt_engine", - "conditional_block_infer"}; + "conditional_block_infer", + "merge_lod_tensor_infer", + "merge_lod_tensor", + "equal", + "lod_reset"}; for (auto* tmp : node->inputs) { CHECK(tmp->IsOp()); std::string op_type = tmp->Op()->Type(); @@ -209,264 +225,6 @@ void MakeSimpleReusePlan( } } -// Collect the memory size of the tensors. -void MemoryOptimizePass::CollectVarMemorySize( - const std::unordered_map& batch_var_ave_dim, - std::unordered_map* tensor_nodes, - space_table_t* space_table) const { - // Collect tensors from graph. - for (auto* node : graph_->Nodes()) { - if (node->IsVar() && - node->Var()->GetType() == - framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { - // Parameters will not be reused. - if (node->Var()->Persistable()) continue; - (*tensor_nodes)[node->Name()] = node; - (*space_table)[node->Name()] = - DataTypeToSpace(node->Var()->GetDataType()) * - batch_var_ave_dim.at(node->Name()); - } - } -} - -// Find a sutable (big enough but smallest to avoid memory waste). -// -// Args: -// @tensor_nodes: the tensor nodes in the ir::Graph. -// @free_existing_tensors: the allocated tensor and are free. -// @space_table: the memory space of tensors. -// @tensor2use: the tensor that requires memory. -// -// Returns: -// true if found some existing tensor to reuse. -// false if no sutable tensor to reuse, one need to allocate a new tensor for -// this requirement. -// The suitable tensor for reuse is one that is approximately equal to the -// memory demand. -bool FindSuitableTensorToReuse( - const std::string& tensor, int space_required, - const std::unordered_map& tensor_nodes, - std::unordered_set* free_existing_tensors, - const space_table_t& space_table, - const std::vector>& var_clusters, - std::string* tensor2use) __SHOULD_USE_RESULT__; - -bool FindSuitableTensorToReuse( - const std::string& tensor, int space_required, - const std::unordered_map& tensor_nodes, - std::unordered_set* free_existing_tensors, - const space_table_t& space_table, - const std::vector>& var_clusters, - std::string* tensor2use) { - std::pair best_fit; - best_fit.second = std::numeric_limits::max(); - VLOG(5) << "Split Tensors to " << var_clusters.size() << " clusters"; - - // find the cluster this var belongs to. - const std::unordered_set* cluster = nullptr; - for (const auto& c : var_clusters) { - if (c.count(tensor)) { - cluster = &c; - break; - } - } - PADDLE_ENFORCE_NOT_NULL(cluster, - "something wrong in memory optimization, the " - "variable %s not in the clusters.", - tensor); - - for (auto& candidate : *free_existing_tensors) { - // This is not a temporary tensor. - if (!space_table.count(candidate)) continue; - // Not in the same cluster. - if (!cluster->count(candidate)) continue; - - size_t space = space_table.at(candidate); - PADDLE_ENFORCE( - space <= std::numeric_limits::type>::max(), - "space overload"); - size_t space_diff = - std::abs((std::make_signed::type)space - space_required); - if (space_diff < best_fit.second) { - best_fit.first = candidate; - best_fit.second = space_diff; - } - } - - if (best_fit.second < std::numeric_limits::max()) { - *tensor2use = best_fit.first; - return true; - } - return false; -} - -// Allocate new tensor instead of reusing the existing one. -void AllocateNewTensor( - const std::string& name, size_t space_required, - const std::unordered_map& tensor_nodes, - std::unordered_set* free_existing_tensors, - space_table_t* space_table, - std::unordered_map* reuse_table) { - // The newly born tensor is free to be used. - free_existing_tensors->insert(name); - // Register the space it has. - PADDLE_ENFORCE(space_table->count(name)); - space_table->at(name) = std::max(space_table->at(name), space_required); - // The allocated new tensor use the memory of itself. - (*reuse_table)[name] = name; -} - -// Free a tensor and make it resuable. -// @tensor: the tensor to free. -// @free_existing_tensors: the free and allocated tensors. -// @reuse_table: a map from a fake tensor to the existing allocated tensor. -void FreeATensor(const std::string& tensor, - std::unordered_set* free_existing_tensors, - std::unordered_map* reuse_table) { - if (tensor == "feed" || tensor == "fetch") return; - // the really allocated tensor. - const auto& free_tensor = reuse_table->at(tensor); - - free_existing_tensors->insert(free_tensor); -} - -// Reuse a free existing tensor. -void ReuseATensor(const std::string& tensor, const std::string& tensor2reuse, - size_t memory_size, - std::unordered_set* free_existing_tensors, - std::unordered_map* reuse_table, - space_table_t* reused_space_table) { - auto it = free_existing_tensors->find(tensor2reuse); - PADDLE_ENFORCE(it != free_existing_tensors->end()); - free_existing_tensors->erase(it); - (*reuse_table)[tensor] = tensor2reuse; - // Update the memory size of a reused tensor, the memory will grow if the - // required memory is larger. - (*reused_space_table)[tensor2reuse] = - std::max(reused_space_table->at(tensor2reuse), memory_size); -} - -// Calculate the memory usage. -void EvaluateMemoryUsage( - const std::unordered_map& reuse_table, - const space_table_t& space_table, - const std::unordered_map& var_batch_ave_size, - size_t* allocated, size_t* saved) { - *allocated = 0; - *saved = 0; - - for (auto elem : reuse_table) { - if (elem.first == elem.second) { - *allocated += space_table.at(elem.first); - VLOG(4) << elem.first << " <-> " << elem.second << " " - << space_table.at(elem.first) << " " - << space_table.at(elem.second); - } else { - *saved += space_table.at(elem.first); - VLOG(4) << "reuse " << elem.first << " -> " << elem.second; - } - } - VLOG(4) << "allocated " << *allocated; - VLOG(4) << "saved " << *saved; -} - -// Return saved ratio. -void MemoryOptimizePass::MakeReusePlan( - const std::vector>& var_clusters, - const std::unordered_map& var_batch_ave_size, - const space_table_t& space_table, - std::unordered_map* reuse_table, int sort_kind, - MemoryAllocation* memory_allocation) const { - // Clear the existing plan. - reuse_table->clear(); - - // The `space_table` stores the real memory size for each tensor. - // The `reused_space_table` stores the maximum memory size required by a - // tensor during the memory reusing, the small tensor might be reused by a - // larger tensor, and the memory size of the small one will grow. - auto reused_space_table = space_table; - - std::unordered_map life_cycles; - std::unordered_map tensor_nodes; - // The allocated tensors whose memory can be reused, they will live across the - // program execution. - std::unordered_set existing_tensors; - // The existing tensor that has been allocated, and is also free to reuse. - std::unordered_set free_existing_tensors; - - CollectLifeCycle(&life_cycles, sort_kind); - - for (int age = 0; age < max_lifecycle_; ++age) { - std::unordered_set born_tensors; - std::unordered_set dead_tensors; - // Gather the dead and born tensors. - for (auto elem_it = life_cycles.begin(); elem_it != life_cycles.end(); - elem_it++) { - if (elem_it->second.first == -1) { - continue; - } - const auto& tensor = elem_it->first; - const auto& lifecycle = elem_it->second; - VLOG(4) << "process " << tensor << " reuse " << lifecycle.first << "->" - << lifecycle.second; - - // Collect newly born tensors. - if (lifecycle.first == age) { - born_tensors.insert(tensor); - } - // Collect dead tensors whose memory can be reused. - else if (lifecycle.second < age) { // NOLINT - dead_tensors.insert(tensor); - // remove to avoid duplicate process. - elem_it->second.first = -1; // avoid duplicate search - } - } - - // Reuse the dead tensors for born tensors - for (const auto& tensor : born_tensors) { - // Skip the feed and fetch tensor for that they share data with others. - std::string tensor2reuse; - if (!space_table.count(tensor)) continue; - size_t space_required = space_table.at(tensor); - if (FindSuitableTensorToReuse(tensor, space_required, tensor_nodes, - &free_existing_tensors, reused_space_table, - var_clusters, &tensor2reuse)) { - if (tensor != tensor2reuse) { - VLOG(4) << tensor << " -> " << tensor2reuse; - } - ReuseATensor(tensor, tensor2reuse, space_required, - &free_existing_tensors, reuse_table, &reused_space_table); - } else { - VLOG(4) << "allocate " << tensor; - AllocateNewTensor(tensor, space_required, tensor_nodes, - &free_existing_tensors, &reused_space_table, - reuse_table); - ReuseATensor(tensor, tensor, space_required, &free_existing_tensors, - reuse_table, &reused_space_table); - } - } - - for (const auto& tensor : dead_tensors) { - // free its memory. - FreeATensor(tensor, &free_existing_tensors, reuse_table); - } - } - - EvaluateMemoryUsage(*reuse_table, reused_space_table, var_batch_ave_size, - &(memory_allocation->allocated), - &(memory_allocation->saved)); - memory_allocation->sort_kind = sort_kind; -} - -void BuildVarNodeTable(Graph* graph, - std::unordered_map* var_node_table) { - for (auto* node : graph->Nodes()) { - if (node->IsVar()) { - (*var_node_table)[node->Name()] = node; - } - } -} - // NOTE The optimized opdesc doesn't match ir::Graph. void UpdateOpDescsByReuse( Graph* graph, @@ -535,311 +293,35 @@ void UpdateOpDescsByReuse( } } -void MemoryOptimizePass::PerformReusePlan( - const std::unordered_map& reuse_table, - int sort_kind, std::unordered_set* vars2remove) const { - std::unordered_map var_node_table; - BuildVarNodeTable(graph_, &var_node_table); - UpdateOpDescsByReuse(graph_, reuse_table, sort_kind); - - for (auto& item : reuse_table) { - if (item.first != item.second) { - vars2remove->insert(item.first); - } - } - VLOG(2) << "to remove vars " << vars2remove->size(); -} - -std::vector split(const std::string& line, char delim) { - std::vector res; - std::string field; - std::stringstream line_stream(line); - while (std::getline(line_stream, field, delim)) { - res.emplace_back(field); - } - return res; -} - -// Deserialize the batch var shapes from the cache file. -std::vector>> DeseralizeBatchVarShapes( - const std::string& path) { - std::ifstream file(path); - PADDLE_ENFORCE(file.is_open(), "failed to open %s to read cache", path); - std::string line; - std::vector>> batch_shapes; - - while (std::getline(file, line)) { - std::map> batch; - for (const auto& var_info : split(line, ';')) { - auto fields = split(var_info, ':'); - PADDLE_ENFORCE_EQ(fields.size(), 2UL); - auto var_name = fields.front(); - auto shape_str = split(fields[1], ','); - std::vector shape; - for (const auto& v : shape_str) shape.push_back(std::stoi(v)); - batch[var_name] = shape; - } - batch_shapes.push_back(batch); - } - return batch_shapes; -} - -// Replace the -1 in shape to a real number to fake the shape. -std::vector>> FakeBatchVarShapes( - const framework::ProgramDesc& program) { - std::vector>> res; - res.emplace_back(); - auto& record = res.front(); - const int fake_batch_size = 3; - for (auto* var : program.Block(0).AllVars()) { - if (var->GetType() == - framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { - auto shape = var->GetShape(); - for (auto& v : shape) { - if (v < 0) v = fake_batch_size; - } - record[var->Name()].assign(shape.begin(), shape.end()); - } - } - return res; -} - -// Calculate the average dim of each tensor from the batch shape cache. -std::unordered_map GetBatchAverageSize( - const std::vector>>& batches) { - std::unordered_map var2size; - // The average size of the batches for each variable. - int num_batch = 0; - for (const auto& batch : batches) { - num_batch++; - for (const auto& item : batch) { - int dim = std::accumulate(item.second.begin(), item.second.end(), 1, - [](int a, int b) { return a * b; }); - var2size[item.first] += dim; - } - } - - for (auto& item : var2size) { - item.second /= num_batch; - } - - return var2size; -} - -// Analysis the batch shapes loading from the cache file. -// By splitting the variables to different clusters by analyzing their batch -// size, we can pre-schedule the changes of difference LoDTensor when different -// length of input sequences is entered. -// This should works fine for the models operating on sentences. -std::vector> AnalysisBatchShapesByBatchSize( - const std::vector>>& batches) { - // collect the batch size of each shape and combine to a stringstream in - // converient to generate a hash. - std::unordered_map var_batchsize_hashes; - for (auto& batch : batches) { - for (auto& ele : batch) { - PADDLE_ENFORCE(!ele.second.empty()); - int batch_size = ele.second.front(); - // TODO(Superjomn) might consume large memory here, use combine hash. - var_batchsize_hashes[ele.first] << batch_size; - } - } - - // Split to sets by batch size sequences. - std::unordered_map> - shape_sets; - for (auto& ele : var_batchsize_hashes) { - auto hash = std::hash()(ele.second.str()); - shape_sets[hash].insert(ele.first); - } - std::vector> res; - for (auto& ele : shape_sets) { - res.emplace_back(std::move(ele.second)); - } - - VLOG(3) << "Cluster by batch_size and get " << res.size() << " clusters"; - return res; -} - -// Analysis the batch shapes loading from the cache file, and split them to -// different clusters by their size. -// This should works fine for the overall models. -std::vector> AnalysisBatchShapesBySimilarSize( - const space_table_t& space_table, - const std::vector>>& batches, - int interval = 200000) { - PADDLE_ENFORCE_GT(interval, 0); - // cluster to different clusters. - size_t max_size = 0; - for (auto& item : space_table) { - max_size = std::max(item.second, max_size); - } - VLOG(4) << "tensor max size " << max_size; - - std::vector> res; - - // cluster by intervals. - for (size_t interval_size = 0; interval_size <= max_size; - interval_size += interval) { - std::unordered_set cluster; - for (auto& item : space_table) { - if (interval_size <= item.second && - interval_size + interval > item.second) { - cluster.insert(item.first); - } - } - if (!cluster.empty()) { - res.push_back(cluster); - } - } - - VLOG(3) << "Cluster by interval and get " << res.size() << " cluster"; - return res; -} - std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; } -std::pair GetRange( - const std::unordered_map& ave_size) { - auto res = std::make_pair(std::numeric_limits::max(), - std::numeric_limits::min()); - for (auto& item : ave_size) { - res.first = std::min(item.second, res.first); - res.second = std::max(item.second, res.second); - } - return res; -} - void MemoryOptimizePass::RunImpl(Argument* argument) { - // When force update, should not optimize memory. - if (!argument->enable_memory_optim() || - argument->static_memory_optim_force_update()) - return; + // Memory optimization. + // We will perform the following operation: + // 1. Collect all var's lifetime. + // 2. Make reuse plan: the vars can be reused if there is no overlap(on + // lifetime) between + // them. + // The final plan is a mapping table in which the key represents the original + // name of var and the value in the table represents the current name of var. + // 3. Perform reuse plan: Replace all var's name in the model according to the + // mapping table. + if (!argument->enable_memory_optim()) return; graph_ = argument->main_graph_ptr(); - auto path = GetMemoryCachePath( - argument->model_dir_valid() ? argument->model_dir() : "", - argument->model_program_path_valid() ? argument->model_program_path() - : ""); - VLOG(3) << "Load memory cache from " << path; - std::vector>> batches; - - if (!(argument->static_memory_optim() && inference::IsFileExists(path))) { - string::PrettyLogInfo("--- Performing dynamic memory optimize"); - // batches = FakeBatchVarShapes(argument->main_program()); - int sort_kind = 0; - std::unordered_map lifecycles; - space_table_t space_table; - std::unordered_map node2cluster; - std::unordered_map cluster_size; - - CollectLifeCycle(&lifecycles, sort_kind); - CollectVarMemorySize(&space_table); - MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size); - UpdateOpDescsByReuse(graph_, node2cluster, sort_kind); - return; - - } else { - string::PrettyLogInfo("--- Performing static memory optimize"); - batches = DeseralizeBatchVarShapes(path); - } - auto var_batch_ave_size = GetBatchAverageSize(batches); - - // Get min and max memory size. - const auto range = GetRange(var_batch_ave_size); - const int cluster_size = std::max( - static_cast((range.second - range.first) / 100 /*cluster num*/), - 1024); - const int cluster_size1 = std::max( - static_cast((range.second - range.first) / 1000 /*cluster num*/), - 1024); - - std::unordered_map tensor_nodes; + int sort_kind = 0; + std::unordered_map lifecycles; space_table_t space_table; - CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); - - std::unordered_map reuse_table; - double max_saving_ratio = 0.; - - std::vector> strategies; - - for (int sort_kind = 0; sort_kind < 2; sort_kind++) { - if (argument->static_memory_optim()) { - // This strategy only make scene in static memory optimize. - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_batch_size = - AnalysisBatchShapesByBatchSize(batches); - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_batch_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); - } - - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = - AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size); - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, - &reuse_table, sort_kind, &allocation); - return allocation; - }); - - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = - AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size1); - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, - &reuse_table, sort_kind, &allocation); - return allocation; - }); - - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, - std::numeric_limits::max()); // no intervals - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, - &reuse_table, sort_kind, &allocation); - return allocation; - }); - } - - std::function* best_strategy{nullptr}; - - // Try all strategies to get the best result. - for (auto& strategy : strategies) { - auto allocation = strategy(); - string::PrettyLogDetail("--- get strategy saving %f memory for workspace", - allocation.GetSavingRatio()); - if (allocation.GetSavingRatio() > max_saving_ratio) { - max_saving_ratio = allocation.GetSavingRatio(); - best_strategy = &strategy; - } - } - if (!best_strategy) { - LOG(ERROR) << "This model makes poor memory optimize, skip memory optimize"; - return; - } - auto memory_allocation = (*best_strategy)(); - - string::PrettyLogInfo( - "--- Saved %.2f%s memory for workspace(temporary variables)", - memory_allocation.GetSavingRatio() * 100, "%"); - - argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, - new std::unordered_set); - auto& vars2remove = - argument->main_graph().Get>( - framework::ir::kGraphToProgramVarsToRemove); - - PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); - argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); + std::unordered_map node2cluster; + std::unordered_map cluster_size; + + CollectLifeCycle(&lifecycles, sort_kind); + CollectVarMemorySize(&space_table); + MakeSimpleReusePlan(lifecycles, space_table, &node2cluster, &cluster_size); + UpdateOpDescsByReuse(graph_, node2cluster, sort_kind); + return; } -float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const { - return (saved / 1024.) / (allocated / 1024. + saved / 1024.); -} } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h index 5a907303..77da5d40 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -25,45 +25,22 @@ namespace paddle { namespace inference { namespace analysis { -/* - * Memory optimization pass for inference with pre-analysis of memory usage - * without GC. - * Different from training, the inference memory reuse strategies doesn't - * include GC for that overhead is too much when batch size equals one. - * - * The inference memory reuse tries to pre-determine the tensor reusing strategy - * without runtime overhead. - * - * To improve the strategy's performance, a warm-up running is introduced: - * - Before officially deploy the inference program, one should warm it up and - * generate some runtime cache, - * - Run the inference program with several batches of data, it will persist - * some runtime information about memory of tensors to disk, we call the - * information the memory reusing cache, - * - With the memory reusing cache, user can deploy the inference to a - * service, before running the model, the inference program will load the - * memory cache, analysis it and generate the best memory reusing strategy, - * and adjust the execution of the network. - * - * With the warm-up and memory reusing cache design, the memory reusing - * algorithm can analysis the real memory consume of the tensors, even with the - * flexible LoDTensor and special shape changing operators such as - * sequence-pooling. - */ +/* Memory optimization. +* We will perform the following operation: +* 1. Collect all var's lifetime. +* 2. Make reuse plan: the vars can be reused if there is no overlap(on lifetime) +* between +* them. +* The final plan is a mapping table in which the key represents the original +* name of var and the value in the table represents the current name of var. +* 3. Perform reuse plan: Replace all var's name in the model according to the +* mapping table. +*/ class MemoryOptimizePass : public AnalysisPass { public: using space_table_t = std::unordered_map; using lifecycle_t = std::pair; - struct MemoryAllocation { - size_t allocated; // allocated memory in byte. - size_t saved; // saved memory in byte. - int sort_kind; // the kind of the corresponding sorting algorithm. - - // Get the memory saving ratio of temporary variables. - float GetSavingRatio() const; - }; - virtual ~MemoryOptimizePass() = default; protected: @@ -76,23 +53,6 @@ class MemoryOptimizePass : public AnalysisPass { void CollectVarMemorySize(space_table_t *space_table) const; - void CollectVarMemorySize( - const std::unordered_map &batch_var_ave_dim, - std::unordered_map *tensor_nodes, - space_table_t *space_table) const; - - // Returns percentage of saved memory. - void MakeReusePlan( - const std::vector> &var_clusters, - const std::unordered_map &var_batch_ave_size, - const space_table_t &space_table, - std::unordered_map *reuse_table, int sort_kind, - MemoryAllocation *memory_allocation) const; - - void PerformReusePlan( - const std::unordered_map &reuse_table, - int sort_kind, std::unordered_set *vars2remove) const; - public: std::string repr() const override; @@ -101,12 +61,6 @@ class MemoryOptimizePass : public AnalysisPass { mutable int max_lifecycle_{-1}; }; -static std::string GetMemoryCachePath(const std::string &model_path, - const std::string &prog_path) { - auto path = model_path.empty() ? prog_path : model_path; - return path + ".memory_cache"; -} - } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc index 97debcec..ca0b25c2 100644 --- a/paddle/fluid/inference/analysis/passes/passes.cc +++ b/paddle/fluid/inference/analysis/passes/passes.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" +#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" @@ -32,6 +33,8 @@ PassRegistry::PassRegistry() { std::unique_ptr(new IrAnalysisPass)); passes_.emplace("ir_graph_build_pass", std::unique_ptr(new IrGraphBuildPass)); + passes_.emplace("ir_graph_clean_pass", + std::unique_ptr(new IrInferCleanGraphPass)); passes_.emplace("memory_optimize_pass", std::unique_ptr(new MemoryOptimizePass)); passes_.emplace( diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 7c697c81..344d12dd 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -70,9 +70,9 @@ cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_ if(ANAKIN_FOUND) # Do not turn warnings into errors. set_source_files_properties(api.cc api_anakin_engine.cc PROPERTIES COMPILE_FLAGS "-Wno-error") - cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS boost xxhash) + cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS boost xxhash framework_proto eigen3) target_link_libraries(inference_anakin_api anakin anakin_saber_common) - cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS boost xxhash) + cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS boost xxhash framework_proto eigen3) target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common) function(anakin_target target_name) target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 0ea26000..ace260c7 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -94,14 +94,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { prog_file_ = std::move(other.prog_file_); params_file_ = std::move(other.params_file_); - // Gpu related. + // GPU related. CP_MEMBER(use_gpu_); + CP_MEMBER(use_cudnn_); CP_MEMBER(device_id_); CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(enable_memory_optim_); - CP_MEMBER(static_memory_optim_); - CP_MEMBER(static_memory_optim_force_update_); // TensorRT related. CP_MEMBER(use_tensorrt_); CP_MEMBER(tensorrt_workspace_size_); @@ -129,6 +128,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(anakin_passes_filter_); CP_MEMBER(anakin_ops_filter_); + // profile related. + CP_MEMBER(with_profile_); + // Ir related. CP_MEMBER(enable_ir_optim_); CP_MEMBER(use_feed_fetch_ops_); @@ -152,6 +154,17 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { Update(); } +void AnalysisConfig::EnableCUDNN() { +#ifdef PADDLE_WITH_CUDA + use_cudnn_ = use_gpu_; +#else + LOG(ERROR) << "Please compile with CUDA first to use cuDNN"; + use_cudnn_ = false; +#endif + + Update(); +} + void AnalysisConfig::EnableMKLDNN() { #ifdef PADDLE_WITH_MKLDNN use_mkldnn_ = true; @@ -261,6 +274,15 @@ void AnalysisConfig::Update() { pass_builder()->AppendPass(pass); } } + if (use_gpu() && use_cudnn_) { +#ifdef PADDLE_WITH_CUDA + if (!enable_ir_optim_) { + LOG(ERROR) << "EnableCUDNN() only works when IR optimization is enabled."; + } else { + pass_builder()->EnableCUDNN(); + } +#endif + } if (use_ngraph_) { if (!enable_ir_optim_) { @@ -347,8 +369,6 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << tensorrt_min_subgraph_size_; ss << enable_memory_optim_; - ss << static_memory_optim_; - ss << static_memory_optim_force_update_; ss << use_ngraph_; @@ -360,6 +380,8 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << use_mkldnn_quantizer_; ss << model_from_memory_; + ss << with_profile_; + ss << enable_ir_optim_; ss << use_feed_fetch_ops_; ss << ir_debug_; @@ -394,12 +416,8 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const { #endif } -void AnalysisConfig::EnableMemoryOptim(bool static_optim, - bool force_update_static_cache) { +void AnalysisConfig::EnableMemoryOptim() { enable_memory_optim_ = true; - static_memory_optim_ = static_optim; - static_memory_optim_force_update_ = force_update_static_cache; - Update(); } @@ -434,6 +452,12 @@ void AnalysisConfig::SwitchIrDebug(int x) { ir_debug_ = x; Update(); } + +void AnalysisConfig::EnableProfile() { + with_profile_ = true; + Update(); +} + void AnalysisConfig::EnableAnakinEngine( int max_batch_size, std::map> max_input_shape, int min_subgraph_size, AnalysisConfig::Precision precision_mode, diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 7650b2e9..d47bde32 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -52,8 +52,6 @@ #include "paddle/fluid/inference/anakin/convert/op_converter.h" #endif -DECLARE_bool(profile); - namespace paddle { using inference::Singleton; @@ -79,12 +77,14 @@ bool AnalysisPredictor::Init( const std::shared_ptr &parent_scope, const std::shared_ptr &program) { VLOG(3) << "Predictor::init()"; - if (FLAGS_profile) { - LOG(WARNING) << "Profiler is actived, might affect the performance"; - LOG(INFO) << "You can turn off by set gflags '-profile false'"; + if (config_.with_profile_) { + LOG(WARNING) << "Profiler is activated, which might affect the performance"; auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll : platform::ProfilerState::kCPU; platform::EnableProfiler(tracking_device); + } else { + LOG(INFO) << "Profiler is deactivated, and no profiling report will be " + "generated."; } // no matter with or without MKLDNN @@ -135,7 +135,6 @@ bool AnalysisPredictor::PrepareProgram( const std::shared_ptr &program) { if (!program) { if (!LoadProgramDesc()) return false; - // If not cloned, the parameters should be loaded. // If config_.ir_optim() is True, parameters is loaded in // OptimizeInferenceProgram(), but other persistable variables @@ -145,17 +144,10 @@ bool AnalysisPredictor::PrepareProgram( // So in both case, create persistable variables at first. executor_->CreateVariables(*inference_program_, 0, true, sub_scope_); - // Optimize the program, and load parameters and modify them in the - // scope_. - // This will change the scope_ address. - if (config_.ir_optim()) { - status_ir_optim_enabled_ = true; - OptimizeInferenceProgram(); - } else { - // Load parameters - LOG(INFO) << "load parameters "; - LoadParameters(); - } + // if enable_ir_optim_ is false, + // the analysis pass(op fuse, graph analysis, trt subgraph, mkldnn etc) will + // not be executed. + OptimizeInferenceProgram(); } else { // If the program is passed from external, no need to optimize it, this // logic is used in the clone scenario. @@ -249,11 +241,6 @@ bool AnalysisPredictor::Run(const std::vector &inputs, return false; } - // Collect variable shapes for memory optimization. - if (need_collect_var_shapes_for_memory_optim()) { - CollectVarShapes(); - } - VLOG(3) << "predict cost: " << timer.toc() << "ms"; // All the containers in the scope will be hold in inference, but the @@ -396,10 +383,8 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, void AnalysisPredictor::PrepareArgument() { argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); + argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_); argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); - argument_.SetStaticMemoryOptim(config_.static_memory_optim_); - argument_.SetStaticMemoryOptimForceUpdate( - config_.static_memory_optim_force_update_); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program argument_.SetUseAnakin(config_.anakin_engine_enabled()); @@ -467,8 +452,6 @@ void AnalysisPredictor::PrepareArgument() { // NOTE All the members in AnalysisConfig should be copied to Argument. void AnalysisPredictor::OptimizeInferenceProgram() { - status_program_optimized_ = true; - PrepareArgument(); Analyzer().Run(&argument_); @@ -481,7 +464,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { // when the predictor settings are complete, we release these stores. argument_.PartiallyRelease(); config_.PartiallyRelease(); - LOG(INFO) << "== optimize end =="; + LOG(INFO) << "======= optimize end ======="; } template <> @@ -507,7 +490,7 @@ std::unique_ptr CreatePaddlePredictor< } if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) { - flags.push_back("dummpy"); + flags.push_back("dummy"); std::string flag = "--fraction_of_gpu_memory_to_use=" + std::to_string(fraction_of_gpu_memory); flags.push_back(flag); @@ -585,6 +568,18 @@ std::vector AnalysisPredictor::GetInputNames() { return input_names; } +std::map> +AnalysisPredictor::GetInputTensorShape() { + std::map> input_shapes; + std::vector names = GetInputNames(); + for (std::string name : names) { + auto *var = inference_program_->Block(0).FindVar(name); + PADDLE_ENFORCE_NOT_NULL(var, "input %s does not exist.", name); + input_shapes[name] = var->GetShape(); + } + return input_shapes; +} + std::vector AnalysisPredictor::GetOutputNames() { std::vector output_names; for (auto &item : idx2fetches_) { @@ -801,7 +796,7 @@ AnalysisPredictor::~AnalysisPredictor() { SaveTrtCalibToDisk(); } #endif - if (FLAGS_profile) { + if (config_.with_profile_) { platform::DisableProfiler(platform::EventSortingKey::kTotal, "./profile.log"); } @@ -815,13 +810,6 @@ AnalysisPredictor::~AnalysisPredictor() { mkldnn_quantizer_ = nullptr; } #endif - - // TODO(Superjomn) deduce the directory path. - std::string out_path = inference::analysis::GetMemoryCachePath( - config_.model_dir(), config_.prog_file()); - if (need_collect_var_shapes_for_memory_optim()) { - SerializeBatchVarShapes(out_path); - } } std::unique_ptr AnalysisPredictor::Clone() { @@ -831,66 +819,6 @@ std::unique_ptr AnalysisPredictor::Clone() { return std::unique_ptr(x); } -void AnalysisPredictor::CollectVarShapes() { - VLOG(4) << "Collecting var shapes"; - if (batch_var_shapes_.size() >= max_shape_collect_count_) return; - std::map> var_shapes; - for (auto var_name : inference_program_->Block(0).LocalVarNames()) { - auto *var = sub_scope_->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(var); - if (var->Type() == framework::VarTypeTrait::kId || - var->Type() == framework::VarTypeTrait::kId) { - auto &tensor = var->Get(); - auto shape = framework::vectorize(tensor.dims()); - var_shapes[var_name].assign(shape.begin(), shape.end()); - } - } - batch_var_shapes_.push_back(var_shapes); - LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size() - << " batch of var shapes for analysis"; -} - -void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) { - LOG(INFO) << "serialize batch var shapes to " << path; - std::ofstream file(path); - if (!file.is_open()) { - LOG(ERROR) << "failed to serialize the var shapes to " << path; - return; - } - - // The sirialized data format: - // :dim0,dim1,dim2,; - for (auto &batch : batch_var_shapes_) { - for (auto &ele : batch) { - file << ele.first << ":"; - for (size_t i = 0; i < ele.second.size() - 1; i++) { - file << ele.second[i] << ","; - } - file << ele.second.back() << ";"; - } - file << "\n"; - } -} - -bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { - if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_; - bool need = false; - // check if the cache exists - if (!config_.enable_memory_optim()) { - need = false; - } else if (config_.static_memory_optim_ && - !inference::IsFileExists(inference::analysis::GetMemoryCachePath( - config_.model_dir(), config_.prog_file()))) { - need = true; - } else if (config_.static_memory_optim_ && - config_.static_memory_optim_force_update_) { - need = true; - } - - need_collect_var_shapes_ = need ? 1 : 0; - return need; -} - std::string AnalysisPredictor::GetSerializedProgram() const { return inference_program_->Proto()->SerializeAsString(); } @@ -968,6 +896,8 @@ USE_TRT_CONVERTER(split); USE_TRT_CONVERTER(prelu); USE_TRT_CONVERTER(conv2d_transpose); USE_TRT_CONVERTER(leaky_relu); +USE_TRT_CONVERTER(shuffle_channel); +USE_TRT_CONVERTER(swish); #endif #if PADDLE_WITH_ANAKIN diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 7a366b10..33a2e623 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -65,6 +65,8 @@ class AnalysisPredictor : public PaddlePredictor { std::unique_ptr GetOutputTensor( const std::string &name) override; + std::map> GetInputTensorShape() override; + bool ZeroCopyRun() override; void CreateFeedFetchVar(framework::Scope *scope); @@ -89,11 +91,6 @@ class AnalysisPredictor : public PaddlePredictor { void SaveOptimModel(const std::string &dir); protected: - // For memory optimization. - bool need_collect_var_shapes_for_memory_optim(); - void CollectVarShapes(); - void SerializeBatchVarShapes(const std::string &path); - bool PrepareProgram(const std::shared_ptr &program); bool PrepareScope(const std::shared_ptr &parent_scope); bool CreateExecutor(); @@ -178,10 +175,8 @@ class AnalysisPredictor : public PaddlePredictor { private: // Some status here that help to determine the status inside the predictor. - bool status_program_optimized_{false}; bool status_is_cloned_{false}; bool status_use_gpu_{false}; - bool status_ir_optim_enabled_{false}; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 44b1b807..e990b2c7 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -44,7 +44,6 @@ TEST(AnalysisPredictor, analysis_off) { ASSERT_EQ(predictor->scope_->parent(), nullptr); ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); // ir is turned off, so program shouldn't be optimized. - ASSERT_FALSE(predictor->status_program_optimized_); LOG(INFO) << "scope parameters " << predictor->scope_->LocalVarNames().size(); // 2. Dummy Input Data @@ -76,8 +75,6 @@ TEST(AnalysisPredictor, analysis_on) { ASSERT_TRUE(predictor->sub_scope_); ASSERT_EQ(predictor->scope_->parent(), nullptr); ASSERT_EQ(predictor->sub_scope_->parent(), predictor->scope_.get()); - // ir is turned on, so program should be optimized. - ASSERT_TRUE(predictor->status_program_optimized_); // 2. Dummy Input Data int64_t data[4] = {1, 2, 3, 4}; PaddleTensor tensor; diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index fc2d7b48..ec659f1c 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -54,8 +54,15 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) { memory_owned_ = other.memory_owned_; } else { Resize(other.length()); - PADDLE_ENFORCE(!(other.length() > 0 && other.data() == nullptr)); - memcpy(data_, other.data(), other.length()); + // if other.length() == 0 or other.data() == nullptr, then the memcpy + // behavior is undefined + if (other.length() && other.data()) + memcpy(data_, other.data(), other.length()); + else if (other.length()) + PADDLE_THROW( + "Invalid argument, null pointer data with length %u is passed", + other.length()); + length_ = other.length(); memory_owned_ = true; } diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc index e38531a4..4c51c239 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.cc +++ b/paddle/fluid/inference/api/api_anakin_engine.cc @@ -42,6 +42,7 @@ void PaddleInferenceAnakinPredictor::InitEnv() { template void PaddleInferenceAnakinPredictor::InitNet() { std::unique_lock lock(this->mutex_); + delete this->executor_p_; this->executor_p_ = new anakin::Net(*this->graph_p_, true); } template @@ -89,7 +90,7 @@ void PaddleInferenceAnakinPredictor::InitPredictor() { this->InitNet(); } template -void PaddleInferenceAnakinPredictor::Predict() { +void PaddleInferenceAnakinPredictor::Predict(int batch_size) { anakin::TargetWrapper::device_sync(); this->executor_p_->prediction(); anakin::TargetWrapper::device_sync(); @@ -99,7 +100,7 @@ bool PaddleInferenceAnakinPredictor::Run( const std::vector &inputs, std::vector *output_data, int batch_size) { if (this->config_.re_allocable) { - return this->RunImpl(inputs, output_data); + return this->RunImpl(inputs, output_data, batch_size); } else { // Run inputs data that exceeds batch size in batches. // 1. Reassign the batch size. @@ -194,7 +195,7 @@ bool PaddleInferenceAnakinPredictor::Run( template bool PaddleInferenceAnakinPredictor::RunImpl( const std::vector &inputs, - std::vector *output_data) { + std::vector *output_data, int batch_size) { anakin::TargetWrapper::set_device(this->config_.device_id); for (const auto &input : inputs) { if (input.dtype != PaddleDType::FLOAT32) { @@ -207,12 +208,12 @@ bool PaddleInferenceAnakinPredictor::RunImpl( LOG(FATAL) << " input " << input.name << "'s shape size should be equal to that of net"; } +#ifndef ANAKIN_MLU_PLACE int sum = 1; for_each(input.shape.begin(), input.shape.end(), [&](int n) { sum *= n; }); if (sum > net_shape.count()) { if (this->config_.re_allocable) { this->graph_p_->Reshape(input.name, input.shape); - delete this->executor_p_; this->InitNet(); d_tensor_p = this->executor_p_->get_in(input.name); } else { @@ -221,6 +222,7 @@ bool PaddleInferenceAnakinPredictor::RunImpl( "memory."; } } +#endif std::vector tmp_shape; for (auto s : input.shape) { tmp_shape.push_back(s); @@ -229,8 +231,9 @@ bool PaddleInferenceAnakinPredictor::RunImpl( anakin::saber::Tensor::Host_type> h_tensor(data, typename anakin::DefaultHostType::Host_type(), 0, tmp_shape); +#ifndef ANAKIN_MLU_PLACE d_tensor_p->reshape(tmp_shape); - +#endif if (input.lod.size() > 0) { if (input.lod.size() > 1) { LOG(FATAL) << " input lod first dim should <=1, but you set " @@ -246,9 +249,9 @@ bool PaddleInferenceAnakinPredictor::RunImpl( } d_tensor_p->copy_from(h_tensor); } - this->Predict(); + this->Predict(batch_size); if (output_data->empty()) { - LOG(FATAL) << "At least one output should be set with tensors' names."; + LOG(FATAL) << "The output param in the Run function is incorrect."; } for (auto &output : *output_data) { if (std::find(this->output_names_.begin(), this->output_names_.end(), @@ -256,14 +259,18 @@ bool PaddleInferenceAnakinPredictor::RunImpl( LOG(FATAL) << output.name << " is not in the outputs of the graph."; } auto *d_tensor_p = this->executor_p_->get_out(output.name); - output.shape = d_tensor_p->valid_shape(); - if (output.data.length() < d_tensor_p->valid_size() * sizeof(float)) { - output.data.Resize(d_tensor_p->valid_size() * sizeof(float)); + auto tmp_shape = d_tensor_p->valid_shape(); +#ifdef ANAKIN_MLU_PLACE + tmp_shape.set_num(batch_size); +#endif + output.shape = tmp_shape; + if (output.data.length() < tmp_shape.count() * sizeof(float)) { + output.data.Resize(tmp_shape.count() * sizeof(float)); } auto *data = static_cast(output.data.data()); anakin::saber::Tensor::Host_type> h_tensor(data, typename anakin::DefaultHostType::Host_type(), 0, - d_tensor_p->valid_shape()); + tmp_shape); h_tensor.copy_from(*d_tensor_p); } return true; @@ -317,6 +324,8 @@ void PaddleInferenceAnakinMLUPredictor::SetContext() { this->config_.compute_stream_id); this->ctx_p_->set_model_parallel(this->config_.model_parallel); this->ctx_p_->set_fusion(this->config_.op_fuse); + this->ctx_p_->enable_batch_changable(); + this->ctx_p_->enable_channel_duplicate(); } template void PaddleInferenceAnakinMLUPredictor::OptimizeGraph() { @@ -327,14 +336,13 @@ void PaddleInferenceAnakinMLUPredictor::OptimizeGraph() { template void PaddleInferenceAnakinMLUPredictor::InitNet() { std::unique_lock lock(this->mutex_); + delete this->executor_p_; this->executor_p_ = new anakin::Net(); this->executor_p_->fusion_init(*this->graph_p_, this->ctx_p_, true); } template -void PaddleInferenceAnakinMLUPredictor::Predict() { - anakin::TargetWrapper::device_sync(); - this->executor_p_->fusion_prediction(); - anakin::TargetWrapper::device_sync(); +void PaddleInferenceAnakinMLUPredictor::Predict(int batch_size) { + this->executor_p_->fusion_prediction(batch_size); } #endif @@ -353,14 +361,13 @@ void PaddleInferenceAnakinBMPredictor::OptimizeGraph() { template void PaddleInferenceAnakinBMPredictor::InitNet() { std::unique_lock lock(this->mutex_); + delete this->executor_p_; this->executor_p_ = new anakin::Net(); this->executor_p_->fusion_init(*this->graph_p_, this->ctx_p_, true); } template -void PaddleInferenceAnakinBMPredictor::Predict() { - anakin::TargetWrapper::device_sync(); +void PaddleInferenceAnakinBMPredictor::Predict(int batch_size) { this->executor_p_->fusion_prediction(); - anakin::TargetWrapper::device_sync(); } #endif diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h index 88d3325b..97fc0061 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -73,7 +73,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { virtual void OptimizeGraph(); virtual void InitNet(); virtual void SetContext(); - virtual void Predict(); + virtual void Predict(int batch_size); virtual std::unique_ptr New(); static std::mutex mutex_; AnakinConfig config_; @@ -85,7 +85,7 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { private: bool RunImpl(const std::vector& inputs, - std::vector* output_data); + std::vector* output_data, int batch_size = -1); static std::once_flag init_anakin_; }; @@ -103,7 +103,7 @@ class PaddleInferenceAnakinMLUPredictor final void SetContext() override; void OptimizeGraph() override; void InitNet() override; - void Predict() override; + void Predict(int batch_size) override; }; #endif @@ -120,7 +120,7 @@ class PaddleInferenceAnakinBMPredictor final std::unique_ptr New() override; void OptimizeGraph() override; void InitNet() override; - void Predict() override; + void Predict(int batch_size) override; }; #endif } // namespace paddle diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 2dc5dda3..c80187ad 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -48,7 +48,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { } else { LOG(FATAL) << "unsupported type."; } - pt.shape = framework::vectorize2int(t->dims()); + pt.shape = framework::vectorize(t->dims()); return pt; } diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 318658e0..113302b7 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -4,6 +4,9 @@ option(WITH_MKL "Compile demo with MKL/OpenBlas support, default use MKL. option(WITH_GPU "Compile demo with GPU/CPU, default use CPU." OFF) option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static." ON) option(USE_TENSORRT "Compile demo with TensorRT." OFF) +if(NOT WITH_STATIC_LIB) + add_definitions("-DPADDLE_WITH_SHARED_LIB") +endif() macro(safe_set_static_flag) foreach(flag_var @@ -28,14 +31,10 @@ include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") -include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappy/include") -include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappystream/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}zlib/include") include_directories("${PADDLE_LIB}/third_party/boost") include_directories("${PADDLE_LIB}/third_party/eigen3") -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappy/lib") -link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}snappystream/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}zlib/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib") @@ -45,13 +44,15 @@ link_directories("${PADDLE_LIB}/paddle/lib") if (WIN32) add_definitions("/DGOOGLE_GLOG_DLL_DECL=") - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") - if (WITH_STATIC_LIB) - safe_set_static_flag() - add_definitions(-DSTATIC_LIB) + if (MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + if (WITH_STATIC_LIB) + safe_set_static_flag() + add_definitions(-DSTATIC_LIB) + endif() endif() else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") @@ -109,7 +110,7 @@ if(WITH_MKL) else() set(MATH_LIB ${PADDLE_LIB_THIRD_PARTY_PATH}openblas/lib/libopenblas${CMAKE_STATIC_LIBRARY_SUFFIX}) if(WIN32) - set(MATH_DLL ${PADDLE_LIB_THIRD_PARTY_PATH}openblas/lib/openblas${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(MATH_DLL ${PADDLE_LIB_THIRD_PARTY_PATH}openblas/lib/openblas${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() endif() @@ -124,12 +125,12 @@ if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} - glog gflags protobuf snappystream snappy z xxhash + glog gflags protobuf z xxhash ${EXTERNAL_LIB}) else() set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} - glog gflags_static libprotobuf snappy zlibstatic xxhash snappystream ${EXTERNAL_LIB}) + glog gflags_static libprotobuf zlibstatic xxhash ${EXTERNAL_LIB}) set(DEPS ${DEPS} libcmt shlwapi.lib) endif(NOT WIN32) @@ -141,6 +142,10 @@ if(WITH_GPU) endif() set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) else() + if (USE_TENSORRT) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) + endif() set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) @@ -150,6 +155,14 @@ endif() add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) target_link_libraries(${DEMO_NAME} ${DEPS}) if(WIN32) + if(USE_TENSORRT) + add_custom_command(TARGET ${DEMO_NAME} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer${CMAKE_SHARED_LIBRARY_SUFFIX} + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${TENSORRT_LIB_DIR}/nvinfer_plugin${CMAKE_SHARED_LIBRARY_SUFFIX} + ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + ) + endif() if(WITH_MKL) add_custom_command(TARGET ${DEMO_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_PATH}/lib/mkldnn.dll ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} @@ -157,7 +170,7 @@ if(WIN32) ) else() add_custom_command(TARGET ${DEMO_NAME} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MATH_DLL}/openblas.dll ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} + COMMAND ${CMAKE_COMMAND} -E copy ${MATH_DLL} ${CMAKE_BINARY_DIR}/${CMAKE_BUILD_TYPE} ) endif() endif() diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index 0d2c418c..b63e8e62 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -30,6 +30,9 @@ DEFINE_string( "path of data; each line is a record, format is " "'\t &shape) { template T *ZeroCopyTensor::mutable_data(PaddlePlace place) { EAGER_GET_TENSOR; + PADDLE_ENFORCE_GT( + tensor->numel(), 0, + "You should call ZeroCopyTensor::Reshape(const std::vector &shape)" + "function before retrieving mutable_data from input tensor."); switch (static_cast(place)) { case static_cast(PaddlePlace::kCPU): { return tensor->mutable_data(platform::CPUPlace()); @@ -83,8 +87,8 @@ PaddleDType ZeroCopyTensor::type() const { return PaddleDType::INT64; } else if (type == framework::proto::VarType::INT32) { return PaddleDType::INT32; - } else { - LOG(ERROR) << "unknown type, only support float32 and int64 now."; + } else if (type == framework::proto::VarType::UINT8) { + return PaddleDType::UINT8; } return PaddleDType::FLOAT32; } @@ -95,7 +99,7 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) { PADDLE_ENFORCE_GE( tensor->numel(), 0, "You should call ZeroCopyTensor::Reshape(const std::vector &shape)" - "function before copy data from cpu."); + "function before copying data from cpu."); size_t ele_size = tensor->numel() * sizeof(T); if (place_ == PaddlePlace::kCPU) { @@ -112,7 +116,7 @@ void ZeroCopyTensor::copy_from_cpu(const T *data) { memory::Copy(gpu_place, static_cast(t_data), platform::CPUPlace(), data, ele_size, dev_ctx->stream()); #else - PADDLE_THROW("Not compile with CUDA, should not reach here."); + PADDLE_THROW("Not compiled with CUDA, should not reach here."); #endif } } @@ -143,9 +147,11 @@ void ZeroCopyTensor::copy_to_cpu(T *data) { template void ZeroCopyTensor::copy_from_cpu(const float *data); template void ZeroCopyTensor::copy_from_cpu(const int64_t *data); template void ZeroCopyTensor::copy_from_cpu(const int32_t *data); +template void ZeroCopyTensor::copy_from_cpu(const uint8_t *data); template void ZeroCopyTensor::copy_to_cpu(float *data); template void ZeroCopyTensor::copy_to_cpu(int64_t *data); template void ZeroCopyTensor::copy_to_cpu(int32_t *data); +template void ZeroCopyTensor::copy_to_cpu(uint8_t *data); template float *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; @@ -153,9 +159,12 @@ template int64_t *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; template int32_t *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; +template uint8_t *ZeroCopyTensor::data(PaddlePlace *place, + int *size) const; template float *ZeroCopyTensor::mutable_data(PaddlePlace place); template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); template int32_t *ZeroCopyTensor::mutable_data(PaddlePlace place); +template uint8_t *ZeroCopyTensor::mutable_data(PaddlePlace place); void *ZeroCopyTensor::FindTensor() const { PADDLE_ENFORCE(!name_.empty(), @@ -172,7 +181,7 @@ void *ZeroCopyTensor::FindTensor() const { std::vector ZeroCopyTensor::shape() const { EAGER_GET_TENSOR; PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_); - return framework::vectorize2int(tensor->dims()); + return framework::vectorize(tensor->dims()); } void ZeroCopyTensor::SetLoD(const std::vector> &x) { diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index e5820c36..907d35b2 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -317,7 +317,7 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, double batch_latency, int epoch = 1, const framework::proto::VarType::Type data_type = framework::proto::VarType::FP32) { - PADDLE_ENFORCE(batch_size > 0, "Non-positive batch size."); + PADDLE_ENFORCE_GT(batch_size, 0, "Non-positive batch size."); double sample_latency = batch_latency / batch_size; LOG(INFO) << "====== threads: " << num_threads << ", thread id: " << tid << " ======"; diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc index fea56f01..94c556ce 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -68,10 +68,10 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() { if (is_output) { if (op->Type() == "conv2d") { // output of conv2d with relu must be unsigned - is_unsigned = (op->HasAttr("fuse_relu") && - boost::get(op->GetAttr("fuse_relu"))) || - (op->HasAttr("fuse_brelu") && - boost::get(op->GetAttr("fuse_brelu"))); + std::string fuse_activation = + op->GetAttrIfExists("fuse_activation"); + is_unsigned = + (fuse_activation == "relu" || fuse_activation == "relu6"); } else if (op->Type() == "relu") { is_unsigned = true; } else if (op->Type() == "transpose2" || @@ -397,13 +397,14 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const { auto* builder = predictor_.config_.pass_builder(); builder->SetPasses({ - "infer_clean_graph_pass", "cpu_quantize_pass", "cpu_quantize_squash_pass", + "cpu_quantize_pass", "cpu_quantize_squash_pass", }); if (predictor_.config_.ir_debug_) builder->TurnOnDebug(); auto passes = builder->AllPasses(); predictor_.argument_.SetIrAnalysisPasses(passes); predictor_.argument_.SetAnalysisPasses( - {"ir_analysis_pass", "memory_optimize_pass", "ir_graph_to_program_pass"}); + {"ir_graph_clean_pass", "ir_analysis_pass", "memory_optimize_pass", + "ir_graph_to_program_pass"}); predictor_.argument_.SetQuantVarScales(scales_); } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 83143be0..7764a498 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -46,6 +46,7 @@ struct AnalysisConfig { enum class Precision { kFloat32 = 0, kInt8, + kHalf, }; /** Set model with a directory. @@ -100,6 +101,13 @@ struct AnalysisConfig { */ float fraction_of_gpu_memory_for_pool() const; + /** Turn on CUDNN + */ + void EnableCUDNN(); + /** A boolean state telling whether to use cuDNN. + */ + bool cudnn_enabled() const { return use_cudnn_; } + /** \brief Control whether to perform IR graph optimization. * * If turned off, the AnalysisConfig will act just like a NativeConfig. @@ -236,10 +244,19 @@ struct AnalysisConfig { /** Turn on memory optimize * NOTE still in development, will release latter. */ - void EnableMemoryOptim(bool static_optim = false, - bool force_update_static_cache = false); + void EnableMemoryOptim(); /** Tell whether the memory optimization is activated. */ bool enable_memory_optim() const; + + /** \brief Turn on profiling report. + * + * If not turned on, no profiling report will be generateed. + */ + void EnableProfile(); + /** A boolean state telling whether the profiler is activated. + */ + bool profile_enabled() const { return with_profile_; } + void SetInValid() const { is_valid_ = false; } bool is_valid() const { return is_valid_; } @@ -268,6 +285,8 @@ struct AnalysisConfig { int device_id_{0}; uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. + bool use_cudnn_{false}; + // TensorRT related. bool use_tensorrt_{false}; // For workspace_size, refer it from here: @@ -289,8 +308,6 @@ struct AnalysisConfig { // memory reuse related. bool enable_memory_optim_{false}; - bool static_memory_optim_{false}; - bool static_memory_optim_force_update_{false}; bool use_ngraph_{false}; bool use_mkldnn_{false}; @@ -306,6 +323,8 @@ struct AnalysisConfig { int cpu_math_library_num_threads_{1}; + bool with_profile_{false}; + // A runtime cache, shouldn't be transferred to others. std::string serialized_info_cache_; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 87f40f09..8c0adfcb 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -23,6 +23,7 @@ */ #include +#include #include #include #include @@ -37,6 +38,7 @@ enum PaddleDType { FLOAT32, INT64, INT32, + UINT8, // TODO(Superjomn) support more data types if needed. }; @@ -149,8 +151,8 @@ class ZeroCopyTensor { /** Get the memory in CPU or GPU with specific data type, should Reshape first * to tell the data size. - * Once can directly call this data to feed the data. - * This is for write the input tensor. + * One can directly call this data to feed the data. + * This is for writing the input tensor. */ template T* mutable_data(PaddlePlace place); @@ -220,6 +222,12 @@ class PaddlePredictor { */ virtual std::vector GetInputNames() { return {}; } + /** \brief Get input shapes of the model + */ + virtual std::map> GetInputTensorShape() { + return {}; + } + /** \brief Get output names of the model */ virtual std::vector GetOutputNames() { return {}; } diff --git a/paddle/fluid/inference/api/paddle_inference_pass.h b/paddle/fluid/inference/api/paddle_inference_pass.h deleted file mode 100644 index 64628d8e..00000000 --- a/paddle/fluid/inference/api/paddle_inference_pass.h +++ /dev/null @@ -1,33 +0,0 @@ -// Generated by the paddle/fluid/framework/ir/CMakeLists.txt. DO NOT EDIT! - -#pragma once -#include "paddle/fluid/framework/ir/pass.h" -USE_PASS(graph_to_program_pass); -USE_PASS(graph_viz_pass); -USE_PASS(lock_free_optimize_pass); -USE_PASS(fc_fuse_pass); -USE_PASS(attention_lstm_fuse_pass); -USE_PASS(infer_clean_graph_pass); -USE_PASS(fc_lstm_fuse_pass); -USE_PASS(embedding_fc_lstm_fuse_pass); -USE_PASS(fc_gru_fuse_pass); -USE_PASS(seq_concat_fc_fuse_pass); -USE_PASS(multi_batch_merge_pass); -USE_PASS(conv_bn_fuse_pass); -USE_PASS(seqconv_eltadd_relu_fuse_pass); -USE_PASS(seqpool_concat_fuse_pass); -USE_PASS(repeated_fc_relu_fuse_pass); -USE_PASS(squared_mat_sub_fuse_pass); -USE_PASS(is_test_pass); -USE_PASS(conv_elementwise_add_act_fuse_pass); -USE_PASS(conv_elementwise_add2_act_fuse_pass); -USE_PASS(conv_elementwise_add_fuse_pass); -USE_PASS(conv_affine_channel_fuse_pass); -USE_PASS(transpose_flatten_concat_fuse_pass); -USE_PASS(identity_scale_op_clean_pass); -USE_PASS(sync_batch_norm_pass); -USE_PASS(runtime_context_cache_pass); -USE_PASS(quant_conv2d_dequant_fuse_pass); -USE_PASS(fillconstant_elementwisemul_fuse); -USE_PASS(shuffle_channel_detect_pass); -USE_PASS(delete_quant_dequant_op_pass); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index bc2c0914..e81a8428 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -71,9 +71,9 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { void PaddlePassBuilder::ClearPasses() { passes_.clear(); } const std::vector kTRTSubgraphPasses({ - "infer_clean_graph_pass", // - "conv_affine_channel_fuse_pass", // + "conv_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", // + "shuffle_channel_detect_pass", // "quant_conv2d_dequant_fuse_pass", // "delete_quant_dequant_op_pass", // // "fc_fuse_pass", // @@ -90,7 +90,6 @@ const std::vector kTRTSubgraphPasses({ // The following passes works for Anakin sub-graph engine. const std::vector kAnakinSubgraphPasses({ - "infer_clean_graph_pass", // "quant_conv2d_dequant_fuse_pass", // "simplify_anakin_priorbox_detection_out_pass", // "fillconstant_elementwisemul_fuse", // @@ -104,8 +103,11 @@ const std::vector kAnakinSubgraphPasses({ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", // - // "identity_scale_op_clean_pass", // + // "identity_scale_op_clean_pass", // + "is_test_pass", // + "simplify_with_basic_ops_pass", // + "fc_fuse_pass", // + "fc_elementwise_layernorm_fuse_pass", // "conv_affine_channel_fuse_pass", // "conv_eltwiseadd_affine_channel_fuse_pass", // "conv_bn_fuse_pass", // @@ -125,6 +127,13 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { use_gpu_ = true; } +void GpuPassStrategy::EnableCUDNN() { + if (!use_cudnn_) { + passes_.insert(passes_.begin(), "cudnn_placement_pass"); + } + use_cudnn_ = true; +} + void GpuPassStrategy::EnableMKLDNN() { LOG(ERROR) << "GPU not support MKLDNN yet"; } @@ -140,10 +149,11 @@ void GpuPassStrategy::EnableNgraph() { CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { // NOTE the large fusions should be located in the front, so that they will // not be damaged by smaller ones. - passes_.assign({"infer_clean_graph_pass", // + passes_.assign({"simplify_with_basic_ops_pass", // "attention_lstm_fuse_pass", // "seqconv_eltadd_relu_fuse_pass", // // "seqpool_concat_fuse_pass", // + "seqpool_cvm_concat_fuse_pass", // // "embedding_fc_lstm_fuse_pass", // "fc_lstm_fuse_pass", // "mul_lstm_fuse_pass", // @@ -163,6 +173,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { use_gpu_ = false; } +void CpuPassStrategy::EnableCUDNN() { LOG(ERROR) << "CPU not support cuDNN"; } + void CpuPassStrategy::EnableMKLDNN() { // TODO(Superjomn) Consider the way to mix CPU with GPU. #ifdef PADDLE_WITH_MKLDNN @@ -178,8 +190,9 @@ void CpuPassStrategy::EnableMKLDNN() { "conv3d_bias_mkldnn_fuse_pass", // "conv_elementwise_add_mkldnn_fuse_pass", "conv_concat_relu_mkldnn_fuse_pass", - "conv_relu_mkldnn_fuse_pass", // - "conv_brelu_mkldnn_fuse_pass", // + "conv_relu_mkldnn_fuse_pass", // + "conv_leaky_relu_mkldnn_fuse_pass", // + "conv_relu6_mkldnn_fuse_pass", // // Disabled due to topology-dependent speed-up // "fc_mkldnn_pass" })) { diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 62b7ab30..69bc5cd7 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -72,7 +72,7 @@ class PaddlePassBuilder { protected: std::vector analysis_passes_{ - {"ir_graph_build_pass", "ir_analysis_pass", + {"ir_graph_build_pass", "ir_graph_clean_pass", "ir_analysis_pass", "ir_params_sync_among_devices_pass", "adjust_cudnn_workspace_size_pass", "inference_op_replace_pass"}}; std::vector passes_; @@ -85,6 +85,10 @@ class PassStrategy : public PaddlePassBuilder { explicit PassStrategy(const std::vector &passes) : PaddlePassBuilder(passes) {} + /** Enable the use of cuDNN kernel + */ + virtual void EnableCUDNN() {} + /** The MKLDNN control exists in both CPU and GPU mode, because there can be * still some CPU kernels running in CPU mode. */ @@ -124,6 +128,7 @@ class CpuPassStrategy : public PassStrategy { virtual ~CpuPassStrategy() = default; + void EnableCUDNN() override; void EnableNgraph() override; void EnableMKLDNN() override; void EnableMkldnnQuantizer() override; @@ -142,13 +147,18 @@ class GpuPassStrategy : public PassStrategy { explicit GpuPassStrategy(const GpuPassStrategy &other) : PassStrategy(other.AllPasses()) { use_gpu_ = true; + use_cudnn_ = other.use_cudnn_; } + void EnableCUDNN() override; void EnableNgraph() override; void EnableMKLDNN() override; void EnableMkldnnQuantizer() override; virtual ~GpuPassStrategy() = default; + + protected: + bool use_cudnn_{false}; }; extern const std::vector kTRTSubgraphPasses; diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 4647fa03..8b379457 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -22,7 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/platform/cpu_helper.h" -//#include "paddle/fluid/pybind/pybind.h" +#include "paddle/fluid/pybind/pybind.h" DEFINE_string(devices, "", "The devices to be used which is joined by comma."); DEFINE_bool(init_p2p, false, "Whether to init p2p."); diff --git a/paddle/fluid/inference/paddle_fluid.map b/paddle/fluid/inference/paddle_fluid.map index 7e5cae04..05935701 100644 --- a/paddle/fluid/inference/paddle_fluid.map +++ b/paddle/fluid/inference/paddle_fluid.map @@ -1,7 +1,8 @@ { global: *paddle*; - *Pass*; + *Pass*; + *profile*; local: *; }; diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index d82b88a7..4f3da10f 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,5 +1,5 @@ nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost) -nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto boost) +nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto device_context boost) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) add_subdirectory(plugin) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 854007ce..b63b75f7 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -3,6 +3,7 @@ nv_library(tensorrt_converter SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc +shuffle_channel_op.cc swish_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS @@ -42,3 +43,9 @@ nv_test(test_op_converter SRCS test_op_converter.cc DEPS # prelu_op) #nv_test(test_trt_leaky_relu_op SRCS test_leaky_relu_op.cc leaky_relu_op.cc # DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op) + +#nv_test(test_shuffle_channel_op SRCS test_shuffle_channel_op.cc shuffle_channel_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine shuffle_channel_op) + +#nv_test(test_swish_op SRCS test_swish_op.cc swish_op.cc +# DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine activation_op tensorrt_plugin) diff --git a/paddle/fluid/inference/tensorrt/convert/activation_op.cc b/paddle/fluid/inference/tensorrt/convert/activation_op.cc index 5c2454fa..18de4486 100644 --- a/paddle/fluid/inference/tensorrt/convert/activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/activation_op.cc @@ -42,11 +42,20 @@ class ActivationOpConverter : public OpConverter { nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, Activation, *const_cast(input_tensor), op_pair->second); + +#if IS_TRT_VERSION_GE(5130) + // max(alpha, min(beta, x)) + if (op_type_ == "relu6") { + layer->setAlpha(0.); + layer->setBeta(6.); + } +#endif + auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode); if (op_desc.HasAttr("out_scale")) { -#if IS_TRT_VERSION_GE(5000) +#if IS_TRT_VERSION_GE(5130) float out_scale = boost::get(op_desc.GetAttr("out_scale")); engine_->SetTensorDynamicRange(layer->getOutput(0), out_scale); #endif @@ -63,6 +72,9 @@ const std::unordered_map {"relu", nvinfer1::ActivationType::kRELU}, {"sigmoid", nvinfer1::ActivationType::kSIGMOID}, {"tanh", nvinfer1::ActivationType::kTANH}, +#if IS_TRT_VERSION_GE(5130) + {"relu6", nvinfer1::ActivationType::kCLIP}, +#endif }; class ReluOpConverter : public ActivationOpConverter { @@ -80,6 +92,11 @@ class TanhOpConverter : public ActivationOpConverter { TanhOpConverter() { op_type_ = "tanh"; } }; +class Relu6OpConverter : public ActivationOpConverter { + public: + Relu6OpConverter() { op_type_ = "relu6"; } +}; + } // namespace tensorrt } // namespace inference } // namespace paddle @@ -87,3 +104,4 @@ class TanhOpConverter : public ActivationOpConverter { REGISTER_TRT_OP_CONVERTER(relu, ReluOpConverter); REGISTER_TRT_OP_CONVERTER(sigmoid, SigmoidOpConverter); REGISTER_TRT_OP_CONVERTER(tanh, TanhOpConverter); +REGISTER_TRT_OP_CONVERTER(relu6, Relu6OpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index d9488684..25f0d866 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -116,11 +116,10 @@ class BatchNormOpConverter : public OpConverter { scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Y").front(); - engine_->weight_map[op_desc.Input("Bias").front()] = - std::move(combile_bias_tensor); - engine_->weight_map[op_desc.Input("Scale").front()] = - std::move(combile_scale_tensor); - + engine_->SetWeights(op_desc.Input("Bias").front(), + std::move(combile_bias_tensor)); + engine_->SetWeights(op_desc.Input("Scale").front(), + std::move(combile_scale_tensor)); RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode); } }; diff --git a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc index 71177e5e..cd28c6d9 100644 --- a/paddle/fluid/inference/tensorrt/convert/dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/dropout_op.cc @@ -31,6 +31,20 @@ class DropoutOpConverter : public OpConverter { auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); float dropout_prob = boost::get(op_desc.GetAttr("dropout_prob")); + std::string downgrade_in_infer = ""; + if (op_desc.HasAttr("dropout_implementation")) { + downgrade_in_infer = + boost::get(op_desc.GetAttr("dropout_implementation")); + } + + if (!downgrade_in_infer.empty() && + downgrade_in_infer == "upscale_in_train") { + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode); + return; + } + platform::CPUPlace cpu_place; std::unique_ptr weight_tensor( new framework::LoDTensor()); @@ -52,8 +66,8 @@ class DropoutOpConverter : public OpConverter { nvinfer1::ScaleMode::kUNIFORM, shift_weights.get(), scale_weights.get(), power_weights.get()); - engine_->weight_map[op_desc.Output("Out").front() + "_dropout"] = - std::move(weight_tensor); + engine_->SetWeights(op_desc.Output("Out").front() + "_dropout", + std::move(weight_tensor)); auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "dropout", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index a888b080..c61dd753 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -61,7 +61,7 @@ class ElementwiseWeightOpConverter : public OpConverter { auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; - std::vector dims_y = framework::vectorize2int(Y_t->dims()); + std::vector dims_y = framework::vectorize(Y_t->dims()); if (static_cast(dims_y.size()) == dims_x.nbDims + 1) { if (dims_y[0] == 1) dims_y.erase(dims_y.begin()); } diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index fb7b89b1..ea108d6a 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -72,7 +72,7 @@ class FcOpConverter : public OpConverter { PADDLE_ENFORCE_NOT_NULL(Y_v); auto* Y_t = Y_v->GetMutable(); // This may trigger a GPU->CPU copy, because TRT's weight can only be - // assigned from CPU memory, that can't be avoided. + // assigned from CPU memory, which can't be avoided. float* weight_data = nullptr; bool enable_int8 = boost::get(op_desc.HasAttr("enable_int8")); if (enable_int8) { @@ -131,7 +131,7 @@ class FcOpConverter : public OpConverter { *const_cast(X), n_output, tmp_weight.get(), bias.get()); - engine_->weight_map[op_desc.Input(w_name).front()] = std::move(tmp); + engine_->SetWeights(op_desc.Input(w_name).front(), std::move(tmp)); auto output_name = op_desc.Output("Out").front(); RreplenishLayerAndOutput(layer, "fc", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc index 7753fda0..f3c71400 100644 --- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc @@ -35,7 +35,14 @@ class LeakyReluOpConverter : public OpConverter { PADDLE_ENFORCE(output_num == 1); // Get attrs float alpha = boost::get(op_desc.GetAttr("alpha")); + nvinfer1::ILayer* output_layer = nullptr; +#if IS_TRT_VERSION_GE(5100) + nvinfer1::IActivationLayer* layer = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *input, nvinfer1::ActivationType::kLEAKY_RELU); + layer->setAlpha(alpha); + output_layer = layer; +#else platform::CPUPlace place; std::unique_ptr alpha_tensor( new framework::LoDTensor()); @@ -65,7 +72,7 @@ class LeakyReluOpConverter : public OpConverter { nvinfer1::ScaleMode::kUNIFORM, shift.get(), sub_scale.get(), power.get()); PADDLE_ENFORCE(nullptr != scale_relu_layer); - auto* output_layer = + output_layer = TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *(scale_layer->getOutput(0)), *(scale_relu_layer->getOutput(0)), nvinfer1::ElementWiseOperation::kSUM); @@ -74,8 +81,8 @@ class LeakyReluOpConverter : public OpConverter { std::string alpha_name = op_desc.Output("Out")[0] + "_alpha"; PADDLE_ENFORCE(engine_->weight_map.find(alpha_name) == engine_->weight_map.end()); - engine_->weight_map[alpha_name] = std::move(alpha_tensor); - + engine_->SetWeights(alpha_name, std::move(alpha_tensor)); +#endif auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(output_layer, "leaky_relu", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index f89b0d7e..3a2deae3 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -225,7 +225,7 @@ class OpConverter { return 0; \ } -#define USE_TRT_CONVERTER(op_type__) \ - extern int TouchConverterRegister_##op_type__(); \ - static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \ +#define USE_TRT_CONVERTER(op_type__) \ + extern int TouchConverterRegister_##op_type__(); \ + static int use_op_converter_trt_##op_type__ UNUSED = \ TouchConverterRegister_##op_type__(); diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index 01bcd03e..d327a743 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -55,8 +55,8 @@ class PReluOpConverter : public OpConverter { nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); // keep alpha tensor to avoid release it's memory - engine_->weight_map[op_desc.Input("Alpha")[0]] = - std::move(alpha_tensor_temp); + engine_->SetWeights(op_desc.Input("Alpha")[0], + std::move(alpha_tensor_temp)); auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "prelu", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc new file mode 100644 index 00000000..0f891e0f --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * ConcatOp + */ +class ShuffleChannelOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + auto input_dims = input->getDimensions(); + PADDLE_ENFORCE(input_dims.nbDims == 3); + int c = input_dims.d[0]; + int h = input_dims.d[1]; + int w = input_dims.d[2]; + int group = boost::get(op_desc.GetAttr("group")); + + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + nvinfer1::Dims4 reshape_dim(group, c / group, h, w); + layer->setReshapeDimensions(reshape_dim); + layer->setSecondTranspose({1, 0, 2, 3}); + auto* output = layer->getOutput(0); + + auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *output); + nvinfer1::DimsCHW reshape_dim2(c, h, w); + reshape_layer->setReshapeDimensions(reshape_dim2); + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(reshape_layer, "concat", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(shuffle_channel, ShuffleChannelOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc new file mode 100644 index 00000000..42f2008a --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +class SwishOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert fluid swish op to tensorrt layer"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + int input_num = op_desc.Input("X").size(); + PADDLE_ENFORCE(input_num == 1); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + // Get output + size_t output_num = op_desc.Output("Out").size(); + PADDLE_ENFORCE(output_num == 1); + // Get attrs + float beta = boost::get(op_desc.GetAttr("beta")); + + plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta); + + nvinfer1::IPluginLayer* layer = + engine_->AddPlugin(&input, input_num, plugin); + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "swish", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(swish, SwishOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index dd3dfb0b..f2dc5ba1 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -46,6 +46,8 @@ TEST(SigmoidOpConverter, main) { test_activation("sigmoid"); } TEST(TanhOpConverter, main) { test_activation("tanh"); } +TEST(Relu6OpConverter, main) { test_activation("relu6"); } + } // namespace tensorrt } // namespace inference } // namespace paddle @@ -53,3 +55,4 @@ TEST(TanhOpConverter, main) { test_activation("tanh"); } USE_OP(relu); USE_OP(sigmoid); USE_OP(tanh); +USE_OP(relu6); diff --git a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc index 6b8e621b..81e905b9 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_dropout_op.cc @@ -34,6 +34,7 @@ TEST(DropoutOpConverter, main) { framework::OpDesc desc; int is_test = 1; float dropout_prob = 0.4; + std::string dropout_implementation = "upscale_in_train"; desc.SetType("dropout"); desc.SetInput("X", {"dropout-X"}); @@ -42,6 +43,8 @@ TEST(DropoutOpConverter, main) { desc.SetAttr("is_test", is_test); desc.SetAttr("dropout_prob", dropout_prob); + desc.SetAttr("dropout_implementation", dropout_implementation); + LOG(INFO) << "set OP"; validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc index c5a41322..52655663 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include +#include // NOLINT #include "paddle/fluid/framework/program_desc.h" namespace paddle { @@ -27,10 +27,8 @@ TEST(OpConverter, ConvertBlock) { auto* conv2d_op = block->AppendOp(); // init trt engine - cudaStream_t stream_; std::unique_ptr engine_; - PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); - engine_.reset(new TensorRTEngine(5, 1 << 15, stream_)); + engine_.reset(new TensorRTEngine(5, 1 << 15)); engine_->InitNetwork(); engine_->DeclareInput("conv2d-X", nvinfer1::DataType::kFLOAT, diff --git a/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc new file mode 100644 index 00000000..e3cc5273 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_shuffle_channel_op.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(leaky_relu_op, test_leaky_relu) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("sc_input", nvinfer1::DimsCHW(4, 2, 2)); + validator.DeclOutputVar("sc_out", nvinfer1::DimsCHW(4, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("shuffle_channel"); + desc.SetInput("X", {"sc_input"}); + desc.SetOutput("Out", {"sc_out"}); + int group = 2; + desc.SetAttr("group", group); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +// USE_OP(leaky_relu); +USE_OP(shuffle_channel); diff --git a/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc new file mode 100644 index 00000000..c15c79bb --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_swish_op.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(swish_op, test_swish) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("sw_input", nvinfer1::DimsCHW(3, 2, 2)); + validator.DeclOutputVar("sw_out", nvinfer1::DimsCHW(3, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("swish"); + desc.SetInput("X", {"sw_input"}); + desc.SetOutput("Out", {"sw_out"}); + + desc.SetAttr("beta", 2.0f); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(swish); diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 388d83d8..97affafb 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -80,8 +80,7 @@ class TRTConvertValidation { if_add_batch_(if_add_batch), max_batch_size_(max_batch_size) { PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); - engine_.reset( - new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0)); + engine_.reset(new TensorRTEngine(max_batch_size, workspace_size)); engine_->InitNetwork(); } diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index c5ac6f38..f806069b 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -35,8 +35,15 @@ void TensorRTEngine::Build(const DescType &paddle_model) { void TensorRTEngine::Execute(int batch_size, std::vector *buffers, cudaStream_t stream) { freshDeviceId(); + const std::thread::id tid = std::this_thread::get_id(); batch_size_ = batch_size; - infer_context_->enqueue(batch_size, buffers->data(), stream, nullptr); + if (infer_context_.find(tid) == infer_context_.end()) { + PADDLE_ENFORCE_NOT_NULL( + infer_engine_, + "You should build engine first and then set the context."); + infer_context_[tid].reset(infer_engine_->createExecutionContext()); + } + infer_context_[tid]->enqueue(batch_size, buffers->data(), stream, nullptr); cudaStreamSynchronize(stream); SetRuntimeBatch(batch_size); } @@ -51,7 +58,25 @@ void TensorRTEngine::FreezeNetwork() { // build engine. infer_builder_->setMaxBatchSize(max_batch_); infer_builder_->setMaxWorkspaceSize(max_workspace_); - if (enable_int8_) { + bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf); +#if IS_TRT_VERSION_GE(5000) + if (enable_fp16) { + bool support_fp16 = infer_builder_->platformHasFastFp16(); + infer_builder_->setFp16Mode(support_fp16); + if (!support_fp16) { + LOG(INFO) << "You specify FP16 mode, but the hardware do not support " + "FP16 speed up, use FP32 instead."; + } + } +#else + if (enable_fp16) + LOG(INFO) << "Using FP16 in Paddle-TRT must ensure that the version of TRT " + "is at least 5." + "So, use FP32 to run."; +#endif + bool enable_int8 = (precision_ == AnalysisConfig::Precision::kInt8); + + if (enable_int8) { infer_builder_->setInt8Mode(true); if (calibrator_) { infer_builder_->setInt8Calibrator(calibrator_); @@ -91,8 +116,6 @@ void TensorRTEngine::FreezeNetwork() { infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_)); PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); - - infer_context_.reset(infer_engine_->createExecutionContext()); } nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, @@ -123,8 +146,8 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset, PADDLE_ENFORCE(!output->isNetworkInput()); infer_network_->markOutput(*output); PADDLE_ENFORCE(output->isNetworkOutput()); - // output buffers' size can only be decided latter, set zero here to mark this - // and will reset latter. + // output buffers' size can only be decided later, set zero here to mark this + // and will reset later. buffer_sizes_[name] = 0; } @@ -141,8 +164,8 @@ void TensorRTEngine::DeclareOutput(const std::string &name) { output->setName(name.c_str()); PADDLE_ENFORCE(!output->isNetworkInput()); infer_network_->markOutput(*output); - // output buffers' size can only be decided latter, set zero here to mark this - // and will reset latter. + // output buffers' size can only be decided later, set zero here to mark this + // and will reset later. buffer_sizes_[name] = 0; } @@ -167,20 +190,26 @@ float *TensorRTEngine::GetWeightCPUData(const std::string &name, framework::Tensor *weight_tensor, bool enable_int8, const std::vector &scale) { + static int name_suffix_counter = 0; + std::string name_suffix = std::to_string(name_suffix_counter); + std::string name_with_suffix = name + name_suffix; auto w_dims = weight_tensor->dims(); platform::CPUPlace cpu_place; - PADDLE_ENFORCE(!weight_map.count(name), - "During TRT Op converter: We set weight %s with the same name " - "twice into the weight_map", - name); - weight_map[name].reset(new framework::Tensor()); - weight_map[name]->Resize(weight_tensor->dims()); - TensorCopySync(*weight_tensor, cpu_place, weight_map[name].get()); - float *weight_data = weight_map[name]->mutable_data(cpu_place); + PADDLE_ENFORCE_EQ( + weight_map.count(name_with_suffix), 0, + "During TRT Op converter: We set weight %s with the same name " + "twice into the weight_map", + name_with_suffix); + weight_map[name_with_suffix].reset(new framework::Tensor()); + weight_map[name_with_suffix]->Resize(weight_tensor->dims()); + TensorCopySync(*weight_tensor, cpu_place, weight_map[name_with_suffix].get()); + float *weight_data = + weight_map[name_with_suffix]->mutable_data(cpu_place); + name_suffix_counter += 1; if (enable_int8) { // when the op is fc, scale's size should be 1 - // when the op is conv, the scale's size should be w_dims[0] + // when the op is conv, scale's size should be w_dims[0] bool valid_scale_size = (scale.size() == 1 || scale.size() == static_cast(w_dims[0])); PADDLE_ENFORCE(valid_scale_size, "TRT int8 quant: invalid scale size"); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 80af463d..c1d95003 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -15,13 +15,16 @@ limitations under the License. */ #pragma once #include +#include #include #include #include #include +#include #include #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -38,7 +41,7 @@ class TRTInt8Calibrator; * TensorRT Engine. * * There are two alternative ways to use it, one is to build from a paddle - * protobuf model, another way is to manully construct the network. + * protobuf model, another way is to manually construct the network. */ class TensorRTEngine { using DescType = ::paddle::framework::proto::BlockDesc; @@ -61,12 +64,14 @@ class TensorRTEngine { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false, - TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, - nvinfer1::ILogger& logger = NaiveLogger::Global()) + TensorRTEngine( + int max_batch, int max_workspace, + AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32, + TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, + nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), - enable_int8_(enable_int8), + precision_(precision), calibrator_(calibrator), device_id_(device_id), logger_(logger) {} @@ -86,11 +91,11 @@ class TensorRTEngine { infer_builder_.reset(createInferBuilder(&logger_)); infer_network_.reset(infer_builder_->createNetwork()); } - // After finishing adding ops, freeze this network and creates the executation + // After finishing adding ops, freeze this network and creates the execution // environment. void FreezeNetwork(); - // Add an input and set its name, data type and dimention. + // Add an input and set its name, data type and dimension. nvinfer1::ITensor* DeclareInput(const std::string& name, nvinfer1::DataType dtype, const nvinfer1::Dims& dim); @@ -125,7 +130,6 @@ class TensorRTEngine { &inference::Singleton::Global())); PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed when deserialize engine info.!"); - infer_context_.reset(infer_engine_->createExecutionContext()); } void SetRuntimeBatch(size_t batch_size); @@ -149,6 +153,16 @@ class TensorRTEngine { std::unordered_map> weight_map; + // When setting weight_map, a self-increasing suffix is needed for the names + // so as to avoid repeatedly setting weights with the same name. + void SetWeights(std::string w_name, + std::unique_ptr w_tensor) { + static int suffix_counter = 0; + std::string suffix = std::to_string(suffix_counter); + weight_map[w_name + suffix] = std::move(w_tensor); + suffix_counter += 1; + } + void ClearWeights() { for (auto& weight_pair : weight_map) { weight_pair.second.reset(nullptr); @@ -168,7 +182,7 @@ class TensorRTEngine { // the max memory size the engine uses int max_workspace_; - bool enable_int8_; + AnalysisConfig::Precision precision_; TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; @@ -197,7 +211,8 @@ class TensorRTEngine { infer_ptr infer_builder_; infer_ptr infer_network_; infer_ptr infer_engine_; - infer_ptr infer_context_; + std::unordered_map> + infer_context_; infer_ptr ihost_memory_; std::unordered_map quant_dynamic_range_; }; // class TensorRTEngine @@ -206,7 +221,7 @@ class TensorRTEngine { ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \ NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version) -// Add an layer__ into engine__ with args ARGS. +// Add a layer__ into engine__ with args ARGS. // For example: // // Reference @@ -216,8 +231,8 @@ class TensorRTEngine { // TensorRT has too many layers, so that is not wise to add member functions for // them, and an macro like this is more extensible when underlying TensorRT // library add new layer supports. -#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \ - engine__->network()->add##layer__(ARGS); +#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ...) \ + engine__->network()->add##layer__(__VA_ARGS__); class TRTEngineManager { public: @@ -231,12 +246,12 @@ class TRTEngineManager { return engines_.at(name).get(); } - TensorRTEngine* Create(std::string name, int max_batch, int max_workspace, - bool enable_int8 = false, - TRTInt8Calibrator* calibrator = nullptr, - int device_id = 0, - nvinfer1::ILogger& logger = NaiveLogger::Global()) { - auto* p = new TensorRTEngine(max_batch, max_workspace, enable_int8, + TensorRTEngine* Create( + std::string name, int max_batch, int max_workspace, + AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32, + TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, + nvinfer1::ILogger& logger = NaiveLogger::Global()) { + auto* p = new TensorRTEngine(max_batch, max_workspace, precision, calibrator, device_id, logger); engines_[name].reset(p); return p; diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 170ca40d..292f5e1d 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -20,7 +20,11 @@ namespace tensorrt { // Just tell by the op_types. struct SimpleOpTypeSetTeller : public Teller { - SimpleOpTypeSetTeller() {} + SimpleOpTypeSetTeller() { +#if IS_TRT_VERSION_GE(5130) + teller_set.insert("relu6"); +#endif + } bool operator()(const std::string& op_type, const framework::OpDesc& desc) override { @@ -28,11 +32,27 @@ struct SimpleOpTypeSetTeller : public Teller { } private: - std::unordered_set teller_set{ - {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", - "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", - "elementwise_add", "elementwise_mul", "dropout", "prelu", - "conv2d_transpose", "leaky_relu", "fc"}}; + std::unordered_set teller_set{{"mul", + "conv2d", + "pool2d", + "relu", + "softmax", + "sigmoid", + "depthwise_conv2d", + "batch_norm", + "concat", + "tanh", + "pad", + "elementwise_add", + "elementwise_mul", + "dropout", + "prelu", + "conv2d_transpose", + "leaky_relu", + "fc", + "shuffle_channel", + "swish", + "split"}}; }; bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) { diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h index 3363d77a..7ff1d474 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.h +++ b/paddle/fluid/inference/tensorrt/op_teller.h @@ -18,6 +18,7 @@ #include #include #include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/inference/tensorrt/engine.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 709aa103..d01c5c82 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,5 +1,5 @@ nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu trt_plugin_factory.cc - avg_pool_op_plugin.cu + avg_pool_op_plugin.cu swish_op_plugin.cu DEPS enforce tensorrt_engine prelu) diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu index b8a044fe..84f938ee 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -34,6 +34,7 @@ int PReluPlugin::initialize() { cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size()); cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float), cudaMemcpyHostToDevice); + return 0; } nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index b5503c3b..9e927ed6 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -27,50 +27,20 @@ SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) { } REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize); -// copied from operators::math::SplitFunctor template -__global__ void SplitKernel(const T* input_data, const int in_row, - const int in_col, const int* out_cols, - int out_cols_size, T** outputs_data) { - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - int curr_segment = 0; - int curr_offset = out_cols[0]; - for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { - int curr_col_offset = out_cols[curr_segment + 1]; - while (curr_col_offset <= tid_x) { - curr_offset = curr_col_offset; - ++curr_segment; - curr_col_offset = out_cols[curr_segment + 1]; - } - - int local_col = tid_x - curr_offset; - int segment_width = curr_col_offset - curr_offset; - T* output_ptr = outputs_data[curr_segment]; - if (output_ptr != nullptr) { - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) - output_ptr[tid_y * segment_width + local_col] = - input_data[tid_y * in_col + tid_x]; - } - } -} - -template -__global__ void SplitKernel(const T* input_data, const int in_row, - const int in_col, const int fixed_out_col, - T** outputs_data) { - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { - int split = tid_x / fixed_out_col; - int in_offset = tid_x - split * fixed_out_col; - T* output_ptr = outputs_data[split]; - if (output_ptr != nullptr) { - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) - output_ptr[tid_y * fixed_out_col + in_offset] = - input_data[tid_y * in_col + tid_x]; +__device__ int upper_bound(T const* vals, int n, T const& key) { + int i = 0; + while (n > 0) { + int m = n / 2; + int j = i + m; + if (!(key < vals[j])) { + i = j + 1; + n -= m + 1; + } else { + n = m; } } + return i; } nvinfer1::Dims SplitPlugin::getOutputDimensions( @@ -101,80 +71,60 @@ int SplitPlugin::initialize() { if (output_length_[i] != output_length_[0]) { same_shape_ = false; } - segment_offsets.push_back(segment_offsets.back() + - output_length_[i] * inner_cols_); + segment_offsets.push_back(segment_offsets.back() + output_length_[i]); } - inner_cols_ *= dims.d[axis_]; + axis_shape_ = dims.d[axis_]; d_segment_offsets_ = segment_offsets; segment_offsets_ = std::move(segment_offsets); d_output_ptrs_.resize(this->getNbOutputs(), nullptr); return 0; } +// The following part of the code refers to onnx-tensorrt +// https://github.com/onnx/onnx-tensorrt/blob/master/Split.cu template -inline void Split(cudaStream_t stream, const bool same_shape, - const int outer_rows, const int inner_cols, - const std::vector& segment_offsets, - const int* d_segment_offsets, const T* input, T** outputs) { - const int kThreadsPerBlock = 1024; - const int kMaxBlocks = 65535; - int block_cols = kThreadsPerBlock; - if (inner_cols < kThreadsPerBlock) { // block_cols is aligned by 32. - block_cols = ((inner_cols + 31) >> 5) << 5; - } - int block_rows = kThreadsPerBlock / block_cols; - dim3 block_size = dim3(block_cols, block_rows, 1); - - int grid_cols = - std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks); - int grid_rows = - std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1)); - dim3 grid_size = dim3(grid_cols, grid_rows, 1); - - if (same_shape) { - SplitKernel<<>>( - input, outer_rows, inner_cols, segment_offsets[1], outputs); - } else { - SplitKernel<<>>( - input, outer_rows, inner_cols, d_segment_offsets, - static_cast(segment_offsets.size()), outputs); +__global__ void split_kernel(int nsegment, + int const* __restrict__ segment_offsets, + T const* __restrict__ idata, T* const* odatas, + int inner_cols, int axis_shape, int outer_rows) { + int x0 = threadIdx.x + blockIdx.x * blockDim.x; + int src_y0 = threadIdx.y + blockIdx.y * blockDim.y; + int z0 = threadIdx.z + blockIdx.z * blockDim.z; + for (int z = z0; z < outer_rows; z += blockDim.z * gridDim.z) { + for (int src_y = src_y0; src_y < axis_shape; + src_y += blockDim.y * gridDim.y) { + for (int x = x0; x < inner_cols; x += blockDim.x * gridDim.x) { + int segment = upper_bound(segment_offsets, nsegment, src_y) - 1; + int dst_y = src_y - segment_offsets[segment]; + int dst_ny = segment_offsets[segment + 1] - segment_offsets[segment]; + odatas[segment][x + inner_cols * (dst_y + dst_ny * z)] = + idata[x + inner_cols * (src_y + axis_shape * z)]; + } + } } } int SplitPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { + const int* d_segment_offsets_ptr = + thrust::raw_pointer_cast(&d_segment_offsets_[0]); float const* input_ptr = reinterpret_cast(inputs[0]); - if (((batchSize == 1 && axis_ == 0) || axis_ == -1) && - this->getNbOutputs() < 10) { - float** output_ptrs = reinterpret_cast(outputs); - int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT) - ? sizeof(float) - : sizeof(__half); - for (int i = 0; i < this->getNbOutputs(); ++i) { - PADDLE_ENFORCE( - cudaMemcpyAsync( - output_ptrs[i], input_ptr + segment_offsets_[i], - (segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size, - cudaMemcpyDeviceToDevice, stream) == cudaSuccess); - } - } else { - outer_rows_ *= batchSize; - const int* d_segment_offsets_ptr = - thrust::raw_pointer_cast(&d_segment_offsets_[0]); - float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]); - PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs, - this->getNbOutputs() * sizeof(float*), - cudaMemcpyHostToDevice, - stream) == cudaSuccess); - if (this->getDataType() == nvinfer1::DataType::kFLOAT) { - Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, - d_segment_offsets_ptr, input_ptr, output_ptrs); - } else { - Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, - d_segment_offsets_ptr, (__half*)input_ptr, // NOLINT - (__half**)output_ptrs); // NOLINT - } - } + float* const* h_odatas = reinterpret_cast(outputs); + float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpyAsync( + output_ptrs, h_odatas, d_output_ptrs_.size() * sizeof(float*), + cudaMemcpyHostToDevice, stream)); + + int outer_rows = outer_rows_ * batchSize; + + dim3 block(32, 16); + dim3 grid(std::min((inner_cols_ - 1) / block.x + 1, 65535u), + std::min((axis_shape_ - 1) / block.y + 1, 65535u), + std::min((outer_rows_ - 1) / block.z + 1, 65535u)); + + split_kernel<<>>( + d_segment_offsets_.size(), d_segment_offsets_ptr, input_ptr, output_ptrs, + inner_cols_, axis_shape_, outer_rows); return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index cbb72590..b2a7bc3b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -66,6 +66,7 @@ class SplitPlugin : public PluginTensorRT { int axis_; int outer_rows_; int inner_cols_; + int axis_shape_; bool same_shape_; std::vector output_length_; std::vector segment_offsets_; diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu new file mode 100644 index 00000000..864ca5f0 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu @@ -0,0 +1,76 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +SwishPlugin *CreateSwishPluginDeserialize(const void *buffer, size_t length) { + return new SwishPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("swish_plugin", CreateSwishPluginDeserialize); + +int SwishPlugin::initialize() { return 0; } + +nvinfer1::Dims SwishPlugin::getOutputDimensions(int index, + const nvinfer1::Dims *inputDims, + int nbInputs) { + assert(nbInputs == 1); + assert(index < this->getNbOutputs()); + nvinfer1::Dims const &input_dims = inputDims[0]; + nvinfer1::Dims output_dims = input_dims; + return output_dims; +} +__global__ void swish_kernel(int num, const float *input, float *output, + float beta) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index < num) { +#if __CUDA_ARCH__ >= 350 + output[index] = + __ldg(input + index) / (1.0f + expf(-beta * __ldg(input + index))); +#else + output[index] = input[index] / (1.0f + expf(-beta * input[index])); +#endif + } +} + +int SwishPlugin::enqueue(int batch_size, const void *const *inputs, + void **outputs, void *workspace, cudaStream_t stream) { + // input dims is CHW. + const auto &input_dims = this->getInputDims(0); + const float *input = reinterpret_cast(inputs[0]); + float *output = reinterpret_cast(outputs)[0]; + int num = batch_size; + for (int i = 0; i < input_dims.nbDims; i++) { + num *= input_dims.d[i]; + } + int threads = 1024; + int blocks = (num + threads - 1) / threads; + swish_kernel<<>>(num, input, output, beta_); + + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h new file mode 100644 index 00000000..6c3cd038 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h @@ -0,0 +1,72 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class SwishPlugin : public PluginTensorRT { + private: + float beta_; + + protected: + size_t getSerializationSize() override { + return getBaseSerializationSize() + SerializedSize(beta_); + } + + // TRT will call this func when we need to serialize the configuration of + // tensorrt. + // It should not be called by users. + void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); + serializeBase(buffer); + SerializeValue(&buffer, beta_); + } + + public: + explicit SwishPlugin(const float beta) : beta_(beta) {} + + // It was used for tensorrt deserialization. + // It should not be called by users. + SwishPlugin(void const *serialData, size_t serialLength) { + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &beta_); + } + ~SwishPlugin() {} + int initialize() override; + + SwishPlugin *clone() const override { return new SwishPlugin(beta_); } + + const char *getPluginType() const override { return "swish_plugin"; } + int getNbOutputs() const override { return 1; } + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, + int nbInputDims) override; + int enqueue(int batchSize, const void *const *inputs, void **outputs, + void *workspace, cudaStream_t stream) override; +}; + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h index 139c7559..ed825801 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h @@ -68,7 +68,7 @@ class TrtPluginRegistrar { #define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func) \ static paddle::inference::tensorrt::plugin::TrtPluginRegistrar \ - trt_plugin_registrar##ctr __attribute__((unused)) = \ + trt_plugin_registrar##ctr UNUSED = \ paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \ name, deserialize_func) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 083e1bc5..e064d01b 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -43,6 +43,17 @@ function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir --iterations=2) endfunction() +function(inference_analysis_api_object_dection_int8_test_run TARGET_NAME test_binary model_dir data_path) + inference_analysis_test_run(${TARGET_NAME} + COMMAND ${test_binary} + ARGS --infer_model=${model_dir}/model + --infer_data=${data_path} + --warmup_batch_size=10 + --batch_size=300 + --paddle_num_threads=${CPU_NUM_THREADS_ON_CI} + --iterations=1) +endfunction() + function(inference_analysis_api_test_with_fake_data_build TARGET_NAME filename) inference_analysis_test_build(${TARGET_NAME} SRCS ${filename} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}) @@ -113,11 +124,6 @@ set(LAC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lac") download_model_and_data(${LAC_INSTALL_DIR} "lac_model.tar.gz" "lac_data.txt.tar.gz") inference_analysis_api_test(test_analyzer_lac ${LAC_INSTALL_DIR} analyzer_lac_tester.cc) -# MM DNN -set(MM_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mm_dnn") -download_model_and_data(${MM_DNN_INSTALL_DIR} "MM_DNN_model.tar.gz" "MM_DNN_data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_mm_dnn ${MM_DNN_INSTALL_DIR} analyzer_mm_dnn_tester.cc) - # Pyramid DNN set(PYRAMID_DNN_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/pyramid_dnn") download_model_and_data(${PYRAMID_DNN_INSTALL_DIR} "PyramidDNN_model.tar.gz" "PyramidDNN_data.txt.tar.gz") @@ -232,12 +238,15 @@ if(WITH_MKLDNN) inference_analysis_api_int8_test_run(test_analyzer_int8_googlenet ${INT8_IMG_CLASS_TEST_APP} ${INT8_GOOGLENET_MODEL_DIR} ${IMAGENET_DATA_PATH}) ### Object detection models - set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_100.bin") + set(PASCALVOC_DATA_PATH "${INT8_DATA_DIR}/pascalvoc_val_head_300.bin") set(INT8_OBJ_DETECT_TEST_APP "test_analyzer_int8_object_detection") set(INT8_OBJ_DETECT_TEST_APP_SRC "analyzer_int8_object_detection_tester.cc") # download dataset if necessary - download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_100.tar.gz") + download_int8_data(${INT8_DATA_DIR} "pascalvoc_val_head_300.tar.gz") + + # download small demo set of pascalvoc for testing local userdata preprocessing + download_int8_data(${INT8_DATA_DIR} "pascalvoc_small.tar.gz") # build test binary to be used in subsequent tests inference_analysis_api_int8_test_build(${INT8_OBJ_DETECT_TEST_APP} ${INT8_OBJ_DETECT_TEST_APP_SRC}) @@ -245,7 +254,7 @@ if(WITH_MKLDNN) # mobilenet-ssd int8 set(INT8_MOBILENET_SSD_MODEL_DIR "${INT8_DATA_DIR}/mobilenet-ssd") download_int8_data(${INT8_MOBILENET_SSD_MODEL_DIR} "mobilenet_ssd_int8_model.tar.gz" ) - inference_analysis_api_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH}) + inference_analysis_api_object_dection_int8_test_run(test_analyzer_int8_mobilenet_ssd ${INT8_OBJ_DETECT_TEST_APP} ${INT8_MOBILENET_SSD_MODEL_DIR} ${PASCALVOC_DATA_PATH}) endif() @@ -255,17 +264,23 @@ download_model_and_data(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_dat inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc) if(WITH_GPU AND TENSORRT_FOUND) - set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt") + set(TRT_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/trt_models") if (NOT EXISTS ${TRT_MODEL_INSTALL_DIR}) - inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_test_models.tar.gz") + inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "trt_inference_test_models.tar.gz") endif() inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models) + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) inference_analysis_test(trt_resnet50_test SRCS trt_resnet50_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models) + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) inference_analysis_test(trt_resnext_test SRCS trt_resnext_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_test_models) + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + inference_analysis_test(trt_fc_prelu_test SRCS trt_fc_prelu_test.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) + inference_analysis_test(trt_cascade_rcnn_test SRCS trt_cascade_rcnn_test.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc index 406c028a..f679e122 100644 --- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc @@ -230,7 +230,7 @@ TEST(Analyzer_bert, compare_determine) { inputs); } -void verify_transfer_scope_cache(bool is_static = false) { +TEST(Analyzer_bert, transfer_scope_cache) { AnalysisConfig config; SetConfig(&config); @@ -251,11 +251,6 @@ void verify_transfer_scope_cache(bool is_static = false) { threads.emplace_back([&, i]() { std::getline(fin, line); ParseLine(line, &input); -#ifdef PADDLE_WITH_MKLDNN - // Use static method to handle transfer_scope_cache() - // TODO(intel) explicit session id setting will be deprecated. - if (is_static) platform::set_cur_mkldnn_session_id(1); -#endif predictor->Run(input, &output, FLAGS_batch_size); global_transfer_scope_cache.insert( &paddle::framework::global_transfer_scope_cache()); @@ -266,31 +261,12 @@ void verify_transfer_scope_cache(bool is_static = false) { threads.clear(); std::vector().swap(input); } -#ifdef PADDLE_WITH_MKLDNN - if (is_static) { - // Use static method to do transfer_scope_cache() instead of thread_local - // so paddle::framework::global_transfer_data_cache() should be 1 - PADDLE_ENFORCE(global_transfer_scope_cache.size(), 1); - PADDLE_ENFORCE(global_transfer_data_cache.size(), 1); - } else { -#endif - // Since paddle::framework::global_transfer_scope_cache() and - // paddle::framework::global_transfer_data_cache() are thread_local, - // their pointer should be different among different thread id. - PADDLE_ENFORCE(global_transfer_scope_cache.size(), threads_num); - PADDLE_ENFORCE(global_transfer_data_cache.size(), threads_num); -#ifdef PADDLE_WITH_MKLDNN - } -#endif + // Since paddle::framework::global_transfer_scope_cache() and + // paddle::framework::global_transfer_data_cache() are thread_local, + // their pointer should be different among different thread id. + PADDLE_ENFORCE(global_transfer_scope_cache.size(), threads_num); + PADDLE_ENFORCE(global_transfer_data_cache.size(), threads_num); } -TEST(Analyzer_bert, threadlocal_transfer_scope_cache) { - verify_transfer_scope_cache(); -} -#ifdef PADDLE_WITH_MKLDNN -TEST(Analyzer_bert, static_transfer_scope_cache) { - verify_transfer_scope_cache(true); -} -#endif } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 83bf99ec..78c87b6d 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -262,33 +262,6 @@ void compare(bool use_mkldnn = false) { reinterpret_cast(&cfg), input_slots_all); } -// Compare result of NativeConfig and AnalysisConfig with memory optimization. -TEST(Analyzer_dam, compare_with_static_memory_optim) { - // The small dam will core in CI, but works in local. - if (FLAGS_max_turn_num == 9) { - AnalysisConfig cfg, cfg1; - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - - std::vector> input_slots_all; - SetInput(&input_slots_all); - // Run the first time to force to update memory cache - SetConfig(&cfg); - cfg.EnableMemoryOptim(true, true /*force update*/); - - CompareNativeAndAnalysis( - reinterpret_cast(&cfg), - input_slots_all); - - // Run second time to use the memory cache and perform memory optimization. - SetConfig(&cfg1); - cfg1.EnableMemoryOptim(true, false /*do not force update*/); - - CompareNativeAndAnalysis( - reinterpret_cast(&cfg1), - input_slots_all); - } -} - TEST(Analyzer_dam, compare_with_dynamic_memory_optim) { // The small dam will core in CI, but works in local. if (FLAGS_max_turn_num == 9) { diff --git a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc index ccb50d40..72da7c48 100644 --- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/tests/api/tester_helper.h" +// setting iterations to 0 means processing the whole dataset namespace paddle { namespace inference { namespace analysis { @@ -143,8 +144,8 @@ std::shared_ptr> GetWarmupData( int32_t num_images = FLAGS_warmup_batch_size) { int test_data_batch_size = test_data[0][0].shape[0]; auto iterations = test_data.size(); - PADDLE_ENFORCE( - static_cast(num_images) <= iterations * test_data_batch_size, + PADDLE_ENFORCE_LE( + static_cast(num_images), iterations * test_data_batch_size, "The requested quantization warmup data size " + std::to_string(num_images) + " is bigger than all test data size."); @@ -234,8 +235,8 @@ std::shared_ptr> GetWarmupData( static_cast(difficult.data.data()) + objects_accum); objects_accum = objects_accum + objects_remain; } - PADDLE_ENFORCE( - static_cast(num_objects) == static_cast(objects_accum), + PADDLE_ENFORCE_EQ( + static_cast(num_objects), static_cast(objects_accum), "The requested num of objects " + std::to_string(num_objects) + " is the same as objects_accum."); @@ -273,7 +274,8 @@ TEST(Analyzer_int8_mobilenet_ssd, quantization) { q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data); q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size); - CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all); + // 0 is avg_cost, 1 is top1_acc, 2 is top5_acc or mAP + CompareQuantizedAndAnalysis(&cfg, &q_cfg, input_slots_all, 2); } } // namespace analysis diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc deleted file mode 100644 index 70478d69..00000000 --- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc +++ /dev/null @@ -1,268 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/tests/api/tester_helper.h" - -namespace paddle { -namespace inference { - -struct DataRecord { - std::vector> query, title; - std::vector lod1, lod2; - size_t batch_iter{0}, batch_size{1}, num_samples; // total number of samples - DataRecord() = default; - explicit DataRecord(const std::string &path, int batch_size = 1) - : batch_size(batch_size) { - Load(path); - } - DataRecord NextBatch() { - DataRecord data; - size_t batch_end = batch_iter + batch_size; - // NOTE skip the final batch, if no enough data is provided. - if (batch_end <= query.size()) { - GetInputPerBatch(query, &data.query, &data.lod1, batch_iter, batch_end); - GetInputPerBatch(title, &data.title, &data.lod2, batch_iter, batch_end); - } - batch_iter += batch_size; - return data; - } - void Load(const std::string &path) { - std::ifstream file(path); - std::string line; - int num_lines = 0; - while (std::getline(file, line)) { - num_lines++; - std::vector data; - split(line, '\t', &data); - // load query data - std::vector query_data; - split_to_int64(data[0], ' ', &query_data); - // load title data - std::vector title_data; - split_to_int64(data[1], ' ', &title_data); - query.push_back(std::move(query_data)); - title.push_back(std::move(title_data)); - } - num_samples = num_lines; - } -}; - -void PrepareInputs(std::vector *input_slots, DataRecord *data, - int batch_size) { - PaddleTensor lod_query_tensor, lod_title_tensor; - lod_query_tensor.name = "left"; - lod_title_tensor.name = "right"; - auto one_batch = data->NextBatch(); - // assign data - TensorAssignData(&lod_query_tensor, one_batch.query, one_batch.lod1); - TensorAssignData(&lod_title_tensor, one_batch.title, one_batch.lod2); - // Set inputs. - input_slots->assign({lod_query_tensor, lod_title_tensor}); - for (auto &tensor : *input_slots) { - tensor.dtype = PaddleDType::INT64; - } -} - -void SetConfig(AnalysisConfig *cfg) { - cfg->SetModel(FLAGS_infer_model); - cfg->DisableGpu(); - cfg->SwitchSpecifyInputNames(); - cfg->SwitchIrOptim(); -} - -void SetInput(std::vector> *inputs) { - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - std::vector input_slots; - int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1; - LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size; - for (int bid = 0; bid < epoch; ++bid) { - PrepareInputs(&input_slots, &data, FLAGS_batch_size); - (*inputs).emplace_back(input_slots); - } -} - -// Easy for profiling independently. -void profile(bool use_mkldnn = false) { - AnalysisConfig cfg; - SetConfig(&cfg); - std::vector> outputs; - - if (use_mkldnn) { - cfg.EnableMKLDNN(); - cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); - } - - std::vector> input_slots_all; - SetInput(&input_slots_all); - TestPrediction(reinterpret_cast(&cfg), - input_slots_all, &outputs, FLAGS_num_threads); - - if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { - PADDLE_ENFORCE_GT(outputs.size(), 0); - PADDLE_ENFORCE_EQ(outputs.back().size(), 2UL); - for (auto &output : outputs.back()) { - size_t size = GetSize(output); - PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(output.data.data()); - // output is probability, which is in (-1, 1). - for (size_t i = 0; i < size; i++) { - EXPECT_GT(result[i], -1); - EXPECT_LT(result[i], 1); - } - } - } -} - -TEST(Analyzer_MM_DNN, profile) { profile(); } -#ifdef PADDLE_WITH_MKLDNN -TEST(Analyzer_MM_DNN, profile_mkldnn) { profile(true /* use_mkldnn */); } -#endif - -// Check the fuse status -TEST(Analyzer_MM_DNN, fuse_statis) { - AnalysisConfig cfg; - SetConfig(&cfg); - - int num_ops; - auto predictor = CreatePaddlePredictor(cfg); - auto fuse_statis = GetFuseStatis( - static_cast(predictor.get()), &num_ops); -} - -// Compare result of NativeConfig and AnalysisConfig -void compare(bool use_mkldnn = false) { - AnalysisConfig cfg; - SetConfig(&cfg); - - if (use_mkldnn) { - cfg.EnableMKLDNN(); - cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); - } - - std::vector> input_slots_all; - SetInput(&input_slots_all); - CompareNativeAndAnalysis( - reinterpret_cast(&cfg), input_slots_all); -} - -TEST(Analyzer_MM_DNN, compare) { compare(); } -#ifdef PADDLE_WITH_MKLDNN -TEST(Analyzer_MM_DNN, compare_mkldnn) { compare(true /* use_mkldnn */); } -#endif - -// Compare Deterministic result -TEST(Analyzer_MM_DNN, compare_determine) { - AnalysisConfig cfg; - SetConfig(&cfg); - - std::vector> input_slots_all; - SetInput(&input_slots_all); - CompareDeterministic(reinterpret_cast(&cfg), - input_slots_all); -} - -#ifdef PADDLE_WITH_MKLDNN -void TestMkldnnCacheClear(int mkldnn_input_shape_cache_capacity, - std::vector> *outputs) { - AnalysisConfig config; - SetConfig(&config); - config.EnableMKLDNN(); - config.SetMkldnnCacheCapacity(mkldnn_input_shape_cache_capacity); - - std::vector input; - auto predictor = CreatePaddlePredictor(config); - - int sample_num = 10; - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - outputs->resize(sample_num); - - for (int i = 0; i < sample_num; i++) { - PrepareInputs(&input, &data, FLAGS_batch_size); - predictor->Run(input, &(*outputs)[i], 1); - } -} - -TEST(Analyzer_MM_DNN, mkldnn_cache_clear) { - std::vector> outputs, cache_outputs; - // 0 means do not use cache clear strategy. - TestMkldnnCacheClear(0, &outputs); - // 4 means use cache clear strategy, and the - // mkldnn_input_shape_cache_capacity is 4. - TestMkldnnCacheClear(4, &cache_outputs); - // compare the result. - for (size_t i = 0; i < outputs.size(); i++) { - CompareResult(outputs[i], cache_outputs[i]); - } -} - -void TestMkldnnShapeBlobSize(int mkldnn_input_shape_cache_capacity) { - AnalysisConfig config; - SetConfig(&config); - config.EnableMKLDNN(); - config.SwitchUseFeedFetchOps(false); - // Since AnalysisPredictor::Run() will reset cur_mkldnn_session_id to default - // before its finished, we use AnalysisPredictor::ZeroCopyRun() here to check - // the mkldnn_shape_blob_size. - if (mkldnn_input_shape_cache_capacity > 0) { - platform::set_cur_mkldnn_session_id( - platform::kMKLDNNSessionID_CacheClearing); - platform::set_cur_input_shape_cache_capacity( - mkldnn_input_shape_cache_capacity); - } - - std::vector input; - auto predictor = CreatePaddlePredictor(config); - - int sample_num = 10; - DataRecord data(FLAGS_infer_data, FLAGS_batch_size); - - auto &pool = platform::DeviceContextPool::Instance(); - auto *dev_ctx = dynamic_cast( - pool.Get(platform::CPUPlace())); - // clear before test - dev_ctx->ResetBlobMap(); - - for (int i = 0; i < sample_num; i++) { - PrepareInputs(&input, &data, FLAGS_batch_size); - ConvertPaddleTensorToZeroCopyTensor(predictor.get(), input); - if (mkldnn_input_shape_cache_capacity > 0) { - std::stringstream ss; - for (size_t i = 0; i < input.size(); i++) { - for (size_t j = 0; j < input[i].shape.size(); ++j) { - ss << input[i].shape[j] << "-"; - } - } - platform::set_cur_input_shape_str(ss.str()); - } - predictor->ZeroCopyRun(); - } - if (mkldnn_input_shape_cache_capacity > 0) { - PADDLE_ENFORCE_EQ(dev_ctx->GetShapeBlobSize(), - mkldnn_input_shape_cache_capacity); - } else { - PADDLE_ENFORCE_EQ(dev_ctx->GetShapeBlobSize(), 1UL); - } -} - -TEST(Analyzer_MM_DNN, mkldnn_shape_blob_size) { - // 0 means do not use cache clear strategy. - TestMkldnnShapeBlobSize(0); - // 4 means use cache clear strategy, and the - // mkldnn_input_shape_cache_capacity is 4. - TestMkldnnShapeBlobSize(4); -} -#endif - -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index 5ee848c3..e3f8b835 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -146,7 +146,7 @@ TEST(Analyzer_seq_conv1, fuse_statis) { ASSERT_TRUE(fuse_statis.count("seqconv_eltadd_relu_fuse")); EXPECT_EQ(fuse_statis.at("fc_fuse"), 2); EXPECT_EQ(fuse_statis.at("seqconv_eltadd_relu_fuse"), 6); - EXPECT_EQ(num_ops, 32); + EXPECT_EQ(num_ops, 31); } // Compare result of NativeConfig and AnalysisConfig diff --git a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py index 2ca8e582..d703a129 100644 --- a/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py +++ b/paddle/fluid/inference/tests/api/full_pascalvoc_test_preprocess.py @@ -11,7 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import xml.etree.ElementTree as ET + +import xml.etree.ElementTree from PIL import Image import numpy as np import os @@ -21,6 +22,7 @@ import tarfile import StringIO import hashlib import tarfile +import argparse DATA_URL = "http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar" DATA_DIR = os.path.expanduser("~/.cache/paddle/dataset/pascalvoc/") @@ -28,8 +30,8 @@ TAR_FILE = "VOCtest_06-Nov-2007.tar" TAR_PATH = os.path.join(DATA_DIR, TAR_FILE) RESIZE_H = 300 RESIZE_W = 300 -mean_value = [127.5, 127.5, 127.5] -ap_version = '11point' +MEAN_VALUE = [127.5, 127.5, 127.5] +AP_VERSION = '11point' DATA_OUT = 'pascalvoc_full.bin' DATA_OUT_PATH = os.path.join(DATA_DIR, DATA_OUT) BIN_TARGETHASH = "f6546cadc42f5ff13178b84ed29b740b" @@ -40,10 +42,8 @@ BIN_FULLSIZE = 5348678856 def preprocess(img): img_width, img_height = img.size - img = img.resize((RESIZE_W, RESIZE_H), Image.ANTIALIAS) img = np.array(img) - # HWC to CHW if len(img.shape) == 3: img = np.swapaxes(img, 1, 2) @@ -51,12 +51,92 @@ def preprocess(img): # RBG to BGR img = img[[2, 1, 0], :, :] img = img.astype('float32') - img_mean = np.array(mean_value)[:, np.newaxis, np.newaxis].astype('float32') + img_mean = np.array(MEAN_VALUE)[:, np.newaxis, np.newaxis].astype('float32') img -= img_mean img = img * 0.007843 return img +def convert_pascalvoc_local2bin(args): + data_dir = os.path.expanduser(args.data_dir) + label_fpath = os.path.join(data_dir, args.label_file) + flabel = open(label_fpath) + label_list = [line.strip() for line in flabel] + + img_annotation_list_path = os.path.join(data_dir, args.img_annotation_list) + flist = open(img_annotation_list_path) + lines = [line.strip() for line in flist] + + output_file_path = os.path.join(data_dir, args.output_file) + f1 = open(output_file_path, "w+b") + f1.seek(0) + image_nums = len(lines) + f1.write(np.array(image_nums).astype('int64').tobytes()) + + boxes = [] + lbls = [] + difficults = [] + object_nums = [] + + for line in lines: + image_path, label_path = line.split() + image_path = os.path.join(data_dir, image_path) + label_path = os.path.join(data_dir, label_path) + + im = Image.open(image_path) + if im.mode == 'L': + im = im.convert('RGB') + im_width, im_height = im.size + + im = preprocess(im) + np_im = np.array(im) + f1.write(np_im.astype('float32').tobytes()) + + # layout: label | xmin | ymin | xmax | ymax | difficult + bbox_labels = [] + root = xml.etree.ElementTree.parse(label_path).getroot() + + objects = root.findall('object') + objects_size = len(objects) + object_nums.append(objects_size) + + for object in objects: + bbox_sample = [] + # start from 1 + bbox_sample.append( + float(label_list.index(object.find('name').text))) + bbox = object.find('bndbox') + difficult = float(object.find('difficult').text) + bbox_sample.append(float(bbox.find('xmin').text) / im_width) + bbox_sample.append(float(bbox.find('ymin').text) / im_height) + bbox_sample.append(float(bbox.find('xmax').text) / im_width) + bbox_sample.append(float(bbox.find('ymax').text) / im_height) + bbox_sample.append(difficult) + bbox_labels.append(bbox_sample) + + bbox_labels = np.array(bbox_labels) + if len(bbox_labels) == 0: continue + + lbls.extend(bbox_labels[:, 0]) + boxes.extend(bbox_labels[:, 1:5]) + difficults.extend(bbox_labels[:, -1]) + + f1.write(np.array(object_nums).astype('uint64').tobytes()) + f1.write(np.array(lbls).astype('int64').tobytes()) + f1.write(np.array(boxes).astype('float32').tobytes()) + f1.write(np.array(difficults).astype('int64').tobytes()) + f1.close() + + object_nums_sum = sum(object_nums) + target_size = 8 + image_nums * 3 * args.resize_h * args.resize_h * 4 + image_nums * 8 + object_nums_sum * ( + 8 + 4 * 4 + 8) + if (os.path.getsize(output_file_path) == target_size): + print("Success! \nThe output binary file can be found at: ", + output_file_path) + else: + print("Conversion failed!") + + def print_processbar(done_percentage): done_filled = done_percentage * '=' empty_filled = (100 - done_percentage) * ' ' @@ -65,7 +145,7 @@ def print_processbar(done_percentage): sys.stdout.flush() -def convert_pascalvoc(tar_path, data_out_path): +def convert_pascalvoc_tar2bin(tar_path, data_out_path): print("Start converting ...\n") images = {} gt_labels = {} @@ -87,12 +167,12 @@ def convert_pascalvoc(tar_path, data_out_path): f_test = tar.extractfile(TEST_LIST_KEY).read() lines = f_test.split('\n') del lines[-1] - line_len = len(lines) - per_percentage = line_len / 100 + image_nums = len(lines) + per_percentage = image_nums / 100 f1 = open(data_out_path, "w+b") f1.seek(0) - f1.write(np.array(line_len).astype('int64').tobytes()) + f1.write(np.array(image_nums).astype('int64').tobytes()) for tarInfo in tar: if tarInfo.isfile(): tmp_filename = tarInfo.name @@ -115,7 +195,7 @@ def convert_pascalvoc(tar_path, data_out_path): # layout: label | xmin | ymin | xmax | ymax | difficult bbox_labels = [] - root = ET.fromstring(gt_labels[name_prefix]) + root = xml.etree.ElementTree.fromstring(gt_labels[name_prefix]) objects = root.findall('object') objects_size = len(objects) @@ -179,9 +259,48 @@ def run_convert(): retry = retry + 1 else: download_pascalvoc(DATA_URL, DATA_DIR, TAR_TARGETHASH, TAR_PATH) - convert_pascalvoc(TAR_PATH, DATA_OUT_PATH) - print("Success! \nThe binary file can be found at %s\n" % DATA_OUT_PATH) + convert_pascalvoc_tar2bin(TAR_PATH, DATA_OUT_PATH) + print("Success!\nThe binary file can be found at %s\n" % DATA_OUT_PATH) + + +def main_pascalvoc_preprocess(args): + parser = argparse.ArgumentParser( + description="Convert the full pascalvoc val set or local data to binary file." + ) + parser.add_argument( + '--choice', choices=['local', 'VOC_test_2007'], required=True) + parser.add_argument( + "--data_dir", + default="/home/li/AIPG-Paddle/paddle/build/third_party/inference_demo/int8v2/pascalvoc_small", + type=str, + help="Dataset root directory") + parser.add_argument( + "--img_annotation_list", + type=str, + default="test_100.txt", + help="A file containing the image file path and relevant annotation file path" + ) + parser.add_argument( + "--label_file", + type=str, + default="label_list", + help="List the labels in the same sequence as denoted in the annotation file" + ) + parser.add_argument( + "--output_file", + type=str, + default="pascalvoc_small.bin", + help="File path of the output binary file") + parser.add_argument("--resize_h", type=int, default=RESIZE_H) + parser.add_argument("--resize_w", type=int, default=RESIZE_W) + parser.add_argument("--mean_value", type=str, default=MEAN_VALUE) + parser.add_argument("--ap_version", type=str, default=AP_VERSION) + args = parser.parse_args() + if args.choice == 'local': + convert_pascalvoc_local2bin(args) + elif args.choice == 'VOC_test_2007': + run_convert() if __name__ == "__main__": - run_convert() + main_pascalvoc_preprocess(sys.argv) diff --git a/paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py b/paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py new file mode 100644 index 00000000..4576d60a --- /dev/null +++ b/paddle/fluid/inference/tests/api/test_detection_dataset_preprocess.py @@ -0,0 +1,35 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from full_pascalvoc_test_preprocess import main_pascalvoc_preprocess +import numpy as np +import paddle.fluid.core as core +import paddle.fluid as fluid +import unittest +import os + + +class Test_Preprocess(unittest.TestCase): + def test_local_convert(self): + os.system("python full_pascalvoc_test_preprocess.py --choice=local") + + def test_online_convert(self): + os.system( + "python full_pascalvoc_test_preprocess.py --choice=VOC_test_2007") + + +if __name__ == '__main__': + unittest.main() diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 61cf10c3..463fc4b1 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -42,6 +42,8 @@ DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_data, "", "data file"); DEFINE_string(refer_result, "", "reference result for comparison"); DEFINE_int32(batch_size, 1, "batch size"); +DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction"); +DEFINE_bool(enable_int8, true, "Enable INT8 type prediction"); DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup"); // setting iterations to 0 means processing the whole dataset DEFINE_int32(iterations, 0, "number of batches to process"); @@ -128,6 +130,14 @@ void CompareResult(const std::vector &outputs, } break; } + case PaddleDType::UINT8: { + uint8_t *pdata = static_cast(out.data.data()); + uint8_t *pdata_ref = static_cast(ref_out.data.data()); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } } } } @@ -172,6 +182,15 @@ void CompareResult(const std::vector &outputs, } break; } + case PaddleDType::UINT8: { + uint8_t *pdata = static_cast(out.data.data()); + uint8_t *pdata_ref = ref_out.data(&place, &ref_size); + EXPECT_EQ(size, ref_size); + for (size_t j = 0; j < size; ++j) { + EXPECT_EQ(pdata_ref[j], pdata[j]); + } + break; + } } } } @@ -286,6 +305,8 @@ void ConvertPaddleTensorToZeroCopyTensor( ZeroCopyTensorAssignData(tensor.get(), input.data); } else if (input.dtype == PaddleDType::INT32) { ZeroCopyTensorAssignData(tensor.get(), input.data); + } else if (input.dtype == PaddleDType::UINT8) { + ZeroCopyTensorAssignData(tensor.get(), input.data); } else { LOG(ERROR) << "unsupported feed type " << input.dtype; } @@ -443,64 +464,108 @@ void TestPrediction(const PaddlePredictor::Config *config, } } -void SummarizeAccuracy(float avg_acc1_fp32, float avg_acc1_int8) { +void SummarizeAccuracy(float avg_acc_fp32, float avg_acc_int8, + int compared_idx) { + PADDLE_ENFORCE_LE(compared_idx, 2, + "Compare either top1 accuracy or mAP (top5), the " + "compared_idx is out of range"); + PADDLE_ENFORCE_GE(compared_idx, 1, + "Compare either top1 accuracy or mAP (top5), the " + "compared_idx is out of range"); + std::string prefix = (compared_idx == 1) ? "top1_accuracy " : "mAP "; LOG(INFO) << "--- Accuracy summary --- "; - LOG(INFO) << "Accepted top1 accuracy drop threshold: " - << FLAGS_quantized_accuracy - << ". (condition: (FP32_top1_acc - INT8_top1_acc) <= threshold)"; - LOG(INFO) << "FP32: avg top1 accuracy: " << std::fixed << std::setw(6) - << std::setprecision(4) << avg_acc1_fp32; - LOG(INFO) << "INT8: avg top1 accuracy: " << std::fixed << std::setw(6) - << std::setprecision(4) << avg_acc1_int8; + LOG(INFO) << "Accepted " << prefix + << "drop threshold: " << FLAGS_quantized_accuracy + << ". (condition: (FP32_" << prefix << " - INT8_" << prefix + << ") <= threshold)"; + LOG(INFO) << "FP32: avg " << prefix << std::fixed << std::setw(6) + << std::setprecision(4) << avg_acc_fp32; + LOG(INFO) << "INT8: avg " << prefix << std::fixed << std::setw(6) + << std::setprecision(4) << avg_acc_int8; +} + +void SummarizePerformance(const char *title, float sample) { + CHECK_GT(sample, 0.0); + auto throughput = 1000.0 / sample; + LOG(INFO) << title << ": avg fps: " << std::fixed << std::setw(6) + << std::setprecision(4) << throughput << ", avg latency: " << sample + << " ms"; } void SummarizePerformance(float sample_latency_fp32, float sample_latency_int8) { - // sample latency in ms - auto throughput_fp32 = 1000.0 / sample_latency_fp32; - auto throughput_int8 = 1000.0 / sample_latency_int8; - LOG(INFO) << "--- Performance summary --- "; - LOG(INFO) << "FP32: avg fps: " << std::fixed << std::setw(6) - << std::setprecision(4) << throughput_fp32 - << ", avg latency: " << sample_latency_fp32 << " ms"; - LOG(INFO) << "INT8: avg fps: " << std::fixed << std::setw(6) - << std::setprecision(4) << throughput_int8 - << ", avg latency: " << sample_latency_int8 << " ms"; + if (FLAGS_enable_fp32) SummarizePerformance("FP32", sample_latency_fp32); + if (FLAGS_enable_int8) SummarizePerformance("INT8", sample_latency_int8); } -void CompareTopAccuracy( - const std::vector> &output_slots_quant, - const std::vector> &output_slots_ref) { - if (output_slots_quant.size() == 0 || output_slots_ref.size() == 0) +float CompareAccuracyOne( + const std::vector> &output_slots, + int compared_idx) { + if (output_slots.size() == 0) throw std::invalid_argument( - "CompareTopAccuracy: output_slots vector is empty."); - - float total_accs1_quant{0}; - float total_accs1_ref{0}; - for (size_t i = 0; i < output_slots_quant.size(); ++i) { - PADDLE_ENFORCE(output_slots_quant[i].size() >= 2UL); - PADDLE_ENFORCE(output_slots_ref[i].size() >= 2UL); - // second output: acc_top1 - if (output_slots_quant[i][1].lod.size() > 0 || - output_slots_ref[i][1].lod.size() > 0) - throw std::invalid_argument( - "CompareTopAccuracy: top1 accuracy output has nonempty LoD."); - if (output_slots_quant[i][1].dtype != paddle::PaddleDType::FLOAT32 || - output_slots_ref[i][1].dtype != paddle::PaddleDType::FLOAT32) + "CompareAccuracy: output_slots vector is empty."); + + float total_accs{0}; + + for (size_t i = 0; i < output_slots.size(); ++i) { + switch (compared_idx) { + case 1: + PADDLE_ENFORCE_GE( + output_slots[i].size(), 2UL, + "To achieve top 1 accuracy, output_slots_quant[i].size()>=2"); + break; + case 2: + PADDLE_ENFORCE_GE( + output_slots[i].size(), 2UL, + "To achieve top 1 accuracy, output_slots_ref[i].size()>=2"); + break; + default: + throw std::invalid_argument( + "CompareAccuracy: compared_idx is out of range."); + } + + if (output_slots[i][compared_idx].lod.size() > 0) + throw std::invalid_argument("CompareAccuracy: output has nonempty LoD."); + + if (output_slots[i][compared_idx].dtype != paddle::PaddleDType::FLOAT32) throw std::invalid_argument( - "CompareTopAccuracy: top1 accuracy output is of a wrong type."); - total_accs1_quant += - *static_cast(output_slots_quant[i][1].data.data()); - total_accs1_ref += - *static_cast(output_slots_ref[i][1].data.data()); - } - float avg_acc1_quant = total_accs1_quant / output_slots_quant.size(); - float avg_acc1_ref = total_accs1_ref / output_slots_ref.size(); - - SummarizeAccuracy(avg_acc1_ref, avg_acc1_quant); - CHECK_GT(avg_acc1_ref, 0.0); - CHECK_GT(avg_acc1_quant, 0.0); - CHECK_LE(avg_acc1_ref - avg_acc1_quant, FLAGS_quantized_accuracy); + "CompareAccuracy: output is of a wrong type."); + + total_accs += + *static_cast(output_slots[i][compared_idx].data.data()); + } + + CHECK_GT(output_slots.size(), 0); + + return total_accs / output_slots.size(); +} + +void CompareAccuracy( + const std::vector> &output_slots_quant, + const std::vector> &output_slots_ref, + int compared_idx) { + if ((FLAGS_enable_fp32 && FLAGS_enable_int8) && + (output_slots_quant.size() == 0 || output_slots_ref.size()) == 0) + throw std::invalid_argument( + "CompareAccuracy: output_slots vector is empty."); + + float avg_acc_quant = 0.0; + float avg_acc_ref = 0.0; + + if (FLAGS_enable_int8) + avg_acc_quant = CompareAccuracyOne(output_slots_quant, compared_idx); + + if (FLAGS_enable_fp32) + avg_acc_ref = CompareAccuracyOne(output_slots_ref, compared_idx); + + SummarizeAccuracy(avg_acc_ref, avg_acc_quant, compared_idx); + + if (FLAGS_enable_fp32) CHECK_GT(avg_acc_ref, 0.0); + + if (FLAGS_enable_int8) CHECK_GT(avg_acc_quant, 0.0); + + if (FLAGS_enable_fp32 && FLAGS_enable_int8) + CHECK_LE(avg_acc_ref - avg_acc_quant, FLAGS_quantized_accuracy); } void CompareDeterministic( @@ -529,14 +594,15 @@ void CompareNativeAndAnalysis( std::vector> native_outputs, analysis_outputs; TestOneThreadPrediction(config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &analysis_outputs, true); - PADDLE_ENFORCE(native_outputs.size() > 0, "Native output is empty."); - PADDLE_ENFORCE(analysis_outputs.size() > 0, "Analysis output is empty."); + PADDLE_ENFORCE_GT(native_outputs.size(), 0, "Native output is empty."); + PADDLE_ENFORCE_GT(analysis_outputs.size(), 0, "Analysis output is empty."); CompareResult(analysis_outputs.back(), native_outputs.back()); } void CompareQuantizedAndAnalysis( const AnalysisConfig *config, const AnalysisConfig *qconfig, - const std::vector> &inputs) { + const std::vector> &inputs, + const int compared_idx = 1) { PADDLE_ENFORCE_EQ(inputs[0][0].shape[0], FLAGS_batch_size, "Input data has to be packed batch by batch."); LOG(INFO) << "FP32 & INT8 prediction run: batch_size " << FLAGS_batch_size @@ -547,19 +613,25 @@ void CompareQuantizedAndAnalysis( PrintConfig(cfg, true); std::vector> analysis_outputs; float sample_latency_fp32{-1}; - TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32, - &sample_latency_fp32); + + if (FLAGS_enable_fp32) { + TestOneThreadPrediction(cfg, inputs, &analysis_outputs, true, VarType::FP32, + &sample_latency_fp32); + } LOG(INFO) << "--- INT8 prediction start ---"; auto *qcfg = reinterpret_cast(qconfig); PrintConfig(qcfg, true); std::vector> quantized_outputs; float sample_latency_int8{-1}; - TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true, VarType::INT8, - &sample_latency_int8); + if (FLAGS_enable_int8) { + TestOneThreadPrediction(qcfg, inputs, &quantized_outputs, true, + VarType::INT8, &sample_latency_int8); + } SummarizePerformance(sample_latency_fp32, sample_latency_int8); - CompareTopAccuracy(quantized_outputs, analysis_outputs); + + CompareAccuracy(quantized_outputs, analysis_outputs, compared_idx); } void CompareNativeAndAnalysis( diff --git a/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc new file mode 100644 index 00000000..35be7db5 --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_cascade_rcnn_test.cc @@ -0,0 +1,62 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/inference/tests/api/trt_test_helper.h" + +namespace paddle { +namespace inference { + +TEST(TensorRT, cascade_rcnn) { + std::string model_dir = FLAGS_infer_model + "/cascade_rcnn"; + AnalysisConfig config; + int batch_size = 1; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir + "/model", model_dir + "/params"); + config.SwitchUseFeedFetchOps(false); + config.EnableTensorRtEngine(1 << 30, batch_size, 40, + AnalysisConfig::Precision::kFloat32, false); + + auto predictor = CreatePaddlePredictor(config); + + int channels = 3; + int height = 640; + int width = 640; + int input_num = batch_size * channels * height * width; + float *input = new float[input_num]; + memset(input, 1.0, input_num * sizeof(float)); + + float *im_shape = new float[3]; + im_shape[0] = 3.0; + im_shape[1] = 640.0; + im_shape[2] = 640.0; + + auto input_names = predictor->GetInputNames(); + + auto input_t = predictor->GetInputTensor(input_names[0]); + input_t->Reshape({batch_size, channels, height, width}); + input_t->copy_from_cpu(input); + + auto input_t1 = predictor->GetInputTensor(input_names[1]); + input_t1->Reshape({batch_size, 3}); + input_t1->copy_from_cpu(im_shape); + + ASSERT_TRUE(predictor->ZeroCopyRun()); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc new file mode 100644 index 00000000..2ee75f90 --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_fc_prelu_test.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/inference/tests/api/trt_test_helper.h" + +namespace paddle { +namespace inference { + +TEST(TensorRT_fc, compare) { + std::string model_dir = FLAGS_infer_model + "/fc_uint8"; + compare(model_dir, /* use_tensorrt */ true); + // Open it when need. + // profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); +} + +TEST(ZeroCopyTensor, uint8) { + std::string model_dir = FLAGS_infer_model + "/" + "fc_uint8"; + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir); + config.SwitchUseFeedFetchOps(false); + config.EnableProfile(); + + std::vector> inputs_all; + auto predictor = CreatePaddlePredictor(config); + auto input_names = predictor->GetInputNames(); + auto name2shape = predictor->GetInputTensorShape(); + + int batch_size = 1; + int length = 4; + int input_num = batch_size * length; + uint8_t *input = new uint8_t[input_num]; + memset(input, 1, input_num * sizeof(uint8_t)); + auto input_t = predictor->GetInputTensor(input_names[0]); + input_t->Reshape({batch_size, length}); + input_t->copy_from_cpu(input); + input_t->type(); + + ASSERT_TRUE(predictor->ZeroCopyRun()); +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc index 14539a9d..1dbdcccf 100644 --- a/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc +++ b/paddle/fluid/inference/tests/api/trt_mobilenet_test.cc @@ -32,6 +32,7 @@ TEST(AnalysisPredictor, use_gpu) { std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; AnalysisConfig config; config.EnableUseGpu(100, 0); + config.EnableCUDNN(); config.SetModel(model_dir); config.pass_builder()->TurnOnDebug(); diff --git a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc index 7dfcbb0d..9f70a58a 100644 --- a/paddle/fluid/inference/tests/api/trt_resnet50_test.cc +++ b/paddle/fluid/inference/tests/api/trt_resnet50_test.cc @@ -23,7 +23,7 @@ namespace inference { TEST(resnet50, compare_continuous_input) { std::string model_dir = FLAGS_infer_model + "/resnet50"; - compare_continuous_input(model_dir, true); + compare_continuous_input(model_dir, /* use_tensorrt */ true); } } // namespace inference diff --git a/paddle/fluid/inference/tests/api/trt_test_helper.h b/paddle/fluid/inference/tests/api/trt_test_helper.h index 0233cad0..ee3ba63b 100644 --- a/paddle/fluid/inference/tests/api/trt_test_helper.h +++ b/paddle/fluid/inference/tests/api/trt_test_helper.h @@ -63,6 +63,7 @@ void SetConfig(AnalysisConfig* config, std::string model_dir, config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->TurnOnDebug(); } else { + config->EnableCUDNN(); config->SwitchIrOptim(); } } diff --git a/paddle/fluid/memory/CMakeLists.txt b/paddle/fluid/memory/CMakeLists.txt index 7eb663ea..ce24f5a4 100644 --- a/paddle/fluid/memory/CMakeLists.txt +++ b/paddle/fluid/memory/CMakeLists.txt @@ -1,12 +1,27 @@ add_subdirectory(detail) add_subdirectory(allocation) -cc_library(malloc SRCS malloc.cc DEPS place enforce allocator_facade profiler) + +if (WITH_MKLDNN) + set(MKLDNN_CTX_DEPS mkldnn) +else () + set(MKLDNN_CTX_DEPS) +endif() + +cc_library(malloc SRCS malloc.cc DEPS + place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS}) cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memory DEPS malloc memcpy) + +if (WITH_GPU) + nv_test(malloc_test + SRCS malloc_test.cu + DEPS device_context malloc) +endif() + #if (WITH_GPU) # nv_test(pinned_memory_test SRCS pinned_memory_test.cu DEPS place memory) #endif() diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 888c214e..ffae6e64 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -6,6 +6,12 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(naive_best_fit_allocator SRCS naive_best_fit_allocator.cc DEPS allocator buddy_allocator profiler) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator) +if (WITH_MKLDNN) + set(MKLDNN_CTX_DEPS mkldnn) +else () + set(MKLDNN_CTX_DEPS) +endif() + if (WITH_GPU) nv_library(cuda_allocator SRCS cuda_allocator.cc DEPS allocator cuda_device_guard) endif() @@ -46,6 +52,10 @@ cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator) if (WITH_TESTING) + if (WITH_GPU) + target_link_libraries(retry_allocator_test cuda_allocator) + endif() + set_tests_properties(retry_allocator_test PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") endif() diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index 5d7c9bde..379c8d00 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -19,6 +19,7 @@ #include #include #include "paddle/fluid/framework/inlined_vector.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -26,14 +27,14 @@ namespace memory { namespace allocation { // Exception when `Alloc`/`AllocShared` failed -class BadAlloc : public std::exception { - public: - inline explicit BadAlloc(std::string msg) : msg_(std::move(msg)) {} +struct BadAlloc : public std::exception { + inline explicit BadAlloc(std::string err_msg, const char* file, int line) + : err_str_(platform::GetTraceBackString(std::move(err_msg), file, line)) { + } - inline const char* what() const noexcept override { return msg_.c_str(); } + const char* what() const noexcept override { return err_str_.c_str(); } - private: - std::string msg_; + std::string err_str_; }; class Allocator; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 77b95f71..220b50b1 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -37,7 +37,7 @@ #endif DEFINE_int64( - gpu_allocator_retry_time, 0, + gpu_allocator_retry_time, 10000, "The retry time (milliseconds) when allocator fails " "to allocate memory. No retry if this value is not greater than 0"); @@ -80,6 +80,12 @@ class AllocatorFacadePrivate { } } InitZeroSizeAllocators(); + + if (FLAGS_gpu_allocator_retry_time > 0) { + WrapCUDARetryAllocator(FLAGS_gpu_allocator_retry_time); + } + + CheckAllocThreadSafe(); } inline const std::shared_ptr& GetAllocator( @@ -118,6 +124,8 @@ class AllocatorFacadePrivate { public: explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} + bool IsAllocThreadSafe() const override { return true; } + protected: Allocation* AllocateImpl(size_t size) override { return new Allocation(nullptr, 0, place_); @@ -145,6 +153,25 @@ class AllocatorFacadePrivate { } } + void CheckAllocThreadSafe() const { + for (auto& pair : allocators_) { + PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true); + } + + for (auto& pair : zero_size_allocators_) { + PADDLE_ENFORCE_EQ(pair.second->IsAllocThreadSafe(), true); + } + } + + void WrapCUDARetryAllocator(size_t retry_time) { + PADDLE_ENFORCE_GT(retry_time, 0, "Retry time must be larger than 0"); + for (auto& pair : allocators_) { + if (platform::is_gpu_place(pair.first)) { + pair.second = std::make_shared(pair.second, retry_time); + } + } + } + private: std::map> allocators_; std::map> zero_size_allocators_; diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc index 4e45cc4d..19b13806 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.cc +++ b/paddle/fluid/memory/allocation/allocator_strategy.cc @@ -17,11 +17,7 @@ #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" -DEFINE_string(allocator_strategy, "naive_best_fit", - "The allocation strategy. naive_best_fit means the original best " - "fit allocator of Fluid. " - "auto_growth means the experimental auto-growth allocator. " - "Enum in [naive_best_fit, auto_growth]."); +DECLARE_string(allocator_strategy); namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index 0d3b11f7..9ce4fd07 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -57,6 +57,7 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { block_it->is_free_ = false; } } else { + FreeIdleChunks(); size_t realloc_size = std::max(size, chunk_size_); try { @@ -119,6 +120,20 @@ void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { delete allocation; } +void AutoGrowthBestFitAllocator::FreeIdleChunks() { + for (auto chunk_it = chunks_.begin(); chunk_it != chunks_.end();) { + auto &blocks = chunk_it->blocks_; + if (blocks.size() == 1 && blocks.begin()->is_free_) { + auto &block = *blocks.begin(); + VLOG(2) << "Free chunk with size " << block.size_; + free_blocks_.erase(std::make_pair(block.size_, block.ptr_)); + chunk_it = chunks_.erase(chunk_it); + } else { + ++chunk_it; + } + } +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index a31dd7cf..27257883 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -27,7 +27,7 @@ namespace allocation { class AutoGrowthBestFitAllocator : public Allocator { public: - explicit AutoGrowthBestFitAllocator( + AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t alignment, size_t chunk_size = 0); @@ -39,6 +39,8 @@ class AutoGrowthBestFitAllocator : public Allocator { void FreeImpl(Allocation *allocation) override; private: + void FreeIdleChunks(); + template using List = std::list; diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc index 72ee4e54..126464f0 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -150,8 +150,8 @@ Allocation* BestFitAllocator::AllocateImpl(size_t size) { } } if (UNLIKELY(highest_set_bit == free_chunks_.size())) { - throw BadAlloc(string::Sprintf( - "Cannot allocate %d, All fragments size is %d", size, FreeSize())); + PADDLE_THROW_BAD_ALLOC("Cannot allocate %d, All fragments size is %d", size, + FreeSize()); } auto chunk_it = SplitChunk(size, highest_set_bit, map_it); return new BestFitAllocation(this, chunk_it); diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 349c71ce..2ba3b6d0 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -17,6 +17,7 @@ #include #include #include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gpu_info.h" namespace paddle { @@ -36,9 +37,10 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) { void* ptr; auto status = cudaMalloc(&ptr, size); if (UNLIKELY(status != cudaSuccess)) { - throw BadAlloc(string::Sprintf( - "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device, - status, cudaGetErrorString(status))); + PADDLE_ENFORCE_NE(cudaGetLastError(), cudaSuccess); + PADDLE_THROW_BAD_ALLOC("Cannot allocate %d on GPU %d, cuda status %d, %s", + size, place_.device, status, + cudaGetErrorString(status)); } return new Allocation(ptr, size, platform::Place(place_)); } diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h new file mode 100644 index 00000000..1f8ad370 --- /dev/null +++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h @@ -0,0 +1,167 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { + +namespace platform { +class CUDADeviceContext; +} // namespace platform + +namespace memory { +namespace allocation { + +/** + * CUDADeviceContextAllocation is a wrapper of the underbeneath allocation. + * CUDADeviceContextAllocation adds a CUDA stream callback for the underbeneath + * allocation so that CUDADeviceContextAllocation can be used in a CUDA stream + * which deletes allocation in the callback. + */ +class CUDADeviceContextAllocation : public Allocation { + public: + explicit CUDADeviceContextAllocation(AllocationPtr allocation) + : Allocation(allocation->ptr(), allocation->size(), allocation->place()), + underlying_allocation_(std::move(allocation)) {} + + ~CUDADeviceContextAllocation() { + PADDLE_ENFORCE_NOT_NULL( + dev_ctx_, "Didn't set device context for CUDADeviceContextAllocation"); + auto *p_allocation = underlying_allocation_.release(); + VLOG(4) << "Adding callback to delete CUDADeviceContextAllocation at " + << p_allocation; + dev_ctx_->AddStreamCallback([p_allocation] { + VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation; + AllocationDeleter()(p_allocation); + }); + } + + void SetCUDADeviceContext(const platform::CUDADeviceContext *dev_ctx) { + dev_ctx_ = dev_ctx; + } + + private: + AllocationPtr underlying_allocation_; + const platform::CUDADeviceContext *dev_ctx_{nullptr}; +}; + +/** + * CUDADeviceContextAllocator will allocate a CUDADeviceContextAllocation + * after waiting for a self-created event on the default stream. It does so to + * let the non-default stream be able to allocate GPU memory which will be + * released by stream callback + */ +class CUDADeviceContextAllocator : public Allocator { + public: + explicit CUDADeviceContextAllocator(platform::CUDAPlace place, + cudaStream_t default_stream) + : place_(place), default_stream_(default_stream) { + platform::CUDADeviceGuard guard(place_.device); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventCreate(&event_, cudaEventDisableTiming), + "Create event failed in CUDADeviceContextAllocator"); + } + + ~CUDADeviceContextAllocator() { + if (event_) { + platform::CUDADeviceGuard guard(place_.device); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventDestroy(event_), + "Destory event failed in CUDADeviceContextAllocator destroctor"); + } + } + + protected: + Allocation *AllocateImpl(size_t size) override { + PADDLE_ENFORCE_NOT_NULL( + default_stream_, + "Didn't set default stream for CUDADeviceContextAllocator"); + platform::CUDADeviceGuard guard(place_.device); + auto allocation = + new CUDADeviceContextAllocation(memory::Alloc(place_, size)); + // Wait for the event on stream + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventRecord(event_, default_stream_), + "Failed to record event in CUDADeviceContextAllocator"); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamWaitEvent(default_stream_, event_, 0), + "Failed to wait event in CUDADeviceContextAllocator"); + return allocation; + } + + void FreeImpl(Allocation *allocation) override { delete allocation; } + + private: + platform::CUDAPlace place_; + cudaEvent_t event_{nullptr}; + cudaStream_t default_stream_{nullptr}; +}; + +/** + * CUDADeviceContextAllocatorPool is a singletion stores mapping from + * CUDAPlace(s) to std::shared_ptr. When a + * CUDADeviceContext's compute stream isn't default stream, it can call this + * class to allocate GPU memory which will be released by a callback after + * stream execution. + */ +class CUDADeviceContextAllocatorPool { + public: + static CUDADeviceContextAllocatorPool &Instance() { + static CUDADeviceContextAllocatorPool pool; + return pool; + } + + AllocationPtr Alloc(const platform::CUDADeviceContext &dev_ctx, size_t size) { + auto iter = + allocators_.find(boost::get(dev_ctx.GetPlace())); + PADDLE_ENFORCE_EQ(iter != allocators_.end(), true, + "CUDADeviceContextAllocatorPool initialization error"); + auto &allocator = iter->second; + AllocationPtr allocation = allocator->Allocate(size); + static_cast(allocation.get()) + ->SetCUDADeviceContext(&dev_ctx); + return allocation; + } + + private: + CUDADeviceContextAllocatorPool() { + std::vector devices = platform::GetSelectedDevices(); + for (int i : devices) { + auto place = platform::CUDAPlace(i); + auto compute_stream = + platform::DeviceContextPool::Instance().GetByPlace(place)->stream(); + auto allocator = std::shared_ptr( + new CUDADeviceContextAllocator(place, compute_stream)); + allocators_.insert(make_pair(place, allocator)); + } + } + + std::map> + allocators_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 2e4e7162..24df3ce3 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -297,13 +297,18 @@ namespace allocation { Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); - return new Allocation(ptr, size, place_); + auto *tmp_alloc = new Allocation(ptr, size, place_); + platform::MemEvenRecorder::Instance().PushMemRecord( + static_cast(tmp_alloc), place_, size); + return tmp_alloc; } void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) { boost::apply_visitor( legacy::FreeVisitor(allocation->ptr(), allocation->size()), allocation->place()); + platform::MemEvenRecorder::Instance().PopMemRecord( + static_cast(allocation), place_); delete allocation; } diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h index 5b376e6c..913d5830 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h @@ -28,6 +28,8 @@ class NaiveBestFitAllocator : public Allocator { public: explicit NaiveBestFitAllocator(const platform::Place &p) : place_(p) {} + bool IsAllocThreadSafe() const override { return true; } + protected: Allocation *AllocateImpl(size_t size) override; void FreeImpl(Allocation *allocation) override; diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index bf14ed5d..ae6af532 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -13,14 +13,40 @@ // limitations under the License. #include "paddle/fluid/memory/allocation/retry_allocator.h" + namespace paddle { namespace memory { namespace allocation { +class WaitedAllocateSizeGuard { + public: + WaitedAllocateSizeGuard(std::atomic* waited_size, + size_t requested_size) + : waited_size_(waited_size), requested_size_(requested_size) { + waited_size_->fetch_add(requested_size_, + std::memory_order::memory_order_relaxed); + } + + ~WaitedAllocateSizeGuard() { + waited_size_->fetch_sub(requested_size_, + std::memory_order::memory_order_relaxed); + } + + private: + std::atomic* waited_size_; + size_t requested_size_; +}; + void RetryAllocator::FreeImpl(Allocation* allocation) { // Delete underlying allocation first. + size_t size = allocation->size(); underlying_allocator_->Free(allocation); - cv_.notify_all(); + if (UNLIKELY(waited_allocate_size_)) { + VLOG(10) << "Free " << size << " bytes and notify all waited threads, " + "where waited_allocate_size_ = " + << waited_allocate_size_; + cv_.notify_all(); + } } Allocation* RetryAllocator::AllocateImpl(size_t size) { @@ -31,29 +57,38 @@ Allocation* RetryAllocator::AllocateImpl(size_t size) { // But it would add lock even when allocation success at the first time try { return alloc_func(); - } catch (BadAlloc& bad_alloc) { + } catch (BadAlloc&) { { + WaitedAllocateSizeGuard guard(&waited_allocate_size_, size); + VLOG(10) << "Allocation failed when allocating " << size + << " bytes, waited_allocate_size_ = " << waited_allocate_size_; // We can just write allocation retry inside the predicate function of - // wait_until - // But it needs to acquire the lock when executing predicate function - // For better performance, we use loop here + // wait_until. But it needs to acquire the lock when executing predicate + // function. For better performance, we use loop here auto end_time = std::chrono::high_resolution_clock::now() + retry_time_; auto wait_until = [&, this] { std::unique_lock lock(mutex_); return cv_.wait_until(lock, end_time); }; + + size_t retry_time = 0; while (wait_until() != std::cv_status::timeout) { try { return alloc_func(); - } catch (BadAlloc& ex) { - bad_alloc = ex; + } catch (BadAlloc&) { + // do nothing when it is not timeout + ++retry_time; + VLOG(10) << "Allocation failed when retrying " << retry_time + << " times when allocating " << size + << " bytes. Wait still."; } catch (...) { throw; } } - - throw; // rethrow the original exception or throw the internal bad_alloc } + VLOG(10) << "Allocation failed because of timeout when allocating " << size + << " bytes."; + return alloc_func(); // If timeout, try last allocation request. } catch (...) { throw; } diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 7840a834..7c218e25 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -14,12 +14,14 @@ #pragma once +#include // NOLINT #include // NOLINT #include // NOLINT #include #include // NOLINT #include #include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace memory { @@ -48,9 +50,7 @@ class RetryAllocator : public Allocator { std::mutex mutex_; std::condition_variable cv_; - // For debug, We can add an atomic integer to record how many memory sizes are - // waited to allocate - // std::atomic waited_allocate_size_{0}; + std::atomic waited_allocate_size_{0}; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index 4ac08d44..11a8dfdc 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -17,12 +17,16 @@ #include // NOLINT #include // NOLINT #include // NOLINT -#include // NOLINT +#include +#include // NOLINT #include #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" #include "paddle/fluid/memory/allocation/locked_allocator.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/memory/allocation/cuda_allocator.h" +#endif namespace paddle { namespace memory { @@ -39,7 +43,7 @@ TEST(RetryAllocator, RetryAllocator) { std::unique_ptr locked_allocator( new LockedAllocator(std::move(best_fit_allocator))); - size_t thread_num = 8; + size_t thread_num = 4; size_t sleep_time = 40; size_t extra_time = 10; @@ -93,6 +97,51 @@ TEST(RetryAllocator, RetryAllocator) { } } +class DummyAllocator : public Allocator { + public: + bool IsAllocThreadSafe() const override { return true; } + + protected: + Allocation *AllocateImpl(size_t size) override { + PADDLE_THROW_BAD_ALLOC("Always BadAlloc"); + } + + void FreeImpl(Allocation *) override {} +}; + +TEST(RetryAllocator, RetryAllocatorLastAllocFailure) { + size_t retry_ms = 10; + { + RetryAllocator allocator(std::make_shared(), retry_ms); + try { + auto allocation = allocator.Allocate(100); + ASSERT_TRUE(false); + allocation.reset(); + } catch (BadAlloc &ex) { + ASSERT_TRUE(std::string(ex.what()).find("Always BadAlloc") != + std::string::npos); + } + } + +#ifdef PADDLE_WITH_CUDA + { + platform::CUDAPlace p(0); + RetryAllocator allocator(std::make_shared(p), retry_ms); + size_t allocate_size = (static_cast(1) << 40); // Very large number + try { + auto allocation = allocator.Allocate(allocate_size); + ASSERT_TRUE(false); + allocation.reset(); + } catch (BadAlloc &ex) { + ASSERT_TRUE(std::string(ex.what()).find( + "Cannot allocate " + std::to_string(allocate_size) + + " on GPU " + std::to_string(p.device)) != + std::string::npos); + } + } +#endif +} + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt index a555b6b2..e1c9a4f0 100644 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -1,9 +1,9 @@ -cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc) +cc_library(memory_block SRCS memory_block.cc memory_block_desc.cc meta_cache.cc DEPS place) if(${WITH_GPU}) - nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info) + nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place) else(${WITH_GPU}) - cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info) + cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place) endif(${WITH_GPU}) cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator) diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index edd6ea4a..3e4af0a4 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -19,9 +19,9 @@ limitations under the License. */ #include "glog/logging.h" -DEFINE_bool(free_idle_memory, false, - "If it is true, Paddle will try to free idle memory trunks during " - "running time."); +#ifdef PADDLE_WITH_CUDA +DECLARE_uint64(reallocate_gpu_memory_in_mb); +#endif namespace paddle { namespace memory { @@ -160,15 +160,6 @@ void BuddyAllocator::Free(void* p) { << block->total_size(cache_) << ")"; pool_.insert( IndexSizeAddress(block->index(cache_), block->total_size(cache_), block)); - - if (FLAGS_free_idle_memory) { - // Clean up if existing too much free memory - // Prefer freeing fallback allocation first - CleanIdleFallBackAlloc(); - - // Free normal allocation - CleanIdleNormalAlloc(); - } } size_t BuddyAllocator::Used() { return total_used_; } @@ -200,8 +191,9 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( // Compute the allocation size for gpu for the first allocation. allocate_bytes = std::max(platform::GpuInitAllocSize(), request_bytes); } else { - // Reallocation size - if (realloc_size_ == 0) { + // Compute the re-allocation size, we store the re-allocation size when + // user set FLAGS_reallocate_gpu_memory_in_mb to fix value. + if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) { realloc_size_ = platform::GpuReallocSize(); } allocate_bytes = std::max(realloc_size_, request_bytes); @@ -220,12 +212,6 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( static_cast(p)->init(&cache_, MemoryBlock::FREE_CHUNK, index, allocate_bytes, nullptr, nullptr); - // gpu fallback allocation - if (system_allocator_->UseGpu() && - static_cast(p)->index(cache_) == 1) { - fallback_alloc_count_++; - } - total_free_ += allocate_bytes; // dump the block into pool @@ -283,70 +269,6 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it, return block; } -void BuddyAllocator::CleanIdleFallBackAlloc() { - // If fallback allocation does not exist, return directly - if (!fallback_alloc_count_) return; - - for (auto pool = pool_.rbegin(); pool != pool_.rend();) { - // If free memory block less than max_chunk_size_, return directly - if (std::get<1>(*pool) < max_chunk_size_) return; - - MemoryBlock* block = static_cast(std::get<2>(*pool)); - - // If no GPU fallback allocator, return - if (!system_allocator_->UseGpu() || block->index(cache_) == 0) { - return; - } - - VLOG(10) << "Return block " << block << " to fallback allocator."; - - system_allocator_->Free(block, block->size(cache_), block->index(cache_)); - cache_.invalidate(block); - - pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); - - total_free_ -= block->size(cache_); - fallback_alloc_count_--; - - // If no fall allocation exists, return directly - if (!fallback_alloc_count_) return; - } -} - -void BuddyAllocator::CleanIdleNormalAlloc() { - auto shall_free_alloc = [&]() -> bool { - // free all fallback allocations - if (fallback_alloc_count_ > 0) { - return true; - } - // keep 2x overhead if we haven't fallen back - if ((total_used_ + max_chunk_size_) * 2 < total_free_) { - return true; - } - return false; - }; - - if (!shall_free_alloc()) return; - - for (auto pool = pool_.rbegin(); pool != pool_.rend();) { - // If free memory block less than max_chunk_size_, return directly - if (std::get<1>(*pool) < max_chunk_size_) return; - - MemoryBlock* block = static_cast(std::get<2>(*pool)); - - VLOG(10) << "Return block " << block << " to base allocator."; - - system_allocator_->Free(block, block->size(cache_), block->index(cache_)); - cache_.invalidate(block); - - pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base())); - - total_free_ -= block->size(cache_); - - if (!shall_free_alloc()) return; - } -} - } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index bdc8cca4..791f8b56 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/memory/detail/memory_block.h" #include "paddle/fluid/memory/detail/system_allocator.h" -#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/gpu_info.h" @@ -76,12 +75,6 @@ class BuddyAllocator { /*! \brief Find the existing chunk which used to allocation */ PoolSet::iterator FindExistChunk(size_t size); - /*! \brief Clean idle fallback allocation */ - void CleanIdleFallBackAlloc(); - - /*! \brief Clean idle normal allocation */ - void CleanIdleNormalAlloc(); - private: size_t total_used_ = 0; // the total size of used memory size_t total_free_ = 0; // the total size of free memory @@ -99,9 +92,6 @@ class BuddyAllocator { */ PoolSet pool_; - /*! Record fallback allocation count for auto-scaling */ - size_t fallback_alloc_count_ = 0; - private: /*! Unify the metadata format between GPU and CPU allocations */ MetadataCache cache_; diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc index 1edc9f20..dce48ac9 100644 --- a/paddle/fluid/memory/detail/buddy_allocator_test.cc +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -22,6 +22,8 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #ifdef PADDLE_WITH_CUDA +#include + DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb); @@ -31,9 +33,11 @@ namespace paddle { namespace memory { namespace detail { -constexpr static int test_gpu_id = 0; +constexpr static int TEST_GPU_ID = 0; -void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { +int* TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes, + bool use_system_allocator = false, + bool free_ptr = true) { bool freed = false; size_t used_bytes = allocator->Used(); @@ -41,19 +45,25 @@ void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { void* p = allocator->Alloc(size_bytes); EXPECT_NE(p, nullptr); + #ifdef PADDLE_WITH_CUDA - if (size_bytes < platform::GpuMaxChunkSize()) { + if (size_bytes < allocator->GetMaxChunkSize()) { #else - if (size_bytes < platform::CpuMaxChunkSize()) { + if (size_bytes < allocator->GetMaxChunkSize()) { #endif // Not allocate from SystemAllocator + EXPECT_FALSE(use_system_allocator); EXPECT_GE(allocator->Used(), used_bytes + size_bytes); } else { // Allocate from SystemAllocator doesn't count in Used() + EXPECT_TRUE(use_system_allocator); EXPECT_EQ(allocator->Used(), used_bytes); } int* intp = static_cast(p); + if (!free_ptr) { + return intp; + } std::shared_ptr ptr(intp, [&](void* p) { allocator->Free(intp); freed = true; @@ -64,20 +74,30 @@ void TestBuddyAllocator(BuddyAllocator* allocator, size_t size_bytes) { EXPECT_EQ(used_bytes, allocator->Used()); EXPECT_TRUE(freed); + return nullptr; } #ifdef PADDLE_WITH_CUDA TEST(BuddyAllocator, GpuFraction) { + // In a 16 GB machine, the pool size will be about 160 MB FLAGS_fraction_of_gpu_memory_to_use = 0.01; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; BuddyAllocator buddy_allocator( - std::unique_ptr(new GPUAllocator(test_gpu_id)), + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + // Less than pool size TestBuddyAllocator(&buddy_allocator, 10); TestBuddyAllocator(&buddy_allocator, 10 << 10); TestBuddyAllocator(&buddy_allocator, 10 << 20); - TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); + + // Greater than max chunk size + TestBuddyAllocator(&buddy_allocator, 499 << 20, + /* use_system_allocator = */ true); + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30), + /* use_system_allocator = */ true); } TEST(BuddyAllocator, InitRealloc) { @@ -87,19 +107,19 @@ TEST(BuddyAllocator, InitRealloc) { EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast(100 << 20)); BuddyAllocator buddy_allocator( - std::unique_ptr(new GPUAllocator(test_gpu_id)), + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); // Less then initial size and reallocate size TestBuddyAllocator(&buddy_allocator, 10 << 20); // Between initial size and reallocate size and not exceed pool TestBuddyAllocator(&buddy_allocator, 80 << 20); - // Less then reallocate size and exceed pool - TestBuddyAllocator(&buddy_allocator, 40 << 20); - // Greater then reallocate size and exceed pool - TestBuddyAllocator(&buddy_allocator, 80 << 20); - // Greater then initial size and reallocate size - TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); + TestBuddyAllocator(&buddy_allocator, 99 << 20); + // Greater than max chunk size + TestBuddyAllocator(&buddy_allocator, 101 << 20, + /* use_system_allocator = */ true); + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30), + /* use_system_allocator = */ true); } TEST(BuddyAllocator, ReallocSizeGreaterThanInit) { @@ -109,23 +129,112 @@ TEST(BuddyAllocator, ReallocSizeGreaterThanInit) { EXPECT_EQ(platform::GpuMaxChunkSize(), static_cast(10 << 20)); BuddyAllocator buddy_allocator( - std::unique_ptr(new GPUAllocator(test_gpu_id)), + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); - // Less then initial size and reallocate size + // Less than initial size and reallocate size TestBuddyAllocator(&buddy_allocator, 1 << 20); - // Between initial size and reallocate size and not exceed pool - TestBuddyAllocator(&buddy_allocator, 3 << 20); - // Less then initial size and exceed pool - TestBuddyAllocator(&buddy_allocator, 3 << 20); - // Less then reallocate size and not exceed pool (now pool is 15 MB, used 7 - // MB) - TestBuddyAllocator(&buddy_allocator, 7 << 20); - // Less then reallocate size and exceed pool + // Between initial size and reallocate size and exceed pool + TestBuddyAllocator(&buddy_allocator, 6 << 20); TestBuddyAllocator(&buddy_allocator, 8 << 20); - // Greater then initial size and reallocate size - TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30)); + TestBuddyAllocator(&buddy_allocator, 9 << 20); + // Greater than max trunk size + TestBuddyAllocator(&buddy_allocator, 11 << 20, + /* use_system_allocator = */ true); + TestBuddyAllocator(&buddy_allocator, 2 * static_cast(1 << 30), + /* use_system_allocator = */ true); +} + +TEST(BuddyAllocator, FractionRefillPool) { + FLAGS_fraction_of_gpu_memory_to_use = 0.6; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + size_t max_chunk_size = platform::GpuMaxChunkSize(); + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), + platform::GpuMinChunkSize(), max_chunk_size); + + // Less than pool size + int* p0 = TestBuddyAllocator(&buddy_allocator, max_chunk_size - 1000, + /* use_system_allocator = */ false, + /* free_ptr = */ false); + // Max chunk size should be same during allocation + EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize()); + + size_t alloc = + platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use; + // Exceed pool trigger refilling size of fraction of avaiable gpu, and should + // be able to alloc 60% of the remaining GPU + int* p1 = TestBuddyAllocator(&buddy_allocator, alloc, + /* use_system_allocator = */ false, + /* free_ptr = */ false); + // Max chunk size should be same during allocation + EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize()); + + alloc = + platform::GpuAvailableMemToAlloc() * FLAGS_fraction_of_gpu_memory_to_use; + // Exceed pool trigger refilling size of fraction of avaiable gpu, and should + // be able to alloc 60% of the remaining GPU + TestBuddyAllocator(&buddy_allocator, alloc, + /* use_system_allocator = */ false); + // Max chunk size should be same during allocation + EXPECT_EQ(max_chunk_size, buddy_allocator.GetMaxChunkSize()); + + buddy_allocator.Free(p0); + buddy_allocator.Free(p1); +} + +TEST(BuddyAllocator, AllocFromAvailable) { + FLAGS_fraction_of_gpu_memory_to_use = 0.7; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + size_t total = 0, available = 0; + platform::SetDeviceId(TEST_GPU_ID); + platform::GpuMemoryUsage(&available, &total); + + // Take half of available GPU + void* p; + cudaError_t result = cudaMalloc(&p, available >> 1); + EXPECT_TRUE(result == cudaSuccess); + + // BuddyAllocator should be able to alloc the remaining GPU + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 10 << 20); + TestBuddyAllocator(&buddy_allocator, static_cast(1 << 30)); + + if (p) { + EXPECT_TRUE(cudaFree(p) == cudaSuccess); + } } + +TEST(BuddyAllocator, AllocFromAvailableWhenFractionIsOne) { + FLAGS_fraction_of_gpu_memory_to_use = 1.0; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + void* p = nullptr; + EXPECT_TRUE(cudaMalloc(&p, static_cast(3) << 30) == cudaSuccess); + + // BuddyAllocator should be able to alloc the remaining GPU + BuddyAllocator buddy_allocator( + std::unique_ptr(new GPUAllocator(TEST_GPU_ID)), + platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + + TestBuddyAllocator(&buddy_allocator, static_cast(1) << 30); + TestBuddyAllocator(&buddy_allocator, static_cast(2) << 30); + + if (p) { + EXPECT_TRUE(cudaFree(p) == cudaSuccess); + } +} + #endif } // namespace detail diff --git a/paddle/fluid/memory/detail/memory_block.cc b/paddle/fluid/memory/detail/memory_block.cc index f34b922b..15e2e856 100644 --- a/paddle/fluid/memory/detail/memory_block.cc +++ b/paddle/fluid/memory/detail/memory_block.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/memory/detail/memory_block.h" -#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace memory { @@ -61,7 +61,7 @@ MemoryBlock* MemoryBlock::right_buddy(const MetadataCache& cache) const { void MemoryBlock::split(MetadataCache* cache, size_t size) { // make sure the split fits - PADDLE_ASSERT(total_size(*cache) >= size); + PADDLE_ENFORCE_GE(total_size(*cache), size); // bail out if there is no room for another partition if (total_size(*cache) - size <= sizeof(MemoryBlock::Desc)) { @@ -102,8 +102,8 @@ void MemoryBlock::split(MetadataCache* cache, size_t size) { void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) { // only free blocks can be merged - PADDLE_ASSERT(type(*cache) == FREE_CHUNK); - PADDLE_ASSERT(right_buddy->type(*cache) == FREE_CHUNK); + PADDLE_ENFORCE_EQ(type(*cache), FREE_CHUNK); + PADDLE_ENFORCE_EQ(right_buddy->type(*cache), FREE_CHUNK); auto metadata = cache->load(this); @@ -129,8 +129,8 @@ void MemoryBlock::merge(MetadataCache* cache, MemoryBlock* right_buddy) { void MemoryBlock::mark_as_free(MetadataCache* cache) { // check for double free or corruption - PADDLE_ASSERT(type(*cache) != FREE_CHUNK); - PADDLE_ASSERT(type(*cache) != INVALID_CHUNK); + PADDLE_ENFORCE_NE(type(*cache), FREE_CHUNK); + PADDLE_ENFORCE_NE(type(*cache), INVALID_CHUNK); set_type(cache, FREE_CHUNK); } diff --git a/paddle/fluid/memory/detail/meta_cache.cc b/paddle/fluid/memory/detail/meta_cache.cc index b86e4f38..f04b0c80 100644 --- a/paddle/fluid/memory/detail/meta_cache.cc +++ b/paddle/fluid/memory/detail/meta_cache.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/memory/detail/memory_block.h" -#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace memory { @@ -25,12 +25,12 @@ MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {} MemoryBlock::Desc MetadataCache::load(const MemoryBlock* block) const { if (uses_gpu_) { auto existing_desc = cache_.find(block); - PADDLE_ASSERT(existing_desc->second.check_guards()); + PADDLE_ENFORCE_EQ(existing_desc->second.check_guards(), true); return existing_desc->second; } else { auto* desc = reinterpret_cast(block); VLOG(10) << "Load MemoryBlock::Desc type=" << desc->type; - PADDLE_ASSERT(desc->check_guards()); + PADDLE_ENFORCE_EQ(desc->check_guards(), true); return *reinterpret_cast(block); } } diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index b0f48c45..55011179 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -23,9 +23,11 @@ limitations under the License. */ #endif #include // for malloc and free #include // for std::max +#include +#include #include "gflags/gflags.h" -#include "paddle/fluid/platform/assert.h" +#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gpu_info.h" @@ -56,7 +58,7 @@ void* AlignedMalloc(size_t size) { PADDLE_ENFORCE_EQ(posix_memalign(&p, alignment, size), 0, "Alloc %ld error!", size); #endif - PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size); + PADDLE_ENFORCE_NOT_NULL(p, "Fail to allocate CPU memory: size = %d .", size); return p; } @@ -118,33 +120,35 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { gpu_alloc_size_ += size; return p; } else { - LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0 - << " MB GPU memory. Please shrink " - "FLAGS_fraction_of_gpu_memory_to_use or " - "FLAGS_initial_gpu_memory_in_mb or " - "FLAGS_reallocate_gpu_memory_in_mb" - "environment variable to a lower value. " - << "Current FLAGS_fraction_of_gpu_memory_to_use value is " - << FLAGS_fraction_of_gpu_memory_to_use - << ". Current FLAGS_initial_gpu_memory_in_mb value is " - << FLAGS_initial_gpu_memory_in_mb - << ". Current FLAGS_reallocate_gpu_memory_in_mb value is " - << FLAGS_reallocate_gpu_memory_in_mb; - return nullptr; + PADDLE_ENFORCE_NE(cudaGetLastError(), cudaSuccess); + + size_t avail, total; + platform::GpuMemoryUsage(&avail, &total); + + PADDLE_THROW_BAD_ALLOC( + "\n\nOut of memory error on GPU %d. " + "Cannot allocate %s memory on GPU %d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using GPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n" + "2. If no, please try one of the following suggestions:\n" + " 1) Decrease the batch size of your model.\n" + " 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, " + "please set it to a higher value but less than 1.0.\n" + " The command is " + "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.\n\n", + gpu_id_, string::HumanReadableSize(size), gpu_id_, + string::HumanReadableSize(avail), gpu_id_, + FLAGS_fraction_of_gpu_memory_to_use); } } void GPUAllocator::Free(void* p, size_t size, size_t index) { cudaError_t err; - if (index == 0) { - PADDLE_ASSERT(gpu_alloc_size_ >= size); - gpu_alloc_size_ -= size; - err = cudaFree(p); - } else { - PADDLE_ASSERT(fallback_alloc_size_ >= size); - fallback_alloc_size_ -= size; - err = cudaFreeHost(p); - } + PADDLE_ENFORCE_EQ(index, 0); + PADDLE_ENFORCE_GE(gpu_alloc_size_, size); + gpu_alloc_size_ -= size; + err = cudaFree(p); // Purposefully allow cudaErrorCudartUnloading, because // that is returned if you ever call cudaFree after the @@ -194,9 +198,9 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { cudaError_t err; - PADDLE_ASSERT(index == 1); + PADDLE_ENFORCE_EQ(index, 1); - PADDLE_ASSERT(cuda_pinnd_alloc_size_ >= size); + PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size); cuda_pinnd_alloc_size_ -= size; err = cudaFreeHost(p); diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index a0386a2d..42f0f23e 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -52,7 +52,6 @@ class GPUAllocator : public SystemAllocator { private: size_t gpu_alloc_size_ = 0; - size_t fallback_alloc_size_ = 0; int gpu_id_; }; diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc index 26826014..34bb40d5 100644 --- a/paddle/fluid/memory/detail/system_allocator_test.cc +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator.h" DECLARE_bool(use_pinned_memory); @@ -62,4 +63,22 @@ TEST(GPUAllocator, Alloc) { TestAllocator(&a, 2048); TestAllocator(&a, 0); } + +TEST(CUDAPinnedAllocator, Alloc) { + paddle::memory::detail::CUDAPinnedAllocator a; + TestAllocator(&a, 2048); + TestAllocator(&a, 0); +} + +TEST(GPUAllocator, AllocFailure) { + paddle::memory::detail::GPUAllocator allocator(0); + size_t index; + size_t alloc_size = (static_cast(1) << 40); // Very large number + try { + allocator.Alloc(&index, alloc_size); + ASSERT_TRUE(false); + } catch (paddle::memory::allocation::BadAlloc&) { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError()); + } +} #endif diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc index 5884433a..e01f0305 100644 --- a/paddle/fluid/memory/malloc.cc +++ b/paddle/fluid/memory/malloc.cc @@ -18,14 +18,16 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/platform/place.h" + namespace paddle { namespace memory { -std::shared_ptr AllocShared(const platform::Place& place, + +std::shared_ptr AllocShared(const platform::Place &place, size_t size) { return allocation::AllocatorFacade::Instance().AllocShared(place, size); } -AllocationPtr Alloc(const platform::Place& place, size_t size) { +AllocationPtr Alloc(const platform::Place &place, size_t size) { return allocation::AllocatorFacade::Instance().Alloc(place, size); } diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 6731203f..9ba572ac 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -18,7 +18,13 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/platform/place.h" namespace paddle { + +namespace platform { +class DeviceContext; +} // platform + namespace memory { + using allocation::Allocation; using allocation::Allocator; using allocation::AllocationPtr; @@ -28,5 +34,7 @@ extern std::shared_ptr AllocShared(const platform::Place& place, extern AllocationPtr Alloc(const platform::Place& place, size_t size); +extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size); + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu new file mode 100644 index 00000000..89853e15 --- /dev/null +++ b/paddle/fluid/memory/malloc_test.cu @@ -0,0 +1,137 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace memory { + +const int NUM_STREAMS = 8; +const int N = 2; +const float DELTA = 1e-1; + +using CudaDevCtxVec = std::vector>; + +__global__ void kernel(float *x, int n) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + for (int i = tid; i < n; i += blockDim.x * gridDim.x) { + x[i] = 3.14159 * i; + } +} + +void CheckKernelOutput(float *x, int n) { + auto host_x = std::unique_ptr(new float[n]); + for (int i = 0; i < n; ++i) { + EXPECT_TRUE(cudaSuccess == cudaMemcpy(host_x.get(), x, n * sizeof(float), + cudaMemcpyDeviceToHost)); + EXPECT_GE(host_x[i] + DELTA, 3.14159f * i); + EXPECT_LE(host_x[i] - DELTA, 3.14159f * i); + } +} + +void MultiStreamCompute(float **data, float **second_data, + const platform::CUDADeviceContext &ctx) { + // multi-streams + AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float)); + EXPECT_GE(allocation_ptr->size(), N * sizeof(float)); + *data = reinterpret_cast(allocation_ptr->ptr()); + kernel<<<1, 64, 0, ctx.stream()>>>(*data, N); + + // allocate and compute on same stream again + allocation_ptr = Alloc(ctx, N * sizeof(float)); + EXPECT_GE(allocation_ptr->size(), N * sizeof(float)); + *second_data = reinterpret_cast(allocation_ptr->ptr()); + kernel<<<1, 64, 0, ctx.stream()>>>(*second_data, N); +} + +TEST(Malloc, CUDADeviceContextMultiStream) { + auto place = platform::CUDAPlace(0); + EXPECT_TRUE(cudaSuccess == cudaSetDevice(0)); + + AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float)); + EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float)); + float *main_stream_data = + reinterpret_cast(main_stream_alloc_ptr->ptr()); + + float *data[NUM_STREAMS]; + float *second_data[NUM_STREAMS]; + CudaDevCtxVec dev_ctx; + + // default stream + kernel<<<1, 64>>>(main_stream_data, N); + main_stream_alloc_ptr.reset(); + + for (int i = 0; i < NUM_STREAMS; ++i) { + dev_ctx.push_back(std::unique_ptr( + new platform::CUDADeviceContext(place))); + MultiStreamCompute(&data[i], &second_data[i], *dev_ctx[i]); + } + + EXPECT_TRUE(cudaSuccess == cudaDeviceSynchronize()); + for (int i = 0; i < NUM_STREAMS; ++i) { + CheckKernelOutput(data[i], N); + CheckKernelOutput(second_data[i], N); + } +} + +TEST(Malloc, CUDADeviceContextMultiThreadMultiStream) { + auto place = platform::CUDAPlace(0); + EXPECT_TRUE(cudaSuccess == cudaSetDevice(0)); + + AllocationPtr main_stream_alloc_ptr = Alloc(place, N * sizeof(float)); + EXPECT_GE(main_stream_alloc_ptr->size(), N * sizeof(float)); + float *main_stream_data = + reinterpret_cast(main_stream_alloc_ptr->ptr()); + + float *data[NUM_STREAMS]; + float *second_data[NUM_STREAMS]; + CudaDevCtxVec dev_ctx; + std::vector threads; + + // default stream + kernel<<<1, 64>>>(main_stream_data, N); + main_stream_alloc_ptr.reset(); + + for (int i = 0; i < NUM_STREAMS; ++i) { + dev_ctx.push_back(std::unique_ptr( + new platform::CUDADeviceContext(place))); + threads.push_back(std::thread(MultiStreamCompute, &data[i], &second_data[i], + std::cref(*dev_ctx[i]))); + } + + for (int i = 0; i < NUM_STREAMS; ++i) { + threads[i].join(); + } + + EXPECT_TRUE(cudaSuccess == cudaDeviceSynchronize()); + for (int i = 0; i < NUM_STREAMS; ++i) { + CheckKernelOutput(data[i], N); + CheckKernelOutput(second_data[i], N); + } +} + +TEST(Malloc, AllocZero) { + auto place = platform::CUDAPlace(0); + AllocationPtr allocation_ptr = Alloc(place, 0); + EXPECT_GE(allocation_ptr->size(), 0); +} +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec index 4ec0a35b..389a174c 100644 --- a/paddle/fluid/op_use_default_grad_op_maker.spec +++ b/paddle/fluid/op_use_default_grad_op_maker.spec @@ -1,22 +1,13 @@ -attention_lstm conv_shift cos_sim -dequantize fc flatten fsp -fused_embedding_fc_lstm fused_embedding_seq_pool -fusion_gru -fusion_lstm -fusion_repeated_fc_relu -fusion_seqconv_eltadd_relu -fusion_seqexpand_concat_fc -fusion_seqpool_concat -fusion_squared_mat_sub gru lrn lstm_unit +match_matrix_tensor max_pool2d_with_index max_pool3d_with_index maxout @@ -25,13 +16,10 @@ nce pool2d pool3d prelu -quantize rank_loss reduce_max reduce_min reduce_prod -reduce_sum -requantize reshape rnn_memory_helper sequence_softmax @@ -41,3 +29,4 @@ tensor_array_to_tensor transpose unpool unsqueeze +var_conv_2d diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 98ff3ea1..f99cbc87 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -48,8 +48,14 @@ if (WITH_DISTRIBUTE) SET(OP_PREFETCH_DEPS ${OP_PREFETCH_DEPS} parameter_prefetch) endif() +SET(OP_ONLY_MKL "") +if (NOT WITH_MKL) + SET(OP_ONLY_MKL ${OP_ONLY_MKL} match_matrix_tensor_op) + SET(OP_ONLY_MKL ${OP_ONLY_MKL} var_conv_2d_op) +endif() + register_operators(EXCLUDES py_func_op warpctc_op dgc_op conv_fusion_op - sync_batch_norm_op deformable_conv_op DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) + sync_batch_norm_op ${OP_ONLY_MKL} DEPS ${OP_HEADER_DEPS} ${OP_PREFETCH_DEPS}) if (WITH_GPU) # warpctc_op needs cudnn 7 above @@ -67,8 +73,6 @@ if (WITH_GPU) op_library(sync_batch_norm_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") endif() - op_library(deformable_conv_op) - file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(deformable_conv);\n") else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() @@ -84,7 +88,8 @@ endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) endif() @@ -109,6 +114,11 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) +if (WITH_GPU) + nv_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc test_leaky_relu_grad_grad_functor.cu DEPS tensor device_context eigen3) +else() + cc_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc DEPS tensor device_context eigen3) +endif() if (WITH_PYTHON) cc_library(py_func_op SRCS py_func_op.cc DEPS op_registry python pybind) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 943c6f80..f6848a80 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -24,6 +24,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cudnn_helper.h" #endif +DECLARE_bool(use_mkldnn); + namespace paddle { namespace operators { @@ -34,20 +36,6 @@ static constexpr bool CanInplaceAct() { return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps; } -std::unique_ptr> GetInplaceOpSet() { - std::unique_ptr> ret( - new std::unordered_set()); -#define INSERT_INTO_INPLACE_OP_SET(op_type, __omitted, fwd_functor, \ - bwd_functor) \ - if (CanInplaceAct>()) { \ - ret->insert(#op_type); \ - } - - FOR_EACH_ACTIVATION_OP(INSERT_INTO_INPLACE_OP_SET); -#undef INSERT_INTO_INPLACE_OP_SET - return ret; -} - #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT) \ class OP_NAME##OpMaker \ : public ::paddle::framework::OpProtoAndCheckerMaker { \ @@ -84,8 +72,10 @@ class ActivationGradOpDescMaker : public framework::SingleGradOpDescMaker { op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetAttrMap(Attrs()); - if (static_cast(kDepValue) & - static_cast(ActBwdOpFwdDeps::kDepX)) { + if ((static_cast(kDepValue) & + static_cast(ActBwdOpFwdDeps::kDepX)) || + FLAGS_use_mkldnn || (op->HasAttr("use_mkldnn") && + boost::get(op->GetAttr("use_mkldnn")))) { op->SetInput("X", Input("X")); } @@ -363,6 +353,13 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of LeakyRelu operator"); AddOutput("Out", "Output of LeakyRelu operator"); AddAttr("alpha", "The small negative slope").SetDefault(0.02f); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); AddComment(R"DOC( LeakyRelu Activation Operator. @@ -486,6 +483,11 @@ class PowOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "Input of Pow operator"); + AddInput("FactorTensor", + "(Tensor, optional). If provided, pow will use this" + "The shape of FactorTensor MUST BE [1]." + "it has higher priority than attr(factor).") + .AsDispensable(); AddOutput("Out", "Output of Pow operator"); AddAttr("factor", "The exponential factor of Pow").SetDefault(1.0f); AddComment(R"DOC( @@ -570,7 +572,33 @@ class SwishOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Swish Activation Operator. -$$out = \\frac{x}{1 + e^{- \beta x}}$$ +$$out = \\frac{x}{1 + e^{- \beta \ x}}$$ + +)DOC"); + } +}; + +class HardSwishOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of HardSwish operator"); + AddOutput("Out", "Output of HardSwish operator"); + AddAttr("threshold", "The threshold parameter of HardSwish operator") + .SetDefault(6.0f); + AddAttr("scale", "The scale parameter of HardSwish operator") + .SetDefault(6.0f); + AddAttr("offset", "The offset parameter of HardSwish operator") + .SetDefault(3.0f); + AddComment(R"DOC( +HardSwish Activation Operator. + +The hard version of swish(https://arxiv.org/pdf/1905.02244.pdf). + +$out = \frac{x * (min(max(0, x+offset), threshold))}{scale}$ + +The threshold and scale should be positive. The offset can be either positive or negative. +The default parameters are set according to the above reference. +It is recommended to use the defaults for this activation. )DOC"); } @@ -693,8 +721,8 @@ class LeakyReluDoubleGradMaker std::unique_ptr<::paddle::framework::OpDesc> Apply() const override { auto* op = new ::paddle::framework::OpDesc(); op->SetType("leaky_relu_grad_grad"); - // input1: X - op->SetInput("X", Input("X")); + // input1: Out + op->SetInput("Out", Input("Out")); // X@GRAD@GRAD: ddx op->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); op->SetAttrMap(Attrs()); @@ -751,14 +779,81 @@ class SquareDoubleGradMaker } }; -class ActivationGradOpInplaceInference : public framework::InplaceOpInference { +DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInference, + {framework::GradVarName("Out"), + framework::GradVarName("X")}); +DECLARE_INPLACE_OP_INFERER(ActivationDoubleGradOpInplaceInference, + {"DDX", "DDOut"}); + +class PowGradOpDescMaker : public framework::SingleGradOpDescMaker { public: - std::unordered_map operator()( - const framework::OpDesc& op_desc, bool use_cuda) const override { - return {{framework::GradVarName("Out"), framework::GradVarName("X")}}; + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("pow_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetInput("FactorTensor", Input("FactorTensor")); + op->SetAttrMap(Attrs()); + + return op; + } +}; +class PowOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->ShareDim("X", /*->*/ "Out"); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this, "X"); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "FactorTensor") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); } }; +class PowOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto out_grad_name = framework::GradVarName("Out"); + ctx->ShareDim(out_grad_name, framework::GradVarName("X")); + ctx->ShareLoD(out_grad_name, framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this, framework::GradVarName("Out")); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "FactorTensor") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; } // namespace operators } // namespace paddle @@ -803,7 +898,8 @@ REGISTER_OPERATOR(relu_grad, ops::ActivationOpGrad, ops::ReluDoubleGradMaker); REGISTER_OPERATOR( relu_grad_grad, - ops::ActivationOpDoubleGrad2::FwdDeps()>); + ops::ActivationOpDoubleGrad2::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInference); REGISTER_ACTIVATION_CPU_KERNEL(relu, Relu, ReluFunctor, ReluGradFunctor); @@ -828,7 +924,8 @@ REGISTER_OPERATOR(leaky_relu_grad, ops::ActivationOpGrad, ops::LeakyReluDoubleGradMaker); REGISTER_OPERATOR( leaky_relu_grad_grad, - ops::ActivationOpDoubleGrad2::FwdDeps()>); + ops::ActivationOpDoubleGrad2::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInference); REGISTER_ACTIVATION_CPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor, LeakyReluGradFunctor); @@ -852,7 +949,9 @@ REGISTER_OPERATOR(sqrt_grad, ops::ActivationOpGrad, ops::SqrtDoubleGradMaker); REGISTER_OPERATOR( sqrt_grad_grad, - ops::ActivationOpDoubleGrad::FwdDeps()>); + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInference); + REGISTER_ACTIVATION_CPU_KERNEL(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor); REGISTER_OP_CPU_KERNEL( sqrt_grad_grad, ops::SqrtDoubleGradKernel::FwdDeps()>); + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInference); REGISTER_ACTIVATION_CPU_KERNEL(square, Square, SquareFunctor, SquareGradFunctor); @@ -888,3 +988,22 @@ REGISTER_OP_CPU_KERNEL( ops::SquareDoubleGradKernel>); /* ========================================================================== */ + +/* ========================== pow register ============================ */ + +REGISTER_OPERATOR( + pow, ops::PowOp, ops::PowOpMaker, ops::ActivationOpInferVarType, + ops::PowGradOpDescMaker, + std::conditional>(), + ::paddle::framework::SingleOpInplaceInToOut, void>::type); +REGISTER_OPERATOR(pow_grad, ops::PowOpGrad, + ops::ActivationGradOpInplaceInference); + +REGISTER_OP_CPU_KERNEL( + pow, ops::PowKernel>, + ops::PowKernel>); +REGISTER_OP_CPU_KERNEL( + pow_grad, + ops::PowGradKernel>, + ops::PowGradKernel>); +/* ========================================================================== */ diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 25514186..ac03b898 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -86,3 +86,17 @@ REGISTER_OP_CUDA_KERNEL( ops::SquareDoubleGradKernel>); /* ========================================================================== */ + +/* ========================== pow register ============================ */ + +REGISTER_OP_CUDA_KERNEL( + pow, ops::PowKernel>, + ops::PowKernel>, + ops::PowKernel>); +REGISTER_OP_CUDA_KERNEL( + pow_grad, + ops::PowGradKernel>, + ops::PowGradKernel>, + ops::PowGradKernel>); +/* ========================================================================== */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index b516fc8a..ea19dcd3 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -49,21 +49,6 @@ enum ActBwdOpFwdDeps { kDepXOut = 0x03 }; -std::unique_ptr> GetInplaceOpSet(); - -static bool IsInplace(const std::string& op) { - static auto InplaceOpSet = GetInplaceOpSet(); - bool inplace = InplaceOpSet->count(op); - // for op_grad - const int kGradSuffixLen = 4; - if (op.size() > kGradSuffixLen && - op.compare(op.size() - kGradSuffixLen - 1, kGradSuffixLen, "grad")) { - inplace = - InplaceOpSet->count(op.substr(0, op.size() - (kGradSuffixLen + 1))); - } - return inplace; -} - /* The following operator can be used to process SelectedRows, because the * output of those operator for zero is zero too. */ @@ -363,17 +348,64 @@ struct GeluFunctor : public BaseActivationFunctor { } }; +// gelu_grad(x) = dout * (0.5 * (1 + erf(x / sqrt(2))) + 0.5 * 2 / sqrt(pie) / +// sqrt(2) * x * exp (-0.5 * sqrt(x))) +// gelu_grad(x) = dout * (0.5 + 0.5 * erf(x * M_SQRT1_2) + (0.5 * M_2_SQRTPI * +// M_SQRT1_2) * x * exp (-0.5 * sqrt(x))) template struct GeluGradFunctor : BaseActivationFunctor { template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) && !defined(PADDLE_WITH_CUDA) + auto x_data = x.data(); + auto dx_data = dx.data(); + int n = std::min(x.size(), dx.size()); + + std::memset(dx_data, 0, n * sizeof(T)); + + // First(dx_data) = erf(x * M_SQRT1_2) + math::CBlas::AXPY(n, static_cast(M_SQRT1_2), x_data, 1, dx_data, 1); + math::CBlas::VMERF(n, dx_data, dx_data, VML_LA); + + // Second = 0.5 * M_2_SQRTPI * M_SQRT1_2 * x * exp (-0.5 * sqrt(x)) + auto second = static_cast(std::malloc(n * sizeof(T))); + std::memset(second, 0, n * sizeof(T)); + + math::CBlas::VSQUARE(n, x_data, second); + for (int i = 0; i < n; i++) { + second[i] *= static_cast(-0.5); + } + math::CBlas::VEXP(n, second, second); + math::CBlas::VMUL(n, x_data, second, second); + T tmp = static_cast(0.5) * static_cast(M_SQRT1_2) * + static_cast(M_2_SQRTPI); + for (int i = 0; i < n; i++) { + second[i] *= tmp; + } + + // Sum = 0.5 * First + Second + math::CBlas::AXPY(n, static_cast(0.5), dx_data, 1, second, 1); + + // 0.5 + Sum + for (int i = 0; i < n; i++) { + second[i] += static_cast(0.5); + } + + // * dout + auto dout_data = dout.data(); + math::CBlas::VMUL(n, dout_data, second, dx_data); + + std::free(second); +#else auto first = static_cast(0.5) * (static_cast(1) + ((x * static_cast(M_SQRT1_2)).erf())); auto second = static_cast(0.5 * M_2_SQRTPI * M_SQRT1_2) * x * (-static_cast(0.5) * x.square()).exp(); dx.device(d) = dout * (first + second); +#endif } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -431,8 +463,8 @@ struct HardShrinkFunctor : public BaseActivationFunctor { } template void operator()(Device d, X x, Out out) const { - auto temp1 = (x < static_cast(threshold * -1)).template cast().eval(); - auto temp2 = (x > static_cast(threshold)).template cast().eval(); + auto temp1 = (x < static_cast(threshold * -1)).template cast(); + auto temp2 = (x > static_cast(threshold)).template cast(); out.device(d) = x * (temp1 + temp2); } }; @@ -448,8 +480,8 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = (x < static_cast(threshold * -1)).template cast().eval(); - auto temp2 = (x > static_cast(threshold)).template cast().eval(); + auto temp1 = (x < static_cast(threshold * -1)).template cast(); + auto temp2 = (x > static_cast(threshold)).template cast(); dx.device(d) = dout * (temp1 + temp2).template cast(); } @@ -468,8 +500,8 @@ struct SoftShrinkFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out) const { auto lambdaT = static_cast(lambda); - auto temp1 = (x > lambdaT).template cast().eval(); - auto temp2 = (x < -lambdaT).template cast().eval(); + auto temp1 = (x > lambdaT).template cast(); + auto temp2 = (x < -lambdaT).template cast(); out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT); } }; @@ -484,8 +516,8 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor { typename dX> void operator()(Device d, X x, Out out, dOut dout, dX dx) const { auto lambdaT = static_cast(lambda); - auto temp1 = (x > lambdaT).template cast().eval(); - auto temp2 = (x < -lambdaT).template cast().eval(); + auto temp1 = (x > lambdaT).template cast(); + auto temp2 = (x < -lambdaT).template cast(); dx.device(d) = dout * (temp1 + temp2).template cast(); } @@ -887,6 +919,51 @@ struct Relu6GradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +// HardSwish = min(max(0, x+3), 6) * x / 6 +template +struct HardSwishFunctor : public BaseActivationFunctor { + float threshold; + float scale; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = (x + static_cast(offset)) + .cwiseMax(static_cast(0)) + .cwiseMin(static_cast(threshold)) * + x / static_cast(scale); + } +}; + +template +struct HardSwishGradFunctor : public BaseActivationFunctor { + float threshold; + float scale; + float offset; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"threshold", &threshold}, {"scale", &scale}, {"offset", &offset}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto tmp = ((x + static_cast(offset)) < static_cast(threshold)) + .template cast(); + dx.device(d) = + dout * + (((x + static_cast(offset)) > static_cast(0)).template cast() * + (static_cast(2) * x + static_cast(offset)) / + static_cast(scale) * tmp + + static_cast(1) * (static_cast(1) - tmp)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + // softplus(x) = log(1 + exp(x)) // When x is a very large positive number, exp(x) may explode to inf, // Using trick below for numerical stability @@ -966,7 +1043,7 @@ struct SoftReluGradFunctor : public BaseActivationFunctor { typename dX> void operator()(Device d, X x, Out out, dOut dout, dX dx) const { auto tmp = static_cast(threshold); - auto temp = ((out > -tmp) * (out < tmp)).template cast().eval(); + auto temp = ((out > -tmp) * (out < tmp)).template cast(); dx.device(d) = dout * (static_cast(1) - (-out).exp()) * temp; } @@ -995,13 +1072,13 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor { template void operator()(Device d, X x, Out out, dOut dout, dX dx) const { - auto temp1 = static_cast(alpha) * - (x < static_cast(0)).template cast().eval(); - auto temp2 = (x >= static_cast(0)).template cast().eval(); + auto temp1 = + static_cast(alpha) * (out <= static_cast(0)).template cast(); + auto temp2 = (out > static_cast(0)).template cast(); dx.device(d) = dout * (temp1 + temp2).template cast(); } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; template @@ -1336,19 +1413,19 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor { const framework::Tensor* Out, const framework::Tensor* ddX, framework::Tensor* ddOut, framework::Tensor* dOut, framework::Tensor* dX) const { - auto* d = dev.eigen_device(); - auto ddx = framework::EigenVector::Flatten(detail::Ref(ddX)); - auto x = framework::EigenVector::Flatten(detail::Ref(X)); if (ddOut) { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten(detail::Ref(ddX)); + auto out = framework::EigenVector::Flatten(detail::Ref(Out)); auto ddout = framework::EigenVector::Flatten(detail::Ref(ddOut)); ddout.device(*d) = ddx * - ((x >= static_cast(0)).template cast().eval() + + ((out > static_cast(0)).template cast() + static_cast(alpha) * - (x < static_cast(0)).template cast().eval()) + (out <= static_cast(0)).template cast()) .template cast(); } } - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; template @@ -1360,15 +1437,17 @@ struct SqrtGradGradFunctor : public BaseActivationFunctor { auto* d = dev.eigen_device(); auto ddx = framework::EigenVector::Flatten(detail::Ref(ddX)); auto out = framework::EigenVector::Flatten(detail::Ref(Out)); - if (ddOut) { - auto ddout = framework::EigenVector::Flatten(detail::Ref(ddOut)); - ddout.device(*d) = ddx * static_cast(0.5) / out; - } + // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx + // calculate dy first, so ddy can inplace ddx if (dOut) { auto dx = framework::EigenVector::Flatten(detail::Ref(dX)); auto dout = framework::EigenVector::Flatten(detail::Ref(dOut)); dout.device(*d) = dx * ddx * static_cast(-1) / out; } + if (ddOut) { + auto ddout = framework::EigenVector::Flatten(detail::Ref(ddOut)); + ddout.device(*d) = ddx * static_cast(0.5) / out; + } } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; @@ -1382,15 +1461,17 @@ struct SquareGradGradFunctor : public BaseActivationFunctor { auto* d = dev.eigen_device(); auto ddx = framework::EigenVector::Flatten(detail::Ref(ddX)); auto x = framework::EigenVector::Flatten(detail::Ref(X)); - if (ddOut) { - auto ddout = framework::EigenVector::Flatten(detail::Ref(ddOut)); - ddout.device(*d) = ddx * static_cast(2) * x; - } + // square GradGrad: ddy=2x*ddx, dx=2dy*ddx + // calculate dx first, so ddy can inplace ddx if (dX) { auto dx = framework::EigenVector::Flatten(detail::Ref(dX)); auto dout = framework::EigenVector::Flatten(detail::Ref(dOut)); dx.device(*d) = ddx * static_cast(2) * dout; } + if (ddOut) { + auto ddout = framework::EigenVector::Flatten(detail::Ref(ddOut)); + ddout.device(*d) = ddx * static_cast(2) * x; + } } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; @@ -1512,6 +1593,97 @@ class SqrtDoubleGradKernel } }; +template +class PowKernel : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* X = nullptr; + framework::Tensor* Out = nullptr; + ExtractActivationTensor(context, &X, &Out); + Out->mutable_data(context.GetPlace()); + + auto x = framework::EigenVector::Flatten(detail::Ref(X)); + auto out = framework::EigenVector::Flatten(detail::Ref(Out)); + auto* place = + context.template device_context().eigen_device(); + Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + // get FactorTensor + auto* factor_tensor = context.HasInput("FactorTensor") + ? context.Input("FactorTensor") + : nullptr; + if (factor_tensor) { + auto* factor_data = factor_tensor->data(); + framework::Tensor cpu_factor_tensor; + if (platform::is_gpu_place(factor_tensor->place())) { + TensorCopySync(*factor_tensor, platform::CPUPlace(), + &cpu_factor_tensor); + factor_data = cpu_factor_tensor.data(); + } + auto factor = + std::vector(factor_data, factor_data + factor_tensor->numel()); + PADDLE_ENFORCE_EQ(factor.size(), 1, + "The shape of factor(tensor) MUST BE [1]."); + for (auto& attr : attrs) { + *attr.second = factor[0]; + } + } + functor(*place, x, out); + } +}; + +template +class PowGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor *X, *Out, *dOut; + framework::Tensor* dX = nullptr; + X = Out = dOut = nullptr; + ExtractActivationGradTensor(context, &X, &Out, &dOut, + &dX); + dX->mutable_data(context.GetPlace()); + auto dout = framework::EigenVector::Flatten(detail::Ref(dOut)); + auto out = framework::EigenVector::Flatten(detail::Ref(Out)); + auto dx = framework::EigenVector::Flatten(detail::Ref(dX)); + auto x = framework::EigenVector::Flatten(detail::Ref(X)); + auto* place = + context.template device_context().eigen_device(); + Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + // get FactorTensor + auto* factor_tensor = + context.HasInput("FactorTensor") + ? context.Input("FactorTensor") + : nullptr; + if (factor_tensor) { + auto* factor_data = factor_tensor->data(); + framework::Tensor cpu_factor_tensor; + if (platform::is_gpu_place(factor_tensor->place())) { + TensorCopySync(*factor_tensor, platform::CPUPlace(), + &cpu_factor_tensor); + factor_data = cpu_factor_tensor.data(); + } + auto factor = + std::vector(factor_data, factor_data + factor_tensor->numel()); + PADDLE_ENFORCE_EQ(factor.size(), 1, + "The shape of factor(tensor) MUST BE [1]."); + for (auto& attr : attrs) { + *attr.second = factor[0]; + } + } + functor(*place, x, out, dout, dx); + } +}; } // namespace operators } // namespace paddle @@ -1536,7 +1708,6 @@ class SqrtDoubleGradKernel __macro(log, Log, LogFunctor, LogGradFunctor); \ __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor); \ __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor); \ - __macro(pow, Pow, PowFunctor, PowGradFunctor); \ __macro(stanh, STanh, STanhFunctor, STanhGradFunctor); \ __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor); \ __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor); \ @@ -1548,4 +1719,5 @@ class SqrtDoubleGradKernel HardSigmoidGradFunctor); \ __macro(swish, Swish, SwishFunctor, SwishGradFunctor); \ __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor, \ - ThresholdedReluGradFunctor); + ThresholdedReluGradFunctor); \ + __macro(hard_swish, HardSwish, HardSwishFunctor, HardSwishGradFunctor); diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc index da063541..1476cfc2 100644 --- a/paddle/fluid/operators/affine_channel_op.cc +++ b/paddle/fluid/operators/affine_channel_op.cc @@ -238,21 +238,11 @@ class AffineChannelGradKernel : public framework::OpKernel { EigenVectorArrayMap dbias_e(dbias_d, C); if (layout == framework::DataLayout::kNCHW) { - // compute dx - int stride = C * HxW; - if (dx) { - for (int i = 0; i < N; i++) { - ConstEigenArrayMap dy_e(dy_d, HxW, C); - EigenArrayMap dx_e(dx_d, HxW, C); - dx_e = dy_e.rowwise() * scale_e.transpose(); - dy_d += stride; - dx_d += stride; - } - } // compute dscale and dbias + int stride = C * HxW; + auto* original_dy_d = dy_d; if (dscale && dbias) { auto* x_d = x->data(); - dy_d = dy->data(); for (int i = 0; i < N; i++) { ConstEigenArrayMap x_e(x_d, HxW, C); ConstEigenArrayMap dy_e(dy_d, HxW, C); @@ -270,14 +260,21 @@ class AffineChannelGradKernel : public framework::OpKernel { dy_d += stride; } } - } else { - int num = N * HxW; - ConstEigenArrayMap dy_e(dy_d, C, num); + // compute dx if (dx) { - EigenArrayMap dx_e(dx_d, C, num); - dx_e = dy_e.colwise() * scale_e; + dy_d = original_dy_d; + for (int i = 0; i < N; i++) { + ConstEigenArrayMap dy_e(dy_d, HxW, C); + EigenArrayMap dx_e(dx_d, HxW, C); + dx_e = dy_e.rowwise() * scale_e.transpose(); + dy_d += stride; + dx_d += stride; + } } + } else { + int num = N * HxW; + ConstEigenArrayMap dy_e(dy_d, C, num); // compute dscale and dbias if (dscale && dbias) { auto* x_d = x->data(); @@ -285,6 +282,12 @@ class AffineChannelGradKernel : public framework::OpKernel { dscale_e = (x_e * dy_e).rowwise().sum(); dbias_e = dy_e.rowwise().sum(); } + + // compute dx + if (dx) { + EigenArrayMap dx_e(dx_d, C, num); + dx_e = dy_e.colwise() * scale_e; + } } } }; @@ -295,10 +298,10 @@ class AffineChannelNoNeedBufferVarsInference using framework::NoNeedBufferVarsInference::NoNeedBufferVarsInference; private: - inline bool HasInput(const std::string& name) const { - auto& inputs = Inputs(); - auto iter = inputs.find(name); - if (iter == inputs.end() || iter->second.empty()) { + inline bool HasOutput(const std::string& name) const { + auto& outputs = Outputs(); + auto iter = outputs.find(name); + if (iter == outputs.end() || iter->second.empty()) { return false; } else { return iter->second[0] != framework::kEmptyVarName; @@ -306,9 +309,9 @@ class AffineChannelNoNeedBufferVarsInference } public: - std::unordered_set operator()() const { - if (!HasInput(framework::GradVarName("Scale")) && - !HasInput(framework::GradVarName("Bias"))) { + std::unordered_set operator()() const override { + if (!HasOutput(framework::GradVarName("Scale")) && + !HasOutput(framework::GradVarName("Bias"))) { return {"X"}; } else { return {}; @@ -316,6 +319,11 @@ class AffineChannelNoNeedBufferVarsInference } }; +DECLARE_INPLACE_OP_INFERER(AffineChannelInplaceInferer, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(AffineChannelGradInplaceInferer, + {framework::GradVarName("Out"), + framework::GradVarName("X")}); + } // namespace operators } // namespace paddle @@ -323,9 +331,11 @@ namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(affine_channel, ops::AffineChannelOp, - ops::AffineChannelOpMaker, ops::AffineChannelGradMaker); + ops::AffineChannelOpMaker, ops::AffineChannelGradMaker, + ops::AffineChannelInplaceInferer); REGISTER_OPERATOR(affine_channel_grad, ops::AffineChannelOpGrad, - ops::AffineChannelNoNeedBufferVarsInference); + ops::AffineChannelNoNeedBufferVarsInference, + ops::AffineChannelGradInplaceInferer); REGISTER_OP_CPU_KERNEL(affine_channel, ops::AffineChannelKernel, ops::AffineChannelKernel); diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index 6bc0a263..5e598071 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -151,11 +151,6 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel { int grid1 = (num + block - 1) / block; int grid2 = std::min(C, max_blocks); if (layout == framework::DataLayout::kNCHW) { - if (dx) { - KeAffineChannelCUDA<<>>( - dy_d, s_d, nullptr, C, HxW, num, dx_d); - } if (dscale && dbias) { const T* x_d = x->data(); AffineChannelScaleBiasGradientCUDAKernel< @@ -163,12 +158,12 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel { dev_ctx.stream()>>>( dy_d, x_d, N, C, HxW, ds_d, db_d); } - } else { if (dx) { - KeAffineChannelCUDA<<>>( dy_d, s_d, nullptr, C, HxW, num, dx_d); } + } else { if (dscale && dbias) { const T* x_d = x->data(); AffineChannelScaleBiasGradientCUDAKernel< @@ -176,6 +171,12 @@ class AffineChannelGradCUDAKernel : public framework::OpKernel { dev_ctx.stream()>>>( dy_d, x_d, N, C, HxW, ds_d, db_d); } + + if (dx) { + KeAffineChannelCUDA<<>>( + dy_d, s_d, nullptr, C, HxW, num, dx_d); + } } } }; diff --git a/paddle/fluid/operators/argsort_op.cu b/paddle/fluid/operators/argsort_op.cu index 7d5199aa..1a0b3038 100644 --- a/paddle/fluid/operators/argsort_op.cu +++ b/paddle/fluid/operators/argsort_op.cu @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/argsort_op.h" -#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index d9294048..ff423778 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -66,27 +66,47 @@ class AssignFunctor { const platform::DeviceContext &dev_ctx_; }; -class AssignOp : public framework::OperatorBase { +class AssignOp : public framework::OperatorWithKernel { public: AssignOp(const std::string &type, const framework::VariableNameMap &inputs, const framework::VariableNameMap &outputs, const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} + : OperatorWithKernel(type, inputs, outputs, attrs) {} - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto *x = scope.FindVar(Input("X")); + void InferShape(framework::InferShapeContext *ctx) const override { + if (ctx->HasInput("X")) { + auto type = ctx->GetInputsVarType("X")[0]; + if (type == framework::proto::VarType::SELECTED_ROWS || + type == framework::proto::VarType::LOD_TENSOR) { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + if (type == framework::proto::VarType::LOD_TENSOR) { + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class AssignKernel { + public: + void operator()(const framework::ExecutionContext &ctx) const { + auto *x = ctx.InputVar("X"); if (x == nullptr) { return; } - auto *out = scope.FindVar(Output("Out")); + auto *out = ctx.OutputVar("Out"); PADDLE_ENFORCE( out != nullptr, "The Output(Out) should not be null if the Input(X) is set."); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); + auto &dev_ctx = *pool.Get(ctx.GetPlace()); framework::VisitVarType(*x, AssignFunctor(out, dev_ctx)); } @@ -110,19 +130,6 @@ raise error if the type is not listed above. } }; -class AssignInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *context) const override { - if (context->HasInput("X")) { - auto type = context->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::SELECTED_ROWS || - type == framework::proto::VarType::LOD_TENSOR) { - context->SetOutputDim("Out", context->GetInputDim("X")); - } - } - } -}; - class AssignGradMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; @@ -137,9 +144,20 @@ class AssignGradMaker : public framework::SingleGradOpDescMaker { } }; +DECLARE_INPLACE_OP_INFERER(AssignOpInplaceInferer, {"X", "Out"}); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker, - ops::AssignInferShape, ops::AssignOpProtoMaker); + ops::AssignOpProtoMaker, ops::AssignOpInplaceInferer); +REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, + ops::AssignKernel, int, ops::AssignKernel, + int64_t, ops::AssignKernel); + +#ifdef PADDLE_WITH_CUDA +REGISTER_OP_CUDA_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, + ops::AssignKernel, int, ops::AssignKernel, + int64_t, ops::AssignKernel); +#endif diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index f991bef9..c6d98f1f 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" -#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/platform/cpu_info.h" namespace paddle { @@ -339,10 +339,13 @@ class AttentionLSTMKernel : public framework::OpKernel { T* lstm_x_data = lstm_x->mutable_data(ctx.GetPlace()); T* lstm_out_data = lstm_out->mutable_data(ctx.GetPlace()); + auto blas = math::GetBlas(ctx); + // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1 - auto blas = math::GetBlas(ctx); - math::FCCompute(blas, total_T, 1, M, x_data, atten_w_data, - atted_x_data, atten_b_data); + auto& dev_ctx = ctx.template device_context(); + math::FCFunctor fc; + fc(dev_ctx, total_T, 1, M, x_data, atten_w_data, atted_x_data, + atten_b_data); const T* cur_atten_x_data = atted_x_data; const T* cur_x_data = x_data; @@ -369,8 +372,7 @@ class AttentionLSTMKernel : public framework::OpKernel { // 1d. softmax vec_softmax(seq_len, fc_out_data, fc_out_data); // mul x(seq_len*M) and sum pool - math::FCCompute(blas, 1, M, seq_len, fc_out_data, - cur_x_data, lstm_x_data); + fc(dev_ctx, 1, M, seq_len, fc_out_data, cur_x_data, lstm_x_data); /// 2. compute LSTM step // lstm weight : concat[forget , input , output , tilde] @@ -419,8 +421,7 @@ class AttentionLSTMKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(attention_lstm, ops::AttentionLSTMOp, - ops::AttentionLSTMOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::AttentionLSTMOpMaker); REGISTER_OP_CPU_KERNEL(attention_lstm, ops::AttentionLSTMKernel, ops::AttentionLSTMKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index f6295337..9a1d724c 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -245,8 +245,8 @@ class BatchNormKernel variance_out->mutable_data(ctx.GetPlace()), C); if ((N * sample_size) == 1) { - LOG(WARNING) << "Only 1 element in normalization dimension, " - << "we skip the batch norm calculation, let y = x."; + // Only 1 element in normalization dimension, + // we skip the batch norm calculation, let y = x. framework::TensorCopy(*x, ctx.GetPlace(), y); return; } @@ -496,6 +496,21 @@ class BatchNormGradKernel int scale_coefff = use_global_stats ? 1 : N * sample_size; const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff; + Tensor dy_sum; + dy_sum.Resize({C}); + dy_sum.mutable_data(ctx.GetPlace()); + EigenVectorArrayMap dy_sum_arr(dy_sum.mutable_data(ctx.GetPlace()), + C); + + Tensor dy_mul_x_sub_mean_mul_invstd_sum; + dy_mul_x_sub_mean_mul_invstd_sum.Resize({C}); + dy_mul_x_sub_mean_mul_invstd_sum.mutable_data(ctx.GetPlace()); + EigenVectorArrayMap dy_mul_x_sub_mean_mul_invstd_sum_arr( + dy_mul_x_sub_mean_mul_invstd_sum.mutable_data(ctx.GetPlace()), C); + + dy_sum_arr.setZero(); + dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero(); + switch (data_layout) { case DataLayout::kNCHW: { ConstEigenArrayMap x_arr(x->data(), sample_size, N * C); @@ -504,23 +519,27 @@ class BatchNormGradKernel sample_size, N * C); d_x_arr.setZero(); + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + dy_sum_arr(c) += d_y_arr.col(nc).sum(); + dy_mul_x_sub_mean_mul_invstd_sum_arr(c) += + ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) + .sum(); + } + if (d_scale && d_bias) { - for (int nc = 0; nc < N * C; ++nc) { - int c = nc % C; - d_bias_arr(c) += d_y_arr.col(nc).sum(); - d_scale_arr(c) += ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * - d_y_arr.col(nc)) - .sum(); - } + d_bias_arr = dy_sum_arr; + d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; } + if (!use_global_stats) { for (int nc = 0; nc < N * C; ++nc) { int c = nc % C; d_x_arr.col(nc) += scale_inv_var_nhw(c) * - (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) - - (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * - inv_var_arr(c)); + (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) - + (x_arr.col(nc) - mean_arr[c]) * + dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c)); } } else { for (int nc = 0; nc < N * C; ++nc) { @@ -537,27 +556,24 @@ class BatchNormGradKernel N * sample_size); d_x_arr.setZero(); - const auto d_y_row_sum = d_y_arr.rowwise().sum(); - const auto x_minus_mean = x_arr.colwise() - mean_arr; - const auto d_y_mul_x_minus_mean_row_sum = - (d_y_arr * x_minus_mean).rowwise().sum(); - const auto inv_var_sqr = inv_var_arr * inv_var_arr; + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + dy_sum_arr += d_y_arr.col(nhw); + dy_mul_x_sub_mean_mul_invstd_sum_arr += + (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); + } if (d_scale && d_bias) { - for (int nhw = 0; nhw < N * sample_size; ++nhw) { - d_bias_arr += d_y_arr.col(nhw); - d_scale_arr += - (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); - } + d_bias_arr = dy_sum_arr; + d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; } if (!use_global_stats) { for (int nhw = 0; nhw < N * sample_size; ++nhw) { d_x_arr.col(nhw) += scale_inv_var_nhw * - (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum - - x_minus_mean.col(nhw) * inv_var_sqr * - d_y_mul_x_minus_mean_row_sum); + (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr - + (x_arr.col(nhw) - mean_arr) * + dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr); } } else { for (int nhw = 0; nhw < N * sample_size; ++nhw) { @@ -598,36 +614,13 @@ std::unique_ptr BatchNormGradMaker::Apply() const { return std::unique_ptr(op); } -class BatchNormInplaceInToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc &op_desc, bool use_cuda) const override { - return {{"Mean", "MeanOut"}, {"Variance", "VarianceOut"}, {"X", "Y"}}; - } -}; - -class BatchNormGradInplaceInToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc &op_desc, bool use_cuda) const override { - // Scale, Bias, SavedMean, SavedVariance shape is [batch_size, C] - return { - {framework::GradVarName("Y"), framework::GradVarName("X")}, - {"SavedMean", framework::GradVarName("Scale")}, - {"SavedVariance", framework::GradVarName("Bias")}, - }; - } -}; - } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, - ops::BatchNormOpInferVarType, ops::BatchNormGradMaker) -// ops::BatchNormInplaceInToOut); -REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp) -// ops::BatchNormGradInplaceInToOut); + ops::BatchNormOpInferVarType, ops::BatchNormGradMaker); +REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); REGISTER_OP_CPU_KERNEL( batch_norm, ops::BatchNormKernel, diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index a78a6726..95d7f23b 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -23,15 +23,7 @@ limitations under the License. */ #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" -// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in -// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT -// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The -// reason we set it to false by default is that this mode may use scaled -// atomic integer reduction that may cause a numerical overflow for certain -// input data range. -DEFINE_bool(cudnn_batchnorm_spatial_persistent, false, - "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn " - "batch_norm, default is False."); +DECLARE_bool(cudnn_batchnorm_spatial_persistent); namespace paddle { namespace operators { @@ -160,8 +152,8 @@ class BatchNormKernel functor(dev_ctx, saved_variance, static_cast>(0)); if ((N * H * W * D) == 1) { - LOG(WARNING) << "Only 1 element in normalization dimension, " - << "we skip the batch norm calculation, let y = x."; + // Only 1 element in normalization dimension, + // skip the batch norm calculation, let y = x. framework::TensorCopy(*x, ctx.GetPlace(), y); } else { double this_factor = 1. - momentum; @@ -242,6 +234,63 @@ static __global__ void KeBNBackwardData(const T *dy, } } +template +static __global__ void BNBackwardData(const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *mean, + const T *x, + const BatchNormParamType *variance, + const int C, const int N, const int HxW, + T *dx) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; + __shared__ BatchNormParamType dy_sum_val; + __shared__ BatchNormParamType dy_x_sub_mean_sum_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType inv_var_i = variance[i]; + BatchNormParamType mean_i = mean[i]; + BatchNormParamType dy_sum = static_cast>(0); + BatchNormParamType dy_x_sub_mean_sum = + static_cast>(0); + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType dy_i = + static_cast>(dy[index]); + dy_sum += dy_i; + dy_x_sub_mean_sum += + dy_i * (static_cast>(x[index]) - mean_i); + } + + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage) + .Reduce(dy_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + dx[index] = + (static_cast>(dy[index]) - + dy_sum_val / static_cast>(inner_size) - + (static_cast>(x[index]) - mean_i) * + dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) * + scale[i] * inv_var_i; + } + } +} + template class BatchNormGradKernel : public framework::OpKernel { @@ -290,6 +339,13 @@ class BatchNormGradKernel } auto &dev_ctx = ctx.template device_context(); + const int num = x->numel(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid1 = (num + block - 1) / block; + int grid2 = std::min(C, max_blocks); + if (!use_global_stats) { if ((N * H * W * D) == 1) { framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); @@ -333,21 +389,43 @@ class BatchNormGradKernel const auto *saved_mean = ctx.Input("SavedMean"); const auto *saved_var = ctx.Input("SavedVariance"); - const void *saved_mean_data = + const auto *saved_mean_data = saved_mean->template data>(); - const void *saved_var_data = + const auto *saved_var_data = saved_var->template data>(); - CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, x->template data(), - data_desc_, d_y->template data(), data_desc_, - d_x->template mutable_data(ctx.GetPlace()), bn_param_desc_, - scale->template data>(), - d_scale->template mutable_data>(ctx.GetPlace()), - d_bias->template mutable_data>(ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); + if (d_scale && d_bias) { + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( + dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), + CudnnDataType::kZero(), CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, d_y->template data(), data_desc_, + d_x->template mutable_data(ctx.GetPlace()), bn_param_desc_, + scale->template data>(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace()), + epsilon, saved_mean_data, saved_var_data)); + } else { + if (data_layout == framework::DataLayout::kNCHW) { + if (d_x) { + BNBackwardData<<< + grid2, block, 0, dev_ctx.stream()>>>( + d_y->data(), scale->data>(), + saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, + d_x->data()); + } + } else { + if (d_x) { + BNBackwardData<<< + grid2, block, 0, dev_ctx.stream()>>>( + d_y->data(), scale->data>(), + saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, + d_x->data()); + } + } + } // clean when exit. CUDNN_ENFORCE( @@ -363,13 +441,6 @@ class BatchNormGradKernel const auto *running_var_data = running_var->template data>(); - const int num = x->numel(); - const int block = 512; - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - int grid1 = (num + block - 1) / block; - int grid2 = std::min(C, max_blocks); - if (data_layout == framework::DataLayout::kNCHW) { if (d_x) { KeBNBackwardData<<< diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index 6e89d73e..6c7dbe0d 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/norm_utils.h" namespace paddle { namespace operators { @@ -96,26 +97,5 @@ class BatchNormGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override; }; -inline void ExtractNCWHD(const framework::DDim &dims, - const DataLayout &data_layout, int *N, int *C, int *H, - int *W, int *D) { - *N = dims[0]; - if (dims.size() == 2) { - *C = dims[1]; - *H = 1; - *W = 1; - *D = 1; - } else { - *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; - *W = dims.size() > 3 - ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) - : 1; - *D = dims.size() > 4 - ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) - : 1; - } -} - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h index f9570e4e..a0166659 100644 --- a/paddle/fluid/operators/bpr_loss_op.h +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -28,7 +28,7 @@ using Tensor = framework::Tensor; template struct TolerableValue { HOSTDEVICE T operator()(const T& x) const { - PADDLE_ASSERT(std::is_floating_point::value); + PADDLE_ENFORCE_EQ(std::is_floating_point::value, true); const T kApproInf = 1e20; if (x == INFINITY) return kApproInf; if (x == -INFINITY) return -kApproInf; diff --git a/paddle/fluid/operators/center_loss_op.cc b/paddle/fluid/operators/center_loss_op.cc new file mode 100644 index 00000000..bf766a05 --- /dev/null +++ b/paddle/fluid/operators/center_loss_op.cc @@ -0,0 +1,157 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/center_loss_op.h" +#include +#include + +namespace paddle { +namespace operators { +class CenterLossOp : public framework::OperatorWithKernel { + public: + CenterLossOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of CenterLoss should not be null."); + auto x_dims = ctx->GetInputDim("X"); + + PADDLE_ENFORCE(ctx->HasInput("CenterUpdateRate"), + "Input(CenterUpdateRate) of CenterLoss should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("Label"), + "Input(Label) of CenterLoss should not be null."); + + PADDLE_ENFORCE(ctx->HasInput("Centers"), + "Input(Centers) of CenterLoss should not be null."); + + PADDLE_ENFORCE( + ctx->HasOutput("SampleCenterDiff"), + "Output(SampleCenterDiff) of CenterLoss should not be null."); + + PADDLE_ENFORCE(ctx->HasOutput("Loss"), + "Output(Loss) of CenterLoss should not be null."); + + PADDLE_ENFORCE( + ctx->HasOutput("CentersOut"), + "Output(CentersOut) of CenterLoss shared data with Centers."); + + ctx->SetOutputDim("SampleCenterDiff", + {x_dims[0], product(x_dims) / x_dims[0]}); + ctx->SetOutputDim("CentersOut", ctx->GetInputDim("Centers")); + ctx->SetOutputDim("Loss", {x_dims[0], 1}); + ctx->ShareLoD("X", /*->*/ "Loss"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class CenterLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) Input tensor of center_loss operator."); + AddInput("Label", "(Tensor) Input tensor of center_loss operator."); + AddInput("Centers", "(Tensor) Input tensor of center_loss operator."); + AddInput("CenterUpdateRate", + "(Tensor) Input tensor of center_loss operator."); + + AddOutput("CentersOut", "(Tensor) Input tensor of center_loss operator."); + AddOutput("SampleCenterDiff", + "(Tensor) output tensor of center_loss operator."); + AddOutput("Loss", "(Tensor) Output tensor of center_loss operator."); + + AddAttr("cluster_num", + "The output cluster num of the center_loss operator."); + AddAttr("need_update", "whether need to update center info."); + AddComment(R"DOC( +**CenterLoss operator** +implemention of the center loss function in the papper<>, equations in this implement +is:loss = 1/2 * (x-y)^2 ,where x(X) means the deep feature(output of last hidden layer ) +and y(Label) the target label +)DOC"); + } +}; + +class CenterLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("SampleCenterDiff"), + "Input(SampleCenterDiff) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X) should not be null"); + + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + ctx.Input("SampleCenterDiff")->type(), ctx.device_context()); + } +}; + +class CenterLossOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr retv(new framework::OpDesc()); + retv->SetType("center_loss_grad"); + retv->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + retv->SetInput("SampleCenterDiff", Output("SampleCenterDiff")); + retv->SetInput("X", Input("X")); + retv->SetOutput(framework::GradVarName("X"), InputGrad("X")); + + retv->SetAttrMap(Attrs()); + return retv; + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPUCtx = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(center_loss, ops::CenterLossOp, ops::CenterLossOpMaker, + ops::CenterLossOpGradMaker); + +REGISTER_OPERATOR(center_loss_grad, ops::CenterLossGradOp); + +REGISTER_OP_CPU_KERNEL(center_loss, ops::CenterLossKernel, + ops::CenterLossKernel); + +REGISTER_OP_CPU_KERNEL(center_loss_grad, + ops::CenterLossGradKernel, + ops::CenterLossGradKernel); diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu new file mode 100644 index 00000000..10b65fa2 --- /dev/null +++ b/paddle/fluid/operators/center_loss_op.cu @@ -0,0 +1,146 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/operators/center_loss_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" +namespace paddle { +namespace operators { + +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void ComputeDifferent(T *centers_diff, const T *X, const T *centers, + const int64_t *ids, const int64_t N, + const int64_t K, const int64_t D) { + int idx = threadIdx.x; + int idy = blockIdx.x + threadIdx.y * GridDimX; + + while (idy < K) { + int64_t id = ids[idy]; + PADDLE_ENFORCE(id >= 0, "received id:", id); + PADDLE_ENFORCE(id < N, "received id:", id); + T *out = centers_diff + idy * D; + const T *x = X + idy * D; + const T *cent = centers + id * D; + for (int i = idx; i < D; i += BlockDimX) { + out[i] = x[i] - cent[i]; + } + idy += BlockDimY * GridDimX; + } +} + +template +__global__ void UpdateCenters(T *centers, T *centers_diff, const int64_t *ids, + const int64_t N, const int64_t K, const int64_t D, + const T *alpha) { + int idx = threadIdx.x; + int idy = blockIdx.x + threadIdx.y * GridDimX; + int count; + while (idy < K) { + int count = 1; + int64_t id = ids[idy]; + PADDLE_ENFORCE(id >= 0, "received id:", id); + PADDLE_ENFORCE(id < N, "received id:", id); + + for (int i = 0; i < K; i++) { + if (ids[i] == id) { + count++; + } + } + const T *diff = centers_diff + idy * D; + T *cent = centers + id * D; + for (int i = idx; i < D; i += BlockDimX) { + paddle::platform::CudaAtomicAdd(¢[i], alpha[0] * diff[i] / count); + } + idy += BlockDimY * GridDimX; + } +} + +template +class CenterLossCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto &device_context = ctx.template device_context(); + auto stream = device_context.stream(); + auto *X = ctx.Input("X"); // deep feature + auto *labels = ctx.Input("Label"); + auto *centers = ctx.Input("Centers"); + auto *update_rate = ctx.Input("CenterUpdateRate"); + int cluster_num = ctx.Attr("cluster_num"); + auto *lr_center = update_rate->data(); + bool need_update = static_cast(ctx.Attr("need_update")); + + auto x_data = X->data(); + auto label_data = labels->data(); + + auto x_dims = X->dims(); + int batch_size = x_dims[0]; + const int deep_feat_dim = x_dims[1]; + + auto *centers_diff = ctx.Output("SampleCenterDiff"); + auto centers_diff_data = centers_diff->mutable_data(ctx.GetPlace()); + + auto centers_data = centers->data(); + auto centers_dim = centers->dims(); + auto *out_loss = ctx.Output("Loss"); + auto loss_data = out_loss->mutable_data(ctx.GetPlace()); + + auto *centers_out = ctx.Output("CentersOut"); + auto *centers_out_data = centers_out->mutable_data(ctx.GetPlace()); + + auto ctx_place = ctx.GetPlace(); + if (centers != centers_out) { + framework::TensorCopy( + *static_cast(centers), ctx_place, + *platform::DeviceContextPool::Instance().Get(ctx_place), + static_cast(centers_out)); + } + + int64_t numel = X->numel(); + + size_t N = centers->dims()[0]; + size_t D = centers->dims()[1]; + size_t K = labels->numel(); + + dim3 threads(128, 8); + dim3 grids(8, 1); + + ComputeDifferent<<>>( + centers_diff_data, x_data, centers_data, label_data, N, K, D); + + auto &place = *ctx.template device_context().eigen_device(); + auto sub_result = EigenMatrix::From(*centers_diff); + + auto sub_res_pow2 = (sub_result * sub_result) / T(2.0); + auto z = EigenVector::Flatten(*out_loss); + z.device(place) = sub_res_pow2.sum(Eigen::array({{1}})); + if (need_update) { + UpdateCenters<<>>( + centers_out_data, centers_diff_data, label_data, N, K, D, lr_center); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using GPUCtx = paddle::platform::CUDADeviceContext; +REGISTER_OP_CUDA_KERNEL(center_loss, ops::CenterLossCUDAKernel, + ops::CenterLossCUDAKernel); + +REGISTER_OP_CUDA_KERNEL(center_loss_grad, + ops::CenterLossGradKernel, + ops::CenterLossGradKernel); diff --git a/paddle/fluid/operators/center_loss_op.h b/paddle/fluid/operators/center_loss_op.h new file mode 100644 index 00000000..f134bd0c --- /dev/null +++ b/paddle/fluid/operators/center_loss_op.h @@ -0,0 +1,155 @@ +/*Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/functors.h" +#include "paddle/fluid/platform/transform.h" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +struct SubFunctor { + inline HOSTDEVICE T operator()(T a, T b) const { return a - b; } +}; + +template +class CenterLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *X = ctx.Input("X"); // deep feature + auto *labels = ctx.Input("Label"); + auto *centers = ctx.Input("Centers"); + auto *update_rate = ctx.Input("CenterUpdateRate"); + int cluster_num = ctx.Attr("cluster_num"); + auto *lr_center = update_rate->data(); + T alpha = lr_center[0]; + bool need_update = static_cast(ctx.Attr("need_update")); + + auto x_data = X->data(); + auto label_data = labels->data(); + + auto centers_dim = centers->dims(); + auto centers_data = centers->data(); + + auto x_dims = X->dims(); + int batch_size = x_dims[0]; + int deep_feat_dim = x_dims[1]; + + auto centers_diff = ctx.Output("SampleCenterDiff"); + auto centers_diff_data = centers_diff->mutable_data(ctx.GetPlace()); + auto *out_loss = ctx.Output("Loss"); + + auto *centers_out = ctx.Output("CentersOut"); + auto *centers_out_data = centers_out->mutable_data(ctx.GetPlace()); + + if (centers_out_data != centers_data) { + int size = centers_out->numel() * sizeof(T); + memcpy(centers_out_data, centers_data, size); + } + + std::vector center_update_count(cluster_num, 1); + auto &dev_ctx = ctx.template device_context(); + + auto loss_data = out_loss->mutable_data(ctx.GetPlace()); + + Tensor centers_diffacc; // used to accumulate all diff + auto centers_diffacc_data = + centers_diffacc.mutable_data(centers_dim, ctx.GetPlace()); + int numel = centers_diffacc.numel(); + std::memset(centers_diffacc_data, 0, sizeof(T) * numel); + + auto blas = math::GetBlas(ctx); + int tLabel; + + const T *x_index; + const T *center_index; + T *center_out_index; + T *center_loss_diff_index; + T *acc_index; + platform::Transform trans; + + for (int i = 0; i < batch_size; ++i) { + tLabel = label_data[i]; + center_update_count[tLabel]++; + x_index = x_data + i * deep_feat_dim; // xi index + center_index = centers_data + tLabel * deep_feat_dim; // center index + center_loss_diff_index = centers_diff_data + i * deep_feat_dim; + trans(dev_ctx, x_index, x_index + deep_feat_dim, center_index, + center_loss_diff_index, SubFunctor()); + + acc_index = centers_diffacc_data + tLabel * deep_feat_dim; + blas.VADD(deep_feat_dim, center_loss_diff_index, acc_index, + acc_index); // accumulate + loss_data[i] = blas.DOT(deep_feat_dim, center_loss_diff_index, + center_loss_diff_index) / + T(2.0); + } + + // update centers data + if (need_update == true) { + for (int i = 0; i < cluster_num; i++) { + acc_index = centers_diffacc_data + i * deep_feat_dim; + center_out_index = centers_out_data + i * deep_feat_dim; + T scale = alpha / center_update_count[i]; + blas.SCAL(deep_feat_dim, scale, acc_index); + blas.VADD(deep_feat_dim, acc_index, center_out_index, center_out_index); + } + } + } +}; + +template +class CenterLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *in0 = context.Input("SampleCenterDiff"); + auto *in1 = context.Input(framework::GradVarName("Loss")); + auto *x_g = context.Output(framework::GradVarName("X")); + auto sub_result = EigenMatrix::From(*in0); + auto out_grad = EigenMatrix::From(*in1); + + auto x_dims = x_g->dims(); + int cols = x_g->numel() / x_dims[0]; + // calculate gradient + auto grad_mat = + (out_grad.broadcast(Eigen::array({{1, cols}}))) * sub_result; + + // propagate back to input + auto &eigen_place = + *context.template device_context().eigen_device(); + x_g->mutable_data(context.GetPlace()); + // eigen matrix + auto x_grad = + EigenMatrix::From(*x_g, framework::make_ddim({x_dims[0], cols})); + x_grad.device(eigen_place) = grad_mat; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc index 4fc6ae36..d51f676c 100644 --- a/paddle/fluid/operators/clip_op.cc +++ b/paddle/fluid/operators/clip_op.cc @@ -93,13 +93,18 @@ class ClipGradOpDescMaker : public framework::SingleGradOpDescMaker { } }; +DECLARE_INPLACE_OP_INFERER(ClipInplaceInferer, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(ClipGradInplaceInferer, + {framework::GradVarName("Out"), + framework::GradVarName("X")}); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(clip, ops::ClipOp, ops::ClipOpMaker, - ops::ClipGradOpDescMaker); -REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad); + ops::ClipGradOpDescMaker, ops::ClipInplaceInferer); +REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer); REGISTER_OP_CPU_KERNEL( clip, ops::ClipKernel); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc index 330219cd..14e2741e 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc @@ -35,10 +35,10 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel { int nranks = ctx.Attr("nranks"); int rid = ctx.Attr("ring_id"); - auto comm = platform::NCCLCommContext::Instance().Get(rid); + auto place = ctx.GetPlace(); + auto comm = platform::NCCLCommContext::Instance().Get(rid, place); PADDLE_ENFORCE_EQ(nranks, comm->nranks()); - auto place = ctx.GetPlace(); framework::DDim out_dims = in->dims(); out_dims[0] *= nranks; out->mutable_data(out_dims, place); @@ -55,7 +55,7 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel { stream = comm->stream(); } - PADDLE_ENFORCE(platform::dynload::ncclAllGather( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( send_buff, recv_buff, send_numel, static_cast(dtype), comm->comm(), stream)); #else diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 1db5f155..02f6210c 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -70,7 +70,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { void* recvbuff = out->mutable_data(place); int rid = ctx.Attr("ring_id"); - auto comm = platform::NCCLCommContext::Instance().Get(rid); + auto comm = platform::NCCLCommContext::Instance().Get(rid, place); cudaStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { @@ -102,7 +102,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { PADDLE_THROW("Invalid reduce type: %d", red_type); } - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream)); #else PADDLE_THROW("PaddlePaddle should compile with GPU."); diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc index c0f5bbd2..a4433d0b 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc @@ -33,9 +33,9 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { ncclDataType_t dtype = platform::ToNCCLDataType(x->type()); int rid = ctx.Attr("ring_id"); - auto comm = platform::NCCLCommContext::Instance().Get(rid); - auto place = ctx.GetPlace(); + auto comm = platform::NCCLCommContext::Instance().Get(rid, place); + cudaStream_t stream = nullptr; if (ctx.Attr("use_calc_stream")) { auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); @@ -46,7 +46,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { int root = ctx.Attr("root"); if (root == comm->rank()) { - PADDLE_ENFORCE(platform::dynload::ncclBcast( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast( reinterpret_cast(const_cast(x->data())), numel, dtype, root, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent " @@ -59,9 +59,9 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel { static_cast(out)); } } else { - PADDLE_ENFORCE(platform::dynload::ncclBcast(out->mutable_data(place), - numel, dtype, root, - comm->comm(), stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::ncclBcast(out->mutable_data(place), numel, + dtype, root, comm->comm(), stream)); VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved " << framework::product(out->dims()); } diff --git a/paddle/fluid/operators/collective/c_comm_init_all_op.cc b/paddle/fluid/operators/collective/c_comm_init_all_op.cc new file mode 100644 index 00000000..758affbd --- /dev/null +++ b/paddle/fluid/operators/collective/c_comm_init_all_op.cc @@ -0,0 +1,93 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include +#endif +#include +#include +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +class CCommInitAllInferShape : public framework::InferShapeBase { + public: + ~CCommInitAllInferShape() {} + void operator()(framework::InferShapeContext* ctx) const override{}; +}; + +class CCommInitAllOp : public framework::OperatorBase { + public: + CCommInitAllOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + PADDLE_ENFORCE_EQ(is_gpu_place(place), true, + "CCommInitAllOp can run on gpu place only."); + +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + std::vector devices = Attr>("devices"); + if (devices.empty()) { + devices = platform::GetSelectedDevices(); + } + + int rid = Attr("ring_id"); + + platform::NCCLCommContext::Instance().CreateAllNCCLComms(devices, rid); +#else + PADDLE_THROW("PaddlePaddle should compile with GPU."); +#endif + } +}; + +class CCommInitAllOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddComment(R"DOC( +CCommInitAll operator + +Initialize all collective communicatoin context +)DOC"); + AddAttr>( + "devices", + "(std::vector) which devices does the nccl comm initialized on") + .SetDefault({}); + AddAttr("ring_id", "(int default 0) user specified ring id") + .SetDefault(0); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_comm_init_all, ops::CCommInitAllOp, + ops::CCommInitAllInferShape, ops::CCommInitAllOpMaker); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc index 7244aa94..da92b65a 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc @@ -31,10 +31,10 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel { auto out = ctx.Output("Out"); int rid = ctx.Attr("ring_id"); - auto comm = platform::NCCLCommContext::Instance().Get(rid); + auto place = ctx.GetPlace(); + auto comm = platform::NCCLCommContext::Instance().Get(rid, place); int nranks = comm->nranks(); - auto place = ctx.GetPlace(); auto out_dims = in->dims(); out_dims[0] = out_dims[0] / nranks; out->mutable_data(out_dims, place); @@ -52,7 +52,7 @@ class CReduceScatterOpCUDAKernel : public framework::OpKernel { stream = comm->stream(); } - PADDLE_ENFORCE(platform::dynload::ncclReduceScatter( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduceScatter( send_buff, recv_buff, recv_numel, static_cast(dtype), ncclSum, comm->comm(), stream)); #else diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc index 51703561..320c8507 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc @@ -38,12 +38,13 @@ class CSyncCommStreamOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { - PADDLE_ENFORCE(is_gpu_place(place), - "Sync stream op can run on gpu place only for now."); + PADDLE_ENFORCE_EQ(is_gpu_place(place), true, + "Sync stream op can run on gpu place only for now."); #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) int ring_id = Attr("ring_id"); - auto stream = platform::NCCLCommContext::Instance().Get(ring_id)->stream(); + auto stream = + platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); cudaError_t e_sync = cudaStreamSynchronize(stream); if (e_sync != 0) { LOG(FATAL) << "Fail to sync nccl stream: " << cudaGetErrorString(e_sync); diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 7f249924..e52d2808 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -31,7 +31,7 @@ class ConcatOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, - "Inputs(X) of ConcatOp should be empty."); + "Inputs(X) of ConcatOp should not be empty."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of ConcatOp should not be null."); diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index f7281a2d..758f0a65 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,7 +1,10 @@ include(operators) register_operators(DEPS naive_executor) cc_library(op_variant SRCS op_variant.cc DEPS operator proto_desc) +cc_library(conditional_block_op_helper SRCS conditional_block_op_helper.cc DEPS operator op_variant conditional_block_op) cc_library(recurrent_op_helper SRCS recurrent_op_helper.cc DEPS operator op_variant recurrent_op) cc_library(while_op_helper SRCS while_op_helper.cc DEPS operator op_variant) +target_link_libraries(conditional_block_infer_op conditional_block_op) + file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc index 8358ef75..260b5672 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.cc +++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc @@ -17,6 +17,12 @@ limitations under the License. */ namespace paddle { namespace operators { +const char ConditionalOp::kInputs[] = "Input"; +const char ConditionalOp::kOutputs[] = "Out"; +const char ConditionalOp::kCondition[] = "Cond"; +const char ConditionalOp::kScope[] = "Scope"; +const char ConditionalOp::kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + class ConditionalBlockOp : public ConditionalOp { public: ConditionalBlockOp(const std::string &type, @@ -33,20 +39,20 @@ class ConditionalBlockOp : public ConditionalOp { // When is_scalar_condition is True, the conditional variable is a scalar, // whether need to execute the operators in sub-block depends on the // conditional variable (Cond). - auto xs = InputTensors(scope, "Cond"); + auto xs = InputTensors(scope, ConditionalOp::kCondition); need_run = ScalarCondition(xs); } else { // When is_scalar_condition is False, the conditional variable maybe a // vector or tensor, whether need to execute the operators in sub-block // depends on the input variables (Input). - auto xs = InputTensors(scope, "Input"); + auto xs = InputTensors(scope, ConditionalOp::kInputs); need_run = std::all_of( xs.begin(), xs.end(), [](const framework::LoDTensor *t) { return t->numel() != 0; }); } if (need_run) { - auto *scope_var = scope.FindVar(Output("Scope")); + auto *scope_var = scope.FindVar(Output(ConditionalOp::kScope)); PADDLE_ENFORCE(scope_var != nullptr, "Must set scope"); auto *scopes = scope_var->GetMutable>(); scopes->resize(1); @@ -55,7 +61,10 @@ class ConditionalBlockOp : public ConditionalOp { framework::Executor exec(dev_place); auto *block = Attr("sub_block"); - exec.Run(*block->Program(), &cur_scope, block->ID(), false); + auto &skip_vars = + Attr>(ConditionalOp::kSkipEagerDeletionVars); + exec.Run(*block->Program(), &cur_scope, block->ID(), false, true, + skip_vars); } } }; @@ -73,17 +82,17 @@ class ConditionalBlockGradOp : public ConditionalOp { const platform::Place &dev_place) const override { bool need_run; if (Attr("is_scalar_condition")) { - auto xs = this->InputTensors(scope, "Cond"); + auto xs = this->InputTensors(scope, ConditionalOp::kCondition); need_run = ScalarCondition(xs); } else { - auto xs = this->InputTensors(scope, "Input"); + auto xs = this->InputTensors(scope, ConditionalOp::kInputs); need_run = std::all_of( xs.begin(), xs.end(), [](const framework::LoDTensor *t) { return t->numel() != 0; }); } if (need_run) { - auto *scope_var = scope.FindVar(Input("Scope")); + auto *scope_var = scope.FindVar(Input(ConditionalOp::kScope)); PADDLE_ENFORCE(scope_var != nullptr, "Must set scope"); auto &scopes = scope_var->Get>(); framework::Scope &cur_scope = *scopes[0]; @@ -91,10 +100,12 @@ class ConditionalBlockGradOp : public ConditionalOp { framework::Executor exec(dev_place); auto *block = Attr("sub_block"); - const auto &ins = Inputs("Input"); - const auto &d_ins = Outputs(framework::GradVarName("Input")); - const auto &conds = Inputs("Cond"); - const auto &d_conds = Outputs(framework::GradVarName("Cond")); + const auto &ins = Inputs(ConditionalOp::kInputs); + const auto &d_ins = + Outputs(framework::GradVarName(ConditionalOp::kInputs)); + const auto &conds = Inputs(ConditionalOp::kCondition); + const auto &d_conds = + Outputs(framework::GradVarName(ConditionalOp::kCondition)); std::vector ins_conds_grads; ins_conds_grads.reserve(ins.size() + conds.size()); @@ -142,15 +153,17 @@ class ConditionalBlockGradOp : public ConditionalOp { class ConditionalBlockGradInferShape : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE(context->HasInputs("Cond")); - if (context->HasInputs("Input")) { - PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Input"))); - context->SetOutputsDim(framework::GradVarName("Input"), - context->GetInputsDim("Input")); + PADDLE_ENFORCE(context->HasInputs(ConditionalOp::kCondition)); + if (context->HasInputs(ConditionalOp::kInputs)) { + PADDLE_ENFORCE( + context->HasOutputs(framework::GradVarName(ConditionalOp::kInputs))); + context->SetOutputsDim(framework::GradVarName(ConditionalOp::kInputs), + context->GetInputsDim(ConditionalOp::kInputs)); } - if (context->HasOutputs(framework::GradVarName("Cond"))) { - context->SetOutputsDim(framework::GradVarName("Cond"), - context->GetInputsDim("Cond")); + if (context->HasOutputs( + framework::GradVarName(ConditionalOp::kCondition))) { + context->SetOutputsDim(framework::GradVarName(ConditionalOp::kCondition), + context->GetInputsDim(ConditionalOp::kCondition)); } } }; @@ -163,15 +176,17 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker { std::unique_ptr Apply() const override { auto grad_op = new framework::OpDesc(); grad_op->SetType("conditional_block_grad"); - grad_op->SetInput("Cond", Input("Cond")); - grad_op->SetInput("Input", Input("Input")); - grad_op->SetInput("Out", Output("Out")); - grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); - grad_op->SetInput("Scope", Output("Scope")); - grad_op->SetOutput(framework::GradVarName("Cond"), - InputGrad("Cond", false)); - grad_op->SetOutput(framework::GradVarName("Input"), - InputGrad("Input", false)); + grad_op->SetInput(ConditionalOp::kCondition, + Input(ConditionalOp::kCondition)); + grad_op->SetInput(ConditionalOp::kInputs, Input(ConditionalOp::kInputs)); + grad_op->SetInput(ConditionalOp::kOutputs, Output(ConditionalOp::kOutputs)); + grad_op->SetInput(framework::GradVarName(ConditionalOp::kOutputs), + OutputGrad(ConditionalOp::kOutputs)); + grad_op->SetInput(ConditionalOp::kScope, Output(ConditionalOp::kScope)); + grad_op->SetOutput(framework::GradVarName(ConditionalOp::kCondition), + InputGrad(ConditionalOp::kCondition, false)); + grad_op->SetOutput(framework::GradVarName(ConditionalOp::kInputs), + InputGrad(ConditionalOp::kInputs, false)); grad_op->SetBlockAttr("sub_block", this->grad_block_[0]); grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition")); return std::unique_ptr(grad_op); diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h index 9a079c84..9d65c33c 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.h +++ b/paddle/fluid/operators/controlflow/conditional_block_op.h @@ -33,6 +33,12 @@ class ConditionalOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} + static const char kInputs[]; + static const char kOutputs[]; + static const char kCondition[]; + static const char kScope[]; + static const char kSkipEagerDeletionVars[]; + protected: std::vector InputTensors( const framework::Scope &scope, const std::string &in_name) const { @@ -78,13 +84,15 @@ class ConditionalOp : public framework::OperatorBase { class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("Cond", + AddInput(ConditionalOp::kCondition, "The conditional variable of this operator. If Cond is empty, the " "whole sub-block will not be executed.") .AsDuplicable(); - AddInput("Input", "The input variables of the sub-block.").AsDuplicable(); - AddOutput("Out", "The output variables of the sub-block.").AsDuplicable(); - AddOutput("Scope", + AddInput(ConditionalOp::kInputs, "The input variables of the sub-block.") + .AsDuplicable(); + AddOutput(ConditionalOp::kOutputs, "The output variables of the sub-block.") + .AsDuplicable(); + AddOutput(ConditionalOp::kScope, "(std::vector) The step scope of conditional block. To " "unify the conditional block, rnn and while op, the type of " "scope is std::vector"); @@ -94,6 +102,10 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker { "The conditional variable (Cond) is used as scalar " "condition.") .SetDefault(false); + AddAttr>(ConditionalOp::kSkipEagerDeletionVars, + "Vars that would not be deleted when " + "garbage collection strategy enables") + .SetDefault(std::vector()); AddComment(R"DOC(Conditional block operator If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar, diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc new file mode 100644 index 00000000..13a00c85 --- /dev/null +++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.cc @@ -0,0 +1,167 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" +#include +#include +#include +#include +#include "paddle/fluid/operators/controlflow/op_variant.h" + +namespace paddle { +namespace operators { + +static bool IsMatchedConditionalBlockOpAndConditionalBlockGradOp( + const OpVariant &fwd_op, const OpVariant &bwd_op) { + return fwd_op.Outputs().at(ConditionalOp::kScope) == + bwd_op.Inputs().at(ConditionalOp::kScope); +} + +static void FindAllConditionalBlockAndConditionalBlockGradOp( + const framework::ProgramDesc &program, std::vector *fwd_ops, + std::vector *bwd_ops) { + PADDLE_ENFORCE_GE(fwd_ops->size(), bwd_ops->size()); + + for (size_t i = 1; i < program.Size(); ++i) { + auto &block = program.Block(i); + for (size_t j = 0; j < block.OpSize(); ++j) { + auto *op = block.Op(j); + if (op->Type() == "conditional_block") { + fwd_ops->emplace_back(op); + } else if (op->Type() == "conditional_block_grad") { + bwd_ops->emplace_back(op); + } + } + } + + PADDLE_ENFORCE_GE( + fwd_ops->size(), bwd_ops->size(), + "There are extra conditional_block_grad ops in the graph or program"); +} + +static void SetSkipVarsForConditionalBlockOp(OpVariant *fwd_op, + OpVariant *bwd_op) { + auto *grad_block = bwd_op->Attr("sub_block"); + auto is_skippable_in_fwd = [grad_block](const std::string &var_name) { + return var_name != framework::kEmptyVarName && + !grad_block->HasVar(var_name); + }; + + std::unordered_set forward_skip_vars; + for (auto *op_desc : grad_block->AllOps()) { + for (auto &in_arg_name : op_desc->InputArgumentNames()) { + if (is_skippable_in_fwd(in_arg_name)) { + forward_skip_vars.insert(in_arg_name); + } + } + + for (auto &out_arg_name : op_desc->OutputArgumentNames()) { + if (is_skippable_in_fwd(out_arg_name)) { + forward_skip_vars.insert(out_arg_name); + } + } + } + + auto &fwd_attrs = const_cast(fwd_op->Attrs()); + std::vector skip_vars_vec(forward_skip_vars.begin(), + forward_skip_vars.end()); + VLOG(2) << "Prepare to skip " << skip_vars_vec.size() + << " var(s): " << string::join_strings(skip_vars_vec, ' '); + fwd_attrs[ConditionalOp::kSkipEagerDeletionVars] = std::move(skip_vars_vec); +} + +static void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl( + const framework::ProgramDesc &program, std::vector *ifelse_ops, + std::vector *ifelse_grad_ops) { + FindAllConditionalBlockAndConditionalBlockGradOp(program, ifelse_ops, + ifelse_grad_ops); + + VLOG(2) << "Found conditional_block op num: " << ifelse_ops->size() + << ", conditional_block_grad op num: " << ifelse_grad_ops->size(); + + if (ifelse_grad_ops->empty()) { + return; + } + + std::unordered_set ifelse_op_set( + ifelse_ops->begin(), ifelse_ops->end()); + + for (auto &bwd_op : *ifelse_grad_ops) { + const OpVariant *matched_fwd_op = nullptr; + for (auto &fwd_op : ifelse_op_set) { + if (IsMatchedConditionalBlockOpAndConditionalBlockGradOp(fwd_op, + bwd_op)) { + PADDLE_ENFORCE(matched_fwd_op == nullptr, + "Found multiple matched conditional_block ops"); + matched_fwd_op = &fwd_op; + } + } + + PADDLE_ENFORCE_NOT_NULL(matched_fwd_op, + "Cannot find matched forward conditional_block op"); + + SetSkipVarsForConditionalBlockOp(const_cast(matched_fwd_op), + &bwd_op); + ifelse_op_set.erase(*matched_fwd_op); + } +} + +void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp( + const framework::ProgramDesc &program, int block_id, + const std::vector> &all_ops) { + // If block_id is not 0, returns + // This is because all conditional_block_ops and conditional_block_grad_ops + // in the whole program would be processed when block_id is 0 (i.e. + // when Executor::Run() or ParallelExecutor constructs). + + // What's more, all conditional_block_ops and conditional_block_grad_ops + // must be processed when block_id is zero. If not, conditional_block_op + // may run first and erase variables used in conditional_block_grad_op, + // and in this moment, conditional_block_grad_ops may be not constructed yet. + if (block_id != 0) return; + + std::vector fwd_ops, bwd_ops; + for (auto &op : all_ops) { + if (op->Type() == "conditional_block") { + fwd_ops.emplace_back(op.get()); + } else if (op->Type() == "conditional_block_grad") { + bwd_ops.emplace_back(op.get()); + } + } + + PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl( + program, &fwd_ops, &bwd_ops); +} + +void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp( + const framework::ProgramDesc &program, + const std::vector &ifelse_ops, + const std::vector &ifelse_grad_ops) { + std::vector fwd_ops, bwd_ops; + fwd_ops.reserve(ifelse_ops.size()); + for (auto *op : ifelse_ops) { + fwd_ops.emplace_back(op); + } + + bwd_ops.reserve(ifelse_grad_ops.size()); + for (auto *op : ifelse_grad_ops) { + bwd_ops.emplace_back(op); + } + + PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOpImpl( + program, &fwd_ops, &bwd_ops); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/conditional_block_op_helper.h b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h new file mode 100644 index 00000000..f7dfba6f --- /dev/null +++ b/paddle/fluid/operators/controlflow/conditional_block_op_helper.h @@ -0,0 +1,35 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/controlflow/conditional_block_op.h" + +namespace paddle { +namespace operators { + +void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp( + const framework::ProgramDesc &program, int block_id, + const std::vector> &all_ops); + +void PrepareSafeEagerDeletionOnConditionalOpAndConditionalGradOp( + const framework::ProgramDesc &program, + const std::vector &ifelse_ops, + const std::vector &ifelse_grad_ops); + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index 85d36c5c..39fdf07f 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" @@ -55,7 +56,16 @@ class FetchOp : public framework::OperatorBase { // FIXME(yuyang18): Should we assume the fetch operator always generate // CPU outputs? if (src_item.IsInitialized() && src_item.numel() > 0) { - TensorCopySync(src_item, platform::CPUPlace(), &dst_item); + // Conversion from MKL-DNN to Paddle + if (src_item.layout() == framework::DataLayout::kMKLDNN) { + framework::Tensor out; + framework::innerTransDataLayoutFromMKLDNN( + src_item.layout(), framework::DataLayout::kNCHW, src_item, &out, + platform::CPUPlace()); + TensorCopySync(out, platform::CPUPlace(), &dst_item); + } else { + TensorCopySync(src_item, platform::CPUPlace(), &dst_item); + } } else { // Not copy, if the src tensor is empty. dst_item.clear(); diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc index 69250866..d2bb6827 100644 --- a/paddle/fluid/operators/controlflow/recurrent_op_helper.cc +++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.cc @@ -65,7 +65,8 @@ static void AddSkipVars(const OpVariant &op, const Container &skip_vars) { // Find all ops and grad ops with given type name. The ops and grad ops // may locate in different blocks so we should traverse all blocks in the // program and find them out -static void FindAllOpAndGradOp(OpAndGradOpPair *op_and_grad_op, +static void FindAllOpAndGradOp(const framework::ProgramDesc &program, + OpAndGradOpPair *op_and_grad_op, const std::string &type_name, const std::string &backward_type_name) { OpVariantSet &ops = op_and_grad_op->first; @@ -74,14 +75,8 @@ static void FindAllOpAndGradOp(OpAndGradOpPair *op_and_grad_op, PADDLE_ENFORCE_GE(ops.size(), grad_ops.size(), "There are extra grad ops in the graph or program"); - if (ops.empty()) return; - - const auto *program = - ops.begin() - ->Attr(RecurrentBase::kStepBlock) - ->Program(); - for (size_t i = 1; i < program->Size(); ++i) { - auto &block = program->Block(i); + for (size_t i = 1; i < program.Size(); ++i) { + auto &block = program.Block(i); for (size_t j = 0; j < block.OpSize(); ++j) { auto *op = block.Op(j); if (op->Type() == type_name) { @@ -201,7 +196,7 @@ static void SetRecurrentOpAndRecurrentGradOpSkipVarAttr( } void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( - int block_id, + const framework::ProgramDesc &program, int block_id, const std::vector> &all_ops) { // If block_id is not 0, returns @@ -224,13 +219,13 @@ void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( op_pair.second.emplace(op.get()); } } - PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(&op_pair); + PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(program, &op_pair); } void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( - OpAndGradOpPair *op_pair) { + const framework::ProgramDesc &program, OpAndGradOpPair *op_pair) { // Find all ops and grad ops at all blocks - FindAllOpAndGradOp(op_pair, "recurrent", "recurrent_grad"); + FindAllOpAndGradOp(program, op_pair, "recurrent", "recurrent_grad"); OpVariantSet &recurrent_ops = op_pair->first; OpVariantSet &recurrent_grad_ops = op_pair->second; diff --git a/paddle/fluid/operators/controlflow/recurrent_op_helper.h b/paddle/fluid/operators/controlflow/recurrent_op_helper.h index b1e6e662..aacca076 100644 --- a/paddle/fluid/operators/controlflow/recurrent_op_helper.h +++ b/paddle/fluid/operators/controlflow/recurrent_op_helper.h @@ -37,14 +37,14 @@ using OpAndGradOpPair = std::pair; // recurrent_grad ops at block 0 and the function will find all recurrent and // recurrent_grad ops across blocks. void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( - OpAndGradOpPair *op_pair); + const framework::ProgramDesc &program, OpAndGradOpPair *op_pair); // Set vars to skip eager deletion on input recurrent and recurrent_grad for // preparing safe eager deletion. The input block_id must be 0 and caller can // input all ops in the block. The function will find all recurrent and // recurrent_grad ops across blocks. void PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp( - int block_id, + const framework::ProgramDesc &program, int block_id, const std::vector> &all_ops); diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index b3219208..cd11e87c 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -62,7 +62,7 @@ class WhileOp : public framework::OperatorBase { auto step_scopes = scope.FindVar(Output(kStepScopes))->GetMutable(); - + PADDLE_ENFORCE_EQ(step_scopes->size(), 0, "The StepScope should be empty."); PADDLE_ENFORCE(platform::is_cpu_place(cond.place()), "Condition of while op must in CPU memory."); @@ -197,17 +197,22 @@ class WhileGradOp : public framework::OperatorBase { inside_tensor.set_lod(outside_tensor.lod()); inside_tensor.ShareDataWith(outside_tensor); } else if (og_outside.IsType()) { - auto &outside_array = og_outside.Get(); + auto outside_array = + og_outside.GetMutable(); auto &inside_array = detail::Ref(og_inside.GetMutable()); - VLOG(8) << outside_og_name << " size = " << outside_array.size(); - inside_array.resize(outside_array.size()); + inside_array.clear(); + inside_array.resize(outside_array->size()); + VLOG(8) << outside_og_name << " size = " << outside_array->size(); for (size_t j = 0; j < inside_array.size(); ++j) { - VLOG(8) << j << " " << outside_array[j].numel(); - if (outside_array[j].numel() != 0) { - inside_array[j].set_lod(outside_array[j].lod()); - inside_array[j].ShareDataWith(outside_array[j]); + if (!outside_array->at(j).IsInitialized()) { + outside_array->at(j).Resize({0}); + } + VLOG(8) << j << " " << outside_array->at(j).numel(); + if (outside_array->at(j).numel() != 0) { + inside_array[j].set_lod(outside_array->at(j).lod()); + inside_array[j].ShareDataWith(outside_array->at(j)); } else { PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0); } @@ -276,7 +281,7 @@ class WhileGradOp : public framework::OperatorBase { auto &inside_tensor = var->Get(); framework::AttributeMap attrs; attrs["dtype"] = inside_tensor.type(); - attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); + attrs["shape"] = framework::vectorize(inside_tensor.dims()); attrs["value"] = 0.0f; auto var_name = pg_ig_names[param_id]; @@ -300,6 +305,7 @@ class WhileGradOp : public framework::OperatorBase { dev_ctx.Wait(); const_cast(scope).DeleteScope(&cur_scope); } + step_scopes->clear(); } }; diff --git a/paddle/fluid/operators/controlflow/while_op_helper.cc b/paddle/fluid/operators/controlflow/while_op_helper.cc index 009bc579..8f1e3f60 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.cc +++ b/paddle/fluid/operators/controlflow/while_op_helper.cc @@ -100,16 +100,12 @@ static void ModifyWhileOpAndWhileGradOpAttr(const OpVariant &fwd_op, // Find all while_ops and while_grad_ops in the graph or program // The while_grad_op and while_op may located in different blocks // So we should traverse all blocks in the program and find them out. -static void FindAllWhileAndWhileGradOp(std::vector *while_ops, +static void FindAllWhileAndWhileGradOp(const framework::ProgramDesc &program, + std::vector *while_ops, std::vector *while_grad_ops) { PADDLE_ENFORCE_GE(while_ops->size(), while_grad_ops->size()); - - if (while_ops->empty()) return; - - const auto *program = - while_ops->front().Attr(kStepBlock)->Program(); - for (size_t i = 1; i < program->Size(); ++i) { - auto &block = program->Block(i); + for (size_t i = 1; i < program.Size(); ++i) { + auto &block = program.Block(i); for (size_t j = 0; j < block.OpSize(); ++j) { auto *op = block.Op(j); if (op->Type() == "while") { @@ -125,8 +121,9 @@ static void FindAllWhileAndWhileGradOp(std::vector *while_ops, } static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( - std::vector *while_ops, std::vector *while_grad_ops) { - FindAllWhileAndWhileGradOp(while_ops, while_grad_ops); + const framework::ProgramDesc &program, std::vector *while_ops, + std::vector *while_grad_ops) { + FindAllWhileAndWhileGradOp(program, while_ops, while_grad_ops); VLOG(2) << "Found while op num: " << while_ops->size() << ", while grad op num: " << while_grad_ops->size(); @@ -155,7 +152,7 @@ static void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl( } void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( - int block_id, + const framework::ProgramDesc &program, int block_id, const std::vector> &all_ops) { // If block_id is not 0, returns // This is because all while_ops and while_grad_ops in the whole program @@ -176,10 +173,12 @@ void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( bwd_ops.emplace_back(op.get()); } } - PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(program, &fwd_ops, + &bwd_ops); } void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const framework::ProgramDesc &program, const std::vector &while_ops, const std::vector &while_grad_ops) { std::vector fwd_ops, bwd_ops; @@ -193,7 +192,8 @@ void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( bwd_ops.emplace_back(op); } - PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(&fwd_ops, &bwd_ops); + PrepareSafeEagerDeletionOnWhileOpAndWhileGradOpImpl(program, &fwd_ops, + &bwd_ops); } } // namespace operators diff --git a/paddle/fluid/operators/controlflow/while_op_helper.h b/paddle/fluid/operators/controlflow/while_op_helper.h index 456ba864..e2cfece6 100644 --- a/paddle/fluid/operators/controlflow/while_op_helper.h +++ b/paddle/fluid/operators/controlflow/while_op_helper.h @@ -32,10 +32,11 @@ static constexpr char kOutputs[] = "Out"; static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( - int block_id, + const framework::ProgramDesc &program, int block_id, const std::vector> &all_ops); void PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( + const framework::ProgramDesc &program, const std::vector &while_ops, const std::vector &while_grad_ops); diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 4a5cd326..5f520424 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -22,6 +22,14 @@ limitations under the License. */ namespace paddle { namespace operators { +template +std::ostream& operator<<(std::ostream& out, const std::vector& v) { + out << "["; + for (auto const& tmp : v) out << tmp << ","; + out << "]"; + return out; +} + using framework::AlgorithmsCache; struct ConvArgs { @@ -119,6 +127,11 @@ struct SearchAlgorithm { auto x_dims = framework::vectorize(args.x->dims()); auto w_dims = framework::vectorize(args.w->dims()); + VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:" + << algo_cache_id << ", x_dims:" << x_dims + << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p" + << args.p << ", args.d" << args.d; + algo = algo_cache.GetAlgorithm( x_dims, w_dims, args.s, args.p, args.d, 0, [&]() { int returned_algo_count; @@ -247,6 +260,11 @@ struct SearchAlgorithm { auto x_dims = framework::vectorize(args.x->dims()); auto w_dims = framework::vectorize(args.w->dims()); + VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:" + << algo_cache_id << ", x_dims:" << x_dims + << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p" + << args.p << ", args.d" << args.d; + algo = algo_cache.GetAlgorithm( x_dims, w_dims, args.s, args.p, args.d, 0, [&]() { int returned_algo_count; @@ -368,6 +386,11 @@ struct SearchAlgorithm { auto x_dims = framework::vectorize(args.x->dims()); auto w_dims = framework::vectorize(args.w->dims()); + VLOG(10) << "cudnnConvolutionFwdAlgoPerf_t algo_cache_id:" + << algo_cache_id << ", x_dims:" << x_dims + << ", w_dims:" << w_dims << ", args.s" << args.s << ", args.p" + << args.p << ", args.d" << args.d; + algo = algo_cache.GetAlgorithm( x_dims, w_dims, args.s, args.p, args.d, 0, [&]() { int returned_algo_count; diff --git a/paddle/fluid/operators/conv_cudnn_op.cu.cc b/paddle/fluid/operators/conv_cudnn_op.cu.cc index ec0278e5..6629a203 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc @@ -18,22 +18,14 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_helper.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_op.h" -#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_workspace_helper.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/profiler.h" -DEFINE_bool(cudnn_deterministic, false, - "Whether allow using an autotuning algorithm for convolution " - "operator. The autotuning algorithm may be non-deterministic. If " - "true, the algorithm is deterministic."); -DEFINE_uint64(conv_workspace_size_limit, - paddle::platform::kDefaultConvWorkspaceSizeLimitMB, - "cuDNN convolution workspace limit in MB unit."); -DEFINE_bool(cudnn_exhaustive_search, false, - "Whether enable exhaustive search for cuDNN convolution or " - "not, default is False."); +DECLARE_bool(cudnn_deterministic); +DECLARE_uint64(conv_workspace_size_limit); +DECLARE_bool(cudnn_exhaustive_search); namespace paddle { namespace operators { @@ -517,4 +509,10 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvOpKernel); REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvGradOpKernel, - paddle::operators::CUDNNConvGradOpKernel); + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); +REGISTER_OP_KERNEL( + conv3d_grad_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel, + paddle::operators::CUDNNConvDoubleGradOpKernel); diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h index 1158dc2d..de883580 100644 --- a/paddle/fluid/operators/conv_cudnn_op_cache.h +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -27,10 +27,6 @@ DECLARE_int64(cudnn_exhaustive_search_times); namespace paddle { namespace operators { -static constexpr char kCUDNNFwdAlgoCache[] = "kCUDNNFwdAlgoCache"; -static constexpr char kCUDNNBwdDataAlgoCache[] = "kCUDNNBwdDataAlgoCache"; -static constexpr char kCUDNNBwdFilterAlgoCache[] = "kCUDNNBwdFilterAlgoCache"; - #if CUDNN_VERSION_MIN(6, 0, 5) static constexpr size_t kNUM_CUDNN_FWD_ALGS = CUDNN_CONVOLUTION_FWD_ALGO_COUNT; static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index d1fa7b9d..566daa66 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -16,9 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/platform/cudnn_helper.h" -DEFINE_int64(cudnn_exhaustive_search_times, -1, - "Exhaustive search times for cuDNN convolution, " - "default is -1, not exhaustive search"); +DECLARE_int64(cudnn_exhaustive_search_times); namespace paddle { namespace operators { @@ -81,11 +79,11 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { cudnn_conv_desc, groups)); cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, framework::vectorize2int(input->dims())); + layout, framework::vectorize(input->dims())); cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, framework::vectorize2int(output->dims())); + layout, framework::vectorize(output->dims())); cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( - layout, framework::vectorize2int(filter->dims())); + layout, framework::vectorize(filter->dims())); // Now only support NCHW std::vector bias_dim = {1, static_cast(output->dims()[1]), 1, 1}; cudnnTensorDescriptor_t cudnn_bias_desc = diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index d2036c61..1cfdf7da 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -215,6 +215,14 @@ void Conv2DOpMaker::Make() { AddAttr("fuse_brelu_threshold", "(float, default false 6.0) Only used in mkldnn kernel") .SetDefault(6.0f); + AddAttr("fuse_activation", + "(string, default \"\") Only used in mkldnn kernel") + .SetDefault(""); + AddAttr("fuse_alpha", + "(float, default 0.0) Only used in mkldnn kernel") + .SetDefault(0.0f); + AddAttr("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel") + .SetDefault(0.0f); AddAttr("fuse_residual_connection", "(bool, default false) Only used in mkldnn kernel. Used " "whenever convolution output is as an input to residual " @@ -352,6 +360,14 @@ void Conv3DOpMaker::Make() { .SetDefault(false); AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("fuse_activation", + "(string, default \"\") Only used in mkldnn kernel") + .SetDefault(""); + AddAttr("fuse_alpha", + "(float, default 0.0) Only used in mkldnn kernel") + .SetDefault(0.0f); + AddAttr("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel") + .SetDefault(0.0f); AddAttr("fuse_residual_connection", "(bool, default false) Only used in mkldnn kernel. Used " "whenever convolution output is as an input to residual " @@ -549,6 +565,40 @@ class Conv2DDoubleGradMaker : public framework::SingleGradOpDescMaker { } }; +/* + * Inputs: I, W, dO, ddI, ddW + * Outputs: ddO, dW, dI + */ +class Conv3DDoubleGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType(this->ForwardOpType() + "_grad"); + // I, W, dO, ddI, ddW + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetInput("DOutput", Input(framework::GradVarName("Output"))); + op->SetInput("DDInput", OutputGrad(framework::GradVarName("Input"))); + op->SetInput("DDFilter", OutputGrad(framework::GradVarName("Filter"))); + + auto ddx = OutputGrad(framework::GradVarName("Input")); + auto ddw = OutputGrad(framework::GradVarName("Filter")); + std::vector empty_str = {}; + + op->SetOutput( + "DDOutput", + ddx.empty() ? empty_str : InputGrad(framework::GradVarName("Output"))); + op->SetOutput("DFilter", ddx.empty() ? empty_str : InputGrad("Filter")); + op->SetOutput("DInput", ddw.empty() ? empty_str : InputGrad("Input")); + + op->SetAttrMap(Attrs()); + + return std::unique_ptr(op); + } +}; + void ConvOpDoubleGrad::InferShape(framework::InferShapeContext* ctx) const { auto x_dims = ctx->GetInputDim("Input"); auto w_dims = ctx->GetInputDim("Filter"); @@ -576,8 +626,14 @@ framework::OpKernelType ConvOpDoubleGrad::GetExpectedKernelType( #ifdef PADDLE_WITH_CUDA if (platform::CanCUDNNBeUsed(ctx)) { library_ = framework::LibraryType::kCUDNN; - } else { - PADDLE_THROW("Now ConvDoubleGrad only supports cuDNN."); + } +#endif +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + layout_ = framework::DataLayout::kMKLDNN; + customized_type_value = kConvMKLDNNFP32; } #endif auto type = framework::OpKernelType(ctx.Input("Input")->type(), @@ -621,7 +677,8 @@ REGISTER_OPERATOR(depthwise_conv2d_grad, ops::ConvOpGrad); REGISTER_OPERATOR(conv3d, ops::ConvOp, ops::Conv3DOpMaker, ops::ConvOpInferVarType, ops::Conv3DGradMaker); -REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad); +REGISTER_OPERATOR(conv3d_grad, ops::ConvOpGrad, ops::Conv3DDoubleGradMaker); +REGISTER_OPERATOR(conv3d_grad_grad, ops::ConvOpDoubleGrad); // depthwise conv kernel // TODO(xingzhaolong): neon kernel for mobile @@ -642,6 +699,10 @@ REGISTER_OP_CPU_KERNEL( conv2d_grad, ops::GemmConvGradKernel, ops::GemmConvGradKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_grad_grad, + ops::GemmConvDoubleGradKernel, + ops::GemmConvDoubleGradKernel); REGISTER_OP_CPU_KERNEL( conv3d, ops::GemmConvKernel, @@ -650,3 +711,7 @@ REGISTER_OP_CPU_KERNEL( conv3d_grad, ops::GemmConvGradKernel, ops::GemmConvGradKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_grad_grad, + ops::GemmConvDoubleGradKernel, + ops::GemmConvDoubleGradKernel); diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 4df47ef2..aa621529 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" @@ -393,6 +394,218 @@ class GemmConvGradKernel : public framework::OpKernel { } }; +template +class GemmConvDoubleGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + "It must use CPUPlace."); + const Tensor* X = ctx.Input("Input"); + const Tensor* dY = ctx.Input("DOutput"); + const Tensor* ddX = ctx.Input("DDInput"); + const Tensor* ddW_in = ctx.Input("DDFilter"); + + Tensor* ddY = ctx.Output("DDOutput"); + Tensor* dW = ctx.Output("DFilter"); + Tensor* dX = ctx.Output("DInput"); + Tensor W = detail::Ref(ctx.Input("Filter"), + "Cannot find input Filter(%s) in scope)", + ctx.Inputs("Filter")[0]); + + if (!ddY && !dW && !dX) return; + int groups = ctx.Attr("groups"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + + const int batch_size = static_cast(X->dims()[0]); + std::vector filter_shape_vec(framework::vectorize(W.dims())); + std::vector output_shape_vec(framework::vectorize(dY->dims())); + + size_t data_dim = filter_shape_vec.size() - 2; + std::vector col_shape_vec(1 + 2 * data_dim); + // col_shape [in_channel/group, kh, kw, oh, ow] + col_shape_vec[0] = X->dims()[1] / groups; + for (size_t j = 0; j < data_dim; ++j) { + col_shape_vec[j + 1] = filter_shape_vec[j + 2]; + col_shape_vec[j + data_dim + 1] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_shape_vec)); + // col_matrix_shape [in_channel/group * kh * kw, oh * ow] + framework::DDim col_matrix_shape = + framework::flatten_to_2d(col_shape, data_dim + 1); + // input_shape [Cin, H, W] + framework::DDim input_shape = + framework::slice_ddim(X->dims(), 1, X->dims().size()); + // filter_matrix_shape [Cout, Cin * kh * kw] + framework::DDim filter_matrix_shape = {W.dims()[0], + W.numel() / W.dims()[0]}; + + W.Resize(filter_matrix_shape); + framework::DDim output_matrix_shape = { + dY->dims()[1], dY->numel() / (dY->dims()[0] * dY->dims()[1])}; + int in_step = static_cast(X->dims()[1]) / groups; + int out_step = static_cast(dY->dims()[1]) / groups; + + bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations); + Tensor col; + Tensor col_matrix; + if (is_expand) { + col = ctx.AllocateTmpTensor(col_shape, dev_ctx); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } + + math::SetConstant set_zero; + auto blas = math::GetBlas(dev_ctx); + + // dx convolution double grad: gemm + col2im(col2vol) + // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, + // oH, oW) + if (dX && ddW_in) { + Tensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + + dX->mutable_data(ctx.GetPlace()); + // if is_expand is false, the operation of set_zero is unnecessary + // because math::matmul will reset dx + if (is_expand) { + set_zero(dev_ctx, dX, static_cast(0)); + } + math::Col2VolFunctor col2vol; + math::Col2ImFunctor col2im; + + for (int i = 0; i < batch_size; i++) { + Tensor dy_batch = dY->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor dx_batch = dX->Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; g++) { + // gemm + Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + Tensor dx_slice = dx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col_matrix.ShareDataWith(dx_slice); + col_matrix.Resize(col_matrix_shape); + } + blas.MatMul(ddw_slice, true, dy_slice, false, T(1.0), &col_matrix, + T(0.0)); + + if (is_expand && data_dim == 2U) { + col2im(dev_ctx, col, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &dx_slice); + } else if (is_expand && data_dim == 3U) { + col2vol(dev_ctx, col, dilations, strides, paddings, &dx_slice); + } + } + } + } + + // dw = ddx * dy ==> dw(Cout, Cin, kh, kw), ddx(N, Cin, H, W), dy(N, Cout, + // oH, oW) + // dw convolution double grad: im2col(vol2col) + gemm + if (dW) { + dW->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, dW, static_cast(0)); + Tensor dW_arr = *dW; + dW_arr.Resize(filter_matrix_shape); + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + Tensor dy_batch = dY->Slice(i, i + 1).Resize(output_matrix_shape); + Tensor ddx_batch = ddX->Slice(i, i + 1).Resize(input_shape); + for (int g = 0; g < groups; ++g) { + // im2col + Tensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + im2col(dev_ctx, ddx_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + + Tensor dw_slice = dW_arr.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul(dy_slice, false, col_matrix, true, T(1.0), &dw_slice, + T(1.0)); + } + } + } + + // ddy = w * ddx + x * ddw ==> ddy(N, Cout, oH, oW), x/ddx(N, Cin, H, W), + // w/ddw(Cout, Cin, kh, kw) + // ddy convolution double grad: im2col(vol2col) + gemm + if (ddY) { + ddY->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, ddY, static_cast(0)); + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + for (int i = 0; i < batch_size; ++i) { + Tensor ddx_batch = ddX->Slice(i, i + 1).Resize(input_shape); + Tensor x_batch = X->Slice(i, i + 1).Resize(input_shape); + Tensor ddy_batch = ddY->Slice(i, i + 1).Resize(output_matrix_shape); + for (int g = 0; g < groups; ++g) { + Tensor x_slice = x_batch.Slice(g * in_step, (g + 1) * in_step); + Tensor ddx_slice = ddx_batch.Slice(g * in_step, (g + 1) * in_step); + if (!is_expand) { + col.ShareDataWith(ddx_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(dev_ctx, ddx_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(dev_ctx, ddx_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor ddy_slice = ddy_batch.Slice(g * out_step, (g + 1) * out_step); + Tensor w_slice = W.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul(w_slice, false, col_matrix, false, T(1.0), &ddy_slice, + T(0.0)); + + if (ddW_in) { + Tensor ddW; + ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape); + + if (!is_expand) { + col.ShareDataWith(x_slice); + col_matrix.ShareDataWith(col); + col_matrix.Resize(col_matrix_shape); + } else if (data_dim == 2U) { + // im2col + im2col(dev_ctx, x_slice, dilations, strides, + std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, + &col); + } else if (data_dim == 3U) { + // vol2col + vol2col(dev_ctx, x_slice, dilations, strides, paddings, &col); + } + + // gemm + Tensor ddw_slice = ddW.Slice(g * out_step, (g + 1) * out_step); + blas.MatMul(ddw_slice, false, col_matrix, false, T(1.0), &ddy_slice, + T(1.0)); + } + } + } + } + } +}; + template class DepthwiseConvKernel : public framework::OpKernel { public: diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc index f44094ca..bab6fe24 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/conv_transpose_op.h" -#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { @@ -65,13 +64,13 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { // (N, M, H, W) or (N, M, D, H, W) cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, framework::vectorize2int(input->dims()), groups); + layout, framework::vectorize(input->dims()), groups); // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, framework::vectorize2int(output->dims()), groups); + layout, framework::vectorize(output->dims()), groups); // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w) cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( - layout, framework::vectorize2int(filter->dims()), groups); + layout, framework::vectorize(filter->dims()), groups); cudnnConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(paddings, strides, dilations); @@ -149,13 +148,13 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { // Input: (N, M, H, W) or (N, M, D, H, W) cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, framework::vectorize2int(input->dims()), groups); + layout, framework::vectorize(input->dims()), groups); // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w) cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, framework::vectorize2int(output_grad->dims()), groups); + layout, framework::vectorize(output_grad->dims()), groups); // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w) cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor( - layout, framework::vectorize2int(filter->dims()), groups); + layout, framework::vectorize(filter->dims()), groups); cudnnConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(paddings, strides, dilations); diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 01afdd28..e76c57ab 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -170,6 +170,14 @@ void Conv2DTransposeOpMaker::Make() { .SetDefault(false); AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("fuse_activation", + "(string, default \"\") Only used in mkldnn kernel") + .SetDefault(""); + AddAttr("fuse_alpha", + "(float, default 0.0) Only used in mkldnn kernel") + .SetDefault(0.0f); + AddAttr("fuse_beta", "(float, default 0.0) Only used in mkldnn kernel") + .SetDefault(0.0f); AddAttr( "data_format", "(string, default NCHW) Only used in " diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc index c701e895..2de714e0 100644 --- a/paddle/fluid/operators/crf_decoding_op.cc +++ b/paddle/fluid/operators/crf_decoding_op.cc @@ -19,14 +19,17 @@ namespace operators { class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("Emission", - "(LoDTensor, default: LoDTensor). A LoDTensor with shape " - "[N x D] where N is the size of the mini-batch and D is the total " - "tag number. This input is the unscaled emission weight matrix of " - "the linear_chain_crf operator."); + AddInput( + "Emission", + "(Tensor/LoDTensor). For a LoDTensor input, its " + "shape is [N x D] where N is the total sequence length of the " + "mini-batch and D is the total tag number. While for a tensor " + "input, its shape is [B X S X D] with B the batch size and S the " + "sequence length of each sample after padding. This input is the " + "unscaled emission weight matrix of the linear_chain_crf operator."); AddInput( "Transition", - "(Tensor, default: Tensor). A Tensor with shape [(D + 2) x D]. " + "(Tensor). A Tensor with shape [(D + 2) x D]. " "This input is the transition weights learned by the linear_chain_crf " "operator, denoted as w. The 1st row of w are transition weights for " "the start mask. The 2nd row of w are transition weights for the end " @@ -34,15 +37,24 @@ class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker { "w. See more details in comments of the linear_chain_crf operator."); AddInput( "Label", - "(LoDTensor, LoDTensor). The ground truth with shape " - "[N x 1]. This input is optional. See more details in the operator's " - "comments.") + "(Tensor/LoDTensor). The ground truth with shape " + "[N x 1] (for LoDTensor) or [B x S] (for Tensor). This input is " + "optional. " + "See more details in the operator's comments.") .AsDispensable(); AddOutput( "ViterbiPath", - "(LoDTensor, LoDTensor). The decoding results. What to " + "(Tensor/LoDTensor). The decoding results. What to " "return changes depending on whether the Input(Label) (the ground " "truth) is given. See more details in the operator's comment."); + AddInput("Length", + "(Tensor). The actual length of each sample before " + "padding with shape [B x 1]. It means the Input(Emission), " + "Input(Label) " + "and Output(ViterbiPath) are common tensors with padding when " + "this input " + "is given.") + .AsDispensable(); AddComment(R"DOC( The crf_decoding operator reads the emission feature weights and the transition feature weights learned by the linear_chain_crf operator. It implements the @@ -55,15 +67,16 @@ The output of this operator changes according to whether Input(Label) is given: 1. Input(Label) is given: This happens in training. This operator is used to co-work with the chunk_eval operator. - When Input(Label) is given, the crf_decoding operator returns a row vector - with shape [N x 1] whose values are fixed to be 0, indicating an incorrect - prediction, or 1 indicating a tag is correctly predicted. Such an output is the - input to chunk_eval operator. + When Input(Label) is given, the crf_decoding operator returns tensor with the + sampe shape as Input(Label) whose values are fixed to be 0, indicating an + incorrect prediction, or 1 indicating a tag is correctly predicted. Such an + output is the input to chunk_eval operator. 2. Input(Label) is not given: This is the standard decoding process. -The crf_decoding operator returns a row vector with shape [N x 1] whose values +The crf_decoding operator returns a row vector with shape [N x 1]/[B x S], here +the shape depends on the inputs are LoDTensors or common tensors, whose values range from 0 to maximum tag number - 1, Each element indicates an index of a predicted tag. )DOC"); @@ -75,37 +88,46 @@ class CRFDecodingOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Emission"), - "Input(Emission) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("Transition"), - "Input(Transition) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Emission"), true, + "Input(Emission) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Transition"), true, + "Input(Transition) should be not null."); - PADDLE_ENFORCE(ctx->HasOutput("ViterbiPath"), - "Output(ViterbiPath) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("ViterbiPath"), true, + "Output(ViterbiPath) should be not null."); auto emission_dims = ctx->GetInputDim("Emission"); - PADDLE_ENFORCE_EQ(emission_dims.size(), 2, - "The Input(Emission) should be a 2-D tensor."); - PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); + bool has_length = ctx->HasInput("Length"); + + if (has_length) { + PADDLE_ENFORCE_EQ(emission_dims.size(), 3, + "The Input(Emission) should be a 3-D tensor."); + } else { + PADDLE_ENFORCE_EQ(emission_dims.size(), 2, + "The Input(Emission) should be a 2-D tensor."); + } + PADDLE_ENFORCE_NE(emission_dims[0], 0, + "An empty mini-batch is not allowed."); auto transition_dims = ctx->GetInputDim("Transition"); - PADDLE_ENFORCE_EQ(transition_dims.size(), 2, + PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL, "The Input(Transition) should be a 2-D tensor."); PADDLE_ENFORCE_EQ( transition_dims[0] - 2, transition_dims[1], "An invalid dimension for the Input(Transition), which should " "be a 2-D tensor with shape [(D + 2) x D]."); - if (ctx->IsRuntime() || (emission_dims[1] > 0 && transition_dims[1] > 0)) { + if (ctx->IsRuntime() || (emission_dims[emission_dims.size() - 1] > 0 && + transition_dims[transition_dims.size() - 1] > 0)) { PADDLE_ENFORCE_EQ( - emission_dims[1], transition_dims[1], - "The 2nd dimension of the Input(Emission) and the Input(Transition) " + emission_dims[emission_dims.size() - 1], + transition_dims[transition_dims.size() - 1], + "The last dimension of the Input(Emission) and the Input(Transition) " "should be equal to the tag number."); } if (ctx->HasInput("Label")) { auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, - "The Input(Label) should be a 2-D tensor with the 2nd " - "dimensions fixed to 1."); + PADDLE_ENFORCE_EQ(label_dims.size(), 2UL, + "The Input(Label) should be a 2-D tensor"); if (ctx->IsRuntime() || (emission_dims[0] > 0 && label_dims[0] > 0)) { PADDLE_ENFORCE_EQ( emission_dims[0], label_dims[0], @@ -115,7 +137,11 @@ class CRFDecodingOp : public framework::OperatorWithKernel { } ctx->ShareLoD("Emission", /*->*/ "ViterbiPath"); - ctx->SetOutputDim("ViterbiPath", {emission_dims[0], 1}); + if (has_length) { + ctx->SetOutputDim("ViterbiPath", {emission_dims[0], emission_dims[1]}); + } else { + ctx->SetOutputDim("ViterbiPath", {emission_dims[0], 1}); + } } protected: diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 13a587dc..74b9cb20 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -35,31 +35,59 @@ class CRFDecodingOpKernel : public framework::OpKernel { auto* label = ctx.Input("Label"); auto* decoded_path = ctx.Output("ViterbiPath"); - PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL, - "The Input(Emission) should be a sequence."); - auto lod = emission_weights->lod(); - PADDLE_ENFORCE(lod.size(), "Input(Emission) must be a sequence."); - const size_t level = 0; - const size_t seq_num = lod[level].size() - 1; - int64_t* path = decoded_path->mutable_data(platform::CPUPlace()); math::SetConstant()( ctx.template device_context(), decoded_path, 0); - for (size_t i = 0; i < seq_num; ++i) { - if (lod[level][i] == lod[level][i + 1]) continue; - int start_pos = static_cast(lod[level][i]); - int end_pos = static_cast(lod[level][i + 1]); - Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos); - Decode(emission_weights->Slice(start_pos, end_pos), *transition_weights, - &decoded_path_one_seq); - } + bool has_length = ctx.HasInput("Length"); + if (has_length) { + auto* length = ctx.Input("Length"); + const size_t seq_num = length->numel(); + const int64_t* length_data = length->data(); + auto in_dims = emission_weights->dims(); + + auto& dev_ctx = ctx.template device_context(); + framework::Tensor emission_weights_tmp = + ctx.AllocateTmpTensor(emission_weights->dims(), + dev_ctx); + emission_weights_tmp.ShareDataWith(*emission_weights); + emission_weights_tmp.Resize({in_dims[0] * in_dims[1], in_dims[2]}); + + decoded_path->Resize({in_dims[0] * in_dims[1], 1}); + for (size_t i = 0; i < seq_num; ++i) { + if (length_data[i] == 0) continue; + int start_pos = i * in_dims[1]; + int end_pos = start_pos + static_cast(length_data[i]); + Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos); + Decode(emission_weights_tmp.Slice(start_pos, end_pos), + *transition_weights, &decoded_path_one_seq); + } + decoded_path->Resize({in_dims[0], in_dims[1]}); + } else { + PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL, + "The Input(Emission) should be a sequence."); + auto lod = emission_weights->lod(); + PADDLE_ENFORCE_GT(lod.size(), 0, "Input(Emission) must be a sequence."); + const size_t level = 0; + const size_t seq_num = lod[level].size() - 1; + + for (size_t i = 0; i < seq_num; ++i) { + if (lod[level][i] == lod[level][i + 1]) continue; + int start_pos = static_cast(lod[level][i]); + int end_pos = static_cast(lod[level][i + 1]); + Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos); + Decode(emission_weights->Slice(start_pos, end_pos), *transition_weights, + &decoded_path_one_seq); + } + } if (label) { - PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL, - "The Input(Label) should be a sequence."); + if (!has_length) { + PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL, + "The Input(Label) should be a sequence."); + } const int64_t* label_value = label->data(); - size_t batch_size = emission_weights->dims()[0]; - for (size_t i = 0; i < batch_size; ++i) { + size_t numel = label->numel(); + for (size_t i = 0; i < numel; ++i) { path[i] = label_value[i] == path[i] ? 1 : 0; } } diff --git a/paddle/fluid/operators/crop_tensor_op.cc b/paddle/fluid/operators/crop_tensor_op.cc new file mode 100644 index 00000000..9b536e98 --- /dev/null +++ b/paddle/fluid/operators/crop_tensor_op.cc @@ -0,0 +1,300 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/crop_tensor_op.h" +#include +#include +#include + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class CropTensorOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of Op(crop_tensor) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of Op(crop_tensor) should not be null."); + + auto shape = ctx->Attrs().Get>("shape"); + if (ctx->HasInputs("ShapeTensor")) { + // top prority shape + auto inputs_name = ctx->Inputs("ShapeTensor"); + PADDLE_ENFORCE_GT( + inputs_name.size(), 0, + "Input(ShapeTensor)'size of Op(crop_tensor) can't be zero. " + "Please check the Attr(shape)'s size of " + "Op(fluid.layers.crop_tensor)."); + auto out_dims = std::vector(inputs_name.size(), -1); + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] != -1) { + out_dims[i] = static_cast(shape[i]); + } + } + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + + return; + } + auto x_dim = ctx->GetInputDim("X"); + if (ctx->HasInput("Shape")) { + auto shape_dim = ctx->GetInputDim("Shape"); + PADDLE_ENFORCE_EQ( + shape_dim.size(), 1, + "Input(Shape)'s dimension size of Op(crop_tensor) must be 1. " + "Please check the Attr(shape)'s dimension size of " + "Op(fluid.layers.crop_tensor)."); + PADDLE_ENFORCE_EQ(shape_dim[0], x_dim.size(), + "Input(Shape)'s size of Op(crop_tensor) must be equal " + "to dimension size of input tensor. " + "Please check the Attr(shape)'s size of " + "Op(fluid.layers.crop_tensor)."); + if (ctx->IsRuntime()) { + // If true, set the shape of Output(Out) according to Input(Shape) in + // CropTensorKernel with ExecutionContext. Also check LoD in + // CropTensorKernel. + ctx->ShareLoD("X", /*->*/ "Out"); + } else { + auto out_dims = std::vector(shape_dim[0], -1); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + } + return; + } + PADDLE_ENFORCE_EQ(int64_t(shape.size()), x_dim.size(), + "Attr(shape)'size of Op(crop_tensor) should be equal to " + "dimention size of input tensor."); + std::vector tensor_shape(shape.size()); + for (size_t i = 0; i < shape.size(); ++i) { + tensor_shape[i] = static_cast(shape[i]); + } + ctx->SetOutputDim("Out", framework::make_ddim(tensor_shape)); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const override { + if (var_name == "ShapeTensor" || var_name == "OffsetsTensor" || + var_name == "Shape" || var_name == "Offsets") { + return expected_kernel_type; + } + + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + +class CropTensorOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input of pad op. " + "The input should be a k-D tensor(k > 0 and k < 7)."); + AddInput("Shape", + "The input used to describe shape of output, which is a " + "1-D vector whose size equals to the rank of input 'X'. The " + "elements data type must be int. It has a higher priority than " + "the shape attribute") + .AsDispensable(); + AddInput("Offsets", + "The input used to describe offsets in runtime, which is a " + "1-D vector whose size equals to the rank of input 'X'. The " + "elements data type must be int. It has a higher priority than " + "the offsets attribute") + .AsDispensable(); + AddInput("ShapeTensor", + "(vector>, optional). If provided, crop_tensor will " + "use this. The shape of the tensor in vector MUST BE [1]. " + "It has the highest priority compare with Input(Shape) and " + "attr(shape).") + .AsDuplicable() + .AsDispensable(); + AddInput("OffsetsTensor", + "(vector>, optional). If provided, crop_tensor will " + "use this. The shape of the tensor in vector MUST BE [1]. " + "It has the highest priority compare with Input(Offsets) and " + "attr(offsets).") + .AsDuplicable() + .AsDispensable(); + AddOutput("Out", + "The output of crop_tensor op, " + "which is of the same dimensions as X."); + AddAttr>("offsets", + "A list describing offsets to be cropped. " + "The size of offsets list should be the same as " + "the dimension size of input X.") + .SetDefault(std::vector()); + AddAttr>("shape", + "A list describing the shape of output. " + "The size of shape list should be the same as " + "the dimension size of input X.") + .SetDefault(std::vector()); + AddComment(R"DOC( +CropTensor Operator. + +Crop input into output, as specified by offsets and shape. + +There are three ways to set the offsets: +1. Input 'OffsetsTensor: It is a tensor list. It should be set as a list that + contains tensor variable in python configure script. + This way is suitable for dynamic offsets. +2. Input 'Offsets': It is a variable and can be output of other operators. + This way is suitable for dynamic offsets. +3. Attribute 'offsets': It will be set in python configure script. This way + is suitable for fixed offsets. + +You CANNOT use these three ways at the same time. An exception will be raised +if input 'OffsetsTensor' or 'Offset' is configured and meanwhile the attribute 'offsets' is +not empty. + +There are three ways to set shape: +1. Input 'ShapeTensor': It is a tensor list. It should be set as a list that contains + tensor variable in python configure script. This way is suitable + for dynamic shape. +2. Input 'Shape': It is a Variable and can be output of other operators. This way is suitable + for dynamic shape. +2. Attribute 'shape': crop input X into the shape described by a list. The size of shape + list should be the same as the dimension size of input X. This way is + suitable for fixed shape. + +The input should be a k-D tensor(k > 0 and k < 7). As an example: + +Case 1: +Given + + X = [[0, 1, 2, 0, 0] + [0, 3, 4, 0, 0] + [0, 0, 0, 0, 0]], + +and + + offsets = [0, 1], + +and + + shape = [2, 2], + +we get: + + Out = [[1, 2], + [3, 4]]. + + +Case 2: +Given + + X = [[0, 1, 2, 5, 0] + [0, 3, 4, 6, 0] + [0, 0, 0, 0, 0]], + +and offsets is a list that contains tensor variable, +in runtime offses_var' s value is 1. + + offsets = [0, offsets_var], + +and shape is a list that contains tensor variable, +in runtime dim's value is 2. + + shape = [dim, 3] + +we get: + + Out = [[1, 2, 5], + [3, 4, 6]]. +)DOC"); + } +}; + +class CropTensorOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of Op(crop_tensor) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) of Op(crop_tensor) should not be null."); + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const override { + if (var_name == "ShapeTensor" || var_name == "OffsetsTensor" || + var_name == "Shape" || var_name == "Offsets") { + return expected_kernel_type; + } + + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + +class CropTensorGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("crop_tensor_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("X", Input("X")); + if (ForwardOp().Inputs().count("OffsetsTensor") > 0) { + op->SetInput("OffsetsTensor", Input("OffsetsTensor")); + } + if (ForwardOp().Inputs().count("Offsets") > 0) { + op->SetInput("Offsets", Input("Offsets")); + } + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(crop_tensor, ops::CropTensorOp, ops::CropTensorOpMaker, + ops::CropTensorGradOpDescMaker); +REGISTER_OPERATOR(crop_tensor_grad, ops::CropTensorOpGrad); +REGISTER_OP_CPU_KERNEL( + crop_tensor, + ops::CropTensorKernel, + ops::CropTensorKernel); +REGISTER_OP_CPU_KERNEL( + crop_tensor_grad, + ops::CropTensorGradKernel, + ops::CropTensorGradKernel); diff --git a/paddle/fluid/operators/linear_chain_crf_op.cu b/paddle/fluid/operators/crop_tensor_op.cu similarity index 53% rename from paddle/fluid/operators/linear_chain_crf_op.cu rename to paddle/fluid/operators/crop_tensor_op.cu index 4f7738e8..9d28d984 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cu +++ b/paddle/fluid/operators/crop_tensor_op.cu @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -11,17 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#include "paddle/fluid/operators/linear_chain_crf_op.h" +#include "paddle/fluid/operators/crop_tensor_op.h" namespace ops = paddle::operators; - REGISTER_OP_CUDA_KERNEL( - linear_chain_crf, - ops::LinearChainCRFOpKernel, - ops::LinearChainCRFOpKernel); + crop_tensor, + ops::CropTensorKernel, + ops::CropTensorKernel); REGISTER_OP_CUDA_KERNEL( - linear_chain_crf_grad, - ops::LinearChainCRFGradOpKernel, - ops::LinearChainCRFGradOpKernel); + crop_tensor_grad, + ops::CropTensorGradKernel, + ops::CropTensorGradKernel); diff --git a/paddle/fluid/operators/crop_tensor_op.h b/paddle/fluid/operators/crop_tensor_op.h new file mode 100644 index 00000000..42f118d0 --- /dev/null +++ b/paddle/fluid/operators/crop_tensor_op.h @@ -0,0 +1,284 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { // Internal + +template +using EigenTensor = framework::EigenTensor; +using framework::Tensor; + +inline std::vector get_new_data( + const std::vector& list_new_tensor) { + // get tensor from + std::vector vec_new_data; + for (size_t i = 0; i < list_new_tensor.size(); ++i) { + auto tensor = list_new_tensor[i]; + PADDLE_ENFORCE_EQ( + tensor->dims(), framework::make_ddim({1}), + "The tensor's shape in list of Op(crop_tensor) should be [1]."); + if (platform::is_gpu_place(tensor->place())) { + framework::Tensor temp; + TensorCopySync(*tensor, platform::CPUPlace(), &temp); + + vec_new_data.push_back(static_cast(*temp.data())); + } else { + vec_new_data.push_back(static_cast(*tensor->data())); + } + } + + return vec_new_data; +} + +static framework::DDim ValidateShape(const std::vector shape, + const framework::DDim& in_dims) { + auto in_dim_size = in_dims.size(); + auto shape_size = shape.size(); + PADDLE_ENFORCE_EQ( + in_dim_size, shape_size, + "Input(ShapeTensor)'s dimension size of Op(crop_tensor) should be equal " + "to that of input tensor. " + "Please check the Attr(shape)'s size of Op(fluid.layers.crop_tensor)."); + const int64_t unk_dim_val = -1; + int unk_dim_idx = -1; + std::vector output_shape(shape.size(), 0); + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] == unk_dim_val) { + PADDLE_ENFORCE_EQ(unk_dim_idx, -1, + "Only one element of shape can be unknown."); + PADDLE_ENFORCE_EQ(i, 0, "Only the first element of shape can be -1."); + unk_dim_idx = i; + } else { + PADDLE_ENFORCE_GT(shape[i], 0, + "Each element of shape must be greater than 0 " + "except the first element."); + } + output_shape[i] = static_cast(shape[i]); + } + + return framework::make_ddim(output_shape); +} + +static std::vector GetShape(const framework::ExecutionContext& ctx) { + std::vector res; + int rank = ctx.Input("X")->dims().size(); + auto list_new_shape_tensor = ctx.MultiInput("ShapeTensor"); + if (list_new_shape_tensor.size() > 0) { + // have offsets tensor list + PADDLE_ENFORCE_EQ(list_new_shape_tensor.size(), rank, + "Input(ShapeTensor)'s length of Op(crop_tensor) should " + "be equal to dimension size of input tensor."); + res = get_new_data(list_new_shape_tensor); + + return res; + } + + auto* shape_tensor = ctx.HasInput("Shape") + ? ctx.Input("Shape") + : nullptr; + if (shape_tensor) { + auto* shape_data = shape_tensor->data(); + framework::Tensor cpu_shape_tensor; + if (platform::is_gpu_place(shape_tensor->place())) { + TensorCopySync(*shape_tensor, platform::CPUPlace(), &cpu_shape_tensor); + shape_data = cpu_shape_tensor.data(); + } + res = std::vector(shape_data, shape_data + shape_tensor->numel()); + } + + return res; +} + +static std::vector GetOffsets(const framework::ExecutionContext& ctx) { + std::vector res; + int rank = ctx.Input("X")->dims().size(); + auto list_new_offsets_tensor = + ctx.MultiInput("OffsetsTensor"); + if (list_new_offsets_tensor.size() > 0) { + // have offsets tensor list + res = get_new_data(list_new_offsets_tensor); + + return res; + } + + if (ctx.HasInput("Offsets")) { + PADDLE_ENFORCE_EQ( + ctx.Attr>("offsets").empty(), true, + "Input 'Offsets' and attribute 'offsets' should not be used " + "at the same time."); + const auto* offsets_tensor = ctx.Input("Offsets"); + PADDLE_ENFORCE_EQ(offsets_tensor->dims().size(), 1); + PADDLE_ENFORCE_EQ( + rank, offsets_tensor->dims()[0], + "Offsets size should be equal to dimension size of input tensor."); + const int* offsets_data; + framework::Tensor cpu_tmp_tensor; + if (platform::is_cpu_place(offsets_tensor->place())) { + offsets_data = offsets_tensor->data(); + } else { + framework::TensorCopySync(*offsets_tensor, platform::CPUPlace(), + &cpu_tmp_tensor); + offsets_data = cpu_tmp_tensor.data(); + } + res = std::vector(offsets_data, offsets_data + rank); + } else { + res = ctx.Attr>("offsets"); + PADDLE_ENFORCE_EQ( + rank, static_cast(res.size()), + "Offsets size should be equal to dimension size of input tensor."); + } + return res; +} + +template +void CropTensorFunction(const framework::ExecutionContext& context) { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + auto x_dims = x->dims(); + auto out_dims = out->dims(); + + // get shape from Input(ShapeTensor) of Input(Shape) + std::vector shape = GetShape(context); + // out_dims setted by arrt(shape) + if (shape.size() == 0) { + for (size_t i = 0; i < out_dims.size(); ++i) { + shape.push_back(out_dims[i]); + } + } + out_dims = ValidateShape(shape, x->dims()); + if (out_dims[0] == -1) { + out_dims[0] = x->dims()[0]; + } + + out->mutable_data(out_dims, context.GetPlace()); + auto x_stride = framework::stride(x->dims()); + auto offsets = GetOffsets(context); + int64_t offset = 0; + for (size_t i = 0; i < offsets.size(); ++i) { + PADDLE_ENFORCE_LE( + offsets[i] + shape[i], x_dims[i], + "The sum of the Attr(offsets) and Attr(shape) of Op(crop_tensor) " + "should be less than or equal to corresponding input dimension size."); + offset += (x_stride[i] * offsets[i]); + } + + auto x_tensor = EigenTensor::From(*x); + auto out_tensor = EigenTensor::From(*out); + Eigen::array e_offsets; + Eigen::array e_shape; + for (size_t i = 0; i < D; ++i) { + e_offsets[i] = offsets[i]; + e_shape[i] = out->dims()[i]; + } + auto& place = + *context.template device_context().eigen_device(); + out_tensor.device(place) = x_tensor.slice(e_offsets, e_shape); +} + +template +class CropTensorKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + int rank = context.Input("X")->dims().size(); + switch (rank) { + case 1: + CropTensorFunction(context); + break; + case 2: + CropTensorFunction(context); + break; + case 3: + CropTensorFunction(context); + break; + case 4: + CropTensorFunction(context); + break; + case 5: + CropTensorFunction(context); + break; + case 6: + CropTensorFunction(context); + break; + default: + PADDLE_THROW( + "CropTensorOp only support tensors with no more than 6 " + "dimensions."); + } + } +}; + +template +void CropTensorGradFunction(const framework::ExecutionContext& context) { + auto* d_x = context.Output(framework::GradVarName("X")); + auto* x = context.Input("X"); + if (d_x != nullptr) { + auto* d_out = context.Input(framework::GradVarName("Out")); + d_x->mutable_data(x->dims(), context.GetPlace()); + auto offsets = GetOffsets(context); + Eigen::array, D> paddings; + for (size_t i = 0; i < D; ++i) { + paddings[i].first = offsets[i]; + paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i]; + } + auto d_x_tensor = EigenTensor::From(*d_x); + auto d_out_tensor = EigenTensor::From(*d_out); + d_x_tensor.device( + *context.template device_context().eigen_device()) = + d_out_tensor.pad(paddings, 0); + } +} + +template +class CropTensorGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + size_t rank = + context.Input(framework::GradVarName("Out"))->dims().size(); + switch (rank) { + case 1: + CropTensorGradFunction(context); + break; + case 2: + CropTensorGradFunction(context); + break; + case 3: + CropTensorGradFunction(context); + break; + case 4: + CropTensorGradFunction(context); + break; + case 5: + CropTensorGradFunction(context); + break; + case 6: + CropTensorGradFunction(context); + break; + default: + PADDLE_THROW( + "CropTensorOp only support tensors with no more than 6 " + "dimensions."); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index da2c74b0..624b2b9c 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -25,19 +25,21 @@ class CrossEntropyOpBase : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true, + "Input(Label) should be not null."); - PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true, + "Output(Y) should be not null."); auto x_dims = ctx->GetInputDim("X"); auto label_dims = ctx->GetInputDim("Label"); int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, label_dims.size(), - "Input(X) and Input(Label) shall have the same rank."); + bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) || framework::contain_unknown_dim(label_dims); bool check = ctx->IsRuntime() || !contain_unknown_dim; + if (check) { PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), framework::slice_ddim(label_dims, 0, rank - 1), @@ -46,19 +48,30 @@ class CrossEntropyOpBase : public framework::OperatorWithKernel { } if (IsSoftLabel(ctx)) { + PADDLE_ENFORCE_EQ( + rank, label_dims.size(), + "If Attr(soft_label) == true, Input(X) and Input(Label) " + "shall have the same rank."); if (check) { PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1], "If Attr(soft_label) == true, the last dimension of " "Input(X) and Input(Label) should be equal."); } } else { - PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL, - "If Attr(softLabel) == false, the last dimension of " - "Input(Label) should be 1."); + if (rank == label_dims.size()) { + PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL, + "the last dimension of Input(Label) should be 1."); + } else { + PADDLE_ENFORCE_EQ( + rank, label_dims.size() + 1, + "The rank of Input(X) should be equal to Input(Label) plus 1."); + } } - auto y_dims = x_dims; - y_dims[rank - 1] = 1; + auto y_dims = label_dims; + if (rank == label_dims.size()) { + y_dims[rank - 1] = 1; + } ctx->SetOutputDim("Y", y_dims); ctx->ShareLoD("X", /*->*/ "Y"); } @@ -82,20 +95,19 @@ class CrossEntropyGradientOpBase : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), - "Input(Y@GRAD) shoudl be not null."); - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), - "Output(X@GRAD) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Label"), true, + "Input(Label) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Y")), true, + "Input(Y@GRAD) shoudl be not null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, + "Output(X@GRAD) should be not null."); auto x_dims = GetXDim(ctx); auto label_dims = ctx->GetInputDim("Label"); auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(dy_dims.size(), rank, - "Input(Y@Grad) and Input(X) should have the same rank."); - PADDLE_ENFORCE_EQ(label_dims.size(), rank, - "Input(Label) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(dy_dims.size(), label_dims.size(), + "Input(Y@Grad) and Input(Y) should have the same rank."); bool check = true; if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 || @@ -104,30 +116,12 @@ class CrossEntropyGradientOpBase : public framework::OperatorWithKernel { } if (check) { - PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), - framework::slice_ddim(label_dims, 0, rank - 1), - "The Input(X) and Input(Label) should have the same " - "shape except the last dimension."); PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), framework::slice_ddim(dy_dims, 0, rank - 1), "The Input(X) and Input(Y@Grad) should have the same " "shape except the last dimension."); } - if (IsSoftLabel(ctx)) { - if (check) { - PADDLE_ENFORCE_EQ( - x_dims[rank - 1], label_dims[rank - 1], - "When Attr(soft_label) == true, the last dimension of " - "Input(X) and Input(Label) should be equal."); - } - } else { - PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1, - "When Attr(soft_label) == false, the last dimension of " - "Input(Label) should be 1."); - } - ctx->SetOutputDim(framework::GradVarName("X"), x_dims); - PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, - "The last dimension of Input(Y@Grad) should be 1."); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->ShareLoD(VarNameWithXLoD(), framework::GradVarName("X")); } @@ -231,7 +225,7 @@ class CrossEntropyGradientOp : public CrossEntropyGradientOpBase { using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should be not null."); CrossEntropyGradientOpBase::InferShape(ctx); } }; @@ -260,11 +254,11 @@ class CrossEntropyOp2 : public CrossEntropyOpBase { void InferShape(framework::InferShapeContext* ctx) const override { CrossEntropyOpBase::InferShape(ctx); - PADDLE_ENFORCE(ctx->HasOutput("XShape"), - "Output(XShape) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true, + "Output(XShape) should be not null."); - PADDLE_ENFORCE(ctx->HasOutput("MatchX"), - "Output(MatchX) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("MatchX"), true, + "Output(MatchX) should be not null."); auto x_dims = ctx->GetInputDim("X"); auto x_dims_vec = framework::vectorize(x_dims); x_dims_vec.push_back(0); @@ -284,7 +278,8 @@ class CrossEntropyGradientOp2 : public CrossEntropyGradientOpBase { public: using CrossEntropyGradientOpBase::CrossEntropyGradientOpBase; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("MatchX"), "Input(MatchX) must exist"); + PADDLE_ENFORCE_EQ(ctx->HasInput("MatchX"), true, + "Input(MatchX) must exist"); CrossEntropyGradientOpBase::InferShape(ctx); } diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h index 309ba46c..667135c4 100644 --- a/paddle/fluid/operators/cross_entropy_op.h +++ b/paddle/fluid/operators/cross_entropy_op.h @@ -35,9 +35,20 @@ class CrossEntropyOpKernel : public framework::OpKernel { y->mutable_data(ctx.GetPlace()); int rank = x->dims().size(); + auto label_dims = labels->dims(); Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1); - Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1); - Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1); + Tensor labels_2d, y_2d; + if (label_dims.size() < rank) { + labels_2d.ShareDataWith(*labels); + labels_2d.Resize({framework::product(label_dims), 1}); + + y_2d.ShareDataWith(*y); + y_2d.Resize({framework::product(y->dims()), 1}); + + } else { + labels_2d = framework::ReshapeToMatrix(*labels, rank - 1); + y_2d = framework::ReshapeToMatrix(*y, rank - 1); + } int axis_dim = x->dims()[rank - 1]; math::CrossEntropyFunctor()( @@ -155,8 +166,11 @@ struct HardLabelCrossEntropyForwardFunctor { HOSTDEVICE void operator()(int64_t idx) const { auto label = label_[idx]; if (label != ignore_index_) { - PADDLE_ASSERT_MSG(label >= 0 && label < feature_size_, - "The label is out of the range.", label); + PADDLE_ENFORCE(label >= 0 && label < feature_size_, + "Variable value (label) of " + "OP(fluid.layers.cross_entropy) expected >= 0 " + "and < %ld, but got %ld. Please check label value.", + feature_size_, label); auto match_x = x_[idx * feature_size_ + label]; y_[idx] = -math::TolerableValue()(real_log(match_x)); match_x_[idx] = match_x; diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc index e7c472f8..4abe9509 100644 --- a/paddle/fluid/operators/ctc_align_op.cc +++ b/paddle/fluid/operators/ctc_align_op.cc @@ -22,15 +22,18 @@ class CTCAlignOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Input"), - "Input of CTCAlignOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Output"), - "Output of CTCAlignOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, + "Input of CTCAlignOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Output"), true, + "Output of CTCAlignOp should not be null."); auto input_dims = ctx->GetInputDim("Input"); // TODO(wanghaoshuang): it is tricky to set the wrong dimension here. ctx->SetOutputDim("Output", input_dims); + if (ctx->HasInput("InputLength")) { + ctx->SetOutputDim("OutputLength", {input_dims[0], 1}); + } } protected: @@ -45,9 +48,19 @@ class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("Input", - "(LodTensor, default: LoDTensor), Its shape is " + "2-D Tensor or LodTensor with shape " "[Lp, 1], where Lp is the sum of all input sequences' length."); + AddInput("InputLength", + "2-D Tensor with shape [batch_size, 1], " + " When Input is padding mode, InputLength is length of every " + "sequence in Input.") + .AsDispensable(); AddOutput("Output", "(Tensor, default: Tensor), The align result."); + AddOutput("OutputLength", + "2-D Tensor with shape [batch_size, 1], " + "When Input is padding mode, OutputLength is length of every " + "sequence in Output.") + .AsDispensable(); AddAttr("blank", "(int, default: 0), the blank label setted in Connectionist " "Temporal Classification (CTC) op.") @@ -56,6 +69,11 @@ class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default: true), whether to " "merge repeated elements between two blanks. ") .SetDefault(true); + // add attr padding number for tensor input + AddAttr("padding_value", + "(int, default: 0), padding number " + "use to padding tensor. ") + .SetDefault(0); AddComment(R"DOC( CTCAlign op is used to merge repeated elements between two blanks and then delete all blanks in sequence. @@ -75,7 +93,29 @@ Then: 6, 7] Output.dims = {8, 1} Output.LoD = [[0, 6, 8]] +or Given: + Input.data = [[0, 1, 2, 2, 0, 4], + [0, 4, 5, 0, 6, 0], + [0, 7, 7, 7, 0, 0]] + InputLength.data = [[6], + [5], + [4]], + Input.dims = {3, 6}, + Input.Lod = [] +And: + blank = 0 + merge_repeated = True + padding_value = 0 +Then: + Output.data = [[1, 2, 4, 0, 0, 0], + [4, 5, 6, 0, 0, 0], + [7, 0, 0, 0, 0, 0]], + OutputLength.data = [[3], + [3], + [1]], + Output.dims = {3, 6}, + Output.Lod = [] )DOC"); } }; diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu index bbad74e9..44a7c16f 100644 --- a/paddle/fluid/operators/ctc_align_op.cu +++ b/paddle/fluid/operators/ctc_align_op.cu @@ -42,53 +42,94 @@ __global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens, } } +template +__global__ void PaddingMergeAndDelCudaKernel( + const int64_t num_token, const T* tokens, const T* tokens_length, + const int blank, const int merge_repeated, const int padding_value, + const int64_t batch_size, T* output, T* output_length) { + int ind = blockIdx.x * blockDim.x + threadIdx.x; + if (ind >= batch_size) return; + int output_idx = ind * num_token; + T prev_token = -1; + for (int i = ind * num_token; i < ind * num_token + tokens_length[ind]; i++) { + if ((unsigned)tokens[i] != blank && + !(merge_repeated && tokens[i] == prev_token)) { + output[output_idx] = tokens[i]; + ++output_idx; + } + prev_token = tokens[i]; + } + output_length[ind] = output_idx - ind * num_token; + for (int i = output_idx; i < ind * num_token + num_token; i++) { + output[i] = padding_value; + } +} + template class CTCAlignOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), "It must use CUDAPlace."); - const size_t level = 0; auto* input = ctx.Input("Input"); auto* output = ctx.Output("Output"); - auto input_lod = framework::ToAbsOffset(input->lod()); - - const T* tokens = input->data(); - const int64_t num_tokens = input->dims()[0]; - const size_t num_seq = input_lod[level].size() - 1; - const int blank = ctx.Attr("blank"); const int merge_repeated = static_cast(ctx.Attr("merge_repeated")); - - // prepare a lod to record lod information while merging elements - thrust::device_vector dev_out_lod0(input_lod[level].size()); - size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data()); - - // merge elements and delete blank - T* output_data = output->mutable_data({num_tokens, 1}, ctx.GetPlace()); - + const T* tokens = input->data(); auto stream = ctx.cuda_device_context().stream(); - MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( - num_tokens, tokens, num_seq, - input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated, - dev_out_lod0_ptr, output_data); - - // set output lod - std::vector host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end()); - framework::LoD out_lod; - out_lod.push_back(host_out_lod0); - output->set_lod(out_lod); - - // resize output dims - output->Resize({static_cast(host_out_lod0.back()), 1}); - - if (host_out_lod0.back() == 0) { - output->Resize({1, 1}); - output->mutable_data(ctx.GetPlace()); - math::SetConstant set_constant; - set_constant(ctx.template device_context(), - output, -1); + + // tensor input which has no lod + if (input->lod().empty()) { + const int padding_value = ctx.Attr("padding_value"); + auto input_dims = input->dims(); + T* output_data = output->mutable_data({input_dims[0], input_dims[1]}, + ctx.GetPlace()); + auto* input_length = ctx.Input("InputLength"); + const T* input_length_data = input_length->data(); + auto* output_length = ctx.Output("OutputLength"); + T* output_length_data = + output_length->mutable_data({input_dims[0], 1}, ctx.GetPlace()); + PaddingMergeAndDelCudaKernel< + T><<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>( + input_dims[1], tokens, input_length_data, blank, merge_repeated, + padding_value, input_dims[0], output_data, output_length_data); + } else { + const size_t level = 0; + auto input_lod = framework::ToAbsOffset(input->lod()); + + const int64_t num_tokens = input->dims()[0]; + const size_t num_seq = input_lod[level].size() - 1; + + // prepare a lod to record lod information while merging elements + thrust::device_vector dev_out_lod0(input_lod[level].size()); + size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data()); + + // merge elements and delete blank + T* output_data = output->mutable_data({num_tokens, 1}, ctx.GetPlace()); + + MergeAndDelCudaKernel<<<1, 1, 0, stream>>>( + num_tokens, tokens, num_seq, + input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, + merge_repeated, dev_out_lod0_ptr, output_data); + + // set output lod + std::vector host_out_lod0(dev_out_lod0.begin(), + dev_out_lod0.end()); + framework::LoD out_lod; + out_lod.push_back(host_out_lod0); + output->set_lod(out_lod); + + // resize output dims + output->Resize({static_cast(host_out_lod0.back()), 1}); + + if (host_out_lod0.back() == 0) { + output->Resize({1, 1}); + output->mutable_data(ctx.GetPlace()); + math::SetConstant set_constant; + set_constant(ctx.template device_context(), + output, -1); + } } } }; diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h index 9c5c6f5a..ccf91471 100644 --- a/paddle/fluid/operators/ctc_align_op.h +++ b/paddle/fluid/operators/ctc_align_op.h @@ -31,50 +31,81 @@ class CTCAlignKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("Input"); auto* output = ctx.Output("Output"); - const size_t level = 0; - auto input_lod = framework::ToAbsOffset(input->lod()); - - // check input dims and lod - auto input_dims = input->dims(); - PADDLE_ENFORCE_EQ(input_dims[0], - static_cast(input_lod[level].back()), - "The first dimension of Input(Input) should be equal to " - "the sum of all sequences' lengths."); - - const size_t num_sequences = input_lod[level].size() - 1; size_t blank = static_cast(ctx.Attr("blank")); bool merge_repeated = ctx.Attr("merge_repeated"); - - // merge repeated tokens and delete blank T* output_data = output->mutable_data(ctx.GetPlace()); - size_t output_idx = 0; - std::vector output_lod0(1, 0); + auto input_dims = input->dims(); const T* input_data = input->data(); - for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) { - T prev_token = -1; - for (size_t i = input_lod[level][seq_idx]; - i < input_lod[level][seq_idx + 1]; ++i) { - if ((unsigned)input_data[i] != blank && - !(merge_repeated && input_data[i] == prev_token)) { - output_data[output_idx] = input_data[i]; - ++output_idx; + + // support tensor input, no lod information + if (input->lod().empty()) { + size_t padding_value = + static_cast(ctx.Attr("padding_value")); + auto* input_length = ctx.Input("InputLength"); + const T* input_length_data = input_length->data(); + + auto* output_length = ctx.Output("OutputLength"); + T* output_length_data = output_length->mutable_data(ctx.GetPlace()); + + for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0]; + batch_id++) { + T prev_token = -1; + size_t output_idx = 0; + for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) { + size_t input_ind = batch_id * input_dims[1] + i; + if ((unsigned)input_data[input_ind] != blank && + !(merge_repeated && input_data[input_ind] == prev_token)) { + output_data[batch_id * input_dims[1] + output_idx] = + input_data[input_ind]; + ++output_idx; + } + prev_token = input_data[input_ind]; } - prev_token = input_data[i]; + output_length_data[batch_id] = output_idx; + for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++) + output_data[batch_id * input_dims[1] + j] = padding_value; } - output_lod0.push_back(output_idx); - } + } else { + const size_t level = 0; + auto input_lod = framework::ToAbsOffset(input->lod()); + + // check input dims and lod + PADDLE_ENFORCE_EQ( + input_dims[0], static_cast(input_lod[level].back()), + "The first dimension of Input(Input) should be equal to " + "the sum of all sequences' lengths."); - // set output lod - framework::LoD output_lod; - output_lod.push_back(output_lod0); - output->set_lod(output_lod); - // resize output dims - output->Resize({static_cast(output_lod0.back()), 1}); - // for empty sequence - if (output_lod0.back() == 0) { - output->Resize({1, 1}); - output_data = output->mutable_data(ctx.GetPlace()); - output_data[0] = -1; + const size_t num_sequences = input_lod[level].size() - 1; + + // merge repeated tokens and delete blank + size_t output_idx = 0; + std::vector output_lod0(1, 0); + for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) { + T prev_token = -1; + for (size_t i = input_lod[level][seq_idx]; + i < input_lod[level][seq_idx + 1]; ++i) { + if ((unsigned)input_data[i] != blank && + !(merge_repeated && input_data[i] == prev_token)) { + output_data[output_idx] = input_data[i]; + ++output_idx; + } + prev_token = input_data[i]; + } + output_lod0.push_back(output_idx); + } + + // set output lod + framework::LoD output_lod; + output_lod.push_back(output_lod0); + output->set_lod(output_lod); + // resize output dims + output->Resize({static_cast(output_lod0.back()), 1}); + // for empty sequence + if (output_lod0.back() == 0) { + output->Resize({1, 1}); + output_data = output->mutable_data(ctx.GetPlace()); + output_data[0] = -1; + } } } }; diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc index a5c76db6..5dc83ac7 100644 --- a/paddle/fluid/operators/data_norm_op.cc +++ b/paddle/fluid/operators/data_norm_op.cc @@ -124,6 +124,9 @@ class DataNormOpMaker : public framework::OpProtoAndCheckerMaker { "'epsilon' should be between 0.0 and 0.001."); }); AddAttr("data_layout", "").SetDefault("NCHW"); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddInput("X", "The input tensor"); AddInput("BatchSize", "BatchSize is a 1-dimensional tensor of size C " @@ -224,7 +227,6 @@ class DataNormGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("Scales"), ""); // check output - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), ""); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSize")), ""); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSum")), ""); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("BatchSquareSum")), @@ -237,7 +239,9 @@ class DataNormGradOp : public framework::OperatorWithKernel { (data_layout == DataLayout::kNCHW ? x_dims[1] : x_dims[x_dims.size() - 1]); - ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } ctx->SetOutputDim(framework::GradVarName("BatchSize"), {C}); ctx->SetOutputDim(framework::GradVarName("BatchSum"), {C}); ctx->SetOutputDim(framework::GradVarName("BatchSquareSum"), {C}); @@ -304,7 +308,10 @@ class DataNormGradKernel : x_dims[x_dims.size() - 1]); // init output - auto *d_x = ctx.Output(framework::GradVarName("X")); + Tensor *d_x = nullptr; + if (ctx.HasOutput(framework::GradVarName("X"))) { + d_x = ctx.Output(framework::GradVarName("X")); + } auto *d_batch_size = ctx.Output(framework::GradVarName("BatchSize")); auto *d_batch_sum = ctx.Output(framework::GradVarName("BatchSum")); @@ -331,10 +338,12 @@ class DataNormGradKernel ConstEigenVectorArrayMap means_arr(means->data(), C); ConstEigenArrayMap x_arr(x->data(), C, N); ConstEigenArrayMap d_y_arr(d_y->data(), C, N); - EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, N); - d_x_arr.setZero(); - for (int nc = 0; nc < N; ++nc) { - d_x_arr.col(nc) = d_y_arr.col(nc) * scales_arr; + if (d_x != nullptr) { + EigenArrayMap d_x_arr(d_x->mutable_data(ctx.GetPlace()), C, N); + d_x_arr.setZero(); + for (int nc = 0; nc < N; ++nc) { + d_x_arr.col(nc) = d_y_arr.col(nc) * scales_arr; + } } // calculate data sum and squre sum diff --git a/paddle/fluid/operators/deformable_conv_filter.cu.h b/paddle/fluid/operators/deformable_conv_filter.cu.h new file mode 100644 index 00000000..f466d180 --- /dev/null +++ b/paddle/fluid/operators/deformable_conv_filter.cu.h @@ -0,0 +1,37 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Part of the following code in this file refs to +// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu +// +// Copyright (c) 2017 Microsoft +// Licensed under The Apache-2.0 License [see LICENSE for details] +// \file deformable_psroi_pooling.cu +// \brief +// \author Yi Li, Guodong Zhang, Jifeng Dai + +#pragma once +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" + +template +__global__ void FilterGradAddupCUDAKernel(const int nthreads, const int n, + const int height, const int width, + const T* dweight_3d, T* filter_grad) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + filter_grad[i] = filter_grad[i] + dweight_3d[i]; + } +} diff --git a/paddle/fluid/operators/deformable_conv_func.h b/paddle/fluid/operators/deformable_conv_func.h new file mode 100644 index 00000000..ba1c5044 --- /dev/null +++ b/paddle/fluid/operators/deformable_conv_func.h @@ -0,0 +1,149 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Part of the following code in this file refs to +// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu +// +// Copyright (c) 2017 Microsoft +// Licensed under The Apache-2.0 License [see LICENSE for details] +// \file deformable_psroi_pooling.cu +// \brief +// \author Yi Li, Guodong Zhang, Jifeng Dai + +#pragma once +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/hostdevice.h" + +template +HOSTDEVICE T DmcnGetGradientWeight(T argmax_h, T argmax_w, const int h, + const int w, const int height, + const int width) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + T weight = 0; + + weight = (h == argmax_h_low && w == argmax_w_low) + ? (h + 1 - argmax_h) * (w + 1 - argmax_w) + : weight; + weight = (h == argmax_h_low && w == argmax_w_high) + ? (h + 1 - argmax_h) * (argmax_w + 1 - w) + : weight; + weight = (h == argmax_h_high && w == argmax_w_low) + ? (argmax_h + 1 - h) * (w + 1 - argmax_w) + : weight; + weight = (h == argmax_h_high && w == argmax_w_high) + ? (argmax_h + 1 - h) * (argmax_w + 1 - w) + : weight; + + return weight; +} + +template +HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h, T argmax_w, const int height, + const int width, const T* im_data, + const int data_width, const int bp_dir) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + T weight = 0; + + if (bp_dir == 0) { + weight += (argmax_h_low >= 0 && argmax_w_low >= 0) + ? -1 * (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_low * data_width + argmax_w_low] + : 0; + + weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) + ? -1 * (argmax_w - argmax_w_low) * + im_data[argmax_h_low * data_width + argmax_w_high] + : 0; + + weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) + ? (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_high * data_width + argmax_w_low] + : 0; + weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + ? (argmax_w - argmax_w_low) * + im_data[argmax_h_high * data_width + argmax_w_high] + : 0; + } else if (bp_dir == 1) { + weight += (argmax_h_low >= 0 && argmax_w_low >= 0) + ? -1 * (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_low] + : 0; + weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1) + ? (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_high] + : 0; + weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0) + ? -1 * (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_low] + : 0; + weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + ? (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_high] + : 0; + } + + return weight; +} + +template +HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data, const int data_width, + const int height, const int width, T h, T w) { + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + T lh = h - h_low; + T lw = w - w_low; + T hh = 1 - lh; + T hw = 1 - lw; + + T v1 = + (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0; + T v2 = (h_low >= 0 && w_high <= width - 1) + ? bottom_data[h_low * data_width + w_high] + : 0; + T v3 = (h_high <= height - 1 && w_low >= 0) + ? bottom_data[h_high * data_width + w_low] + : 0; + T v4 = (h_high <= height - 1 && w_high <= width - 1) + ? bottom_data[h_high * data_width + w_high] + : 0; + + T w1 = hh * hw; + T w2 = hh * lw; + T w3 = lh * hw; + T w4 = lh * lw; + + return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4; +} diff --git a/paddle/fluid/operators/deformable_conv_op.cc b/paddle/fluid/operators/deformable_conv_op.cc index 92a93dc7..01cbec56 100644 --- a/paddle/fluid/operators/deformable_conv_op.cc +++ b/paddle/fluid/operators/deformable_conv_op.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/deformable_conv_op.h" +#include #include "paddle/fluid/operators/conv_op.h" namespace paddle { @@ -197,7 +199,6 @@ class DeformableConvOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(mask_dims[1] / (filter_dims[2] * filter_dims[3]), deformable_groups, "mask filter must divide deformable group size."); - ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); } @@ -274,5 +275,10 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(deformable_conv, ops::DeformableConvOp, ops::DeformableConvOpMaker, ops::DeformableConvGradOpDescMaker); - REGISTER_OPERATOR(deformable_conv_grad, ops::DeformableConvGradOp); + +REGISTER_OP_CPU_KERNEL(deformable_conv, ops::DeformableConvCPUKernel, + ops::DeformableConvCPUKernel); +REGISTER_OP_CPU_KERNEL(deformable_conv_grad, + ops::DeformableConvGradCPUKernel, + ops::DeformableConvGradCPUKernel); diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu index afcd418f..0a771627 100644 --- a/paddle/fluid/operators/deformable_conv_op.cu +++ b/paddle/fluid/operators/deformable_conv_op.cu @@ -24,6 +24,7 @@ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/deformable_conv_op.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -200,6 +201,36 @@ __device__ T DmcnGetCoordinateWeight(T argmax_h, T argmax_w, const int height, return weight; } +template +__device__ T DmcnIm2colBilinear(const T* bottom_data, const int data_width, + const int height, const int width, T h, T w) { + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + T lh = h - h_low; + T lw = w - w_low; + T hh = 1 - lh, hw = 1 - lw; + + T v1 = 0; + if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; + T v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + T v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + T v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + template __global__ void ModulatedDeformableCol2imCoordGpuKernel( const int nthreads, const T* data_col, const T* data_im, @@ -315,36 +346,6 @@ inline void ModulatedDeformableCol2imCoord( deformable_groups, col_shape[2], col_shape[3], grad_offset, grad_mask); } -template -__device__ T DmcnIm2colBilinear(const T* bottom_data, const int data_width, - const int height, const int width, T h, T w) { - int h_low = floor(h); - int w_low = floor(w); - int h_high = h_low + 1; - int w_high = w_low + 1; - - T lh = h - h_low; - T lw = w - w_low; - T hh = 1 - lh, hw = 1 - lw; - - T v1 = 0; - if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; - T v2 = 0; - if (h_low >= 0 && w_high <= width - 1) - v2 = bottom_data[h_low * data_width + w_high]; - T v3 = 0; - if (h_high <= height - 1 && w_low >= 0) - v3 = bottom_data[h_high * data_width + w_low]; - T v4 = 0; - if (h_high <= height - 1 && w_high <= width - 1) - v4 = bottom_data[h_high * data_width + w_high]; - - T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; - - T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); - return val; -} - template __global__ void ModulatedDeformableIm2colGpuKernel( const int nthreads, const T* data_im, const T* data_offset, diff --git a/paddle/fluid/operators/deformable_conv_op.h b/paddle/fluid/operators/deformable_conv_op.h new file mode 100644 index 00000000..33a97bf4 --- /dev/null +++ b/paddle/fluid/operators/deformable_conv_op.h @@ -0,0 +1,613 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Part of the following code in this file refs to +// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu +// +// Copyright (c) 2017 Microsoft +// Licensed under The Apache-2.0 License [see LICENSE for details] +// \file deformable_psroi_pooling.cu +// \brief +// \author Yi Li, Guodong Zhang, Jifeng Dai + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/deformable_conv_func.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using CPUDeviceContext = platform::CPUDeviceContext; + +template +void ModulatedDeformableCol2imCPUKernel( + const int num_kernels, const T* data_col, const T* data_offset, + const T* data_mask, const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int dilation_h, + const int dilation_w, const int channel_per_deformable_group, + const int batch_size, const int deformable_group, const int height_col, + const int width_col, T* grad_im) { + for (size_t thread = 0; thread < num_kernels; thread++) { + const int j = (thread / width_col / height_col / batch_size) % kernel_w; + const int i = + (thread / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = + thread / width_col / height_col / batch_size / kernel_w / kernel_h; + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = thread % width_col; + int h_out = (thread / width_col) % height_col; + int b = (thread / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const T* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * + width_col; + const T* data_mask_ptr = data_mask + + (b * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + const T cur_inv_h_data = h_in + i * dilation_h + offset_h; + const T cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const T cur_top_grad = data_col[thread] * mask; + const int cur_h = static_cast(cur_inv_h_data); + const int cur_w = static_cast(cur_inv_w_data); + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && + cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) { + int cur_bottom_grad_pos = + ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + T weight = + DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, + cur_w + dx, height, width); + + *(grad_im + cur_bottom_grad_pos) = + *(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad; + } + } + } + } +} + +template +static inline void ModulatedDeformableCol2imCPU( + const platform::CPUDeviceContext& ctx, const T* data_col, + const T* data_offset, const T* data_mask, + const std::vector im_shape, const std::vector col_shape, + const std::vector kernel_shape, const std::vector pad, + const std::vector stride, const std::vector dilation, + const int deformable_group, T* grad_im) { + int channel_per_deformable_group = im_shape[0] / deformable_group; + int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + ModulatedDeformableCol2imCPUKernel( + num_kernels, data_col, data_offset, data_mask, im_shape[0], im_shape[1], + im_shape[2], kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0], + stride[1], dilation[0], dilation[1], channel_per_deformable_group, + col_shape[1], deformable_group, col_shape[2], col_shape[3], grad_im); +} + +template +void ModulatedDeformableCol2imCoordCPUKernel( + const int num_kernels, const T* data_col, const T* data_im, + const T* data_offset, const T* data_mask, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int offset_channels, const int deformable_group, const int height_col, + const int width_col, T* grad_offset, T* grad_mask) { + for (size_t i = 0; i < num_kernels; i++) { + T val = 0, mval = 0; + const int w = i % width_col; + const int h = (i / width_col) % height_col; + const int c = (i / width_col / height_col) % offset_channels; + const int b = (i / width_col / height_col) / offset_channels; + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const T* data_col_ptr = data_col + + deformable_group_index * + channel_per_deformable_group * batch_size * + width_col * height_col; + const T* data_im_ptr = data_im + + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / + kernel_w * height * width; + const T* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * + width_col; + const T* data_mask_ptr = data_mask + + (b * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = offset_c / 2; col_c < channel_per_deformable_group; + col_c += col_step) { + const int col_pos = + (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = + (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = + (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = + (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + + w_out); + const int data_mask_hw_ptr = + (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T inv_h = h_in + i * dilation_h + offset_h; + T inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { + inv_h = inv_w = -2; + } else { + mval += data_col_ptr[col_pos] * + DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width, + height, width, inv_h, inv_w); + } + const T weight = DmcnGetCoordinateWeight( + inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, + width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + grad_offset[i] = val; + if (offset_c % 2 == 0) + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * + kernel_w + + offset_c / 2) * + height_col + + h) * + width_col + + w] = mval; + } +} + +template +static inline void ModulatedDeformableCol2imCoordCPU( + const platform::CPUDeviceContext& ctx, const T* data_col, const T* data_im, + const T* data_offset, const T* data_mask, + const std::vector im_shape, const std::vector col_shape, + const std::vector kernel_shape, const std::vector paddings, + const std::vector strides, const std::vector dilations, + const int deformable_groups, T* grad_offset, T* grad_mask) { + int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] * + col_shape[2] * col_shape[3] * deformable_groups; + int channel_per_deformable_group = col_shape[0] / deformable_groups; + + ModulatedDeformableCol2imCoordCPUKernel( + num_kernels, data_col, data_im, data_offset, data_mask, im_shape[0], + im_shape[1], im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0], + paddings[1], strides[0], strides[1], dilations[0], dilations[1], + channel_per_deformable_group, col_shape[1], + 2 * kernel_shape[2] * kernel_shape[3] * deformable_groups, + deformable_groups, col_shape[2], col_shape[3], grad_offset, grad_mask); +} + +template +void ModulatedDeformableIm2colCPUKernel( + const int num_kernels, const T* data_im, const T* data_offset, + const T* data_mask, const int height, const int width, const int kernel_h, + const int kernel_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int num_channels, const int deformable_group, const int height_col, + const int width_col, T* data_col) { + for (size_t i = 0; i < num_kernels; i++) { + const int w_col = i % width_col; + const int h_col = (i / width_col) % height_col; + const int b_col = (i / width_col) / height_col % batch_size; + const int c_im = (i / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + T* data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T* data_offset_ptr = + data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + const T* data_mask_ptr = + data_mask + + (b_col * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T mask = data_mask_ptr[data_mask_hw_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +static inline void ModulatedDeformableIm2colCPU( + const platform::CPUDeviceContext& ctx, const T* data_im, + const T* data_offset, const T* data_mask, + const std::vector im_shape, const std::vector col_shape, + const std::vector filter_shape, const std::vector paddings, + const std::vector strides, const std::vector dilations, + const int deformable_groups, T* data_col) { + int channel_per_deformable_group = im_shape[0] / deformable_groups; + int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + // get outputs of im2col with offset by bilinear interpolation + ModulatedDeformableIm2colCPUKernel( + num_kernels, data_im, data_offset, data_mask, im_shape[1], im_shape[2], + filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0], + strides[1], dilations[0], dilations[1], channel_per_deformable_group, + col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3], + data_col); +} + +template +void FilterGradAddupCPUKernel(const int nthreads, const int n, const int height, + const int width, const T* dweight_3d, + T* filter_grad) { + for (size_t i = 0; i < nthreads; i++) { + filter_grad[i] = filter_grad[i] + dweight_3d[i]; + } +} + +template +class DeformableConvCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* offset = ctx.Input("Offset"); + auto* mask = ctx.Input("Mask"); + Tensor filter = *ctx.Input("Filter"); + Tensor* output = ctx.Output("Output"); + output->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + + const int groups = ctx.Attr("groups"); + const int deformable_groups = ctx.Attr("deformable_groups"); + const int im2col_step = ctx.Attr("im2col_step"); + const std::vector strides = ctx.Attr>("strides"); + const std::vector paddings = ctx.Attr>("paddings"); + const std::vector dilations = ctx.Attr>("dilations"); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + std::vector output_shape_vec(framework::vectorize(output->dims())); + + // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} + std::vector col_buffer_shape_vec(filter_shape_vec.size()); + col_buffer_shape_vec[0] = + input->dims()[1] * filter.dims()[2] * filter.dims()[3]; + col_buffer_shape_vec[1] = im2col_step; + for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { + col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec)); + std::vector output_buffer_shape_vec(1); + output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * + output_shape_vec[2] * output_shape_vec[3]; + framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec)); + Tensor col_buffer; + Tensor output_buffer; + col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); + output_buffer = + ctx.AllocateTmpTensor(output_shape, dev_ctx); + int64_t M = output_shape_vec[1] / groups; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t K = + input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; + + Tensor weight_3d; + weight_3d.ShareDataWith(filter).Resize( + framework::make_ddim({groups, M, K})); + Tensor col_buffer_3d; + col_buffer_3d.ShareDataWith(col_buffer) + .Resize(framework::make_ddim({groups, K, N})); + Tensor output_4d; + output_4d.ShareDataWith(output_buffer) + .Resize(framework::make_ddim({batch_size / im2col_step, groups, M, N})); + output_4d.mutable_data(ctx.GetPlace()); + framework::DDim input_shape = + framework::slice_ddim(input->dims(), 1, input->dims().size()); + std::vector input_shape_vec = framework::vectorize(input_shape); + int input_dim = input->numel() / input->dims()[0]; + int input_offset_dim = offset->numel() / offset->dims()[0]; + int input_mask_dim = mask->numel() / mask->dims()[0]; + auto blas = math::GetBlas(dev_ctx); + const T* input_ptr = input->data(); + const T* offset_ptr = offset->data(); + const T* mask_ptr = mask->data(); + col_buffer.mutable_data(ctx.GetPlace()); + T* col_buffer_ptr = col_buffer.data(); + for (int i = 0; i < batch_size / im2col_step; ++i) { + ModulatedDeformableIm2colCPU( + dev_ctx, input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, + col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, + deformable_groups, col_buffer_ptr); + Tensor output_3d = output_4d.Slice(i, i + 1).Resize( + framework::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); + // get the product of pixel and weight + for (int g = 0; g < groups; ++g) { + Tensor weight_3d_slice = + weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + weight_3d.dims(), 1, weight_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + Tensor output_3d_slice = + output_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + output_3d.dims(), 1, output_3d.dims().size())); + blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), + &output_3d_slice, T(0.0)); + } + } + output->ShareDataWith(output_buffer) + .Resize(framework::make_ddim(output_shape_vec)); + } +}; + +template +class DeformableConvGradCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* output_grad = + ctx.Input(framework::GradVarName("Output")); + Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); + Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); + Tensor* offset_grad = ctx.Output(framework::GradVarName("Offset")); + Tensor* mask_grad = ctx.Output(framework::GradVarName("Mask")); + + const Tensor* input = ctx.Input("Input"); + Tensor offset = *ctx.Input("Offset"); + Tensor mask = *ctx.Input("Mask"); + Tensor filter = *ctx.Input("Filter"); + if (!input_grad && !filter_grad && !offset_grad && !mask_grad) return; + + int groups = ctx.Attr("groups"); + int deformable_groups = ctx.Attr("deformable_groups"); + int im2col_step = ctx.Attr("im2col_step"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + + auto& dev_ctx = ctx.template device_context(); + const int batch_size = static_cast(input->dims()[0]); + + framework::DDim input_shape = + framework::slice_ddim(input->dims(), 1, input->dims().size()); + std::vector input_shape_vec = framework::vectorize(input_shape); + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + std::vector output_shape_vec( + framework::vectorize(output_grad->dims())); + + std::vector col_buffer_shape_vec(filter_shape_vec.size()); + col_buffer_shape_vec[0] = + input->dims()[1] * filter.dims()[2] * filter.dims()[3]; + col_buffer_shape_vec[1] = im2col_step; + for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { + col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec)); + std::vector output_buffer_shape_vec(1); + output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * + output_shape_vec[2] * output_shape_vec[3]; + framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec)); + Tensor col_buffer; + Tensor output_buffer; + col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); + output_buffer = + ctx.AllocateTmpTensor(output_shape, dev_ctx); + + output_buffer.ShareDataWith(*output_grad); + + int64_t M = + input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3]; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t K = output_shape_vec[1] / groups; + + framework::DDim weight_3d_shape = {groups, K, M}; + framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K, + N}; + framework::DDim col_buffer_3d_shape = {groups, M, N}; + framework::DDim filter_grad_shape = {groups, K, M}; + + Tensor weight_3d; + weight_3d.ShareDataWith(filter).Resize(weight_3d_shape); + Tensor out_grad_4d; + out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape); + Tensor col_buffer_3d; + col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape); + + math::SetConstant set_zero; + auto blas = math::GetBlas(dev_ctx); + + col_buffer.mutable_data(ctx.GetPlace()); + col_buffer_3d.mutable_data(ctx.GetPlace()); + out_grad_4d.mutable_data(ctx.GetPlace()); + + int input_dim = input->numel() / input->dims()[0]; + int input_offset_dim = offset.numel() / offset.dims()[0]; + int input_mask_dim = mask.numel() / mask.dims()[0]; + + if (filter_grad) { + filter_grad->mutable_data(ctx.GetPlace()); + filter_grad->Resize(filter_grad_shape); + set_zero(dev_ctx, filter_grad, static_cast(0)); + } + + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + } + + if (offset_grad && mask_grad) { + offset_grad->mutable_data(ctx.GetPlace()); + mask_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, offset_grad, static_cast(0)); + set_zero(dev_ctx, mask_grad, static_cast(0)); + } + + for (int i = 0; i < batch_size / im2col_step; ++i) { + Tensor out_grad_3d = + out_grad_4d.Slice(i, i + 1).Resize(framework::slice_ddim( + out_grad_4d.dims(), 1, out_grad_4d.dims().size())); + for (int g = 0; g < groups; ++g) { + Tensor weight_3d_slice = + weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + weight_3d.dims(), 1, weight_3d.dims().size())); + Tensor out_grad_3d_slice = + out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + out_grad_3d.dims(), 1, out_grad_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + + blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0), + &col_buffer_3d_slice, T(0.0)); + } + col_buffer.Resize(col_shape); + + T* col_buffer_ptr = col_buffer.data(); + const T* input_ptr = input->data(); + const T* offset_ptr = offset.data(); + const T* mask_ptr = mask.data(); + + if (mask_grad && offset_grad) { + T* offset_grad_ptr = offset_grad->data(); + T* mask_grad_ptr = mask_grad->data(); + // get grad of offset and mask + ModulatedDeformableCol2imCoordCPU( + ctx.template device_context(), col_buffer_ptr, + input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, + col_buffer_shape_vec, filter_shape_vec, paddings, strides, + dilations, deformable_groups, + offset_grad_ptr + i * im2col_step * input_offset_dim, + mask_grad_ptr + i * im2col_step * input_mask_dim); + } + if (input_grad) { + T* input_grad_ptr = input_grad->data(); + // get grad of input + ModulatedDeformableCol2imCPU( + ctx.template device_context(), col_buffer_ptr, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, + col_buffer_shape_vec, filter_shape_vec, paddings, strides, + dilations, deformable_groups, + input_grad_ptr + i * im2col_step * input_dim); + input_grad->Resize(input->dims()); + } + + ModulatedDeformableIm2colCPU( + ctx.template device_context(), + input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + mask_ptr + i * im2col_step * input_mask_dim, input_shape_vec, + col_buffer_shape_vec, filter_shape_vec, paddings, strides, dilations, + deformable_groups, col_buffer_ptr); + + col_buffer_3d.Resize(col_buffer_3d_shape); + + if (filter_grad) { + Tensor dweight_3d; + dweight_3d = ctx.AllocateTmpTensor( + filter_grad_shape, dev_ctx); + for (int g = 0; g < groups; ++g) { + Tensor out_grad_3d_slice = + out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + out_grad_3d.dims(), 1, out_grad_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + Tensor dweight_3d_slice = + dweight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + dweight_3d.dims(), 1, dweight_3d.dims().size())); + + blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true, + T(1.0), &dweight_3d_slice, T(0.0)); + } + // update grad of weights + FilterGradAddupCPUKernel(dweight_3d.numel(), groups, K, M, + dweight_3d.data(), filter_grad->data()); + } + } + if (filter_grad) { + filter_grad->Resize(filter.dims()); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cc b/paddle/fluid/operators/deformable_conv_v1_op.cc new file mode 100644 index 00000000..6129e296 --- /dev/null +++ b/paddle/fluid/operators/deformable_conv_v1_op.cc @@ -0,0 +1,272 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/deformable_conv_v1_op.h" +#include +#include "paddle/fluid/operators/conv_op.h" + +namespace paddle { +namespace operators { +class DeformableConvV1OpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", + "(Tensor) The input of deformable conv op. " + "The shape of input is " + "[N, channel_in, H, W]"); + AddInput("Offset", + "(Tensor) The input offset. " + "The shape of the offset is " + "[N, deformable_groups * kernel_w * kernel_h * 2, H, W"); + AddInput("Filter", + "(Tensor) The Input Filter " + "The shape of the wight is " + "[num_filters, channel_in, kernel_h, kernel_w."); + AddOutput("Output", + "(Tensor) The output. " + "The shape of the output tensor is " + "[N, num_filters, out_height, out_width]]."); + AddAttr>("strides", + "(vector default:{1, 1}), the " + "strides(h_stride, w_stride) of " + "convolution operator.") + .SetDefault({1, 1}); + AddAttr>("paddings", + "(vector default:{0,0}), the " + "paddings(h_pad, w_pad) of " + "convolution operator. ") + .SetDefault({0, 0}); + AddAttr>("dilations", + "(vector default:{1, 1}), the " + "dilations(h_dilation, w_dilation) of " + "convolution operator.") + .SetDefault({1, 1}); + AddAttr( + "groups", + "(int default:1), the groups number of the convolution operator. " + "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " + "when group=2, the first half of the filters is only connected to the " + "first half of the input channels, while the second half of the " + "filters " + "is only connected to the second half of the input channels.") + .SetDefault(1); + AddAttr("deformable_groups", + "(int default:1), the number of the deformable groups.") + .SetDefault(1); + AddAttr("im2col_step", + "im2col maximum number of image per computation") + .SetDefault(64); + AddComment(R"DOC( +**Deformable Convolution v1 Operator** + +Deformable Convolution is a new method based Convolution which feature has offset +in spatial location. + +1. Get offset of each pixel in feature map with convolution layers which number + of channels should be double of weight size. + +2. Add offset to pixel to get new location and the new value which are computed + directly through bilinear interpolation with four nearest pixel. + +3. Get the product of pixel and weight as result + +Compute 2-D deformable convolution on 4-D input. + +Given input image x, output feature map y, the deformable convolution operation can be expressed as follow: + +$$ +y(p) = \\sum_{k=1}^{K}{w_k * x(p + p_k + \\Delta p_k)} +$$ + +Where $$\\Delta p_k$$ is the learnable offset for the k-th location, respectively. + +Refer to 'https://arxiv.org/abs/1703.06211 ' + +Example: + Input: + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ + Offset shape: $(N, 2 * deformable_groups, * H_f * W_f, H_{out}, W_{out})$ + Output: + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + where $H_{out}, W_{out}$ must be equal to $H_{in}, W_{in}$ respectively. + Where +$$ + H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\ + W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1 +$$ +)DOC"); + } +}; + +class DeformableConvV1Op : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, + "Input(Input) of DeformableConvOp " + "should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("Offset"), true, + "Input(Offset) of DeformableConvOp " + "should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("Filter"), true, + "Input(Filter) of DeformableConvOp " + "should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Output"), true, + "Output(Output) of DeformableConvOp " + "should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + auto offset_dims = ctx->GetInputDim("Offset"); + + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + std::vector dilations = + ctx->Attrs().Get>("dilations"); + int groups = ctx->Attrs().Get("groups"); + int deformable_groups = ctx->Attrs().Get("deformable_groups"); + int im2col_step = ctx->Attrs().Get("im2col_step"); + + PADDLE_ENFORCE_EQ(in_dims.size(), 4, + "Conv input should be 4-D tensor, get %u", + in_dims.size()); + PADDLE_ENFORCE_EQ( + in_dims.size(), filter_dims.size(), + "Conv input dimension and filter dimension should be the same."); + PADDLE_ENFORCE_EQ( + in_dims.size() - strides.size(), 2U, + "Conv input dimension and strides dimension should be consistent."); + PADDLE_ENFORCE_EQ(paddings.size(), strides.size(), + "Conv paddings dimension and Conv strides dimension " + "should be the same."); + + PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups, + "The number of input channels should be equal to filter " + "channels * groups."); + PADDLE_ENFORCE_EQ( + filter_dims[0] % groups, 0, + "The number of output channels should be divided by groups."); + PADDLE_ENFORCE_EQ(filter_dims[0] % deformable_groups, 0, + "The number of output channels should be " + "divided by deformable groups."); + + if (in_dims[0] > im2col_step) { + PADDLE_ENFORCE_EQ( + in_dims[0] % im2col_step, 0U, + "Input batchsize must be smaller than or divide im2col_step"); + } + + for (size_t i = 0; i < strides.size(); ++i) { + PADDLE_ENFORCE_GT(strides[i], 0U, "stride %d size incorrect", i); + } + for (size_t i = 0; i < dilations.size(); ++i) { + PADDLE_ENFORCE_GT(dilations[i], 0U, "dilation %d size incorrect", i); + } + + std::vector output_shape({in_dims[0], filter_dims[0]}); + for (size_t i = 0; i < strides.size(); ++i) { + output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], + dilations[i], paddings[i], + strides[i])); + } + PADDLE_ENFORCE_EQ(output_shape[1] % deformable_groups, 0U, + "output num_filter must divide deformable group size."); + PADDLE_ENFORCE_EQ(output_shape[2], offset_dims[2], + "output height must equal to offset map height."); + PADDLE_ENFORCE_EQ(output_shape[3], offset_dims[3], + "output width must equal to offset map width."); + PADDLE_ENFORCE_EQ(offset_dims[1] % (filter_dims[2] * filter_dims[3]), 0U, + "offset filter must divide deformable group size."); + PADDLE_ENFORCE_EQ(offset_dims[1] / (2 * filter_dims[2] * filter_dims[3]), + deformable_groups, + "offset filter must divide deformable group size."); + + ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.device_context()); + } +}; + +class DeformableConvV1GradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + + op->SetType("deformable_conv_v1_grad"); + op->SetInput("Input", Input("Input")); + op->SetInput("Filter", Input("Filter")); + op->SetInput("Offset", Input("Offset")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + op->SetOutput(framework::GradVarName("Offset"), InputGrad("Offset")); + + op->SetAttrMap(Attrs()); + return op; + } +}; + +class DeformableConvV1GradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + auto in_dims = ctx->GetInputDim("Input"); + auto filter_dims = ctx->GetInputDim("Filter"); + auto offset_dims = ctx->GetInputDim("Offset"); + + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Output")), true, + "the gradient of output(Out) must not be null"); + if (ctx->HasOutput(framework::GradVarName("Input"))) { + ctx->SetOutputDim(framework::GradVarName("Input"), in_dims); + } + if (ctx->HasOutput(framework::GradVarName("Filter"))) { + ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims); + } + if (ctx->HasOutput(framework::GradVarName("Offset"))) { + ctx->SetOutputDim(framework::GradVarName("Offset"), offset_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.device_context()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(deformable_conv_v1, ops::DeformableConvV1Op, + ops::DeformableConvV1OpMaker, + ops::DeformableConvV1GradOpDescMaker); +REGISTER_OPERATOR(deformable_conv_v1_grad, ops::DeformableConvV1GradOp); + +REGISTER_OP_CPU_KERNEL(deformable_conv_v1, + ops::DeformableConvV1CPUKernel); +REGISTER_OP_CPU_KERNEL(deformable_conv_v1_grad, + ops::DeformableConvV1GradCPUKernel); diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cu b/paddle/fluid/operators/deformable_conv_v1_op.cu new file mode 100644 index 00000000..a865766f --- /dev/null +++ b/paddle/fluid/operators/deformable_conv_v1_op.cu @@ -0,0 +1,609 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Part of the following code in this file refs to +// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu +// +// Copyright (c) 2017 Microsoft +// Licensed under The Apache-2.0 License [see LICENSE for details] +// \file deformable_psroi_pooling.cu +// \brief +// \author Yi Li, Guodong Zhang, Jifeng Dai + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/deformable_conv_filter.cu.h" +#include "paddle/fluid/operators/deformable_conv_func.h" +#include "paddle/fluid/operators/deformable_conv_v1_op.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using CUDADeviceContext = paddle::platform::CUDADeviceContext; + +static constexpr int kNumCUDAThread = 512; +static constexpr int kNumMaximumNumBlock = 4096; + +static inline int NumBlock(const int N) { + return std::min((N + kNumCUDAThread - 1) / kNumCUDAThread, + kNumMaximumNumBlock); +} + +template +__global__ void DeformableCol2imCUDAKernel( + const int nthreads, const T* data_col, const T* data_offset, + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int deformable_group, const int height_col, const int width_col, + T* grad_im) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t thread = index; thread < nthreads; thread += offset) { + const int j = (thread / width_col / height_col / batch_size) % kernel_w; + const int i = + (thread / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = + thread / width_col / height_col / batch_size / kernel_w / kernel_h; + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = thread % width_col; + int h_out = (thread / width_col) % height_col; + int b = (thread / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const T* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * + width_col; + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T cur_inv_h_data = h_in + i * dilation_h + offset_h; + const T cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const T cur_top_grad = data_col[thread]; + const int cur_h = static_cast(cur_inv_h_data); + const int cur_w = static_cast(cur_inv_w_data); + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && + cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) { + int cur_bottom_grad_pos = + ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + T weight = + DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, + cur_w + dx, height, width); + + atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} + +template +inline void DeformableCol2im(const platform::CUDADeviceContext& ctx, + const T* data_col, const T* data_offset, + const std::vector im_shape, + const std::vector col_shape, + const std::vector kernel_shape, + const std::vector pad, + const std::vector stride, + const std::vector dilation, + const int deformable_group, T* grad_im) { + int channel_per_deformable_group = im_shape[0] / deformable_group; + int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + int blocks = NumBlock(num_kernels); + int threads = kNumCUDAThread; + + DeformableCol2imCUDAKernel<<< + blocks, threads, 0, + reinterpret_cast(ctx).stream()>>>( + num_kernels, data_col, data_offset, im_shape[0], im_shape[1], im_shape[2], + kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0], stride[1], + dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], + deformable_group, col_shape[2], col_shape[3], grad_im); +} + +template +__global__ void DeformableCol2imCoordCUDAKernel( + const int nthreads, const T* data_col, const T* data_im, + const T* data_offset, const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int dilation_h, + const int dilation_w, const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, T* grad_offset) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + T val = 0, mval = 0; + const int w = i % width_col; + const int h = (i / width_col) % height_col; + const int c = (i / width_col / height_col) % offset_channels; + const int b = (i / width_col / height_col) / offset_channels; + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const T* data_col_ptr = data_col + + deformable_group_index * + channel_per_deformable_group * batch_size * + width_col * height_col; + const T* data_im_ptr = data_im + + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / + kernel_w * height * width; + const T* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * + width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = offset_c / 2; col_c < channel_per_deformable_group; + col_c += col_step) { + const int col_pos = + (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = + (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = + (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = + (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + + w_out); + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + T inv_h = h_in + i * dilation_h + offset_h; + T inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { + inv_h = inv_w = -2; + } else { + mval += data_col_ptr[col_pos] * + DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width, + height, width, inv_h, inv_w); + } + const T weight = DmcnGetCoordinateWeight( + inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, + width, bp_dir); + val += weight * data_col_ptr[col_pos]; + cnt += 1; + } + grad_offset[i] = val; + } +} + +template +inline void DeformableCol2imCoord( + const platform::CUDADeviceContext& ctx, const T* data_col, const T* data_im, + const T* data_offset, const std::vector im_shape, + const std::vector col_shape, + const std::vector kernel_shape, const std::vector paddings, + const std::vector strides, const std::vector dilations, + const int deformable_groups, T* grad_offset) { + int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] * + col_shape[2] * col_shape[3] * deformable_groups; + int channel_per_deformable_group = col_shape[0] / deformable_groups; + int blocks = NumBlock(num_kernels); + int threads = kNumCUDAThread; + + DeformableCol2imCoordCUDAKernel<<< + blocks, threads, 0, + reinterpret_cast(ctx).stream()>>>( + num_kernels, data_col, data_im, data_offset, im_shape[0], im_shape[1], + im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0], paddings[1], + strides[0], strides[1], dilations[0], dilations[1], + channel_per_deformable_group, col_shape[1], + 2 * kernel_shape[2] * kernel_shape[3] * deformable_groups, + deformable_groups, col_shape[2], col_shape[3], grad_offset); +} + +template +__global__ void DeformableIm2colCUDAKernel( + const int nthreads, const T* data_im, const T* data_offset, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int num_channels, const int deformable_group, const int height_col, + const int width_col, T* data_col) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + const int w_col = i % width_col; + const int h_col = (i / width_col) % height_col; + const int b_col = (i / width_col) / height_col % batch_size; + const int c_im = (i / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + T* data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T* data_offset_ptr = + data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +inline void DeformableIm2col(const platform::CUDADeviceContext& ctx, + const T* data_im, const T* data_offset, + const std::vector im_shape, + const std::vector col_shape, + const std::vector filter_shape, + const std::vector paddings, + const std::vector strides, + const std::vector dilations, + const int deformable_groups, T* data_col) { + int channel_per_deformable_group = im_shape[0] / deformable_groups; + int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + int blocks = NumBlock(num_kernels); + int threads = kNumCUDAThread; + + // get outputs of im2col with offset by bilinear interpolation + DeformableIm2colCUDAKernel<<< + blocks, threads, 0, + reinterpret_cast(ctx).stream()>>>( + num_kernels, data_im, data_offset, im_shape[1], im_shape[2], + filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0], + strides[1], dilations[0], dilations[1], channel_per_deformable_group, + col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3], + data_col); +} + +template +class DeformableConvV1CUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* input = ctx.Input("Input"); + const Tensor offset = *ctx.Input("Offset"); + Tensor filter = *ctx.Input("Filter"); + Tensor* output = ctx.Output("Output"); + output->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + + const int groups = ctx.Attr("groups"); + const int deformable_groups = ctx.Attr("deformable_groups"); + const int im2col_step = ctx.Attr("im2col_step"); + const std::vector strides = ctx.Attr>("strides"); + const std::vector paddings = ctx.Attr>("paddings"); + const std::vector dilations = ctx.Attr>("dilations"); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + std::vector output_shape_vec(framework::vectorize(output->dims())); + + // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} + std::vector col_buffer_shape_vec(filter_shape_vec.size()); + col_buffer_shape_vec[0] = + input->dims()[1] * filter.dims()[2] * filter.dims()[3]; + col_buffer_shape_vec[1] = im2col_step; + for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { + col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec)); + std::vector output_buffer_shape_vec(1); + output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * + output_shape_vec[2] * output_shape_vec[3]; + framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec)); + Tensor col_buffer; + Tensor output_buffer; + col_buffer = + ctx.AllocateTmpTensor(col_shape, dev_ctx); + output_buffer = + ctx.AllocateTmpTensor(output_shape, dev_ctx); + + int64_t M = output_shape_vec[1] / groups; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t K = + input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; + + Tensor weight_3d; + weight_3d.ShareDataWith(filter).Resize( + framework::make_ddim({groups, M, K})); + Tensor col_buffer_3d; + col_buffer_3d.ShareDataWith(col_buffer) + .Resize(framework::make_ddim({groups, K, N})); + Tensor output_4d; + output_4d.ShareDataWith(output_buffer) + .Resize(framework::make_ddim({batch_size / im2col_step, groups, M, N})); + output_4d.mutable_data(ctx.GetPlace()); + framework::DDim input_shape = + framework::slice_ddim(input->dims(), 1, input->dims().size()); + std::vector input_shape_vec = framework::vectorize(input_shape); + + int input_dim = input->numel() / input->dims()[0]; + int input_offset_dim = offset.numel() / offset.dims()[0]; + + auto blas = math::GetBlas(dev_ctx); + + const T* input_ptr = input->data(); + const T* offset_ptr = offset.data(); + col_buffer.mutable_data(ctx.GetPlace()); + T* col_buffer_ptr = col_buffer.data(); + + for (int i = 0; i < batch_size / im2col_step; ++i) { + DeformableIm2col(dev_ctx, input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + input_shape_vec, col_buffer_shape_vec, filter_shape_vec, + paddings, strides, dilations, deformable_groups, + col_buffer_ptr); + + Tensor output_3d = output_4d.Slice(i, i + 1).Resize( + framework::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); + // get the product of pixel and weight + for (int g = 0; g < groups; ++g) { + Tensor weight_3d_slice = + weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + weight_3d.dims(), 1, weight_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + Tensor output_3d_slice = + output_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + output_3d.dims(), 1, output_3d.dims().size())); + + blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), + &output_3d_slice, T(0.0)); + } + } + output->ShareDataWith(output_buffer) + .Resize(framework::make_ddim(output_shape_vec)); + } +}; + +template +class DeformableConvV1GradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* output_grad = + ctx.Input(framework::GradVarName("Output")); + Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); + Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); + Tensor* offset_grad = ctx.Output(framework::GradVarName("Offset")); + + const Tensor* input = ctx.Input("Input"); + Tensor offset = *ctx.Input("Offset"); + Tensor filter = *ctx.Input("Filter"); + if (!input_grad && !filter_grad && !offset_grad) return; + + int groups = ctx.Attr("groups"); + int deformable_groups = ctx.Attr("deformable_groups"); + int im2col_step = ctx.Attr("im2col_step"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + + auto& dev_ctx = ctx.template device_context(); + const int batch_size = static_cast(input->dims()[0]); + + framework::DDim input_shape = + framework::slice_ddim(input->dims(), 1, input->dims().size()); + std::vector input_shape_vec = framework::vectorize(input_shape); + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + std::vector output_shape_vec( + framework::vectorize(output_grad->dims())); + + std::vector col_buffer_shape_vec(filter_shape_vec.size()); + col_buffer_shape_vec[0] = + input->dims()[1] * filter.dims()[2] * filter.dims()[3]; + col_buffer_shape_vec[1] = im2col_step; + for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { + col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec)); + std::vector output_buffer_shape_vec(1); + output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * + output_shape_vec[2] * output_shape_vec[3]; + framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec)); + Tensor col_buffer; + Tensor output_buffer; + col_buffer = + ctx.AllocateTmpTensor(col_shape, dev_ctx); + output_buffer = + ctx.AllocateTmpTensor(output_shape, dev_ctx); + + output_buffer.ShareDataWith(*output_grad); + + int64_t M = + input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3]; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t K = output_shape_vec[1] / groups; + + framework::DDim weight_3d_shape = {groups, K, M}; + framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K, + N}; + framework::DDim col_buffer_3d_shape = {groups, M, N}; + framework::DDim filter_grad_shape = {groups, K, M}; + + Tensor weight_3d; + weight_3d.ShareDataWith(filter).Resize(weight_3d_shape); + Tensor out_grad_4d; + out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape); + Tensor col_buffer_3d; + col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape); + + math::SetConstant set_zero; + auto blas = math::GetBlas(dev_ctx); + + col_buffer.mutable_data(ctx.GetPlace()); + col_buffer_3d.mutable_data(ctx.GetPlace()); + out_grad_4d.mutable_data(ctx.GetPlace()); + + int input_dim = input->numel() / input->dims()[0]; + int input_offset_dim = offset.numel() / offset.dims()[0]; + + if (filter_grad) { + filter_grad->mutable_data(ctx.GetPlace()); + filter_grad->Resize(filter_grad_shape); + set_zero(dev_ctx, filter_grad, static_cast(0)); + } + + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + } + + if (offset_grad) { + offset_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, offset_grad, static_cast(0)); + } + + for (int i = 0; i < batch_size / im2col_step; ++i) { + Tensor out_grad_3d = + out_grad_4d.Slice(i, i + 1).Resize(framework::slice_ddim( + out_grad_4d.dims(), 1, out_grad_4d.dims().size())); + for (int g = 0; g < groups; ++g) { + Tensor weight_3d_slice = + weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + weight_3d.dims(), 1, weight_3d.dims().size())); + Tensor out_grad_3d_slice = + out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + out_grad_3d.dims(), 1, out_grad_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + + blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0), + &col_buffer_3d_slice, T(0.0)); + } + col_buffer.Resize(col_shape); + + T* col_buffer_ptr = col_buffer.data(); + const T* input_ptr = input->data(); + const T* offset_ptr = offset.data(); + + if (offset_grad) { + T* offset_grad_ptr = offset_grad->data(); + // get grad of offset + DeformableCol2imCoord( + dev_ctx, col_buffer_ptr, input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, input_shape_vec, + col_buffer_shape_vec, filter_shape_vec, paddings, strides, + dilations, deformable_groups, + offset_grad_ptr + i * im2col_step * input_offset_dim); + } + if (input_grad) { + T* input_grad_ptr = input_grad->data(); + // get grad of input + DeformableCol2im(dev_ctx, col_buffer_ptr, + offset_ptr + i * im2col_step * input_offset_dim, + input_shape_vec, col_buffer_shape_vec, + filter_shape_vec, paddings, strides, dilations, + deformable_groups, + input_grad_ptr + i * im2col_step * input_dim); + input_grad->Resize(input->dims()); + } + + DeformableIm2col(dev_ctx, input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + input_shape_vec, col_buffer_shape_vec, filter_shape_vec, + paddings, strides, dilations, deformable_groups, + col_buffer_ptr); + + col_buffer_3d.Resize(col_buffer_3d_shape); + + if (filter_grad) { + Tensor dweight_3d; + dweight_3d = ctx.AllocateTmpTensor( + filter_grad_shape, dev_ctx); + for (int g = 0; g < groups; ++g) { + Tensor out_grad_3d_slice = + out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + out_grad_3d.dims(), 1, out_grad_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + Tensor dweight_3d_slice = + dweight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + dweight_3d.dims(), 1, dweight_3d.dims().size())); + + blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true, + T(1.0), &dweight_3d_slice, T(0.0)); + } + FilterGradAddupCUDAKernel<<>>( + dweight_3d.numel(), groups, K, M, dweight_3d.data(), + filter_grad->data()); + } + } + if (filter_grad) { + filter_grad->Resize(filter.dims()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL(deformable_conv_v1, + ops::DeformableConvV1CUDAKernel); +REGISTER_OP_CUDA_KERNEL(deformable_conv_v1_grad, + ops::DeformableConvV1GradCUDAKernel); diff --git a/paddle/fluid/operators/deformable_conv_v1_op.h b/paddle/fluid/operators/deformable_conv_v1_op.h new file mode 100644 index 00000000..89dc10cf --- /dev/null +++ b/paddle/fluid/operators/deformable_conv_v1_op.h @@ -0,0 +1,564 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Part of the following code in this file refs to +// https://github.com/msracver/Deformable-ConvNets/blob/master/faster_rcnn/operator_cxx/deformable_convolution.cu +// +// Copyright (c) 2017 Microsoft +// Licensed under The Apache-2.0 License [see LICENSE for details] +// \file deformable_psroi_pooling.cu +// \brief +// \author Yi Li, Guodong Zhang, Jifeng Dai + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/deformable_conv_func.h" +#include "paddle/fluid/operators/deformable_conv_op.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using CPUDeviceContext = platform::CPUDeviceContext; + +template +void DeformableCol2imCPUKernel( + const int num_kernels, const T* data_col, const T* data_offset, + const int channels, const int height, const int width, const int kernel_h, + const int kernel_w, const int pad_h, const int pad_w, const int stride_h, + const int stride_w, const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int deformable_group, const int height_col, const int width_col, + T* grad_im) { + for (size_t thread = 0; thread < num_kernels; thread++) { + const int j = (thread / width_col / height_col / batch_size) % kernel_w; + const int i = + (thread / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = + thread / width_col / height_col / batch_size / kernel_w / kernel_h; + + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = thread % width_col; + int h_out = (thread / width_col) % height_col; + int b = (thread / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const T* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * + width_col; + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + const T cur_inv_h_data = h_in + i * dilation_h + offset_h; + const T cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const T cur_top_grad = data_col[thread]; + const int cur_h = static_cast(cur_inv_h_data); + const int cur_w = static_cast(cur_inv_w_data); + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && + cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) { + int cur_bottom_grad_pos = + ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + T weight = + DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, + cur_w + dx, height, width); + + *(grad_im + cur_bottom_grad_pos) = + *(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad; + } + } + } + } +} + +template +inline void DeformableCol2imCPU(const platform::CPUDeviceContext& ctx, + const T* data_col, const T* data_offset, + const std::vector im_shape, + const std::vector col_shape, + const std::vector kernel_shape, + const std::vector pad, + const std::vector stride, + const std::vector dilation, + const int deformable_group, T* grad_im) { + int channel_per_deformable_group = im_shape[0] / deformable_group; + int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + DeformableCol2imCPUKernel( + num_kernels, data_col, data_offset, im_shape[0], im_shape[1], im_shape[2], + kernel_shape[2], kernel_shape[3], pad[0], pad[1], stride[0], stride[1], + dilation[0], dilation[1], channel_per_deformable_group, col_shape[1], + deformable_group, col_shape[2], col_shape[3], grad_im); +} + +template +void DeformableCol2imCoordCPUKernel( + const int num_kernels, const T* data_col, const T* data_im, + const T* data_offset, const int channels, const int height, const int width, + const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int dilation_h, + const int dilation_w, const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, T* grad_offset) { + for (size_t i = 0; i < num_kernels; i++) { + T val = 0, mval = 0; + const int w = i % width_col; + const int h = (i / width_col) % height_col; + const int c = (i / width_col / height_col) % offset_channels; + const int b = (i / width_col / height_col) / offset_channels; + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const T* data_col_ptr = data_col + + deformable_group_index * + channel_per_deformable_group * batch_size * + width_col * height_col; + const T* data_im_ptr = data_im + + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / + kernel_w * height * width; + const T* data_offset_ptr = data_offset + + (b * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * + width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = offset_c / 2; col_c < channel_per_deformable_group; + col_c += col_step) { + const int col_pos = + (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = + (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = + (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = + (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + + w_out); + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + T inv_h = h_in + i * dilation_h + offset_h; + T inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { + inv_h = inv_w = -2; + } else { + mval += data_col_ptr[col_pos] * + DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width, + height, width, inv_h, inv_w); + } + const T weight = DmcnGetCoordinateWeight( + inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, + width, bp_dir); + val += weight * data_col_ptr[col_pos]; + cnt += 1; + } + grad_offset[i] = val; + } +} + +template +inline void DeformableCol2imCoordCPU( + const platform::CPUDeviceContext& ctx, const T* data_col, const T* data_im, + const T* data_offset, const std::vector im_shape, + const std::vector col_shape, + const std::vector kernel_shape, const std::vector paddings, + const std::vector strides, const std::vector dilations, + const int deformable_groups, T* grad_offset) { + int num_kernels = 2 * kernel_shape[2] * kernel_shape[3] * col_shape[1] * + col_shape[2] * col_shape[3] * deformable_groups; + int channel_per_deformable_group = col_shape[0] / deformable_groups; + + DeformableCol2imCoordCPUKernel( + num_kernels, data_col, data_im, data_offset, im_shape[0], im_shape[1], + im_shape[2], kernel_shape[2], kernel_shape[3], paddings[0], paddings[1], + strides[0], strides[1], dilations[0], dilations[1], + channel_per_deformable_group, col_shape[1], + 2 * kernel_shape[2] * kernel_shape[3] * deformable_groups, + deformable_groups, col_shape[2], col_shape[3], grad_offset); +} + +template +void DeformableIm2colCPUKernel( + const int num_kernels, const T* data_im, const T* data_offset, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int num_channels, const int deformable_group, const int height_col, + const int width_col, T* data_col) { + for (size_t i = 0; i < num_kernels; i++) { + const int w_col = i % width_col; + const int h_col = (i / width_col) % height_col; + const int b_col = (i / width_col) / height_col % batch_size; + const int c_im = (i / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + T* data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const T* data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const T* data_offset_ptr = + data_offset + + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * + kernel_w * height_col * width_col; + + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + + const T offset_h = data_offset_ptr[data_offset_h_ptr]; + const T offset_w = data_offset_ptr[data_offset_w_ptr]; + T val = static_cast(0); + const T h_im = h_in + i * dilation_h + offset_h; + const T w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} + +template +inline void DeformableIm2colCPU(const platform::CPUDeviceContext& ctx, + const T* data_im, const T* data_offset, + const std::vector im_shape, + const std::vector col_shape, + const std::vector filter_shape, + const std::vector paddings, + const std::vector strides, + const std::vector dilations, + const int deformable_groups, T* data_col) { + int channel_per_deformable_group = im_shape[0] / deformable_groups; + int num_kernels = im_shape[0] * col_shape[1] * col_shape[2] * col_shape[3]; + + // get outputs of im2col with offset by bilinear interpolation + DeformableIm2colCPUKernel( + num_kernels, data_im, data_offset, im_shape[1], im_shape[2], + filter_shape[2], filter_shape[3], paddings[0], paddings[1], strides[0], + strides[1], dilations[0], dilations[1], channel_per_deformable_group, + col_shape[1], im_shape[0], deformable_groups, col_shape[2], col_shape[3], + data_col); +} + +template +class DeformableConvV1CPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* offset = ctx.Input("Offset"); + Tensor filter = *ctx.Input("Filter"); + Tensor* output = ctx.Output("Output"); + output->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + + const int groups = ctx.Attr("groups"); + const int deformable_groups = ctx.Attr("deformable_groups"); + const int im2col_step = ctx.Attr("im2col_step"); + const std::vector strides = ctx.Attr>("strides"); + const std::vector paddings = ctx.Attr>("paddings"); + const std::vector dilations = ctx.Attr>("dilations"); + + const int batch_size = static_cast(input->dims()[0]); + + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + std::vector output_shape_vec(framework::vectorize(output->dims())); + + // col_shape_vec: {c_i * k_h * k_w, im2col_step, o_h, o_w} + std::vector col_buffer_shape_vec(filter_shape_vec.size()); + col_buffer_shape_vec[0] = + input->dims()[1] * filter.dims()[2] * filter.dims()[3]; + col_buffer_shape_vec[1] = im2col_step; + for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { + col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec)); + std::vector output_buffer_shape_vec(1); + output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * + output_shape_vec[2] * output_shape_vec[3]; + framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec)); + Tensor col_buffer; + Tensor output_buffer; + col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); + output_buffer = + ctx.AllocateTmpTensor(output_shape, dev_ctx); + int64_t M = output_shape_vec[1] / groups; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t K = + input->dims()[1] * filter_shape_vec[2] * filter_shape_vec[3] / groups; + + Tensor weight_3d; + weight_3d.ShareDataWith(filter).Resize( + framework::make_ddim({groups, M, K})); + Tensor col_buffer_3d; + col_buffer_3d.ShareDataWith(col_buffer) + .Resize(framework::make_ddim({groups, K, N})); + Tensor output_4d; + output_4d.ShareDataWith(output_buffer) + .Resize(framework::make_ddim({batch_size / im2col_step, groups, M, N})); + output_4d.mutable_data(ctx.GetPlace()); + framework::DDim input_shape = + framework::slice_ddim(input->dims(), 1, input->dims().size()); + std::vector input_shape_vec = framework::vectorize(input_shape); + int input_dim = input->numel() / input->dims()[0]; + int input_offset_dim = offset->numel() / offset->dims()[0]; + auto blas = math::GetBlas(dev_ctx); + const T* input_ptr = input->data(); + const T* offset_ptr = offset->data(); + col_buffer.mutable_data(ctx.GetPlace()); + T* col_buffer_ptr = col_buffer.data(); + for (int i = 0; i < batch_size / im2col_step; ++i) { + DeformableIm2colCPU(dev_ctx, input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + input_shape_vec, col_buffer_shape_vec, + filter_shape_vec, paddings, strides, dilations, + deformable_groups, col_buffer_ptr); + Tensor output_3d = output_4d.Slice(i, i + 1).Resize( + framework::slice_ddim(output_4d.dims(), 1, output_4d.dims().size())); + // get the product of pixel and weight + for (int g = 0; g < groups; ++g) { + Tensor weight_3d_slice = + weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + weight_3d.dims(), 1, weight_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + Tensor output_3d_slice = + output_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + output_3d.dims(), 1, output_3d.dims().size())); + blas.MatMul(weight_3d_slice, false, col_buffer_3d_slice, false, T(1.0), + &output_3d_slice, T(0.0)); + } + } + output->ShareDataWith(output_buffer) + .Resize(framework::make_ddim(output_shape_vec)); + } +}; + +template +class DeformableConvV1GradCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* output_grad = + ctx.Input(framework::GradVarName("Output")); + Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); + Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); + Tensor* offset_grad = ctx.Output(framework::GradVarName("Offset")); + + const Tensor* input = ctx.Input("Input"); + Tensor offset = *ctx.Input("Offset"); + Tensor filter = *ctx.Input("Filter"); + if (!input_grad && !filter_grad && !offset_grad) return; + + int groups = ctx.Attr("groups"); + int deformable_groups = ctx.Attr("deformable_groups"); + int im2col_step = ctx.Attr("im2col_step"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + + auto& dev_ctx = ctx.template device_context(); + const int batch_size = static_cast(input->dims()[0]); + + framework::DDim input_shape = + framework::slice_ddim(input->dims(), 1, input->dims().size()); + std::vector input_shape_vec = framework::vectorize(input_shape); + std::vector filter_shape_vec(framework::vectorize(filter.dims())); + std::vector output_shape_vec( + framework::vectorize(output_grad->dims())); + + std::vector col_buffer_shape_vec(filter_shape_vec.size()); + col_buffer_shape_vec[0] = + input->dims()[1] * filter.dims()[2] * filter.dims()[3]; + col_buffer_shape_vec[1] = im2col_step; + for (size_t j = 0; j < filter_shape_vec.size() - 2; ++j) { + col_buffer_shape_vec[j + 2] = output_shape_vec[j + 2]; + } + framework::DDim col_shape(framework::make_ddim(col_buffer_shape_vec)); + std::vector output_buffer_shape_vec(1); + output_buffer_shape_vec[0] = batch_size * output_shape_vec[1] * + output_shape_vec[2] * output_shape_vec[3]; + framework::DDim output_shape(framework::make_ddim(output_buffer_shape_vec)); + Tensor col_buffer; + Tensor output_buffer; + col_buffer = ctx.AllocateTmpTensor(col_shape, dev_ctx); + output_buffer = + ctx.AllocateTmpTensor(output_shape, dev_ctx); + + output_buffer.ShareDataWith(*output_grad); + + int64_t M = + input_shape_vec[0] / groups * filter_shape_vec[2] * filter_shape_vec[3]; + int64_t N = im2col_step * output_shape_vec[2] * output_shape_vec[3]; + int64_t K = output_shape_vec[1] / groups; + + framework::DDim weight_3d_shape = {groups, K, M}; + framework::DDim out_grad_4d_shape = {batch_size / im2col_step, groups, K, + N}; + framework::DDim col_buffer_3d_shape = {groups, M, N}; + framework::DDim filter_grad_shape = {groups, K, M}; + + Tensor weight_3d; + weight_3d.ShareDataWith(filter).Resize(weight_3d_shape); + Tensor out_grad_4d; + out_grad_4d.ShareDataWith(output_buffer).Resize(out_grad_4d_shape); + Tensor col_buffer_3d; + col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape); + + math::SetConstant set_zero; + auto blas = math::GetBlas(dev_ctx); + + col_buffer.mutable_data(ctx.GetPlace()); + col_buffer_3d.mutable_data(ctx.GetPlace()); + out_grad_4d.mutable_data(ctx.GetPlace()); + + int input_dim = input->numel() / input->dims()[0]; + int input_offset_dim = offset.numel() / offset.dims()[0]; + + if (filter_grad) { + filter_grad->mutable_data(ctx.GetPlace()); + filter_grad->Resize(filter_grad_shape); + set_zero(dev_ctx, filter_grad, static_cast(0)); + } + + if (input_grad) { + input_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, input_grad, static_cast(0)); + } + + if (offset_grad) { + offset_grad->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, offset_grad, static_cast(0)); + } + + for (int i = 0; i < batch_size / im2col_step; ++i) { + Tensor out_grad_3d = + out_grad_4d.Slice(i, i + 1).Resize(framework::slice_ddim( + out_grad_4d.dims(), 1, out_grad_4d.dims().size())); + for (int g = 0; g < groups; ++g) { + Tensor weight_3d_slice = + weight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + weight_3d.dims(), 1, weight_3d.dims().size())); + Tensor out_grad_3d_slice = + out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + out_grad_3d.dims(), 1, out_grad_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + + blas.MatMul(weight_3d_slice, true, out_grad_3d_slice, false, T(1.0), + &col_buffer_3d_slice, T(0.0)); + } + col_buffer.Resize(col_shape); + + T* col_buffer_ptr = col_buffer.data(); + const T* input_ptr = input->data(); + const T* offset_ptr = offset.data(); + + if (offset_grad) { + T* offset_grad_ptr = offset_grad->data(); + // get grad of offset + DeformableCol2imCoordCPU( + dev_ctx, col_buffer_ptr, input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, input_shape_vec, + col_buffer_shape_vec, filter_shape_vec, paddings, strides, + dilations, deformable_groups, + offset_grad_ptr + i * im2col_step * input_offset_dim); + } + if (input_grad) { + T* input_grad_ptr = input_grad->data(); + // get grad of input + DeformableCol2imCPU(dev_ctx, col_buffer_ptr, + offset_ptr + i * im2col_step * input_offset_dim, + input_shape_vec, col_buffer_shape_vec, + filter_shape_vec, paddings, strides, dilations, + deformable_groups, + input_grad_ptr + i * im2col_step * input_dim); + input_grad->Resize(input->dims()); + } + + DeformableIm2colCPU(dev_ctx, input_ptr + i * im2col_step * input_dim, + offset_ptr + i * im2col_step * input_offset_dim, + input_shape_vec, col_buffer_shape_vec, + filter_shape_vec, paddings, strides, dilations, + deformable_groups, col_buffer_ptr); + + col_buffer_3d.Resize(col_buffer_3d_shape); + + if (filter_grad) { + Tensor dweight_3d; + dweight_3d = ctx.AllocateTmpTensor( + filter_grad_shape, dev_ctx); + for (int g = 0; g < groups; ++g) { + Tensor out_grad_3d_slice = + out_grad_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + out_grad_3d.dims(), 1, out_grad_3d.dims().size())); + Tensor col_buffer_3d_slice = + col_buffer_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + col_buffer_3d.dims(), 1, col_buffer_3d.dims().size())); + Tensor dweight_3d_slice = + dweight_3d.Slice(g, g + 1).Resize(framework::slice_ddim( + dweight_3d.dims(), 1, dweight_3d.dims().size())); + + blas.MatMul(out_grad_3d_slice, false, col_buffer_3d_slice, true, + T(1.0), &dweight_3d_slice, T(0.0)); + } + // update grad of weights + FilterGradAddupCPUKernel(dweight_3d.numel(), groups, K, M, + dweight_3d.data(), filter_grad->data()); + } + } + if (filter_grad) { + filter_grad->Resize(filter.dims()); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu index c38e9553..4bf04167 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -28,6 +28,7 @@ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/deformable_psroi_pooling_op.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" @@ -231,10 +232,8 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel { } auto& dev_ctx = ctx.cuda_device_context(); - auto& allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = allocator.Allocate(bytes); + auto roi_ptr = memory::Alloc(dev_ctx, bytes); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const auto gplace = boost::get(ctx.GetPlace()); memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, @@ -499,10 +498,8 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel { } } - auto& allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = allocator.Allocate(bytes); + auto roi_ptr = memory::Alloc(dev_ctx, bytes); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const auto gplace = boost::get(ctx.GetPlace()); memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, diff --git a/paddle/fluid/operators/dequantize_op.cc b/paddle/fluid/operators/dequantize_op.cc index 38159f84..97f49dbc 100644 --- a/paddle/fluid/operators/dequantize_op.cc +++ b/paddle/fluid/operators/dequantize_op.cc @@ -41,5 +41,4 @@ void DeQuantOpMaker::Make() { namespace ops = paddle::operators; -REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker, - paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(dequantize, ops::DeQuantOp, ops::DeQuantOpMaker); diff --git a/paddle/fluid/operators/detail/safe_ref.h b/paddle/fluid/operators/detail/safe_ref.h index 8660bc21..c56329d9 100644 --- a/paddle/fluid/operators/detail/safe_ref.h +++ b/paddle/fluid/operators/detail/safe_ref.h @@ -25,7 +25,7 @@ namespace detail { */ template inline T& Ref(T* ptr, ARGS&&... args) { - PADDLE_ENFORCE(ptr != nullptr, ::paddle::string::Sprintf(args...)); + PADDLE_ENFORCE_NOT_NULL(ptr, ::paddle::string::Sprintf(args...)); return *ptr; } diff --git a/paddle/fluid/operators/detection/box_coder_op.cu b/paddle/fluid/operators/detection/box_coder_op.cu index 19a5bb90..b3dd142d 100644 --- a/paddle/fluid/operators/detection/box_coder_op.cu +++ b/paddle/fluid/operators/detection/box_coder_op.cu @@ -11,7 +11,7 @@ limitations under the License. */ #include #include -#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/detection/box_coder_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -174,10 +174,8 @@ class BoxCoderCUDAKernel : public framework::OpKernel { int grid = (row * col + block - 1) / block; auto& device_ctx = context.cuda_device_context(); - auto& allocator = - platform::DeviceTemporaryAllocator::Instance().Get(device_ctx); int bytes = var_size * sizeof(float); - auto dev_var = allocator.Allocate(bytes); + auto dev_var = memory::Alloc(device_ctx, bytes); float* dev_var_data = reinterpret_cast(dev_var->ptr()); auto cplace = platform::CPUPlace(); const auto gplace = boost::get(context.GetPlace()); diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc index 945d575a..976aa317 100644 --- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc @@ -57,17 +57,19 @@ class BoxDecoderAndAssignOp : public framework::OperatorWithKernel { "The rank of Input of TargetBox must be 2"); PADDLE_ENFORCE_EQ(box_score_dims.size(), 2, "The rank of Input of BoxScore must be 2"); - PADDLE_ENFORCE_EQ(prior_box_dims[0], target_box_dims[0], - "The first dim of prior_box and target_box is roi nums " - "and should be same!"); - PADDLE_ENFORCE_EQ(prior_box_dims[0], box_score_dims[0], - "The first dim of prior_box and box_score is roi nums " - "and should be same!"); - PADDLE_ENFORCE_EQ(target_box_dims[1], box_score_dims[1] * prior_box_dims[1], - "The shape of target_box is [N, classnum * 4], The shape " - "of box_score is [N, classnum], The shape of prior_box " - "is [N, 4]"); - + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ(prior_box_dims[0], target_box_dims[0], + "The first dim of prior_box and target_box is roi nums " + "and should be same!"); + PADDLE_ENFORCE_EQ(prior_box_dims[0], box_score_dims[0], + "The first dim of prior_box and box_score is roi nums " + "and should be same!"); + PADDLE_ENFORCE_EQ( + target_box_dims[1], box_score_dims[1] * prior_box_dims[1], + "The shape of target_box is [N, classnum * 4], The shape " + "of box_score is [N, classnum], The shape of prior_box " + "is [N, 4]"); + } ctx->SetOutputDim("DecodeBox", framework::make_ddim({target_box_dims[0], target_box_dims[1]})); ctx->ShareLoD("TargetBox", /*->*/ "DecodeBox"); diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc index 38eafa5f..0d77c7f3 100644 --- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc @@ -305,10 +305,10 @@ class GenerateMaskLabelsKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(gt_segms->lod()[0].size() - 1, n); int mask_dim = num_classes * resolution * resolution; - - mask_rois->mutable_data({rois->numel(), kBoxDim}, ctx.GetPlace()); - roi_has_mask_int32->mutable_data({rois->numel(), 1}, ctx.GetPlace()); - mask_int32->mutable_data({rois->numel(), mask_dim}, ctx.GetPlace()); + int roi_num = rois->lod().back()[n]; + mask_rois->mutable_data({roi_num, kBoxDim}, ctx.GetPlace()); + roi_has_mask_int32->mutable_data({roi_num, 1}, ctx.GetPlace()); + mask_int32->mutable_data({roi_num, mask_dim}, ctx.GetPlace()); framework::LoD lod; std::vector lod0(1, 0); diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc index f46aaf7d..b46d231d 100644 --- a/paddle/fluid/operators/detection/gpc.cc +++ b/paddle/fluid/operators/detection/gpc.cc @@ -532,6 +532,7 @@ static int count_contours(polygon_node *polygon) { } static void add_left(polygon_node *p, double x, double y) { + PADDLE_ENFORCE_NOT_NULL(p); vertex_node *nv = NULL; /* Create a new vertex node and set its fields */ @@ -587,6 +588,7 @@ static void add_right(polygon_node *p, double x, double y) { } static void merge_right(polygon_node *p, polygon_node *q, polygon_node *list) { + PADDLE_ENFORCE_NOT_NULL(p); polygon_node *target = NULL; /* Label contour as external */ @@ -662,6 +664,7 @@ void add_vertex(vertex_node **t, double x, double y) { } void gpc_vertex_create(edge_node *e, int p, int s, double x, double y) { + PADDLE_ENFORCE_NOT_NULL(e); add_vertex(&(e->outp[p]->v[s]), x, y); e->outp[p]->active++; } @@ -1014,6 +1017,7 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, e0 = aet; e1 = aet; /* Set up bundle fields of first edge */ + PADDLE_ENFORCE_NOT_NULL(aet); aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); aet->bundle[ABOVE][!aet->type] = 0; aet->bstate[ABOVE] = UNBUNDLED; @@ -1646,6 +1650,7 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, e1 = aet; /* Set up bundle fields of first edge */ + PADDLE_ENFORCE_NOT_NULL(aet); aet->bundle[ABOVE][aet->type] = (aet->top.y != yb); aet->bundle[ABOVE][!aet->type] = 0; aet->bstate[ABOVE] = UNBUNDLED; @@ -1782,7 +1787,7 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, } new_tristrip(&tlist, cf, cf->xb, yb); } - edge->outp[ABOVE] = cf->outp[ABOVE]; + if (cf) edge->outp[ABOVE] = cf->outp[ABOVE]; gpc_vertex_create(edge, ABOVE, RIGHT, xb, yb); break; case ILI: diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index 8abc8b89..f5b9be14 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -328,7 +328,8 @@ class MultiClassNMSKernel : public framework::OpKernel { void MultiClassOutput(const platform::DeviceContext& ctx, const Tensor& scores, const Tensor& bboxes, const std::map>& selected_indices, - const int scores_size, Tensor* outs) const { + const int scores_size, Tensor* outs, + int* oindices = nullptr, const int offset = 0) const { int64_t class_num = scores.dims()[1]; int64_t predict_dim = scores.dims()[1]; int64_t box_size = bboxes.dims()[1]; @@ -358,9 +359,15 @@ class MultiClassNMSKernel : public framework::OpKernel { if (scores_size == 3) { bdata = bboxes_data + idx * box_size; odata[count * out_dim + 1] = sdata[idx]; // score + if (oindices != nullptr) { + oindices[count] = offset + idx; + } } else { bdata = bbox.data() + idx * box_size; odata[count * out_dim + 1] = *(scores_data + idx * class_num + label); + if (oindices != nullptr) { + oindices[count] = offset + idx * class_num + label; + } } // xmin, ymin, xmax, ymax or multi-points coordinates std::memcpy(odata + count * out_dim + 2, bdata, box_size * sizeof(T)); @@ -373,7 +380,8 @@ class MultiClassNMSKernel : public framework::OpKernel { auto* boxes = ctx.Input("BBoxes"); auto* scores = ctx.Input("Scores"); auto* outs = ctx.Output("Out"); - + bool return_index = ctx.HasOutput("Index") ? true : false; + auto index = ctx.Output("Index"); auto score_dims = scores->dims(); auto score_size = score_dims.size(); auto& dev_ctx = ctx.template device_context(); @@ -406,35 +414,55 @@ class MultiClassNMSKernel : public framework::OpKernel { int num_kept = batch_starts.back(); if (num_kept == 0) { - T* od = outs->mutable_data({1, 1}, ctx.GetPlace()); - od[0] = -1; - batch_starts = {0, 1}; + if (return_index) { + outs->mutable_data({0, out_dim}, ctx.GetPlace()); + index->mutable_data({0, 1}, ctx.GetPlace()); + } else { + T* od = outs->mutable_data({1, 1}, ctx.GetPlace()); + od[0] = -1; + batch_starts = {0, 1}; + } } else { outs->mutable_data({num_kept, out_dim}, ctx.GetPlace()); + int offset = 0; + int* oindices = nullptr; for (int i = 0; i < n; ++i) { if (score_size == 3) { scores_slice = scores->Slice(i, i + 1); boxes_slice = boxes->Slice(i, i + 1); scores_slice.Resize({score_dims[1], score_dims[2]}); boxes_slice.Resize({score_dims[2], box_dim}); + if (return_index) { + offset = i * score_dims[2]; + } } else { auto boxes_lod = boxes->lod().back(); scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]); boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]); + if (return_index) { + offset = boxes_lod[i] * score_dims[1]; + } } int64_t s = batch_starts[i]; int64_t e = batch_starts[i + 1]; if (e > s) { Tensor out = outs->Slice(s, e); + if (return_index) { + int* output_idx = + index->mutable_data({num_kept, 1}, ctx.GetPlace()); + oindices = output_idx + s; + } MultiClassOutput(dev_ctx, scores_slice, boxes_slice, all_indices[i], - score_dims.size(), &out); + score_dims.size(), &out, oindices, offset); } } } framework::LoD lod; lod.emplace_back(batch_starts); - + if (return_index) { + index->set_lod(lod); + } outs->set_lod(lod); } }; @@ -519,13 +547,45 @@ This operator support multi-class and batched inputs. It applying NMS independently for each class. The outputs is a 2-D LoDTenosr, for each image, the offsets in first dimension of LoDTensor are called LoD, the number of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0, -means there is no detected bbox for this image. If there is no detected boxes -for all images, all the elements in LoD are set to {1}, and the Out only -contains one value which is -1. +means there is no detected bbox for this image. )DOC"); } }; +class MultiClassNMS2Op : public MultiClassNMSOp { + public: + MultiClassNMS2Op(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : MultiClassNMSOp(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext* ctx) const override { + MultiClassNMSOp::InferShape(ctx); + + auto box_dims = ctx->GetInputDim("BBoxes"); + auto score_dims = ctx->GetInputDim("Scores"); + auto score_size = score_dims.size(); + if (score_size == 3) { + ctx->SetOutputDim("Index", {box_dims[1], 1}); + } else { + ctx->SetOutputDim("Index", {-1, 1}); + } + } +}; + +class MultiClassNMS2OpMaker : public MultiClassNMSOpMaker { + public: + void Make() override { + MultiClassNMSOpMaker::Make(); + AddOutput("Index", + "(LoDTensor) A 2-D LoDTensor with shape [No, 1] represents the " + "index of selected bbox. The index is the absolute index cross " + "batches.") + .AsIntermediate(); + } +}; + } // namespace operators } // namespace paddle @@ -535,3 +595,8 @@ REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp, paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel, ops::MultiClassNMSKernel); +REGISTER_OPERATOR(multiclass_nms2, ops::MultiClassNMS2Op, + ops::MultiClassNMS2OpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(multiclass_nms2, ops::MultiClassNMSKernel, + ops::MultiClassNMSKernel); diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc index 6628dde5..ce10de40 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -128,11 +128,11 @@ void get_transform_matrix(const int transformed_width, T estimated_width = (len1 + len3) / 2.0; // Get the normalized height and normalized width - int normalized_height = transformed_height; + int normalized_height = std::max(2, transformed_height); int normalized_width = std::round(estimated_width * (normalized_height - 1) / estimated_height) + 1; - normalized_width = std::min(normalized_width, transformed_width); + normalized_width = std::max(2, std::min(normalized_width, transformed_width)); T dx1 = x1 - x2; T dx2 = x3 - x2; @@ -141,9 +141,9 @@ void get_transform_matrix(const int transformed_width, T dy2 = y3 - y2; T dy3 = y0 - y1 + y2 - y3; - matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / + matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / (normalized_width - 1); - matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / + matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / (normalized_height - 1); matrix[8] = 1; diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index 19df68fa..8c9ca946 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -242,10 +242,10 @@ __device__ void get_transform_matrix(const int transformed_width, T estimated_width = (len1 + len3) / 2.0; // Get the normalized height and normalized width - int normalized_height = transformed_height; + int normalized_height = max(2, transformed_height); int normalized_width = round(estimated_width * (normalized_height - 1) / estimated_height) + 1; - normalized_width = min(normalized_width, transformed_width); + normalized_width = max(2, min(normalized_width, transformed_width)); T dx1 = x1 - x2; T dx2 = x3 - x2; @@ -254,9 +254,9 @@ __device__ void get_transform_matrix(const int transformed_width, T dy2 = y3 - y2; T dy3 = y0 - y1 + y2 - y3; - matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1) / + matrix[6] = (dx3 * dy2 - dx2 * dy3) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / (normalized_width - 1); - matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1) / + matrix[7] = (dx1 * dy3 - dx3 * dy1) / (dx1 * dy2 - dx2 * dy1 + 1e-5) / (normalized_height - 1); matrix[8] = 1; diff --git a/paddle/fluid/operators/detection/target_assign_op.h b/paddle/fluid/operators/detection/target_assign_op.h index 7f989dfc..691e3276 100644 --- a/paddle/fluid/operators/detection/target_assign_op.h +++ b/paddle/fluid/operators/detection/target_assign_op.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/for_range.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu index 5a882958..08ea62bc 100644 --- a/paddle/fluid/operators/detection/yolo_box_op.cu +++ b/paddle/fluid/operators/detection/yolo_box_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/detection/yolo_box_op.h" #include "paddle/fluid/operators/math/math_function.h" @@ -84,10 +85,8 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel { int input_size = downsample_ratio * h; auto& dev_ctx = ctx.cuda_device_context(); - auto& allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); int bytes = sizeof(int) * anchors.size(); - auto anchors_ptr = allocator.Allocate(sizeof(int) * anchors.size()); + auto anchors_ptr = memory::Alloc(dev_ctx, sizeof(int) * anchors.size()); int* anchors_data = reinterpret_cast(anchors_ptr->ptr()); const auto gplace = boost::get(ctx.GetPlace()); const auto cplace = platform::CPUPlace(); diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h index 8d1683bd..1285daae 100644 --- a/paddle/fluid/operators/dgc_op.h +++ b/paddle/fluid/operators/dgc_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "dgc/dgc.h" #include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" namespace paddle { @@ -23,14 +24,14 @@ namespace operators { inline float get_period_sparcity(const std::vector& sparsity, float cur_step, float rampup_steps) { - PADDLE_ENFORCE(static_cast(cur_step) >= 0); + PADDLE_ENFORCE_GE(static_cast(cur_step), 0); size_t idx = static_cast(cur_step * sparsity.size() / rampup_steps); if (idx >= sparsity.size()) { return 0.999; } - PADDLE_ENFORCE(idx < sparsity.size()); + PADDLE_ENFORCE_LT(idx, sparsity.size()); return sparsity[idx]; } @@ -63,7 +64,8 @@ class DGCOpKernel : public framework::OpKernel { float ratio = 1 - get_period_sparcity(sparsity, static_cast(*current_step), rampup_step); - PADDLE_ENFORCE(ratio > 0.0 && ratio < 1.0); + PADDLE_ENFORCE_GE(ratio, 0.0); + PADDLE_ENFORCE_LT(ratio, 1.0); int k = static_cast(g->numel() * ratio); VLOG(10) << "m:" << m << ", use_nesterov:" << use_nesterov @@ -111,9 +113,7 @@ class DGCOpKernel : public framework::OpKernel { framework::DDim{2 * k}, ctx.GetPlace()); int buf_size = paddle::communication::dgc::get_buffer_size(k); - auto& allocator = platform::DeviceTemporaryAllocator::Instance().Get( - ctx.GetPlace(), dev_ctx.stream()); - auto tmp_ious_data = allocator.Allocate(buf_size); + auto tmp_ious_data = memory::Alloc(dev_ctx, buf_size); void* buf = reinterpret_cast(tmp_ious_data->ptr()); if (!paddle::communication::dgc::k_select( diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc index 90f2f9fd..be8c7a7d 100644 --- a/paddle/fluid/operators/distributed/collective_server_test.cc +++ b/paddle/fluid/operators/distributed/collective_server_test.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include +#include #include #include // NOLINT @@ -98,6 +100,9 @@ void Gather(const std::vector& vars, } TEST(CollectiveServer, GPU) { + setenv("http_proxy", "", 1); + setenv("https_proxy", "", 1); + platform::CUDAPlace place; platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index af277d69..683d4ca9 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -26,18 +26,17 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/parameter_recv.h" #include "paddle/fluid/operators/distributed/parameter_send.h" +DECLARE_int32(communicator_max_merge_var_num); +DECLARE_int32(communicator_send_queue_size); + DEFINE_bool(communicator_independent_recv_thread, true, "use an independent to recv vars from parameter server"); -DEFINE_int32(communicator_send_queue_size, 20, - "queue size to recv gradient before send"); DEFINE_int32(communicator_min_send_grad_num_before_recv, 20, "max grad num to send before recv parameters"); DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv"); DEFINE_int32(communicator_send_wait_times, 5, "times that send thread will wait if merge num does not reach " "max_merge_var_num"); -DEFINE_int32(communicator_max_merge_var_num, 20, - "max var num to merge and send"); DEFINE_bool(communicator_fake_rpc, false, "fake mode does not really send any thing"); DEFINE_bool(communicator_merge_sparse_grad, true, @@ -77,14 +76,26 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx, VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc; VLOG(0) << "communicator_merge_sparse_grad: " << FLAGS_communicator_merge_sparse_grad; - send_scope_.reset(new Scope()); - for (auto &iter : send_varname_to_ctx_) { - send_varname_to_queue_[iter.first] = - std::make_shared>>( - FLAGS_communicator_send_queue_size); + + if (send_varname_to_ctx.size() == 0) { + VLOG(0) << "nothing need to be send, will not start send_thread"; + } else { + send_scope_.reset(new Scope()); + for (auto &iter : send_varname_to_ctx_) { + send_varname_to_queue_[iter.first] = + std::make_shared>>( + FLAGS_communicator_send_queue_size); + } + send_threadpool_.reset( + new ::ThreadPool(FLAGS_communicator_thread_pool_size)); + } + + if (recv_varname_to_ctx.size() == 0) { + VLOG(0) << "nothing need to be received, will not start recv_thread"; + } else { + recv_threadpool_.reset( + new ::ThreadPool(FLAGS_communicator_thread_pool_size)); } - send_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size)); - recv_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size)); } Communicator::~Communicator() { @@ -161,18 +172,28 @@ void Communicator::SendThread() { task_f.wait(); } auto after_run_send_graph = GetCurrentUS(); - auto send_graph_use_time = after_run_send_graph - before_run_send_graph; - if (send_graph_use_time > 100) { - VLOG(1) << "run send graph use time " - << after_run_send_graph - before_run_send_graph; - } - if (!FLAGS_communicator_independent_recv_thread) { - RecvAll(); - } + + VLOG(3) << "run send graph use time " + << after_run_send_graph - before_run_send_graph; + RecvNonIndependent(); } VLOG(0) << "communicator stopped, send thread exit"; } +void Communicator::RecvNonIndependent() { + if (FLAGS_communicator_independent_recv_thread) { + return; + } + + auto grad_num = grad_num_.load(); + if (grad_num > 0) { + RecvAll(); + grad_num_.store(0); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } +} + void Communicator::RecvAll() { VLOG(3) << "parallel run recv graph"; if (!running_) return; diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index 6db02fc8..b3079f51 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -134,6 +134,8 @@ inline void MergeVars(const std::string& var_name, auto in = EigenVector::Flatten(in_t); result.device(*cpu_ctx.eigen_device()) = result + in; } + result.device(*cpu_ctx.eigen_device()) = + result / static_cast(vars.size()); } else if (var0->IsType()) { auto& slr0 = var0->Get(); auto* out_slr = out_var->GetMutable(); @@ -144,10 +146,10 @@ inline void MergeVars(const std::string& var_name, for (auto& var : vars) { inputs.push_back(&var->Get()); } - math::scatter::MergeAdd - merge_add; auto dev_ctx = paddle::platform::CPUDeviceContext(); - merge_add(dev_ctx, inputs, out_slr, false); + math::scatter::MergeAverage + merge_average; + merge_average(dev_ctx, inputs, out_slr); VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height() << " dims: " << slr0.value().dims(); } else { @@ -175,6 +177,7 @@ class Communicator { private: // recv all parameter void RecvAll(); + void RecvNonIndependent(); void SendThread(); void RecvThread(); diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc index 5294ac33..66e36d01 100644 --- a/paddle/fluid/operators/distributed/communicator_test.cc +++ b/paddle/fluid/operators/distributed/communicator_test.cc @@ -42,6 +42,7 @@ TEST(communicator, merge_lod_tensors) { } out_value += static_cast(i); } + out_value = out_value / 10.0; const std::string out_name = "Out"; std::unique_ptr scope; scope.reset(new framework::Scope()); @@ -95,7 +96,7 @@ TEST(communicator, merge_selected_rows) { std::vector out_values; out_values.reserve(10); for (auto i = 0; i < 10; ++i) { - out_values.push_back(static_cast(i * (10 - i))); + out_values.push_back(static_cast((i * (10 - i)) / 10.0)); } for (auto i = 0; i < out_slr.rows().size(); ++i) { ASSERT_EQ(out_slr.rows()[i], i); diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc index 8504110c..053fe202 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc @@ -73,36 +73,53 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - SendProcessor* s = new SendProcessor(ch); const std::string method = kSendRPC; - VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, p_scope, p_ctx, s, method, h, this] { - auto* var = p_scope->FindVar(var_name_val); + int retry_times_ = 0; + + while (true) { + SendProcessor* s = new SendProcessor(ch); + VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); - ::grpc::ByteBuffer req; - SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); + framework::AsyncIO([var_name_val, p_scope, p_ctx, s, method, h, this] { + auto* var = p_scope->FindVar(var_name_val); - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + ::grpc::ByteBuffer req; + SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); - // stub context - s->response_call_back_ = nullptr; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - platform::RecordRPCEvent record_event(method); + // stub context + s->response_call_back_ = nullptr; - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + platform::RecordRPCEvent record_event(method); - if (UNLIKELY(platform::IsProfileEnabled())) { + auto call = s->stub_g_.PrepareUnaryCall( + s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, + &cq_); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + }); + req_count_++; + + if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { h->Wait(); + if (h->should_retry) { + VLOG(3) << "rpc call failed, retry times " << retry_times_; + retry_times_++; + std::random_device rd; + std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); + continue; + } } - }); - req_count_++; - return h; + return h; + } } void ProcGetResponse(const VarHandle& var_h, @@ -169,42 +186,57 @@ VarHandlePtr GRPCClient::_AsyncGetVar( const std::string table_name_val = table_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - GetProcessor* s = new GetProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); - s->Prepare(h, time_out); + int retry_times_ = 0; + + while (true) { + GetProcessor* s = new GetProcessor(ch); - framework::AsyncIO([var_name_val, out_varname_val, table_name_val, s, method, - p_ctx, h, rpc_path, this] { - // prepare input - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_out_varname(out_varname_val); - req.set_trainer_id(trainer_id_); - req.set_table_name(table_name_val); - ::grpc::ByteBuffer buf; - RequestToByteBuffer(req, &buf); + VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); + s->Prepare(h, time_out); - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + framework::AsyncIO([var_name_val, out_varname_val, table_name_val, s, + method, p_ctx, h, rpc_path, this] { + // prepare input + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + req.set_out_varname(out_varname_val); + req.set_trainer_id(trainer_id_); + req.set_table_name(table_name_val); + ::grpc::ByteBuffer buf; + RequestToByteBuffer(req, &buf); - // stub context - s->response_call_back_ = ProcGetResponse; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - platform::RecordRPCEvent record_event(method); + // stub context + s->response_call_back_ = ProcGetResponse; - auto call = - s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + platform::RecordRPCEvent record_event(method); + + auto call = + s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + }); + req_count_++; - if (UNLIKELY(platform::IsProfileEnabled())) { + if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { h->Wait(); + if (h->should_retry) { + VLOG(3) << "rpc call failed, retry times " << retry_times_; + retry_times_++; + std::random_device rd; + std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); + continue; + } } - }); - - req_count_++; - return h; + return h; + } } VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, @@ -221,41 +253,55 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, const std::string table_name_val = table_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); - GetProcessor* s = new GetProcessor(ch); const std::string method = kPrefetchRPC; + int retry_times_ = 0; - VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); + while (true) { + GetProcessor* s = new GetProcessor(ch); + VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); + s->Prepare(h, time_out); - framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - s, method, h, table_name_val, this] { - auto* var = p_scope->FindVar(in_var_name_val); + framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, + p_ctx, s, method, h, table_name_val, this] { + auto* var = p_scope->FindVar(in_var_name_val); - ::grpc::ByteBuffer req; - SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val, - 0, table_name_val); + ::grpc::ByteBuffer req; + SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, + out_var_name_val, 0, table_name_val); - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - // stub context - s->response_call_back_ = ProcGetResponse; + // stub context + s->response_call_back_ = ProcGetResponse; - platform::RecordRPCEvent record_event(method); + platform::RecordRPCEvent record_event(method); - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, static_cast(s)); + auto call = s->stub_g_.PrepareUnaryCall( + s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, + &cq_); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, static_cast(s)); - if (UNLIKELY(platform::IsProfileEnabled())) { + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + }); + req_count_++; + + if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { h->Wait(); + if (h->should_retry) { + VLOG(3) << "rpc call failed, retry times " << retry_times_; + retry_times_++; + std::random_device rd; + std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); + continue; + } } - }); - req_count_++; - return h; + return h; + } } VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, @@ -420,6 +466,14 @@ void GRPCClient::Proceed() { ok_ = false; } c->Finish(false); + } else if (c->status_.error_code() == grpc::StatusCode::UNAVAILABLE) { + VLOG(3) << c->GetVarHandlePtr()->String() + << " meets grpc error, error_code:" << c->status_.error_code() + << " error_message:" << c->status_.error_message() + << " error_details:" << c->status_.error_details() + << " should retry!"; + c->GetVarHandlePtr()->should_retry = true; + c->Finish(false); } else { LOG(FATAL) << c->GetVarHandlePtr()->String() << " meets grpc error, error_code:" << c->status_.error_code() @@ -449,7 +503,7 @@ void GRPCClient::Proceed() { // destructed at this moment. if (FLAGS_v >= 3) { std::string msg("GRPCClient Proceed end"); - fwrite(msg.c_str(), msg.length(), 1, stdout); + fwrite(msg.c_str(), msg.length(), 1, stderr); } } diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 0e8d877e..c8b8561d 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -12,10 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include #include +#include #include #include "paddle/fluid/operators/distributed/parameter_prefetch.h" @@ -78,45 +80,64 @@ static void SplitIdsIntoMultipleVarsBySection( } } -static void MergeMultipleVarsIntoOneBySection( - const std::string& id_name, const std::vector& ids_vector, - const std::string& out_name, const std::vector& out_var_names, - const std::vector& height_section, - const std::vector>& splited_ids, - const framework::ExecutionContext& context, framework::Scope* scope, - platform::DeviceContext* actual_ctx) { - PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size(), ""); +typedef std::vector> TableAndEndpoints; - auto cpu_place = platform::CPUPlace(); +void prefetch_core( + const std::vector& ids, const TableAndEndpoints& tables, + const std::vector& height_sections, + const framework::ExecutionContext& context, const framework::Scope& scope, + std::unordered_map>* recved_vec_map) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& actual_ctx = *pool.Get(context.GetPlace()); - auto abs_sections = ToAbsoluteSection(height_section); - std::unordered_map> id_to_offset; - for (size_t i = 0; i < ids_vector.size(); ++i) { - id_to_offset[ids_vector[i]].push_back(i); + std::unique_ptr local_scope = scope.NewTmpScope(); + + std::vector in_var_names; + std::vector out_var_names; + for (size_t i = 0; i < tables.size(); ++i) { + in_var_names.push_back("prefetch_send@" + tables[i].second); + out_var_names.push_back("prefetch_recv@" + tables[i].second); } - auto& id_tensor = scope->FindVar(id_name)->Get(); - auto* out_tensor = - scope->FindVar(out_name)->GetMutable(); + auto splited_ids = SplitIds(ids, height_sections); + SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids, + local_scope.get()); + + // create output var in local scope + for (auto& name : out_var_names) { + local_scope->Var(name)->GetMutable(); + } - PADDLE_ENFORCE_GT( - out_tensor->numel(), 0, - "When calling this method, the LoDTensor's numel must larger than zero. " - "Please check LoDTensor::Resize has been called first."); + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance( + context.Attr("trainer_id")); - auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); + std::vector rets; + for (size_t i = 0; i < in_var_names.size(); i++) { + if (NeedSend(*local_scope.get(), in_var_names[i])) { + VLOG(3) << "sending " << in_var_names[i] << " to " << tables[i].second + << " to get " << out_var_names[i] << " back"; + rets.push_back(rpc_client->AsyncPrefetchVar( + tables[i].second, actual_ctx, *local_scope.get(), in_var_names[i], + out_var_names[i], tables[i].first)); + } else { + VLOG(3) << "don't send no-initialied variable: " << out_var_names[i]; + } + } - bool is_on_cpu_place = true; - if (!platform::is_cpu_place(id_tensor.place())) { - is_on_cpu_place = false; + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); } + PADDLE_ENFORCE_EQ(out_var_names.size(), height_sections.size(), ""); + + auto abs_sections = ToAbsoluteSection(height_sections); for (size_t section_idx = 0; section_idx < out_var_names.size(); ++section_idx) { auto& ids_in_this_section = splited_ids[section_idx]; if (!ids_in_this_section.empty()) { - auto& prefetch_out_var = - scope->Var(out_var_names[section_idx])->Get(); + auto& prefetch_out_var = local_scope->Var(out_var_names[section_idx]) + ->Get(); const auto* out_var_data = prefetch_out_var.data(); auto& dims = prefetch_out_var.dims(); @@ -128,26 +149,9 @@ static void MergeMultipleVarsIntoOneBySection( for (int64_t i = 0; i < dims[0]; ++i) { auto id = ids_in_this_section[i]; auto origin_id = id + abs_sections[section_idx]; - auto& offsets = id_to_offset[origin_id]; - for (auto& offset : offsets) { - // should support GPU tensor - if (is_on_cpu_place) { - memory::Copy(cpu_place, out_tensor_data + offset * row_numel, - cpu_place, out_var_data + i * row_numel, - sizeof(float) * row_numel); - } else { -#ifndef PADDLE_WITH_CUDA - PADDLE_THROW("paddle is not compiled with CUDA!"); -#else - auto stream = - static_cast(actual_ctx)->stream(); - memory::Copy(boost::get(id_tensor.place()), - out_tensor_data + offset * row_numel, cpu_place, - out_var_data + i * row_numel, - sizeof(float) * row_numel, stream); -#endif - } - } + std::vector vecs(row_numel); + std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin()); + (*recved_vec_map)[origin_id] = vecs; } } else { VLOG(3) << "ids in this section is empty"; @@ -156,84 +160,107 @@ static void MergeMultipleVarsIntoOneBySection( } void prefetch(const std::string& id_name, const std::string& out_name, + const std::string& persistable_var_name, const bool backfill, const std::vector& table_names, - const std::vector& epmap, + const std::vector& endpoints, const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope) { - std::unique_ptr local_scope = scope.NewTmpScope(); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& cpu_ctx = *pool.Get(platform::CPUPlace()); - auto& actual_ctx = *pool.Get(context.GetPlace()); - - distributed::RPCClient* rpc_client = - distributed::RPCClient::GetInstance( - context.Attr("trainer_id")); + prefetchs({id_name}, {out_name}, persistable_var_name, backfill, table_names, + endpoints, height_sections, context, scope); +} - std::vector in_var_names; - std::vector out_var_names; - for (size_t i = 0; i < epmap.size(); ++i) { - in_var_names.push_back(id_name + "@" + epmap[i]); - out_var_names.push_back(out_name + "@" + epmap[i]); +void prefetchs(const std::vector& id_var_names, + const std::vector& out_var_names, + const std::string& persistable_var_name, const bool backfill, + const std::vector& table_names, + const std::vector& endpoints, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope) { + PADDLE_ENFORCE_GT(id_var_names.size(), 0, ""); + PADDLE_ENFORCE_EQ(id_var_names.size(), out_var_names.size(), ""); + PADDLE_ENFORCE_EQ(table_names.size(), endpoints.size(), ""); + PADDLE_ENFORCE_EQ(table_names.size(), height_sections.size(), ""); + + auto* reconstruct_var = + scope.FindVar(persistable_var_name)->GetMutable(); + const auto vec_dim_1 = reconstruct_var->dims()[1]; + + const auto place = + scope.FindVar(id_var_names[0])->Get().place(); + + if (!platform::is_cpu_place(place)) { + PADDLE_THROW("multi prefetch only support CPU currently"); } - auto& id_tensor = scope.FindVar(id_name)->Get(); - std::vector ids_vector; - if (platform::is_cpu_place(id_tensor.place())) { + std::vector> ids_group; + std::vector ids_union; + std::vector ids_lods; + TableAndEndpoints tables; + + for (auto& id_name : id_var_names) { + auto& id_tensor = scope.FindVar(id_name)->Get(); auto* id_data = id_tensor.data(); + std::vector ids; + for (int64_t i = 0; i < id_tensor.numel(); ++i) { - ids_vector.push_back(id_data[i]); - } - } else { -#ifndef PADDLE_WITH_CUDA - PADDLE_THROW("paddle is not compiled with CUDA!"); -#else - auto cpu_place = platform::CPUPlace(); - framework::LoDTensor cpu_tensor; - auto* cpu_tensor_data = - cpu_tensor.mutable_data(id_tensor.dims(), cpu_place); - auto stream = - static_cast(&actual_ctx)->stream(); - memory::Copy(cpu_place, cpu_tensor_data, - boost::get(id_tensor.place()), - id_tensor.data(), sizeof(int64_t) * id_tensor.numel(), - stream); - for (int64_t i = 0; i < cpu_tensor.numel(); ++i) { - ids_vector.push_back(cpu_tensor_data[i]); + ids.push_back(id_data[i]); + ids_union.push_back(id_data[i]); } -#endif + ids_group.push_back(ids); + ids_lods.push_back(id_tensor.lod()); } - auto splited_ids = SplitIds(ids_vector, height_sections); - SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids, - local_scope.get()); + std::unordered_set s(ids_union.begin(), ids_union.end()); + ids_union.assign(s.begin(), s.end()); - // create output var in local scope - for (auto& name : out_var_names) { - local_scope->Var(name)->GetMutable(); + for (int i = 0; i < table_names.size(); i++) { + tables.push_back(std::make_pair(table_names[i], endpoints[i])); } - std::vector rets; - for (size_t i = 0; i < in_var_names.size(); i++) { - if (NeedSend(*local_scope.get(), in_var_names[i])) { - VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i] - << " to get " << out_var_names[i] << " back"; - rets.push_back(rpc_client->AsyncPrefetchVar( - epmap[i], cpu_ctx, *local_scope.get(), in_var_names[i], - out_var_names[i], table_names[i])); - } else { - VLOG(3) << "don't send no-initialied variable: " << out_var_names[i]; - } + std::unordered_map> recved_vec_map; + prefetch_core(ids_union, tables, height_sections, context, scope, + &recved_vec_map); + + auto padding_idx = distributed::kNoPadding; + + if (context.HasAttr("padding_idx")) { + padding_idx = context.Attr("padding_idx"); } - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + // copy vectors to out vars + for (int i = 0; i < out_var_names.size(); i++) { + auto& ids = ids_group[i]; + auto* out_t = + scope.FindVar(out_var_names[i])->GetMutable(); + out_t->Resize( + framework::make_ddim({static_cast(ids.size()), vec_dim_1})); + out_t->set_lod(ids_lods[i]); + + auto* out_d = out_t->mutable_data(place); + + for (int idx = 0; idx < ids.size(); idx++) { + const auto& id = ids[idx]; + + if (padding_idx != distributed::kNoPadding && id == padding_idx) { + memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1); + } else { + std::copy_n(recved_vec_map[id].begin(), vec_dim_1, + out_d + idx * vec_dim_1); + } + } } - MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, - out_var_names, height_sections, splited_ids, - context, local_scope.get(), &actual_ctx); + if (backfill) { + VLOG(3) << "backfill persistable var's id with vecs"; + + auto* reconstruct_d = reconstruct_var->data(); + for (auto& id : ids_union) { + std::copy(recved_vec_map[id].begin(), recved_vec_map[id].end(), + reconstruct_d + id * vec_dim_1); + } + } } }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 0429ec44..a531c87f 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include "paddle/fluid/framework/operator.h" @@ -23,61 +24,25 @@ namespace paddle { namespace operators { namespace distributed { +constexpr int64_t kNoPadding = -1; + +void prefetchs(const std::vector& id_var_names, + const std::vector& out_var_names, + const std::string& persistable_var_name, const bool backfill, + const std::vector& table_names, + const std::vector& endpoints, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope); + void prefetch(const std::string& id_name, const std::string& out_name, + const std::string& persistable_var_name, const bool backfill, const std::vector& table_names, - const std::vector& epmap, + const std::vector& endpoints, const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope); -template -void prefetch_with_reconstruct(const std::string& id_name, - const std::string& out_name, - const std::vector& table_names, - const std::vector& epmap, - const std::vector& height_sections, - const framework::ExecutionContext& context, - const framework::Scope& scope, - framework::LoDTensor* original) { - prefetch(id_name, out_name, table_names, epmap, height_sections, context, - scope); - auto& out = scope.FindVar(out_name)->Get(); - auto& ids = scope.FindVar(id_name)->Get(); - auto* original_value = original->data(); - auto* out_value = out.data(); - size_t original_width = original->numel() / original->dims()[0]; - - bool is_on_cpu_place = true; - if (!platform::is_cpu_place(ids.place())) { - is_on_cpu_place = false; - } - if (is_on_cpu_place) { - for (int64_t i = 0; i < ids.numel(); i++) { - const T* out_rows = out_value + original_width * i; - T* original_row = - original_value + original_width * ids.data()[i]; - std::memcpy(original_row, out_rows, original_width * sizeof(T)); - } - } else { -#ifndef PADDLE_WITH_CUDA - PADDLE_THROW("paddle is not compiled with CUDA!"); -#else - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& actual_ctx = *pool.Get(context.GetPlace()); - for (int64_t i = 0; i < ids.numel(); i++) { - const T* out_rows = out_value + original_width * i; - T* original_row = - original_value + original_width * ids.data()[i]; - auto stream = - static_cast(&actual_ctx)->stream(); - memory::Copy(boost::get(ids.place()), original_row, - platform::CPUPlace(), out_rows, original_width * sizeof(T), - stream); - } -#endif - } -} - }; // namespace distributed }; // namespace operators }; // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index de8f3018..22083d92 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -85,6 +85,8 @@ class VarHandle { virtual ~VarHandle() {} public: + bool should_retry = false; + bool Wait() { int ret = kDefaultState; { diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index 876b764a..c2368ab1 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -116,42 +116,7 @@ bool RequestGetHandler::Handle(const std::string& varname, VLOG(3) << "copying " << varname << " to " << param_bak_name; framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); } - if (AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) && - !table_name.empty()) { - std::vector updated_rows; - AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear( - varname, trainer_id, &updated_rows); - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& row_id : updated_rows) { - sstream << row_id << ", "; - } - sstream << "]"; - VLOG(3) << "updated_rows size: " << updated_rows.size() << " " - << sstream.str(); - } - auto& origin_tensor = - scope_->FindVar(varname)->Get(); - auto* origin_tensor_data = origin_tensor.data(); - auto& dims = origin_tensor.dims(); - *outvar = scope->Var(); - auto* out_slr = (*outvar)->GetMutable(); - out_slr->set_rows(updated_rows); - out_slr->set_height(dims[0]); - auto out_dims = framework::make_ddim( - {static_cast(updated_rows.size()), dims[1]}); - auto* data = out_slr->mutable_value()->mutable_data( - out_dims, origin_tensor.place()); - auto width = dims[1]; - for (auto i = 0; i < updated_rows.size(); ++i) { - PADDLE_ENFORCE_LT(updated_rows[i], dims[0]); - memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width, - sizeof(float) * width); - } - } else { - *outvar = scope_->FindVar(varname); - } + *outvar = scope_->FindVar(varname); } } return true; diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc index 390e9af0..57ce5487 100644 --- a/paddle/fluid/operators/distributed/rpc_client.cc +++ b/paddle/fluid/operators/distributed/rpc_client.cc @@ -17,6 +17,7 @@ // default to 3min to avoid temprary network failures. DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc"); +DEFINE_int32(rpc_retry_times, 3, "retry times for rpc"); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index d4be2c28..d0b971e0 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -25,6 +25,7 @@ #include "paddle/fluid/operators/distributed/request_handler.h" DECLARE_int32(rpc_deadline); +DECLARE_int32(rpc_retry_times); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc index 089ea623..45e97d96 100644 --- a/paddle/fluid/operators/distributed/rpc_server_test.cc +++ b/paddle/fluid/operators/distributed/rpc_server_test.cc @@ -12,9 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include +#include #include #include // NOLINT +#include #include "gtest/gtest.h" #include "paddle/fluid/framework/block_desc.h" @@ -122,6 +125,8 @@ void StartServer(const std::string& rpc_name) { } TEST(PREFETCH, CPU) { + setenv("http_proxy", "", 1); + setenv("https_proxy", "", 1); g_req_handler.reset(new distributed::RequestPrefetchHandler(true)); g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); distributed::RPCClient* client = @@ -162,6 +167,8 @@ TEST(PREFETCH, CPU) { } TEST(COMPLETE, CPU) { + setenv("http_proxy", "", 1); + setenv("https_proxy", "", 1); g_req_handler.reset(new distributed::RequestSendHandler(true)); g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2)); distributed::RPCClient* client = diff --git a/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc new file mode 100644 index 00000000..3e354791 --- /dev/null +++ b/paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc @@ -0,0 +1,166 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +class DistributedLookupTableOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInputs("Ids"), + "Input(Ids) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(W) of LookupTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutputs("Outputs"), + "Output(Outs) of LookupTableOp should not be null."); + + auto ids_dims = ctx->GetInputsDim("Ids"); + auto table_dims = ctx->GetInputDim("W"); + + PADDLE_ENFORCE_EQ(table_dims.size(), 2, + "Only 2 dimensions of the 'Embedding' is supported."); + + for (auto &ids_dim : ids_dims) { + PADDLE_ENFORCE_EQ(ids_dim.size(), 2, + "The dimension of the 'Ids' tensor must be 2."); + PADDLE_ENFORCE_EQ(ids_dim[1], 1, + "The last dimension of the 'Ids' tensor must be 1."); + } + + auto lookup_tables = + ctx->Attrs().Get>("table_names"); + auto height_sections = + ctx->Attrs().Get>("height_sections"); + auto endpoints = ctx->Attrs().Get>("endpoints"); + + PADDLE_ENFORCE(lookup_tables.size() == height_sections.size() && + lookup_tables.size() == endpoints.size() && + lookup_tables.size() != 0, + "Attrs lookup_tables/height_sections/endpoints must have " + "save size and can not be 0."); + + auto outputs_dims = std::vector(); + + for (auto &ids_dim : ids_dims) { + outputs_dims.push_back(framework::make_ddim({ids_dim[0], table_dims[1]})); + } + + ctx->SetOutputsDim("Outputs", outputs_dims); + ctx->ShareLoD("Ids", /*->*/ "Outputs"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +template +class DistributedLookupTableKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto ids_vars = context.MultiInputVar("Ids"); + auto emb_vars = context.MultiOutput("Embeddings"); + + auto id_names = context.Inputs("Ids"); + auto embedding_name = context.Inputs("W").front(); + auto out_names = context.Outputs("Outputs"); + + auto lookup_tables = context.Attr>("table_names"); + auto height_sections = + context.Attr>("height_sections"); + auto endpoints = context.Attr>("endpoints"); + + operators::distributed::prefetchs( + id_names, out_names, embedding_name, false, lookup_tables, endpoints, + height_sections, context, context.scope()); + } +}; + +class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Ids", + "(LoDTensor) Ids's type should be LoDTensor" + "THe ids to be looked up in W.") + .AsDuplicable(); + + AddInput("W", + "(Tensor) The input represents embedding tensors, " + "which is a learnable parameter."); + + AddOutput("Outputs", + "(LoDTensor) The lookup results, which have the same type as W.") + .AsDuplicable(); + + AddAttr>( + "table_names", + "(string vector, such as emb_block0, emb_block1)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({""}); + + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + + AddAttr>( + "endpoints", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({"127.0.0.1:6164"}); + + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(distributed::kNoPadding); + + AddComment(R"DOC( +Lookup Tablel Prefetch Operator. + +This operator is used to perform lookup on parameter W, +then concatenated into a sparse tensor. + +The type of Ids(Input) is SelectedRows, the rows of Ids contains +the ids to be looked up in W; +if the Id is not in the sparse table, this operator will return a +random value and set the value into the table for the next looking up. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(distributed_lookup_table, ops::DistributedLookupTableOp, + ops::DistributedLookupTableOpMaker); + +REGISTER_OP_CPU_KERNEL(distributed_lookup_table, + ops::DistributedLookupTableKernel); diff --git a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc index 7275ab20..ae4b687f 100644 --- a/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc +++ b/paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc @@ -40,13 +40,15 @@ class FetchBarrierOp : public framework::OperatorBase { distributed::RPCClient::GetInstance( Attr("trainer_id")); - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); - + std::vector rets; for (auto& ep : eps) { VLOG(3) << "fetch barrier, ep: " << ep; - rpc_client->AsyncSendFetchBarrier(ep); + rets.push_back(rpc_client->AsyncSendFetchBarrier(ep)); + } + + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient"); } - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } }; diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc new file mode 100644 index 00000000..07c864ee --- /dev/null +++ b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc @@ -0,0 +1,279 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // for removing the port file +#include +#include +#include +#include // NOLINT +#include + +#include "gflags/gflags.h" + +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/math/math_function.h" + +#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h" + +#include "paddle/fluid/platform/profiler.h" + +DEFINE_int32(flrpc_send_thread_num, 12, "number of threads for rpc send"); +DEFINE_int32(flrpc_get_thread_num, 12, "number of threads for rpc get"); + +namespace paddle { +namespace operators { + +void FlRunServer(std::shared_ptr service) { + service->StartServer(); +} +static void flsplit(const std::string &str, char sep, + std::vector *pieces) { + pieces->clear(); + if (str.empty()) { + return; + } + size_t pos = 0; + size_t next = str.find(sep, pos); + while (next != std::string::npos) { + pieces->push_back(str.substr(pos, next - pos)); + pos = next + 1; + next = str.find(sep, pos); + } + if (!str.substr(pos).empty()) { + pieces->push_back(str.substr(pos)); + } +} + +static void FlParallelExecuteBlocks( + const std::vector ¶llel_blkids, framework::Executor *executor, + const std::vector> + &prepared, + framework::ProgramDesc *program, framework::Scope *scope) { + std::vector> fs; + for (size_t idx : parallel_blkids) { + fs.push_back(framework::Async([&executor, &prepared, &scope, idx]() { + int run_block = idx; // thread local + try { + VLOG(3) << "running server block: " << run_block + << "pointer: " << prepared[run_block].get(); + executor->RunPreparedContext(prepared[run_block].get(), scope); + } catch (const std::exception &e) { + LOG(FATAL) << "run sub program:" << idx << " error " << e.what(); + } + })); + } + for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); +} + +FlListenAndServOp::FlListenAndServOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + +FlListenAndServOp::~FlListenAndServOp() {} + +void FlListenAndServOp::SavePort() const { + // NOTE: default write file to /tmp/paddle.selected_port + rpc_service_->SavePort(); +} + +static int64_t GetTimestamp() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec * 1000 + tp.tv_usec / 1000; +} + +void FlListenAndServOp::RunSyncLoop(framework::Executor *executor, + framework::ProgramDesc *program, + framework::Scope *recv_scope, + platform::DeviceContext *dev_ctx) const { + VLOG(2) << "RunSyncLoop"; + size_t num_blocks = program->Size(); + auto optimize_blocks = + Attr>(kOptimizeBlocks); + PADDLE_ENFORCE_GE(num_blocks, 2, + "server program should have at least 2 blocks"); + + // Prepare all the server block + std::vector optimize_blocks_list; + for (size_t i = 1; i < program->Size(); ++i) { + optimize_blocks_list.push_back(i); + } + auto optimize_prepared = executor->Prepare(*program, optimize_blocks_list); + // Insert placeholder for block0 which holds current op itself, + // NOTE the first block in `optimize_prepared` should never be ran. + optimize_prepared.insert( + optimize_prepared.begin(), + std::shared_ptr(nullptr)); + + while (true) { + // Get from multiple trainers, we don't care about the order in which + // the gradients arrives, just add suffix 0~n and merge the gradient. + VLOG(3) << "wait all clients to get pserver parameters back"; + rpc_service_->SetCond(distributed::kRequestGet); + VLOG(3) << "wait all clients to send fetch_barrier"; + rpc_service_->WaitBarrier(distributed::kRequestGet); + + if (rpc_service_->IsExit()) { + rpc_service_->SetCond(distributed::kRequestGet); + break; + } + + VLOG(3) << "wait all clients to send after_optimizer parameters"; + rpc_service_->SetCond(distributed::kRequestSend); + VLOG(3) << "wait all clients to send send_barrier"; + rpc_service_->WaitBarrier(distributed::kRequestSend); + VLOG(3) << "ResetBarrierCounter"; + rpc_service_->ResetBarrierCounter(); + // NOTE: if is_gpu_place, CUDA kernels are launched by multiple threads + // and this will still work. + // The optimize blocks which have the same parent ID would run parallel + // TODO(Yancey1989): need to use ParallelExecutor for future + int32_t last_parent_blkid = optimize_blocks[0]->Parent(); + std::vector parallel_blkids; + parallel_blkids.push_back(optimize_blocks[0]->ID()); + double ts = GetTimestamp(); + for (size_t i = 1; i < optimize_blocks.size(); ++i) { + // skip the first optimize block because it is already in the + // parallel_blkids. + int blkid = optimize_blocks[i]->ID(); + if (program->Block(blkid).Parent() != last_parent_blkid) { + FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, + program, recv_scope); + parallel_blkids.clear(); + last_parent_blkid = program->Block(blkid).Parent(); + } + parallel_blkids.push_back(blkid); + } + FlParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, + program, recv_scope); + VLOG(3) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; + } // while(true) +} + +static void FillRequestCtx(distributed::RequestHandler *h, + framework::Scope *scope, + platform::DeviceContext *dev_ctx, + framework::Executor *executor, + framework::ProgramDesc *program, + distributed::RPCServer *rpc_server) { + h->SetScope(scope); + h->SetDevCtx(dev_ctx); + h->SetExecutor(executor); + h->SetProgram(program); + h->SetRPCServer(rpc_server); +} + +void FlListenAndServOp::RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const { + // Mark this as PS that it should decide profiling by listening from trainer. + platform::SetProfileListener(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + framework::Scope &recv_scope = scope.NewScope(); + + bool sync_mode = Attr("sync_mode"); + auto fan_in = Attr("Fanin"); + auto inputs = Inputs("X"); + + PADDLE_ENFORCE_EQ(!rpc_service_, true, "rpc_service_ must null"); + std::string endpoint = Attr("endpoint"); + + VLOG(4) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in + << ", end_point:" << endpoint; + + rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); + + request_send_handler_.reset( + new distributed::RequestSendHandler(sync_mode, false)); + request_get_handler_.reset( + new distributed::RequestGetHandler(sync_mode, false)); + + rpc_service_->RegisterRPC(distributed::kRequestSend, + request_send_handler_.get(), + FLAGS_flrpc_send_thread_num); + rpc_service_->RegisterRPC(distributed::kRequestGet, + request_get_handler_.get(), + FLAGS_flrpc_get_thread_num); + auto optimize_blocks = + Attr>(kOptimizeBlocks); + PADDLE_ENFORCE_GE( + optimize_blocks.size(), 1, + "optimize blocks should be 1 at least on the pserver side."); + auto *program = optimize_blocks[0]->Program(); + framework::Executor executor(dev_place); + + auto f = std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, + &dev_ctx, &executor, program, rpc_service_.get()); + + f(request_send_handler_.get()); + f(request_get_handler_.get()); + + // start the server listening after all member initialized. + server_thread_.reset(new std::thread(FlRunServer, rpc_service_)); + VLOG(3) << "wait server thread to become ready..."; + rpc_service_->WaitServerReady(); + + // register SIGINT(from ctrl+C) and SIGTERM(from kill) signal handlers + signal(SIGINT, FlSignalHandler::StopAndExit); + signal(SIGTERM, FlSignalHandler::StopAndExit); + + // Cache the type of the received vars as `sparse_vars_` and `dense_vars_` + // so that we can reset them at the end of each iteration. + // NOTE: only used in sync update + + // Write to a file of server selected port for python use. + SavePort(); + RunSyncLoop(&executor, program, &recv_scope, &dev_ctx); +} + +class FlListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) Variables that server recv.").AsDuplicable(); + AddComment(R"DOC(" + "ListenAndServ operator" + "\n" + "This operator" + +" will start a RPC server which can receive variables from send_op and send" + +"back variables to recv_op.)DOC"); + AddAttr("endpoint", + "(string, default 127.0.0.1:6164)" + "IP address to listen on.") + .SetDefault("127.0.0.1:6164") + .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); + AddAttr("sync_mode", "if works at sync_mode or not").SetDefault(true); + AddAttr("Fanin", "How many clients send to this server.") + .SetDefault(1); + AddAttr>( + kOptimizeBlocks, "Optimize blocks to run on server side.") + .SetDefault({}); + } +}; + +void FlSignalHandler::StopAndExit(int signal_num) { + // Do not use VLOG here for the device for printing maybe already released. + // exit will release interal allocated resoureces. + auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid()); + remove(file_path.c_str()); + exit(0); +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_WITHOUT_GRADIENT(fl_listen_and_serv, ops::FlListenAndServOp, + ops::FlListenAndServOpMaker); diff --git a/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h new file mode 100644 index 00000000..1199a63d --- /dev/null +++ b/paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h @@ -0,0 +1,91 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +constexpr char kOptimizeBlocks[] = "optimize_blocks"; + +void FlRunServer(std::shared_ptr service); + +template +class DoubleFindMap : public std::unordered_map { + public: + typename std::unordered_map::iterator find_value(TValue v) { + return std::find_if(this->begin(), this->end(), + [&v](const std::pair p) { + return p.second == v; + }); + } +}; + +class FlListenAndServOp : public framework::OperatorBase { + public: + FlListenAndServOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs); + virtual ~FlListenAndServOp(); + + void RunSyncLoop(framework::Executor* executor, + framework::ProgramDesc* program, + framework::Scope* recv_scope, + platform::DeviceContext* dev_ctx) const; + + void SavePort() const; + + int GetSelectedPort() { return rpc_service_->GetSelectedPort(); } + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override; + + protected: + mutable std::shared_ptr rpc_service_; + mutable std::shared_ptr request_send_handler_; + mutable std::shared_ptr request_get_handler_; + + mutable std::shared_ptr server_thread_; + mutable std::vector sparse_vars_; + mutable std::vector dense_vars_; +}; + +class FlSignalHandler { + public: + static void StopAndExit(int signal_num); + + private: + DISABLE_COPY_AND_ASSIGN(FlSignalHandler); +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index a672fb2a..14b53086 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -511,6 +511,8 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { void SignalHandler::StopAndExit(int signal_num) { // Do not use VLOG here for the device for printing maybe already released. // exit will release interal allocated resoureces. + auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid()); + remove(file_path.c_str()); exit(0); } diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index b871859d..30a161fe 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -44,7 +44,7 @@ class RecvOp : public framework::OperatorBase { std::vector epmap = Attr>("epmap"); std::vector varnames = Attr>("varnames"); - int sync_mode = Attr("sync_mode"); + auto outs = Outputs("Out"); bool with_barrier = Attr("with_barrier"); @@ -64,8 +64,8 @@ class RecvOp : public framework::OperatorBase { trainer_id); recv_functor(rpc_ctx, scope); } else { + std::vector rets; if (with_barrier) { - std::vector rets; for (size_t i = 0; i < outs.size(); i++) { std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " @@ -73,13 +73,7 @@ class RecvOp : public framework::OperatorBase { rets.push_back( rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i])); } - if (sync_mode) { - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); - } - } } else { - std::vector rets; for (size_t i = 0; i < outs.size(); i++) { std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " @@ -87,9 +81,11 @@ class RecvOp : public framework::OperatorBase { rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope, varname, outs[i])); } - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); - } + } + for (size_t i = 0; i < rets.size(); i++) { + VLOG(7) << "before sync_recv " << outs[i] << "from " << epmap[i]; + PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient"); + VLOG(7) << "after sync_recv " << outs[i] << "from " << epmap[i]; } } } @@ -112,10 +108,6 @@ This operator can get variables from server side. "variables for mapping") .SetDefault({}); AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); - AddAttr("sync_mode", - "(int, default 0)" - "sync recv or async recv.") - .SetDefault(0); AddAttr("with_barrier", "(bool, default True) if with_barrier=False, will use " "AsyncGetVarNoBarrier get variable from pserver immediately") diff --git a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc index ae1b10c3..558d0090 100644 --- a/paddle/fluid/operators/distributed_ops/send_barrier_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_barrier_op.cc @@ -44,13 +44,16 @@ class SendBarrierOp : public framework::OperatorBase { VLOG(3) << "SendBarrierOp sync"; - // need to wait before sending send_barrier message - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); + std::vector rets; + for (auto& ep : eps) { VLOG(3) << "send barrier, ep: " << ep; - rpc_client->AsyncSendBatchBarrier(ep); + rets.push_back(rpc_client->AsyncSendBatchBarrier(ep)); + } + + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient"); } - PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient"); } }; diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 5731bcc1..acb25b17 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -41,7 +41,6 @@ class SendOp : public framework::OperatorBase { auto ins = Inputs("X"); auto epmap = Attr>("epmap"); - int sync_send = Attr("sync_mode"); auto trainer_id = Attr("trainer_id"); auto send_varnames = Attr>("send_varnames"); @@ -75,12 +74,10 @@ class SendOp : public framework::OperatorBase { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - if (sync_send) { - for (size_t i = 0; i < rets.size(); i++) { - VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i]; - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); - VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i]; - } + for (size_t i = 0; i < rets.size(); i++) { + VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i]; + PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, "internal error in RPCClient"); + VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i]; } } } @@ -98,10 +95,6 @@ Send operator This operator will send variables to listen_and_serve op at the parameter server. )DOC"); - AddAttr("sync_mode", - "(int, default 0)" - "sync send or async send.") - .SetDefault(0); AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); AddAttr>("epmap", "(string vector, default 127.0.0.1:6164)" diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc index 191ca1ef..d46b57e7 100644 --- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc +++ b/paddle/fluid/operators/distributed_ops/split_ids_op.cc @@ -81,27 +81,12 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference { } }; -class SplitIdsOpGradMaker : public framework::SingleGradOpDescMaker { - public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - protected: - std::unique_ptr Apply() const override { - auto grad = new framework::OpDesc(); - grad->SetType("concat"); - grad->SetInput("X", OutputGrad("Out")); - grad->SetOutput("Out", InputGrad("Ids")); - grad->SetAttr("axis", 0); - return std::unique_ptr(grad); - } -}; - } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker, - ops::SplitIdsOpGradMaker, ops::SplitIdsOpInferVarType); + ops::SplitIdsOpInferVarType); REGISTER_OP_CPU_KERNEL( split_ids, ops::SplitIdsOpKernel, diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index e26eba68..3e0cb76d 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -11,14 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include #include #include #include #include #include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/platform/dynload/curand.h" #include "paddle/fluid/platform/float16.h" - namespace paddle { namespace operators { @@ -27,10 +29,7 @@ __global__ void RandomGenerator(const size_t n, const int seed, const float dropout_prob, const T* src, MaskType* mask_data, T* dst, bool is_upscale_in_train) { - thrust::minstd_rand rng; - rng.seed(seed); - thrust::uniform_real_distribution dist(0, 1); - + curandStatePhilox4_32_10_t state; int idx = blockDim.x * blockIdx.x + threadIdx.x; int step_size = 0; @@ -39,12 +38,12 @@ __global__ void RandomGenerator(const size_t n, const int seed, for (; idx < n; idx += blockDim.x * gridDim.x) { T s = src[idx]; if (step_size == 0) { - rng.discard(idx); + curand_init(seed, idx, idx, &state); step_size = blockDim.x * gridDim.x; } else { - rng.discard(step_size); + curand_init(seed, idx, step_size, &state); } - if (dist(rng) < dropout_prob) { + if (curand_uniform(&state) < dropout_prob) { mask = 0; dest = 0; } else { @@ -87,9 +86,10 @@ class GPUDropoutKernel : public framework::OpKernel { auto* x_data = x->data(); auto* y_data = y->mutable_data(context.GetPlace()); if (dropout_prob == 1.0f) { - PADDLE_ENFORCE(cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); - PADDLE_ENFORCE(cudaMemsetAsync(mask_data, 0, - x_numel * sizeof(*mask_data), stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaMemsetAsync(y_data, 0, x_numel * sizeof(T), stream)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemsetAsync( + mask_data, 0, x_numel * sizeof(*mask_data), stream)); return; } diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h index 09c4899c..20742f9a 100644 --- a/paddle/fluid/operators/dropout_op.h +++ b/paddle/fluid/operators/dropout_op.h @@ -77,13 +77,20 @@ class CPUDropoutKernel : public framework::OpKernel { } } } else { - auto X = EigenMatrix::Reshape(*x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); - auto& place = - *context.template device_context().eigen_device(); if (upscale_in_train) { - Y.device(place) = X; + const auto* X_data = x->data(); + auto* Y_data = y->mutable_data(context.GetPlace()); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < x->numel(); i++) { + Y_data[i] = X_data[i]; + } } else { + auto X = EigenMatrix::Reshape(*x, 1); + auto Y = EigenMatrix::Reshape(*y, 1); + auto& place = + *context.template device_context().eigen_device(); Y.device(place) = X * static_cast(1.0f - dropout_prob); } } diff --git a/paddle/fluid/operators/elementwise/CMakeLists.txt b/paddle/fluid/operators/elementwise/CMakeLists.txt index 37be1116..94886066 100644 --- a/paddle/fluid/operators/elementwise/CMakeLists.txt +++ b/paddle/fluid/operators/elementwise/CMakeLists.txt @@ -2,3 +2,5 @@ include(operators) register_operators() cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor) +cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor) +cc_test(test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index bf12d8a1..fd93aa44 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -54,7 +54,9 @@ REGISTER_OPERATOR(elementwise_add_grad, ops::ElementwiseOpExplicitGrad, ops::ElementwiseGradNoBufVarsInference, ops::ElementwiseAddDoubleGradDescMaker); REGISTER_OPERATOR(elementwise_add_grad_grad, - ops::ElementwiseOpDoubleGradWithoutDXDY); + ops::ElementwiseOpDoubleGradWithoutDXDY, + ops::ElementwiseDoubleGradOpInplace, + ops::ElementwiseDoubleGradNoBufVarsInference); REGISTER_OP_CPU_KERNEL( elementwise_add, diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 8320272b..15b4bff0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -36,4 +36,6 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, ops::ElementwiseAddDoubleGradKernel, - ops::ElementwiseAddDoubleGradKernel); + ops::ElementwiseAddDoubleGradKernel, + ops::ElementwiseAddDoubleGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cc b/paddle/fluid/operators/elementwise/elementwise_div_op.cc index 6689823d..f025a845 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cc @@ -80,7 +80,8 @@ REGISTER_OPERATOR(elementwise_div, ops::ElementwiseOp, REGISTER_OPERATOR(elementwise_div_grad, ops::ElementwiseOpGrad, ops::ElementwiseDivDoubleGradDescMaker); -REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad); +REGISTER_OPERATOR(elementwise_div_grad_grad, ops::ElementwiseDivOpDoubleGrad, + ops::ElementwiseDivDoubleGradOpInplace); REGISTER_OP_CPU_KERNEL( elementwise_div, diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index b38f8484..4cd17b94 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -37,6 +37,8 @@ REGISTER_OP_CUDA_KERNEL( elementwise_div_grad_grad, ops::ElementwiseDivDoubleGradKernel, + ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel, ops::ElementwiseDivDoubleGradKernel { GetDoubleGradSafeTensor(ctx, Out, ddX, &ddX_safe); GetDoubleGradSafeTensor(ctx, Y, ddY, &ddY_safe); + // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y + // dY = Out * dX * ddY / Y - dX * ddX / Y + // dOut = - dX * ddY + // To save memory, (1) dout can be used as 'tmp' tensor, (2) ddout can + // inplace ddx + Tensor tmp; if (dOut) { - // dOut = - dX * ddY - default_elementwise_mul(ctx, dX, &ddY_safe, dOut); - auto& place = - *ctx.template device_context().eigen_device(); - auto dout = framework::EigenVector::Flatten(*dOut); - dout.device(place) = static_cast(-1) * dout; + tmp = *dOut; + } else { + auto& dev_ctx = ctx.template device_context(); + tmp = ctx.AllocateTmpTensor(Out->dims(), dev_ctx); } - if (dY) { // dX_div_Y = dX / Y; - auto& dev_ctx = ctx.template device_context(); - Tensor dX_div_Y = - ctx.AllocateTmpTensor(Out->dims(), dev_ctx); + Tensor dX_div_Y = tmp; ElementwiseComputeEx, DeviceContext, T>( ctx, dX, Y, axis, DivFunctor(), &dX_div_Y); @@ -179,14 +180,25 @@ class ElementwiseDivDoubleGradKernel : public framework::OpKernel { if (ddOut) { // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y - default_elementwise_mul(ctx, Out, &ddY_safe, ddOut); + default_elementwise_mul(ctx, Out, &ddY_safe, &tmp); ElementwiseComputeEx, DeviceContext, T>( - ctx, &ddX_safe, ddOut, 0, SubFunctor(), ddOut); + ctx, &ddX_safe, &tmp, 0, SubFunctor(), &tmp); ElementwiseComputeEx, DeviceContext, T>( - ctx, ddOut, Y, axis, DivFunctor(), ddOut); + ctx, &tmp, Y, axis, DivFunctor(), ddOut); + } + + if (dOut) { + // dOut = - dX * ddY + default_elementwise_mul(ctx, dX, &ddY_safe, dOut); + auto& place = + *ctx.template device_context().eigen_device(); + auto dout = framework::EigenVector::Flatten(*dOut); + dout.device(place) = static_cast(-1) * dout; } } }; +DECLARE_INPLACE_OP_INFERER(ElementwiseDivDoubleGradOpInplace, {"DDX", "DDOut"}); + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc index fadebc00..451c7816 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc @@ -33,4 +33,6 @@ REGISTER_OP_WITHOUT_GRADIENT(elementwise_mod, ops::ElementwiseOp, REGISTER_OP_CPU_KERNEL( elementwise_mod, ops::ElementwiseModKernel, - ops::ElementwiseModKernel); + ops::ElementwiseModKernel, + ops::ElementwiseModFPKernel, + ops::ElementwiseModFPKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu index da3304a8..92991ab3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cu @@ -19,4 +19,6 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( elementwise_mod, ops::ElementwiseModKernel, - ops::ElementwiseModKernel); + ops::ElementwiseModKernel, + ops::ElementwiseModFPKernel, + ops::ElementwiseModFPKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.h b/paddle/fluid/operators/elementwise/elementwise_mod_op.h index 5b139fd4..e568a5dc 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mod_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.h @@ -27,6 +27,11 @@ struct ModFunctor { inline HOSTDEVICE T operator()(T a, T b) const { return a % b; } }; +template +struct ModFunctorFP { + inline HOSTDEVICE T operator()(T a, T b) const { return std::fmod(a, b); } +}; + template void elementwise_mod(const framework::ExecutionContext &ctx, const framework::Tensor *x, const framework::Tensor *y, @@ -36,6 +41,15 @@ void elementwise_mod(const framework::ExecutionContext &ctx, ModFunctor(), z); } +template +void elementwise_mod_fp(const framework::ExecutionContext &ctx, + const framework::Tensor *x, const framework::Tensor *y, + framework::Tensor *z) { + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + ModFunctorFP(), z); +} + template class ElementwiseModKernel : public framework::OpKernel { public: @@ -51,5 +65,20 @@ class ElementwiseModKernel : public framework::OpKernel { } }; +template +class ElementwiseModFPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *y = ctx.Input("Y"); + auto *z = ctx.Output("Out"); + + z->mutable_data(ctx.GetPlace()); + + // dtype of x and y is float or double + elementwise_mod_fp(ctx, x, y, z); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index 0f6af96f..69900e06 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -77,7 +77,8 @@ REGISTER_OPERATOR(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpGradDescMaker); REGISTER_OPERATOR(elementwise_mul_grad, ops::ElementwiseOpGrad, ops::ElementwiseMulDoubleGradDescMaker); -REGISTER_OPERATOR(elementwise_mul_grad_grad, ops::ElementwiseOpDoubleGrad); +REGISTER_OPERATOR(elementwise_mul_grad_grad, ops::ElementwiseOpDoubleGrad, + ops::ElementwiseMulDoubleGradOpInplace); REGISTER_OP_CPU_KERNEL( elementwise_mul, diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index d18c7e66..d3c0dcb4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -94,4 +94,6 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, - ops::ElementwiseMulDoubleGradKernel); + ops::ElementwiseMulDoubleGradKernel, + ops::ElementwiseMulDoubleGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 105707b8..581caad6 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -146,37 +146,47 @@ class ElementwiseMulDoubleGradKernel : public framework::OpKernel { if (ddout) ddout->mutable_data(ctx.GetPlace()); - // dx = dout * ddy - // dy = dout * ddx Tensor ddx_safe, ddy_safe; GetDoubleGradSafeTensor(ctx, x, ddx, &ddx_safe); GetDoubleGradSafeTensor(ctx, y, ddy, &ddy_safe); - int axis = ctx.Attr("axis"); - ElemwiseGradCompute, MulGradDY>( - ctx, ddx_safe, ddy_safe, *dout, *dout, axis, dx, dy, MulGradDX(), - MulGradDY()); + // dx = dout * ddy + // dy = dout * ddx // ddout = ddx * y + x * ddy + // change computation sequence to save memory, so ddout can inplace ddx and + // dx can be used as 'tmp' tensor + // (1) dx = x * ddy + // (2) dy = dout * ddx + // (3) ddout = ddx * y + // (4) ddout = ddout + dx + // (5) dx = dout *ddy if (ddout) { - if (ddx && ddy) { - Tensor ddout_tmp; - ddout_tmp.mutable_data(ddout->dims(), ctx.GetPlace()); - - default_elementwise_mul(ctx, ddx, y, ddout); - default_elementwise_mul(ctx, x, ddy, &ddout_tmp); - - auto& place = - *ctx.template device_context().eigen_device(); - auto ddout_t = framework::EigenVector::Flatten(*ddout); - auto ddout_tmp_t = framework::EigenVector::Flatten(ddout_tmp); - ddout_t.device(place) = ddout_t + ddout_tmp_t; - } else { - if (ddx) default_elementwise_mul(ctx, ddx, y, ddout); - if (ddy) default_elementwise_mul(ctx, x, ddy, ddout); - } + // use dx to save memory, other than alloc tmp tensor + Tensor* ddout_tmp = dx; + + default_elementwise_mul(ctx, x, &ddy_safe, ddout_tmp); + int axis = ctx.Attr("axis"); + // NOTE: in the following ElemwiseGradCompute, for the + // first output tensor is nullptr, the branch to calculate first + // output tensor will not be activated, DivGradDx function will not + // be called and can be ignored, the first branch has little effect + // on running speed. + ElemwiseGradCompute, MulGradDY>( + ctx, ddx_safe, ddy_safe, *dout, *dout, axis, nullptr, dy, + MulGradDX(), MulGradDY()); + default_elementwise_mul(ctx, &ddx_safe, y, ddout); + + auto& place = + *ctx.template device_context().eigen_device(); + auto ddout_t = framework::EigenVector::Flatten(*ddout); + auto ddout_tmp_t = framework::EigenVector::Flatten(*ddout_tmp); + ddout_t.device(place) = ddout_t + ddout_tmp_t; + default_elementwise_mul(ctx, dout, &ddy_safe, dx); } } }; +DECLARE_INPLACE_OP_INFERER(ElementwiseMulDoubleGradOpInplace, {"DDX", "DDOut"}); + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index c251cc72..da678c5e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -264,7 +264,18 @@ class ElementwiseOpDoubleGradWithoutDXDY framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = ctx.Input("DOut")->type(); + framework::proto::VarType::Type input_data_type; + if (ctx.HasInput("DDX") == false) { + PADDLE_ENFORCE_EQ(ctx.HasInput("DDY"), true, + "Input(DDY) should not be null"); + input_data_type = ctx.Input("DDY")->type(); + } else if (ctx.HasInput("DDY") == false) { + PADDLE_ENFORCE_EQ(ctx.HasInput("DDX"), true, + "Input(DDX) should not be null"); + input_data_type = ctx.Input("DDX")->type(); + } else { + input_data_type = ctx.Input("DDX")->type(); + } #ifdef PADDLE_WITH_MKLDNN if (platform::CanMKLDNNBeUsed(ctx)) { @@ -317,23 +328,15 @@ class ElemwiseGradKernel : public framework::OpKernel { } }; -class ElementwiseOpInplace : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc &op_desc, bool use_cuda) const override { - return {{"X", "Out"}}; - } -}; - -class ElementwiseGradOpInplace : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc &op_desc, bool use_cuda) const override { - return {{framework::GradVarName("Out"), framework::GradVarName("X")}}; - } -}; +DECLARE_INPLACE_OP_INFERER(ElementwiseOpInplace, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(ElementwiseGradOpInplace, + {framework::GradVarName("Out"), + framework::GradVarName("X")}); +DECLARE_INPLACE_OP_INFERER(ElementwiseDoubleGradOpInplace, {"DDX", "DDOut"}); DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseGradNoBufVarsInference, "Y"); +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ElementwiseDoubleGradNoBufVarsInference, + "Y", "DOut"); } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 2b108efe..59a9c308 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -47,25 +47,65 @@ namespace operators { * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5) * pre=2*3, n=4*5, post=1 * x.shape(6, 20, 1) * y.shape(1, 20, 1).broadcast(6, 20, 1) + * + * New parameter: *mid_flag* is added to solve m*n*k & m*1*k + * broadcast cases. + * 3. shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1, 4, 5) + * mid_flag should not be NULL. + * x.shape(2, 3, 20) * y.shape(2, 1, 20).broadcast(2, 3, 20) */ inline void get_mid_dims(const framework::DDim &x_dims, const framework::DDim &y_dims, const int axis, - int *pre, int *n, int *post) { + int *pre, int *n, int *post, int *mid_flag = NULL) { *pre = 1; *n = 1; *post = 1; - for (int i = 0; i < axis; ++i) { - (*pre) *= x_dims[i]; - } + if (mid_flag != NULL) { + *mid_flag = 0; + int mid = 0; + for (int i = 0; i < axis; ++i) { + (*pre) *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + if (x_dims[i + axis] != y_dims[i]) { + // only support single y_dims[i] = 1 now. + PADDLE_ENFORCE_EQ(*mid_flag, 0, + "Broadcast support y_dims with single 1."); + PADDLE_ENFORCE_EQ(y_dims[i], 1, "Broadcast dimension mismatch."); + // m*n*k m*1*k + for (int j = 0; j < i; ++j) { + (*pre) *= y_dims[j]; + } + *n = std::max(x_dims[i + axis], y_dims[i]); + *mid_flag = 1; + mid = i; + break; + } + (*n) *= y_dims[i]; + } + if (*mid_flag) { + for (int i = mid + 1; i < x_dims.size(); ++i) { + (*post) *= x_dims[i]; + } + } else { + for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { + (*post) *= x_dims[i]; + } + } + } else { // for fused_elementwise_activation_op. keep the old version. + for (int i = 0; i < axis; ++i) { + (*pre) *= x_dims[i]; + } - for (int i = 0; i < y_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i], - "Broadcast dimension mismatch."); - (*n) *= y_dims[i]; - } + for (int i = 0; i < y_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i], + "Broadcast dimension mismatch."); + (*n) *= y_dims[i]; + } - for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; + for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { + (*post) *= x_dims[i]; + } } } @@ -171,7 +211,6 @@ class MidWiseTransformIterator } } } - return *this; } @@ -268,6 +307,15 @@ class TransformFunctor { MidWiseTransformIterator(y_, n, post), z_, func_); } + inline void RunMidRowWise(int n, int pre, int post) const { + platform::Transform trans; + for (int i = 0; i < pre; i++) { + trans(ctx_, x_ + i * n * post, x_ + (i + 1) * n * post, + RowwiseTransformIterator(y_ + i * post, post), + z_ + i * n * post, func_); + } + } + private: const T *x_; const T *y_; @@ -406,11 +454,20 @@ static void ElemwiseGradBroadcast1CUDA(cudaStream_t stream, const T *x, const T *y, const T *out, const T *dout, int h, int w, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { - // suppose perfoemance improves with h increased. - dim3 block_size = dim3(BLOCK_X, BLOCK_Y); - int grid_size = (w + BLOCK_X - 1) / BLOCK_X; - FastElemwiseGradBroadcast1CUDAKernel<<>>( - x, y, out, dout, h, w, dx_op, dy_op, dx, dy); + // For small case use 1D block + constexpr int half_walf = 16; + if (w < half_walf || h < half_walf) { + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, h); + int gird_size = w; + ElemwiseGradBroadcast1CUDAKernel<<>>( + x, y, out, dout, h, w, dx_op, dy_op, dx, dy); + } else { + // suppose perfoemance improves with h increased. + dim3 block_size = dim3(BLOCK_X, BLOCK_Y); + int grid_size = (w + BLOCK_X - 1) / BLOCK_X; + FastElemwiseGradBroadcast1CUDAKernel<<>>( + x, y, out, dout, h, w, dx_op, dy_op, dx, dy); + } } #endif @@ -492,6 +549,88 @@ static void ElemwiseGradBroadcast2CUDA(cudaStream_t stream, const T *x, #endif +template +static void ElemwiseGradBroadcastMid2CPU(const T *x, const T *y, const T *out, + const T *dout, int pre, int n, + int post, DX_OP dx_op, DY_OP dy_op, + T *dx, T *dy) { + for (int i = 0; i < pre; ++i) { + for (int j = 0; j < n; ++j) { + for (int k = 0; k < post; ++k) { + int x_offset = i * n * post + j * post + k; + int y_offset = i * post + k; + if (dx != nullptr) { + dx[x_offset] = + dx_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]); + } + if (dy != nullptr) { + T tmp = + dy_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]); + if (j == 0) { + dy[y_offset] = tmp; + } else { + dy[y_offset] += tmp; + } + } + } + } + } +} + +#ifdef __NVCC__ +template +static __global__ void ElemwiseGradBroadcastMid2CUDAKernel( + const T *x, const T *y, const T *out, const T *dout, int pre, int n, + int post, DX_OP dx_op, DY_OP dy_op, T *dx, T *dy) { + int j = threadIdx.x; + int tid = blockIdx.x; + + T val(0); + int ttid = tid; + + while (true) { + int i = ttid / post; + int k = ttid % post; + if (i >= pre) break; + + int x_offset = i * n * post + j * post + k; + int y_offset = i * post + k; + if (dx != nullptr) { + dx[x_offset] = + dx_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]); + } + + if (dy != nullptr) { + val += dy_op(x[x_offset], y[y_offset], out[x_offset], dout[x_offset]); + } + + ttid += ELEMWISE_MAX_BLOCK_DIM; + } + + if (dy) { + int h = n; + h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h; + val = paddle::platform::reduceSum(val, j, h); + if (threadIdx.x == 0) { + dy[tid] = val; + } + } +} + +template +static void ElemwiseGradBroadcastMid2CUDA(cudaStream_t stream, const T *x, + const T *y, const T *out, + const T *dout, int pre, int n, + int post, DX_OP dx_op, DY_OP dy_op, + T *dx, T *dy) { + int block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, n); + int gird_size = pre * post; + ElemwiseGradBroadcastMid2CUDAKernel<<>>( + x, y, out, dout, pre, n, post, dx_op, dy_op, dx, dy); +} + +#endif + template void ElemwiseGradComputeNoBroadcast( const framework::ExecutionContext &ctx, const framework::DDim &x_dim, @@ -524,23 +663,39 @@ void ElemwiseGradComputeWithBroadcast( auto y_dim = trim_trailing_singular_dims(y_dim_untrimed); axis = (y_dim.size() == 0) ? x_dim.size() : axis; - int pre, n, post; - get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); - if (post == 1) { - int h = pre; - int w = n; + int pre, n, post, mid_flag = 0; + get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, &mid_flag); + if (mid_flag) { + PADDLE_ENFORCE_EQ(mid_flag, 1, "mid_flag should be no more than 1."); + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef __NVCC__ + ElemwiseGradBroadcastMid2CUDA( + ctx.template device_context().stream(), x.data(), + y.data(), out.data(), dout.data(), pre, n, post, dx_op, + dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); +#endif + } else { + ElemwiseGradBroadcastMid2CPU( + x.data(), y.data(), out.data(), dout.data(), pre, n, post, + dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); + } + } else if (post == 1) { if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef __NVCC__ ElemwiseGradBroadcast1CUDA( ctx.template device_context().stream(), x.data(), - y.data(), out.data(), dout.data(), h, w, dx_op, dy_op, + y.data(), out.data(), dout.data(), pre, n, dx_op, dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); #endif } else { ElemwiseGradBroadcast1CPU( - x.data(), y.data(), out.data(), dout.data(), h, w, dx_op, - dy_op, dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), + x.data(), y.data(), out.data(), dout.data(), pre, n, + dx_op, dy_op, + dx == nullptr ? nullptr : dx->mutable_data(ctx.GetPlace()), dy == nullptr ? nullptr : dy->mutable_data(ctx.GetPlace())); } } else { @@ -680,9 +835,12 @@ void ElementwiseComputeEx(const framework::ExecutionContext &ctx, "Axis should be in range [0, x_dims)"); auto y_dims = trim_trailing_singular_dims(y_dims_untrimed); axis = (y_dims.size() == 0) ? x_dims.size() : axis; - - int pre, n, post; - get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post); + int pre, n, post, mid_flag = 0; + get_mid_dims(x_dims, y_dims, axis, &pre, &n, &post, &mid_flag); + if (mid_flag) { + functor.RunMidRowWise(n, pre, post); + return; + } if (post == 1) { functor.RunRowWise(n, pre); return; diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc index 6335e67a..59ec9a2d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc @@ -1,11 +1,8 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -13,11 +10,30 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_pow_op.h" +#include #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" namespace paddle { namespace operators { + +class ElementwisePowOpGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("elementwise_pow_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetAttrMap(Attrs()); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + return op; + } +}; class ElementwisePowOpMaker : public ElementwiseOpMaker { protected: std::string GetName() const override { return "Pow"; } @@ -27,9 +43,20 @@ class ElementwisePowOpMaker : public ElementwiseOpMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp, - ops::ElementwisePowOpMaker); +REGISTER_OPERATOR(elementwise_pow, ops::ElementwiseOp, + ops::ElementwisePowOpMaker, ops::ElementwiseOpInferVarType, + ops::ElementwisePowOpGradDescMaker); +REGISTER_OPERATOR(elementwise_pow_grad, ops::ElementwiseOpGrad); + REGISTER_OP_CPU_KERNEL( elementwise_pow, ops::ElementwisePowKernel, - ops::ElementwisePowKernel); + ops::ElementwisePowKernel, + ops::ElementwisePowKernel, + ops::ElementwisePowKernel); +REGISTER_OP_CPU_KERNEL( + elementwise_pow_grad, + ops::ElementwisePowGradKernel, + ops::ElementwisePowGradKernel, + ops::ElementwisePowGradKernel, + ops::ElementwisePowGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu index 9263dbfe..320d1e7b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu @@ -15,4 +15,13 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( elementwise_pow, ops::ElementwisePowKernel, - ops::ElementwisePowKernel); + ops::ElementwisePowKernel, + ops::ElementwisePowKernel, + ops::ElementwisePowKernel); +REGISTER_OP_CUDA_KERNEL( + elementwise_pow_grad, + ops::ElementwisePowGradKernel, + ops::ElementwisePowGradKernel, + ops::ElementwisePowGradKernel, + ops::ElementwisePowGradKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h index dc584b4c..1363485c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_pow_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.h @@ -1,11 +1,8 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,6 +12,7 @@ limitations under the License. */ #pragma once #include +#include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" namespace paddle { @@ -29,9 +27,11 @@ template class ElementwisePowKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - using Tensor = framework::Tensor; - + using Tensor = framework::LoDTensor; auto* x = ctx.Input("X"); + PADDLE_ENFORCE(x != nullptr, + "Cannot get input Variable X, variable name = %s", + ctx.op().Input("X")); auto* y = ctx.Input("Y"); auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); @@ -41,5 +41,36 @@ class ElementwisePowKernel : public framework::OpKernel { } }; +template +struct PowGradDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * y * std::pow(x, y - 1); + } +}; + +template +struct PowGradDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * std::log(x) * std::pow(x, y); + } +}; + +template +class ElementwisePowGradKernel : public ElemwiseGradKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + ElemwiseGradKernel::Compute(ctx); + using Tensor = framework::Tensor; + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* out = dout; + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + ElemwiseGradCompute, PowGradDY>( + ctx, *x, *y, *out, *dout, axis, dx, dy, PowGradDX(), PowGradDY()); + } +}; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index b1ec10ea..b3003092 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -54,7 +54,9 @@ REGISTER_OPERATOR(elementwise_sub_grad, ops::ElementwiseOpExplicitGrad, ops::ElementwiseGradNoBufVarsInference, ops::ElementwiseSubDoubleGradDescMaker); REGISTER_OPERATOR(elementwise_sub_grad_grad, - ops::ElementwiseOpDoubleGradWithoutDXDY); + ops::ElementwiseOpDoubleGradWithoutDXDY, + ops::ElementwiseDoubleGradOpInplace, + ops::ElementwiseDoubleGradNoBufVarsInference); REGISTER_OP_CPU_KERNEL( elementwise_sub, diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 49cfe0a0..97b1f383 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -55,22 +55,22 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { // broadcast operations need to performed. if (x_dims != y_dims_untrimed) { Tensor _x; - mkldnn::memory::format format; - std::vector src_x_tz = framework::vectorize2int(x_dims); + MKLDNNMemoryFormat format; + std::vector src_x_tz = framework::vectorize(x_dims); if ((src_x_tz.size() == 3 && - x->format() != (format = memory::format::ncw)) || + x->format() != (format = MKLDNNMemoryFormat::ncw)) || (src_x_tz.size() == 4 && - x->format() != (format = memory::format::nchw)) || + x->format() != (format = MKLDNNMemoryFormat::nchw)) || (src_x_tz.size() == 5 && - x->format() != (format = memory::format::ncdhw))) { + x->format() != (format = MKLDNNMemoryFormat::ncdhw))) { _x.Resize(x_dims); mkldnn::memory::data_type in_type = platform::MKLDNNGetDataType(); auto out_format = platform::MKLDNNFormatForSize( - x_dims.size(), mkldnn::memory::format::nchw); + x_dims.size(), MKLDNNMemoryFormat::nchw); - const std::string key = platform::ReorderMKLDNNHandler::GetHash( + const std::string key = platform::CreateKey( src_x_tz, x->format(), out_format, std::to_string(in_type)); platform::ReorderMKLDNNHandler handler(src_x_tz, x->type(), in_type, @@ -119,21 +119,24 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { z->set_layout(DataLayout::kMKLDNN); z->set_format(format); } else { - PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && - x->format() != memory::format::format_undef, - "Wrong layout/format set for X tensor"); - PADDLE_ENFORCE(y->layout() == DataLayout::kMKLDNN && - y->format() != memory::format::format_undef, - "Wrong layout/format set for Y tensor"); + PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN, + "Wrong layout set for X tensor"); + PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for X tensor"); - std::vector src_x_tz = framework::vectorize2int(x_dims); - std::vector src_y_tz = framework::vectorize2int(y_dims_untrimed); - std::vector dst_tz = framework::vectorize2int(z_dims); + PADDLE_ENFORCE_EQ(y->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Y tensor"); + PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Y tensor"); + + std::vector src_x_tz = framework::vectorize(x_dims); + std::vector src_y_tz = framework::vectorize(y_dims_untrimed); + std::vector dst_tz = framework::vectorize(z_dims); std::vector srcs_pd; std::vector scales = {1.0f, 1.0f}; - const std::string key = platform::MKLDNNHandler::GetHash( + const std::string key = platform::CreateKey( src_x_tz, ctx.op().Output("Out") + std::to_string(x->format()) + std::to_string(y->format())); @@ -148,7 +151,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { paddle::platform::to_void_cast(y_data)); auto dst_md = memory::desc({dst_tz}, platform::MKLDNNGetDataType(), - memory::format::any); + MKLDNNMemoryFormat::any); auto sum_pd = handler.AcquireSumPrimitiveDescriptor( {src_x_memory, src_y_memory}, scales, dst_md); @@ -164,8 +167,9 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { stream(stream::kind::eager).submit(pipeline).wait(); z->set_layout(DataLayout::kMKLDNN); - z->set_format( - (memory::format)dst_memory->get_primitive_desc().desc().data.format); + z->set_format((MKLDNNMemoryFormat)dst_memory->get_primitive_desc() + .desc() + .data.format); } } }; diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index f2f4d3fe..cc723844 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -37,7 +37,7 @@ static void UpdateDataFormat(const framework::ExecutionContext& ctx, if (ctx.op().HasAttr(attribute)) { auto format_as_string = ctx.Attr(attribute); auto format = StringToMKLDNNFormat(&format_as_string); - if (format != memory::format::any) { + if (format != MKLDNNMemoryFormat::any) { tensor->set_format(format); } } @@ -48,10 +48,11 @@ static void ReorderInput(framework::Tensor* tensor, const platform::Place& place, const mkldnn::engine& engine, bool isFourDim) { using platform::to_void_cast; - auto dims = paddle::framework::vectorize2int(tensor->dims()); + auto dims = paddle::framework::vectorize(tensor->dims()); framework::Tensor out_tensor; out_tensor.Resize(tensor->dims()); - out_tensor.set_format(isFourDim ? memory::format::nchw : memory::format::nc); + out_tensor.set_format(isFourDim ? MKLDNNMemoryFormat::nchw + : MKLDNNMemoryFormat::nc); out_tensor.set_layout(tensor->layout()); mkldnn::memory input_memory = { {{dims, platform::MKLDNNGetDataType(), tensor->format()}, engine}, @@ -79,15 +80,15 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { auto x_dims = x->dims(); auto y_dims_untrimmed = y->dims(); - auto x_int_dims = paddle::framework::vectorize2int(x_dims); + auto x_int_dims = paddle::framework::vectorize(x_dims); UpdateDataFormat(ctx, const_cast(x), "x_data_format"); UpdateDataFormat(ctx, const_cast(y), "y_data_format"); const bool is_avx512_enabled = platform::MayIUse(platform::avx512f); const bool are_dims_divisable = !(x_int_dims[1] % 16); - const bool is_x_format_correct = x->format() == memory::format::nChw16c; - const bool is_y_format_correct = y->format() == memory::format::nc; + const bool is_x_format_correct = x->format() == MKLDNNMemoryFormat::nChw16c; + const bool is_y_format_correct = y->format() == MKLDNNMemoryFormat::nc; if (is_x_format_correct && is_y_format_correct && are_dims_divisable && is_avx512_enabled) { int pre, n, post; @@ -133,12 +134,12 @@ class ElementwiseMulMKLDNNKernel : public framework::OpKernel { } else { // Fallback to naive version: const bool are_inputs_in_same_format = x->format() == y->format(); - const bool is_x_nchw = x->format() == memory::format::nchw; - const bool is_x_nc = x->format() == memory::format::nc; - const bool is_x_x = x->format() == memory::format::x; - const bool is_y_nchw = y->format() == memory::format::nchw; - const bool is_y_nc = y->format() == memory::format::nc; - const bool is_y_x = y->format() == memory::format::x; + const bool is_x_nchw = x->format() == MKLDNNMemoryFormat::nchw; + const bool is_x_nc = x->format() == MKLDNNMemoryFormat::nc; + const bool is_x_x = x->format() == MKLDNNMemoryFormat::x; + const bool is_y_nchw = y->format() == MKLDNNMemoryFormat::nchw; + const bool is_y_nc = y->format() == MKLDNNMemoryFormat::nc; + const bool is_y_x = y->format() == MKLDNNMemoryFormat::x; if (!are_inputs_in_same_format) { using platform::MKLDNNDeviceContext; auto& dev_ctx = ctx.template device_context(); diff --git a/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc new file mode 100644 index 00000000..532084f4 --- /dev/null +++ b/paddle/fluid/operators/elementwise/test_elementwise_add_grad_grad.cc @@ -0,0 +1,83 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +USE_OP(elementwise_add); + +namespace paddle { +namespace operators { + +template +class TestElementwiseAddGradGradWithoutDout + : public TestElementwiseOpGradGrad { + public: + TestElementwiseAddGradGradWithoutDout(const platform::Place &place, + const framework::DDim &dims) + : TestElementwiseOpGradGrad("elementwise_add_grad_grad", place, dims, + {"Y", "DOut", "DDY"}, {"DDOut"}) {} + + using TestElementwiseOpGradGrad::feed_datas_; + using TestElementwiseOpGradGrad::expected_outs_; + using TestElementwiseOpGradGrad::dims_; + void ComputeExpectedOuts() override { + size_t numel = static_cast(framework::product(dims_)); + std::vector dy(numel); + std::vector ddout(numel); + for (size_t i = 0; i < numel; ++i) { + // ddOut = ddX + ddY = ddY if ddX empty + ddout[i] = feed_datas_["DDY"][i]; + } + expected_outs_["DDOut"] = ddout; + } + + std::unique_ptr CreateTestOp() override { + auto op = framework::OpRegistry::CreateOp( + this->op_type_, {{"Y", {"Y"}}, {"DOut", {"DOut"}}, {"DDY", {"DDY"}}}, + {{"DDOut", {"DDOut"}}}, {{"use_mkldnn", false}, {"axis", 0}}); + return op; + } +}; + +TEST(test_elementwise_add_grad_grad_without_ddx, cpu_place) { + framework::DDim dims({32, 64}); + platform::CPUPlace p; + TestElementwiseAddGradGradWithoutDout test(p, dims); + ASSERT_TRUE(test.Check()); +} +#ifdef PADDLE_WITH_CUDA +TEST(test_elementwise_add_grad_grad_without_ddx, gpu_place) { + framework::DDim dims({32, 64}); + platform::CUDAPlace p(0); + TestElementwiseAddGradGradWithoutDout test(p, dims); + ASSERT_TRUE(test.Check()); +} +#endif + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc new file mode 100644 index 00000000..e1f893dd --- /dev/null +++ b/paddle/fluid/operators/elementwise/test_elementwise_div_grad_grad.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +USE_OP(elementwise_div); + +namespace paddle { +namespace operators { + +template +class TestElementwiseDivGradGradWithoutDout + : public TestElementwiseOpGradGrad { + public: + TestElementwiseDivGradGradWithoutDout(const platform::Place &place, + const framework::DDim &dims) + : TestElementwiseOpGradGrad("elementwise_div_grad_grad", place, dims, + {"Y", "Out", "DDX", "DDY", "DX"}, + {"Y@GRAD", "DDOut"}) {} + + using TestElementwiseOpGradGrad::feed_datas_; + using TestElementwiseOpGradGrad::expected_outs_; + using TestElementwiseOpGradGrad::dims_; + void ComputeExpectedOuts() override { + size_t numel = static_cast(framework::product(dims_)); + std::vector dy(numel); + std::vector ddout(numel); + for (size_t i = 0; i < numel; ++i) { + // dY(Y@GRAD) = Out * dX * ddY / Y - dX * ddX / Y + dy[i] = (feed_datas_["DX"][i] / feed_datas_["Y"][i]) * + (feed_datas_["Out"][i] * feed_datas_["DDY"][i] - + feed_datas_["DDX"][i]); + // ddOut = ddX / Y - Out * ddY / Y = (ddX - Out * ddY) / Y + ddout[i] = (feed_datas_["DDX"][i] - + feed_datas_["Out"][i] * feed_datas_["DDY"][i]) / + (feed_datas_["Y"][i]); + } + expected_outs_["Y@GRAD"] = dy; + expected_outs_["DDOut"] = ddout; + } + + std::unique_ptr CreateTestOp() override { + auto op = framework::OpRegistry::CreateOp( + this->op_type_, {{"Y", {"Y"}}, + {"Out", {"Out"}}, + {"DDX", {"DDX"}}, + {"DDY", {"DDY"}}, + {"DX", {"DX"}}}, + {{"Y@GRAD", {"Y@GRAD"}}, {"DDOut", {"DDOut"}}}, + {{"use_mkldnn", false}, {"axis", 0}}); + return op; + } +}; + +TEST(test_elementwise_div_grad_grad_without_dout, cpu_place) { + framework::DDim dims({32, 64}); + platform::CPUPlace p; + TestElementwiseDivGradGradWithoutDout test(p, dims); + ASSERT_TRUE(test.Check()); +} + +#ifdef PADDLE_WITH_CUDA +TEST(test_elementwise_div_grad_grad_without_dout, gpu_place) { + framework::DDim dims({32, 64}); + platform::CUDAPlace p(0); + TestElementwiseDivGradGradWithoutDout test(p, dims); + ASSERT_TRUE(test.Check()); +} +#endif + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h new file mode 100644 index 00000000..c7ce5142 --- /dev/null +++ b/paddle/fluid/operators/elementwise/test_elementwise_op_grad_grad.h @@ -0,0 +1,151 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +// currently, this test class only support same dims +template +class TestElementwiseOpGradGrad { + public: + TestElementwiseOpGradGrad(const std::string &op_type, + const platform::Place &place, + const framework::DDim &dims, + const std::vector &inputs, + const std::vector &outputs) + : op_type_(op_type), + place_(place), + dims_(dims), + inputs_(inputs), + outputs_(outputs) {} + + void InitVarInScope(std::string var_name) { + in_out_tensors_[var_name] = + scope_.Var(var_name)->template GetMutable(); + in_out_tensors_[var_name]->Resize(dims_); + in_out_tensors_[var_name]->template mutable_data(place_); + } + + void InitFeedData(std::string var_name, size_t size) { + // generate random data + std::uniform_real_distribution dist(static_cast(10.0), + static_cast(20.0)); + std::mt19937 engine; + std::vector data(size); + for (size_t i = 0; i < size; ++i) { + data[i] = dist(engine); + } + feed_datas_[var_name] = data; + } + + void Setup() { + size_t numel = static_cast(framework::product(dims_)); + // init vars in scope and feed inputs + for (auto in_name : inputs_) { + InitVarInScope(in_name); + InitFeedData(in_name, numel); + } + for (auto out_name : outputs_) { + InitVarInScope(out_name); + } + + // feeding: copy data to tensor, out tensor don't need init + auto bytes = sizeof(T) * numel; + for (auto &in_name : inputs_) { + auto dst = in_out_tensors_[in_name]->template data(); + auto src = feed_datas_[in_name].data(); + auto src_place = platform::CPUPlace(); + if (platform::is_cpu_place(place_)) { + auto dst_place = boost::get(place_); + memory::Copy(dst_place, dst, src_place, src, bytes); + } else if (platform::is_gpu_place(place_)) { +#ifdef PADDLE_WITH_CUDA + auto dst_place = boost::get(place_); + memory::Copy(dst_place, dst, src_place, src, bytes, nullptr); +#else + PADDLE_THROW("Not compiled with cuda"); +#endif + } + } + + // calculate expected outputs + ComputeExpectedOuts(); + } + + bool Check() { + Setup(); + auto op = CreateTestOp(); + op->Run(scope_, place_); + platform::DeviceContextPool::Instance().Get(place_)->Wait(); + framework::LoDTensor cpu_out; + PADDLE_ENFORCE_EQ(scope_.kids().empty(), true, "scope has child scopes"); + + // get outputs from scope and compare them with expected_outs + bool all_equal = true; + for (auto &out_name : outputs_) { + auto &out_tensor = + scope_.FindVar(out_name)->template Get(); + if (platform::is_gpu_place(place_)) { + framework::TensorCopySync(out_tensor, platform::CPUPlace(), &cpu_out); + } else { + cpu_out = out_tensor; + } + auto *out_ptr = cpu_out.data(); + size_t numel = static_cast(framework::product(dims_)); + auto is_equal = + std::equal(out_ptr, out_ptr + numel, expected_outs_[out_name].data()); + if (!is_equal) { + all_equal = false; + break; + } + } + return all_equal; + } + + virtual std::unique_ptr CreateTestOp() = 0; + virtual void ComputeExpectedOuts() = 0; + virtual ~TestElementwiseOpGradGrad() {} + + protected: + std::string op_type_; + platform::Place place_; + framework::DDim dims_; + std::vector inputs_; + std::vector outputs_; + std::map in_out_tensors_; + std::map> feed_datas_; + std::map> expected_outs_; + framework::Scope scope_; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index e15f848c..b9537317 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -28,14 +28,15 @@ class ExpandOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) should not be null."); auto x_dims = ctx->GetInputDim("X"); - std::vector expand_times(x_dims.size(), -1); + auto expand_times = ctx->Attrs().Get>("expand_times"); - if (!ctx->HasInputs("expand_times_tensor")) { - expand_times = ctx->Attrs().Get>("expand_times"); + if (expand_times.size() == 0) { + expand_times = std::vector(x_dims.size(), -1); } PADDLE_ENFORCE_EQ(static_cast(x_dims.size()), expand_times.size(), @@ -49,6 +50,9 @@ class ExpandOp : public framework::OperatorWithKernel { if (x_dims[i] == -1 || expand_times[i] == -1) { out_shape[i] = -1; } else { + PADDLE_ENFORCE_GT( + expand_times[i], 0, + "The element of Attr(expand_times) must greater than 0."); out_shape[i] = x_dims[i] * expand_times[i]; } } @@ -69,7 +73,7 @@ class ExpandOp : public framework::OperatorWithKernel { framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { - if (var_name == "expand_times_tensor") { + if (var_name == "expand_times_tensor" || var_name == "ExpandTimes") { return expected_kernel_type; } return framework::OpKernelType(expected_kernel_type.data_type_, @@ -83,7 +87,15 @@ class ExpandOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(Tensor, default Tensor). A tensor with rank in [1, 6]." "X is the input to be expanded."); - AddInput("expand_times_tensor", "(Tensor Tensor), epxand times for X") + AddInput("ExpandTimes", + "(Tensor), optional). If provided, expand according to " + "this given expand times. It has a higher priority than " + "expand_times_tensor and expand_times.") + .AsDispensable(); + AddInput("expand_times_tensor", + "(Tensor Tensor), epxand times for X." + "It has a higher priority than expand_times, but a lower priority " + "than ExpandTimes") .AsDuplicable() .AsDispensable(); AddOutput("Out", @@ -127,9 +139,9 @@ class ExpandGradOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) should not be null."); auto x_dims = ctx->GetInputDim("X"); std::vector expand_times = @@ -147,12 +159,15 @@ class ExpandGradOp : public framework::OperatorWithKernel { } for (size_t i = start_pos; i < expand_times.size(); ++i) { - PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i], - "Each dimension size of Input(Out@GRAD) should be " - "equal to multiplication of crroresponding dimension " - "size of Input(X) and Attr(expand_times) value."); + if (expand_times[i] == -1) { + continue; + } else { + PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i], + "Each dimension size of Input(Out@GRAD) should be " + "equal to multiplication of crroresponding dimension " + "size of Input(X) and Attr(expand_times) value."); + } } - auto x_grad_name = framework::GradVarName("X"); if (ctx->HasOutput(x_grad_name)) { @@ -163,8 +178,9 @@ class ExpandGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } framework::OpKernelType GetKernelTypeForVar( @@ -190,18 +206,22 @@ class ExpandGradOpDescMaker : public framework::SingleGradOpDescMaker { op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetInput("expand_times_tensor", Input("expand_times_tensor")); + op->SetInput("ExpandTimes", Input("ExpandTimes")); op->SetAttrMap(Attrs()); return op; } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ExpandGradNoNeedBufVarsInferer, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(expand, ops::ExpandOp, ops::ExpandOpMaker, ops::ExpandGradOpDescMaker); -REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp); +REGISTER_OPERATOR(expand_grad, ops::ExpandGradOp, + ops::ExpandGradNoNeedBufVarsInferer); REGISTER_OP_CPU_KERNEL( expand, ops::ExpandKernel, ops::ExpandKernel, diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h index 8153987d..eb3b46f9 100644 --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -50,6 +50,19 @@ namespace paddle { namespace operators { inline std::vector get_expand_times( const framework::ExecutionContext& ctx) { + if (ctx.HasInput("ExpandTimes")) { + auto* expand_tensor = ctx.Input("ExpandTimes"); + auto* expand_data = expand_tensor->data(); + framework::Tensor cpu_expand_tensor; + if (platform::is_gpu_place(expand_tensor->place())) { + TensorCopySync(*expand_tensor, platform::CPUPlace(), &cpu_expand_tensor); + expand_data = cpu_expand_tensor.data(); + } + auto vec_epxand_times = + std::vector(expand_data, expand_data + expand_tensor->numel()); + return vec_epxand_times; + } + auto list_expand_times_tensor = ctx.MultiInput("expand_times_tensor"); if (list_expand_times_tensor.size() > 0) { @@ -100,6 +113,9 @@ class ExpandKernel : public framework::OpKernel { auto in_dims = in0->dims(); auto expand_times = get_expand_times(context); + PADDLE_ENFORCE_EQ(static_cast(in_dims.size()), expand_times.size(), + "The number of Attr(expand_times)'s value must be equal " + "to the rank of Input(X)."); auto* out0 = context.Output("Out"); Eigen::DSizes bcast_dims; for (size_t i = 0; i < expand_times.size(); ++i) { @@ -186,7 +202,6 @@ class ExpandGradKernel : public framework::OpKernel { "reduce dimensions."); auto* in0 = context.Input(framework::GradVarName("Out")); auto* out0 = context.Output(framework::GradVarName("X")); - auto x = EigenVector::Flatten(*(context.Input("X"))); out0->mutable_data(context.GetPlace()); auto x_grad = EigenVector::Flatten(*out0); Eigen::DSizes reshape_dims; @@ -200,7 +215,9 @@ class ExpandGradKernel : public framework::OpKernel { auto out_grad = EigenVector::Flatten(*in0); x_grad.device( *context.template device_context().eigen_device()) = - out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions()); + out_grad.reshape(reshape_dims) + .sum(reduce_dims) + .reshape(x_grad.dimensions()); } }; diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc new file mode 100644 index 00000000..40848b96 --- /dev/null +++ b/paddle/fluid/operators/eye_op.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/eye_op.h" + +namespace paddle { +namespace operators { + +class EyeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of EyeOP should not be null."); + auto num_rows = ctx->Attrs().Get("num_rows"); + PADDLE_ENFORCE(num_rows >= 0, + "The value of Input(num_rows) should be non-negative int."); + auto num_columns = ctx->Attrs().Get("num_columns"); + if (num_columns == -1) num_columns = num_rows; + PADDLE_ENFORCE( + num_columns >= 0, + "The value of Input(num_columns) should be non-negative int."); + ctx->SetOutputDim("Out", {num_rows, num_columns}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::Type(ctx.Attr("dtype")), + ctx.GetPlace()); + } +}; + +class EyeOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + auto data_type = static_cast( + boost::get(ctx->GetAttr("dtype"))); + auto& out_var_name = ctx->Output("Out").front(); + ctx->SetDataType(out_var_name, data_type); + } +}; + +class EyeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::VarType::FP32); + AddAttr("num_rows", + "(int64_t) the number of rows in output tensor"); + AddAttr("num_columns", + "(int64_t) the number of columns in output tensor." + "Default -1 means that num_columns=num_rows") + .SetDefault(-1); + AddOutput("Out", + "(Tensor) Construct an identity tensor with " + "specified shape [num_rows, num_columns]"); + AddComment(R"DOC( +Return an identity tensor whose shape is [num_rows, num_columns]. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; +using float16 = paddle::platform::float16; + +REGISTER_OPERATOR(eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel, + ops::EyeKernel, + ops::EyeKernel, ops::EyeKernel, + ops::EyeKernel); diff --git a/paddle/fluid/operators/eye_op.cu b/paddle/fluid/operators/eye_op.cu new file mode 100644 index 00000000..8d55235a --- /dev/null +++ b/paddle/fluid/operators/eye_op.cu @@ -0,0 +1,24 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/eye_op.h" + +namespace ops = paddle::operators; +namespace plf = paddle::platform; +REGISTER_OP_CUDA_KERNEL( + eye, ops::EyeKernel, + ops::EyeKernel, + ops::EyeKernel, + ops::EyeKernel, + ops::EyeKernel); diff --git a/paddle/fluid/operators/eye_op.h b/paddle/fluid/operators/eye_op.h new file mode 100644 index 00000000..0eefe7d2 --- /dev/null +++ b/paddle/fluid/operators/eye_op.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +template +struct EyeFunctor { + EyeFunctor(int64_t num_columns, T* output) + : num_columns_(num_columns), output_(output) {} + + HOSTDEVICE void operator()(size_t idx) const { + output_[idx * num_columns_ + idx] = static_cast(1); + } + + int64_t num_columns_; + T* output_; +}; + +template +class EyeKernel : public framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + auto num_rows = ctx.Attr("num_rows"); + auto num_columns = ctx.Attr("num_columns"); + if (num_columns == -1) num_columns = num_rows; + + auto* out_tensor = ctx.Output("Out"); + T* out_data = out_tensor->mutable_data(ctx.GetPlace()); + + math::SetConstant set_zero; + auto& dev_ctx = ctx.template device_context(); + set_zero(dev_ctx, out_tensor, static_cast(0)); + + int64_t num_eyes = std::min(num_rows, num_columns); + platform::ForRange for_range(dev_ctx, num_eyes); + EyeFunctor functor(num_columns, out_data); + for_range(functor); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 422d99dd..28594756 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/blas.h" namespace paddle { @@ -184,9 +185,7 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel { // training auto* in_accum = context.Input("InAccum"); auto* in_state = context.Input("InState"); - auto& allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); - auto cur_scale = allocator.Allocate(1 * sizeof(T)); + auto cur_scale = memory::Alloc(dev_ctx, sizeof(T)); T* cur_scale_data = static_cast(cur_scale->ptr()); FindAbsMaxFunctor()(dev_ctx, in->data(), in->numel(), @@ -251,9 +250,7 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel { // training auto* in_accum = context.Input("InAccum"); auto* in_state = context.Input("InState"); - auto& allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); - auto cur_scale = allocator.Allocate(1 * sizeof(T)); + auto cur_scale = memory::Alloc(dev_ctx, sizeof(T)); T* cur_scale_data = static_cast(cur_scale->ptr()); FindAbsMaxFunctor()(dev_ctx, in->data(), in->numel(), diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 242f5390..da30fef5 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -14,65 +14,76 @@ limitations under the License. */ #include "paddle/fluid/operators/fc_op.h" #include -#include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/fc_compute.h" namespace paddle { namespace operators { -void FCOp::InferShape(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(ctx->HasInput("Input"), - "X(Input) of Fully Connected should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Out(Output) of Fully Connected should not be null."); - PADDLE_ENFORCE(ctx->HasInput("W"), - "W(Input) of Fully Connected should not be null."); - - auto in_dims = ctx->GetInputDim("Input"); - auto w_dims = ctx->GetInputDim("W"); +class FCOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, + "X(Input) of Fully Connected should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Out(Output) of Fully Connected should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true, + "W(Input) of Fully Connected should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + auto w_dims = ctx->GetInputDim("W"); + + if (ctx->HasInput("Bias")) { + auto bias_dims = ctx->GetInputDim("Bias"); + if (bias_dims.size() == 2) { + PADDLE_ENFORCE_EQ(bias_dims[0], 1, + "The shape of Bias must be [1, dim]."); + PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1], + "The shape of Bias must be [1, dim]."); + } else if (bias_dims.size() == 1) { + PADDLE_ENFORCE_EQ(bias_dims[0], w_dims[1], + "The shape of Bias must be [1, dim]."); + } + } - if (ctx->HasInput("Bias")) { - auto bias_dims = ctx->GetInputDim("Bias"); - if (bias_dims.size() == 2) { - PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim]."); - PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1], - "The shape of Bias must be [1, dim]."); - } else if (bias_dims.size() == 1) { - PADDLE_ENFORCE_EQ(bias_dims[0], w_dims[1], - "The shape of Bias must be [1, dim]."); + auto& activation_type = ctx->Attrs().Get("activation_type"); + if (!activation_type.empty()) { + PADDLE_ENFORCE_EQ(activation_type, "relu", + "Activation %s is not supportetd in fc now.", + activation_type.c_str()); } - } + if (ctx->Attrs().Get("use_mkldnn")) { + PADDLE_ENFORCE_EQ(in_dims.size() == 2 || in_dims.size() == 4, true, + "Fully Connected input should be 2-D or 4-D tensor."); + } + PADDLE_ENFORCE_EQ(w_dims.size(), 2, + "Fully Connected input should be 2-D tensor."); + int in_num_col_dims = ctx->Attrs().Get("in_num_col_dims"); + PADDLE_ENFORCE_GT( + in_dims.size(), in_num_col_dims, + "The input tensor Input's rank of FCOp should be larger than " + "in_num_col_dims."); + + std::vector output_dims; + FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims); - if (ctx->Attrs().Get("use_mkldnn")) { - PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4, - "Fully Connected input should be 2-D or 4-D tensor."); + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + ctx->ShareLoD("Input", "Out"); } - PADDLE_ENFORCE_EQ(w_dims.size(), 2, - "Fully Connected input should be 2-D tensor."); - int in_num_col_dims = ctx->Attrs().Get("in_num_col_dims"); - PADDLE_ENFORCE_GT( - in_dims.size(), in_num_col_dims, - "The input tensor Input's rank of FCOp should be larger than " - "in_num_col_dims."); - - std::vector output_dims; - FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims); - - ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); - ctx->ShareLoD("Input", "Out"); -} -framework::OpKernelType FCOp::GetExpectedKernelType( - const framework::ExecutionContext& ctx) const { - framework::LibraryType library = framework::LibraryType::kPlain; - framework::DataLayout layout = framework::DataLayout::kAnyLayout; - if (ctx.Attr("use_mkldnn")) { - library = framework::LibraryType::kMKLDNN; - layout = framework::DataLayout::kMKLDNN; + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + if (ctx.Attr("use_mkldnn")) { + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; + } + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout, library); } - return framework::OpKernelType(ctx.Input("Input")->type(), - ctx.GetPlace(), layout, library); -} +}; void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const { auto in_dims = ctx->GetInputDim("Input"); @@ -86,8 +97,8 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const { } if (ctx->HasInput("Bias")) { - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), - "Should have bias grad"); + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Bias")), true, + "Should have bias grad"); auto bias_dims = ctx->GetInputDim("Bias"); ctx->SetOutputDim(framework::GradVarName("Bias"), bias_dims); } @@ -105,61 +116,36 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType( ctx.GetPlace(), layout, library); } -void FCOpMaker::Make() { - AddInput("Input", "(Tensor), The input tensor of fully connected operator."); - AddInput("W", "(Tensor), The weight fc op with shape (I, O)."); - AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O") - .AsDispensable(); - AddAttr("in_num_col_dims", - "(int, default 1), The fc op can take tensors with more than " - "two dimensions as its inputs.") - .SetDefault(1) - .EqualGreaterThan(1); - AddOutput("Out", "(Tensor) The output tensor of fully connected operator. "); - AddAttr("use_mkldnn", - "(bool, default false) Only used in mkldnn kernel") - .SetDefault(false); - AddAttr(framework::kAllKernelsMustComputeRuntimeShape, - "Skip calling InferShape() function in the runtime.") - .SetDefault(true); - AddComment(R"DOC( - Fully Connected Operator. - - The fully connected operation calculates the output based on the input, weights and bias. - The size of each dimension of the parameters checked in the infer-shape. -)DOC"); -} - -template -class FCOpKernel : public framework::OpKernel { +class FCOpMaker : public framework::OpProtoAndCheckerMaker { public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); - auto input = ctx.Input("Input"); - auto w = ctx.Input("W"); - auto bias = ctx.Input("Bias"); - auto output = ctx.Output("Out"); - int in_num_col_dims = ctx.Attr("in_num_col_dims"); - auto w_dims = w->dims(); - - std::vector output_dims; - FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims); - output->Resize(framework::make_ddim(output_dims)); - output->set_lod(input->lod()); - - auto out_dims = output->dims(); - int M = framework::product(out_dims) / w_dims[1]; - - const T* input_data = input->data(); - const T* w_data = w->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - auto blas = math::GetBlas(ctx); - math::FCCompute( - blas, M, w_dims[1], w_dims[0], input_data, w_data, output_data, - bias ? bias->data() : NULL); - - // TODO(TJ): fuse act + void Make() override { + AddInput("Input", + "(Tensor), The input tensor of fully connected operator."); + AddInput("W", "(Tensor), The weight fc op with shape (I, O)."); + AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O") + .AsDispensable(); + AddOutput("Out", + "(Tensor) The output tensor of fully connected operator. "); + AddAttr("in_num_col_dims", + "(int, default 1), The fc op can take tensors with more than " + "two dimensions as its inputs.") + .SetDefault(1) + .EqualGreaterThan(1); + AddAttr("activation_type", + "Activation type used in fully connected operator.") + .SetDefault(""); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr(framework::kAllKernelsMustComputeRuntimeShape, + "Skip calling InferShape() function in the runtime.") + .SetDefault(true); + AddComment(R"DOC( +Fully Connected Operator. + +The fully connected operation calculates the output based on the input, weights and bias. +The size of each dimension of the parameters checked in the infer-shape. +)DOC"); } }; @@ -170,4 +156,6 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(fc, ops::FCOp, ops::FCOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(fc_grad, ops::FCOpGrad); -REGISTER_OP_CPU_KERNEL(fc, ops::FCOpKernel, ops::FCOpKernel); +REGISTER_OP_CPU_KERNEL( + fc, ops::FCOpKernel, + ops::FCOpKernel); diff --git a/paddle/fluid/platform/dynload/cupti_lib_path.h b/paddle/fluid/operators/fc_op.cu.cc similarity index 58% rename from paddle/fluid/platform/dynload/cupti_lib_path.h rename to paddle/fluid/operators/fc_op.cu.cc index d267d08b..2fd33aeb 100644 --- a/paddle/fluid/platform/dynload/cupti_lib_path.h +++ b/paddle/fluid/operators/fc_op.cu.cc @@ -1,10 +1,10 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once +#include "paddle/fluid/operators/fc_op.h" -#define CUPTI_LIB_PATH "" +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fc, ops::FCOpKernel, + ops::FCOpKernel); diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h index b82a63cd..bf08e6ba 100644 --- a/paddle/fluid/operators/fc_op.h +++ b/paddle/fluid/operators/fc_op.h @@ -14,24 +14,16 @@ limitations under the License. */ #pragma once +#include +#include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/fc.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -class FCOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override; -}; - class FCOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -43,11 +35,6 @@ class FCOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override; }; -class FCOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override; -}; - inline void FCOutputSize(const framework::DDim& in_dims, const framework::DDim& w_dims, std::vector& out_dims, // NOLINT @@ -64,5 +51,38 @@ inline void FCOutputSize(const framework::DDim& in_dims, out_dims.push_back(w_dims[1]); } +template +class FCOpKernel : public framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* w = ctx.Input("W"); + auto* bias = ctx.Input("Bias"); + auto* output = ctx.Output("Out"); + int in_num_col_dims = ctx.Attr("in_num_col_dims"); + bool with_relu = + (ctx.Attr("activation_type") == "relu") ? true : false; + + auto w_dims = w->dims(); + + std::vector output_dims; + FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims); + output->Resize(framework::make_ddim(output_dims)); + output->set_lod(input->lod()); + + auto out_dims = output->dims(); + int M = framework::product(out_dims) / w_dims[1]; + + const T* input_data = input->data(); + const T* w_data = w->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + math::FCFunctor fc; + fc(dev_ctx, M, w_dims[1], w_dims[0], input_data, w_data, output_data, + bias ? bias->data() : NULL, with_relu); + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fill_op.cc b/paddle/fluid/operators/fill_op.cc index a885b301..4f7cfcf1 100644 --- a/paddle/fluid/operators/fill_op.cc +++ b/paddle/fluid/operators/fill_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,74 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/fill_op.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/safe_ref.h" -#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { -struct FillOpVisitor { - FillOpVisitor(framework::LoDTensor *tensor, const std::vector &value) - : tensor_(tensor), value_(value) {} - - template - void apply() const { - platform::CPUPlace cpu; - auto *data = tensor_->mutable_data(cpu); - std::transform(value_.data(), value_.data() + tensor_->numel(), data, - [](float dat) { return static_cast(dat); }); - } - - framework::LoDTensor *tensor_; - const std::vector &value_; -}; - -class FillOp : public framework::OperatorBase { - public: - FillOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto &out = - detail::Ref(detail::Ref(scope.FindVar(Output("Out")), - "Cannot find variable %s", Output("Out")) - .GetMutable()); - out.Resize(framework::make_ddim(Attr>("shape"))); - auto dtype = - static_cast(Attr("dtype")); - platform::CPUPlace cpu; - auto force_cpu = Attr("force_cpu"); - out.mutable_data(force_cpu ? cpu : place, dtype); - - framework::LoDTensor tensor; - - if (force_cpu || platform::is_cpu_place(place)) { - tensor.ShareDataWith(out); - } else { - // Always make tensor in CPU memory. - tensor.Resize(out.dims()); - tensor.mutable_data(cpu, dtype); - } - - framework::VisitDataType( - dtype, FillOpVisitor(&tensor, Attr>("value"))); - - if (!force_cpu && platform::is_gpu_place(place)) { - // Copy tensor to out - platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - framework::TensorCopy(tensor, place, dev_ctx, &out); - } - } -}; - class FillOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -101,16 +39,42 @@ Fill an tensor with `value` and `shape`. The type of the tensor is specify by } }; -class FillOpInferShape : public framework::InferShapeBase { +class FillOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* context) const override { + PADDLE_ENFORCE_EQ(context->HasOutput("Out"), true, + "Output(Out) of FillOp should not be null."); + auto& shape = context->Attrs().Get>("shape"); + context->SetOutputDim("Out", framework::make_ddim(shape)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::Type(ctx.Attr("dtype")), + ctx.GetPlace()); + } +}; + +class FillOpVarTypeInference : public framework::VarTypeInference { public: - void operator()(framework::InferShapeContext *context) const override { - context->SetOutputDim( - "Out", - framework::make_ddim(context->Attrs().Get>("shape"))); + void operator()(framework::InferVarTypeContext* ctx) const override { + auto data_type = static_cast( + boost::get(ctx->GetAttr("dtype"))); + auto& out_var_name = ctx->Output("Out").front(); + ctx->SetDataType(out_var_name, data_type); } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpInferShape, ops::FillOpMaker); +REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpMaker, + ops::FillOpVarTypeInference, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL(fill, ops::FillKernel, ops::FillKernel, + ops::FillKernel, ops::FillKernel, + ops::FillKernel); diff --git a/paddle/fluid/platform/dynload/warpctc_lib_path.h.in b/paddle/fluid/operators/fill_op.cu.cc similarity index 66% rename from paddle/fluid/platform/dynload/warpctc_lib_path.h.in rename to paddle/fluid/operators/fill_op.cu.cc index dc5064f4..fdef8ab2 100644 --- a/paddle/fluid/platform/dynload/warpctc_lib_path.h.in +++ b/paddle/fluid/operators/fill_op.cu.cc @@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#pragma once +#include "paddle/fluid/operators/fill_op.h" -#define WARPCTC_LIB_PATH "@WARPCTC_INSTALL_DIR@/lib/" +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(fill, ops::FillKernel, ops::FillKernel, + ops::FillKernel, ops::FillKernel, + ops::FillKernel); diff --git a/paddle/fluid/operators/fill_op.h b/paddle/fluid/operators/fill_op.h new file mode 100644 index 00000000..fa2d5b85 --- /dev/null +++ b/paddle/fluid/operators/fill_op.h @@ -0,0 +1,80 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +struct FillOpVisitor { + FillOpVisitor(framework::LoDTensor *tensor, const std::vector &value) + : tensor_(tensor), value_(value) {} + + template + void apply() const { + platform::CPUPlace cpu; + auto *data = tensor_->mutable_data(cpu); + std::transform(value_.data(), value_.data() + tensor_->numel(), data, + [](float dat) { return static_cast(dat); }); + } + + framework::LoDTensor *tensor_; + const std::vector &value_; +}; + +template +class FillKernel : public framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext &ctx) const override { + auto &out = + detail::Ref(ctx.Output("Out"), + "Cannot get output lod tensor Out, variable name = %s", + ctx.op().Output("Out")); + out.Resize(framework::make_ddim(ctx.Attr>("shape"))); + auto dtype = + static_cast(ctx.Attr("dtype")); + platform::CPUPlace cpu; + auto force_cpu = ctx.Attr("force_cpu"); + out.mutable_data(force_cpu ? cpu : ctx.GetPlace(), dtype); + + framework::LoDTensor tensor; + + if (force_cpu || platform::is_cpu_place(ctx.GetPlace())) { + tensor.ShareDataWith(out); + } else { + // Always make tensor in CPU memory. + tensor.Resize(out.dims()); + tensor.mutable_data(cpu, dtype); + } + + framework::VisitDataType( + dtype, FillOpVisitor(&tensor, ctx.Attr>("value"))); + + if (!force_cpu && platform::is_gpu_place(ctx.GetPlace())) { + // Copy tensor to out + framework::TensorCopy( + tensor, ctx.GetPlace(), + ctx.template device_context(), &out); + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/filter_by_instag_op.cc b/paddle/fluid/operators/filter_by_instag_op.cc new file mode 100644 index 00000000..ebf44e5b --- /dev/null +++ b/paddle/fluid/operators/filter_by_instag_op.cc @@ -0,0 +1,146 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/filter_by_instag_op.h" + +#include +#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { +class FilterByInstagOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("Ins"), true, + "Input(Ins) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Ins_tag"), true, + "Input(Ins_tag) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Filter_tag"), true, + "Input(Filter_tag) should be not null."); + + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) should be not null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("LossWeight"), true, + "Output(LossWeight) shoudl not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("IndexMap"), true, + "Output(IndexMap) should be not null."); + + auto x1_dims = ctx->GetInputDim("Ins"); // batch_size * vec + + ctx->SetOutputDim("Out", framework::make_ddim({-1, x1_dims[1]})); + ctx->SetOutputDim("LossWeight", framework::make_ddim({-1, 1})); + ctx->SetOutputDim("IndexMap", framework::make_ddim({-1, 2})); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Ins")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class FilterByInstagOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Ins", "(LoDTensor) embeded tensor"); + AddInput("Ins_tag", "(LoDTensor) ins tag list"); + AddInput("Filter_tag", "(1D Tensor) filter tag list"); + AddAttr("is_lod", "is Ins with LoD info or not, default True"); + AddOutput("Out", "(LoDTensor) embeded tensor filtered by instag"); + AddOutput("LossWeight", "(Tensor) loss weight."); + AddOutput("IndexMap", "(LoDTensor) mapping from Out rows to X1 rows"); + AddComment(R"DOC( +Filter By Instag Op + +This operator is used to filter embeded ins. + +There are 3 inputs. First is embeded ins, Second is tags for ins, +Third is tags to filter. + +There are 3 outputs. First is filtered embeded ins, Second is Loss Weight, +Third is the IndexMap from Out line number to X1 line number. +)DOC"); + } +}; + +class FilterByInstagOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("IndexMap"), true, + "Input(IndexMap) should be not null"); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "Grad Input(Out) should be not null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("Ins"), true, + "Input(Ins) should be not null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("LossWeight"), true, + "Input(LossWeight) should be not null"); + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Ins")), true, + "Grad Output(Ins) should be not null"); + + auto grad_out_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto x1_dims = ctx->GetInputDim("Ins"); + ctx->SetOutputDim(framework::GradVarName("Ins"), + framework::make_ddim({x1_dims[0], grad_out_dims[1]})); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar( + ctx.InputVar(framework::GradVarName("Out"))); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class FilterByInstagGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("filter_by_instag_grad"); + op->SetInput("IndexMap", Output("IndexMap")); + op->SetInput("Ins", Input("Ins")); + op->SetAttrMap(Attrs()); + op->SetInput("LossWeight", Output("LossWeight")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Ins"), InputGrad("Ins")); + return op; + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(filter_by_instag, ops::FilterByInstagOp, + ops::FilterByInstagOpMaker, + ops::FilterByInstagGradOpDescMaker); + +REGISTER_OPERATOR(filter_by_instag_grad, ops::FilterByInstagOpGrad); + +REGISTER_OP_CPU_KERNEL(filter_by_instag, ops::FilterByInstagKernel, + ops::FilterByInstagKernel, + ops::FilterByInstagKernel, + ops::FilterByInstagKernel); + +REGISTER_OP_CPU_KERNEL(filter_by_instag_grad, + ops::FilterByInstagGradKernel, + ops::FilterByInstagGradKernel, + ops::FilterByInstagGradKernel, + ops::FilterByInstagGradKernel); diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h new file mode 100644 index 00000000..f082d0df --- /dev/null +++ b/paddle/fluid/operators/filter_by_instag_op.h @@ -0,0 +1,201 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using SelectedRows = framework::SelectedRows; +using LoDTensor = framework::LoDTensor; +#if defined(PADDLE_WITH_CUDA) +template +using Vector = framework::Vector; +#else +template +using Vector = framework::CPUVector; +#endif + +template +class FilterByInstagKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // X1 is global FC output + // Dim [batch size, embedding size] + auto* x1 = context.Input("Ins"); + bool is_x1_lod = context.Attr("is_lod"); + // X2 is ins tag list + // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]] + auto* x2 = context.Input("Ins_tag"); + // X3 is local fc tag list + // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]] + auto* x3 = context.Input("Filter_tag"); + + std::unordered_set filter_tag; + auto* x3_data = x3->data(); + size_t len = x3->dims()[0]; + for (size_t i = 0; i < len; i++) { + filter_tag.insert(x3_data[i]); + } + + // expected auto = const int64_t + auto* x2_data = x2->data(); + // e.g get [0, 1, 2, 3, ...] + auto x2_lods = x2->lod()[0]; + Vector x1_lods(1, 0); + if (!is_x1_lod) { + for (size_t i = 0; i < x1->dims()[0]; i++) { + x1_lods.push_back(i + 1); + } + } else { + x1_lods = context.Input("Ins")->lod()[0]; + } + std::unordered_map mmap_aux; + Vector out_lods(1, 0); + for (size_t i = 0; i < x2_lods.size() - 1; i++) { + for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) { + if (filter_tag.find(x2_data[j]) != filter_tag.end()) { + size_t batch_len = x1_lods[i + 1] - x1_lods[i]; + mmap_aux[out_lods.back()] = x1_lods[i]; + out_lods.push_back(out_lods.back() + batch_len); + break; + } + } + } + // set output value + // for those whose ins been dropout, set 0 for whole lines. + // otherwise, copy whole line + // Dim [local fc count, batch size, embedding size] + LoDTensor* out = context.Output("Out"); + LoDTensor* map = context.Output("IndexMap"); + LoDTensor* loss_weight = context.Output("LossWeight"); + // expected auto = const T + auto* x1_data = x1->data(); + // expected auto = T + size_t x1_embed_size = x1->dims()[1]; + if (out_lods.size() - 1 > 0) { + out->Resize(framework::make_ddim( + {(int64_t)out_lods.back(), (int64_t)x1_embed_size})); + map->Resize(framework::make_ddim({(int64_t)out_lods.size() - 1, 3})); + loss_weight->Resize( + framework::make_ddim({(int64_t)out_lods.size() - 1, 1})); + } else { + out->Resize(framework::make_ddim({1, (int64_t)x1_embed_size})); + map->Resize(framework::make_ddim({1, 3})); + loss_weight->Resize(framework::make_ddim({1, 1})); + } + auto* out_data = out->mutable_data(context.GetPlace()); + auto* map_data = map->mutable_data(context.GetPlace()); + auto* loss_weight_data = + loss_weight->mutable_data(context.GetPlace()); + if (out_lods.size() - 1 > 0) { + Vector map_lods; + for (size_t i = 0; i < out_lods.size() - 1; i++) { + map_data[i * 3] = (int64_t)out_lods[i]; + map_data[i * 3 + 1] = mmap_aux[map_data[i * 3]]; + map_data[i * 3 + 2] = out_lods[i + 1] - out_lods[i]; + map_lods.push_back(i); + } + map_lods.push_back(out_lods.size() - 1); + std::vector> map_lod_info; + map_lod_info.push_back(map_lods); + + map->set_lod(map_lod_info); + loss_weight->set_lod(map_lod_info); + std::vector> out_lod_info; + out_lod_info.push_back(out_lods); + out->set_lod(out_lod_info); + memset(out_data, 0, out->numel() * sizeof(T)); + for (size_t i = 0; i < loss_weight->numel(); i++) { + loss_weight_data[i] = 1; + } + + for (size_t i = 0; i < out_lods.size() - 1; i++) { + size_t pos = out_lods[i]; + for (size_t k = map_data[i * 3 + 1]; + k < map_data[i * 3 + 1] + map_data[i * 3 + 2]; k++) { + memcpy(out_data + pos * x1_embed_size, x1_data + k * x1_embed_size, + x1_embed_size * sizeof(T)); + ++pos; + } + } + } else { + Vector map_lods; + map_data[0] = 0; + map_data[1] = 1; + map_data[2] = 1; + map_lods.push_back(0); + map_lods.push_back(1); + out_lods.push_back(1); + std::vector> map_lod_info; + map_lod_info.push_back(map_lods); + map->set_lod(map_lod_info); + loss_weight->set_lod(map_lod_info); + std::vector> out_lod_info; + out_lod_info.push_back(out_lods); + out->set_lod(out_lod_info); + memset(out_data, 0, out->numel() * sizeof(T)); + loss_weight_data[0] = 0; + } + } +}; + +template +class FilterByInstagGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* output_grad = context.Input(framework::GradVarName("Out")); + auto* x1_grad = context.Output(framework::GradVarName("Ins")); + auto* loss_weight = context.Input("LossWeight"); + auto* mmap = context.Input("IndexMap"); + auto* x1 = context.Input("Ins"); + x1_grad->set_lod(context.Input("Ins")->lod()); + x1_grad->Resize(x1->dims()); + auto mmap_data = mmap->data(); + // expected auto = T + auto* output_grad_data = output_grad->data(); + + auto* loss_weight_data = loss_weight->data(); + // expected auto = T + auto* x1_grad_data = x1_grad->mutable_data(context.GetPlace()); + memset(x1_grad_data, 0, x1->dims()[0] * x1->dims()[1] * sizeof(T)); + if (loss_weight->numel() != 1 || loss_weight_data[0] != 0) { + auto output_dims = output_grad->dims(); + for (size_t i = 0; i < mmap->dims()[0]; i++) { + int src_ln = mmap_data[i * 3], dst_ln = mmap_data[i * 3 + 1]; + int line_cnt = mmap_data[i * 3 + 2]; + for (size_t l = 0; l < line_cnt; l++) { + for (size_t j = 0; j < output_dims[1]; j++) { + x1_grad_data[(dst_ln + l) * output_dims[1] + j] = + output_grad_data[(src_ln + l) * output_dims[1] + j]; + } + } + } + } + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index f4085daa..9f2a1222 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/flatten_op.h" +#include +#include +#include #include #include "paddle/fluid/framework/op_registry.h" @@ -20,18 +24,21 @@ namespace operators { using Tensor = framework::Tensor; -class FlattenOpInferShape : public framework::InferShapeBase { +class FlattenOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input (X) of Flatten op should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output (Output) of Flatten op should not be null."); + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input (X) of Flatten op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output (Output) of Flatten op should not be null."); const auto &axis = ctx->Attrs().Get("axis"); const auto &in_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE(axis >= 0, "The axis should be greater than or equal to 0."); - PADDLE_ENFORCE( - axis <= in_dims.size(), + PADDLE_ENFORCE_GE(axis, 0, + "The axis should be greater than or equal to 0."); + PADDLE_ENFORCE_LE( + axis, in_dims.size(), "The axis should be less than or equal to input tensor's rank."); const auto &out_dims = GetOutputShape(axis, in_dims); @@ -58,28 +65,12 @@ class FlattenOpInferShape : public framework::InferShapeBase { out_shape[1] = inner; return out_shape; } -}; -class FlattenOp : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto &axis = Attr("axis"); - auto in_dims = - scope.FindVar(Input("X"))->Get().dims(); - const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims); - - framework::AttributeMap attrs; - attrs["shape"] = out_dims; - attrs["inplace"] = false; - // Invoke Reshape Op - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape", {{"X", {Input("X")}}, {"Shape", {}}}, - {{"Out", {Output("Out")}}}, attrs); - reshape_op->Run(scope, place); + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -126,34 +117,21 @@ Case 2: } }; -class FlattenGradInferShape : public framework::InferShapeBase { +class FlattenGradOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *context) const override { + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *context) const override { context->SetOutputDim(framework::GradVarName("X"), context->GetInputDim("X")); context->ShareLoD("X", framework::GradVarName("X")); } -}; -class FlattenGradOp : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto dx_name = Output(framework::GradVarName("X")); - auto dout_name = Input(framework::GradVarName("Out")); - auto in_dims = - scope.FindVar(Input("X"))->Get().dims(); - framework::AttributeMap attrs; - attrs["shape"] = framework::vectorize2int(in_dims); - attrs["inplace"] = false; - - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}}, - attrs); - reshape_op->Run(scope, place); + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -162,13 +140,33 @@ class FlattenGradOp : public framework::OperatorBase { // flatten_grad, in this way, the framework can reuse the memory of X // immediately the flatten2_op is finished. // Considering compatibility issues, we could not fix flatten2_op -class Flatten2OpInferShape : public FlattenOpInferShape { +class Flatten2Op : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *ctx) const override { - FlattenOpInferShape::operator()(ctx); - PADDLE_ENFORCE(ctx->HasOutput("XShape"), - "Output (XShape) of Flatten op should not be null."); + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input (X) of Flatten op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output (Output) of Flatten op should not be null."); + const auto &axis = ctx->Attrs().Get("axis"); const auto &in_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(axis, 0, + "The axis should be greater than or equal to 0."); + PADDLE_ENFORCE_LE( + axis, in_dims.size(), + "The axis should be less than or equal to input tensor's rank."); + + const auto &out_dims = FlattenOp::GetOutputShape(axis, in_dims); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + if (in_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", "Out"); + } + + PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true, + "Output (XShape) of Flatten op should not be null."); std::vector xshape_dims(in_dims.size() + 1); xshape_dims[0] = 0; for (int i = 0; i < in_dims.size(); ++i) { @@ -179,29 +177,6 @@ class Flatten2OpInferShape : public FlattenOpInferShape { } }; -class Flatten2Op : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto &axis = Attr("axis"); - auto in_dims = - scope.FindVar(Input("X"))->Get().dims(); - const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims); - - framework::AttributeMap attrs; - attrs["shape"] = out_dims; - attrs["inplace"] = false; - // Invoke Reshape Op - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape2", {{"X", {Input("X")}}, {"Shape", {}}}, - {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs); - reshape_op->Run(scope, place); - } -}; - class Flatten2OpMaker : public FlattenOpMaker { public: void Make() override { @@ -228,76 +203,73 @@ class Flatten2GradOpMaker : public framework::SingleGradOpDescMaker { } }; -class Flatten2GradInferShape : public framework::InferShapeBase { +class Flatten2GradOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE(context->HasInput("XShape"), - "Input(XShape) shouldn't be null."); - PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) shouldn't be null."); + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE_EQ(context->HasInput("XShape"), true, + "Input(XShape) shouldn't be null."); + PADDLE_ENFORCE_EQ(context->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) shouldn't be null."); auto xshape_dims = context->GetInputDim("XShape"); auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); context->SetOutputDim(framework::GradVarName("X"), x_dims); context->ShareLoD("XShape", framework::GradVarName("X")); } -}; - -class Flatten2GradOp : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto dx_name = Output(framework::GradVarName("X")); - auto dout_name = Input(framework::GradVarName("Out")); - auto xshape_name = Input("XShape"); - auto xshape_dims = - scope.FindVar(xshape_name)->Get().dims(); - auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); - - framework::AttributeMap attrs; - attrs["shape"] = framework::vectorize2int(x_dims); - attrs["inplace"] = false; - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape2", {{"X", {dout_name}}, {"Shape", {}}}, - {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs); - reshape_op->Run(scope, place); + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; -class FlattenOpInplaceInToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc &op_desc, bool use_cuda) const override { - return {{"X", "Out"}}; - } -}; - -class FlattenGradInplaceinToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc &op_desc, bool use_cuda) const override { - return {{framework::GradVarName("Out"), framework::GradVarName("X")}}; - } -}; +DECLARE_INPLACE_OP_INFERER(FlattenOpInplaceInToOut, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(FlattenGradInplaceinToOut, + {framework::GradVarName("Out"), + framework::GradVarName("X")}); } // namespace operators } // namespace paddle -USE_OP(reshape); - namespace ops = paddle::operators; REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker, - ops::FlattenOpInferShape, paddle::framework::DefaultGradOpDescMaker, ops::FlattenOpInplaceInToOut); -REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape, +REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInplaceinToOut); REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker, - ops::Flatten2OpInferShape, ops::Flatten2GradOpMaker, - ops::FlattenOpInplaceInToOut); + ops::Flatten2GradOpMaker, ops::FlattenOpInplaceInToOut); REGISTER_OPERATOR(flatten2_grad, ops::Flatten2GradOp, - ops::Flatten2GradInferShape, ops::FlattenGradInplaceinToOut); + ops::FlattenGradInplaceinToOut); + +REGISTER_OP_CPU_KERNEL( + flatten, ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel); +REGISTER_OP_CPU_KERNEL( + flatten_grad, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel); +REGISTER_OP_CPU_KERNEL( + flatten2, ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel); +REGISTER_OP_CPU_KERNEL( + flatten2_grad, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel); diff --git a/paddle/fluid/operators/flatten_op.cu.cc b/paddle/fluid/operators/flatten_op.cu.cc new file mode 100644 index 00000000..ac4ad8e2 --- /dev/null +++ b/paddle/fluid/operators/flatten_op.cu.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/flatten_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + flatten, ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel); +REGISTER_OP_CUDA_KERNEL( + flatten_grad, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel); +REGISTER_OP_CUDA_KERNEL( + flatten2, ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel); +REGISTER_OP_CUDA_KERNEL( + flatten2_grad, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel); diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h new file mode 100644 index 00000000..165832c0 --- /dev/null +++ b/paddle/fluid/operators/flatten_op.h @@ -0,0 +1,116 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +template +class FlattenKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *in = context.Input("X"); + auto *out = context.Output("Out"); + + auto &axes = context.Attr("axis"); + auto x_dims = in->dims(); + auto out_dims = framework::make_ddim(GetOutputShape(axes, x_dims)); + + out->mutable_data(context.GetPlace(), in->type()); + framework::TensorCopy( + *in, context.GetPlace(), + context.template device_context(), out); + out->Resize(out_dims); + } + + static std::vector GetOutputShape(const int axis, + const framework::DDim &in_dims) { + int64_t outer = 1, inner = 1; + for (int i = 0; i < in_dims.size(); ++i) { + if (i < axis) { + outer *= in_dims[i]; + } else { + inner *= in_dims[i]; + } + } + std::vector out_shape(2); + out_shape[0] = outer; + out_shape[1] = inner; + return out_shape; + } +}; + +template +class FlattenGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_out = + ctx.Input(framework::GradVarName("Out")); + auto in_dims = ctx.Input("X")->dims(); + + d_x->mutable_data(ctx.GetPlace(), d_out->type()); + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + d_x->Resize(in_dims); + } +}; + +template +class Flatten2Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto &axes = context.Attr("axis"); + + auto *in = context.Input("X"); + auto x_dims = in->dims(); + + auto *out = context.Output("Out"); + + auto out_dims = framework::make_ddim( + FlattenKernel::GetOutputShape(axes, x_dims)); + + out->mutable_data(context.GetPlace(), in->type()); + framework::TensorCopy( + *in, context.GetPlace(), + context.template device_context(), out); + out->Resize(out_dims); + } +}; + +template +class Flatten2GradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_out = + ctx.Input(framework::GradVarName("Out")); + + auto xshape_dims = ctx.Input("XShape")->dims(); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + + d_x->mutable_data(ctx.GetPlace(), d_out->type()); + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + d_x->Resize(x_dims); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 42ab8e99..a31531c5 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -1,5 +1,5 @@ include(operators) -register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op) +register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op fused_fc_elementwise_layernorm_op) if (WITH_GPU) op_library(fusion_transpose_flatten_concat_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n") @@ -7,4 +7,6 @@ if (WITH_GPU) op_library(fusion_conv_inception_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n") endif() + op_library(fused_fc_elementwise_layernorm_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_fc_elementwise_layernorm);\n") endif() diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 35a30854..4c13d394 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" -#include "paddle/fluid/operators/math/fc_compute.h" #include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/platform/cpu_info.h" @@ -589,8 +588,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(fused_embedding_fc_lstm, ops::FusedEmbeddingFCLSTMOp, - ops::FusedEmbeddingFCLSTMOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::FusedEmbeddingFCLSTMOpMaker); REGISTER_OP_CPU_KERNEL(fused_embedding_fc_lstm, ops::FusedEmbeddingFCLSTMKernel, diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc index 3ee962d3..91100990 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -78,6 +78,12 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { "are supported, sum computes the weighted sum of the " "embedding results for each row.") .SetDefault("sum"); + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(kNoPadding); // NOTE(minqiyang): grad_inplace is an temporal attribute, // please do NOT set this attribute in python layer. AddAttr("grad_inplace", diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h index 4651c2b2..3fffdf7e 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include @@ -22,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/operators/math/blas.h" namespace paddle { namespace operators { @@ -31,6 +33,49 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; +constexpr int64_t kNoPadding = -1; + +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) +template +void prepare_csr_data(const std::vector &offset, + const int64_t *ids_data, const size_t idx_width, + T *csr_vals, int *csr_colmuns, int *csr_row_idx, + int64_t padding_idx = kNoPadding) { + int val_idx = 0; + int row_idx = 0; + csr_row_idx[0] = 0; + + std::map ids_map; + + // for each sequence in batch + for (size_t i = 0; i < offset.size() - 1; ++i) { + for (size_t idx = 0; idx < idx_width; ++idx) { + ids_map.clear(); + + // construct a map for creating csr + for (size_t j = offset[i]; j < offset[i + 1]; ++j) { + auto ids_value = ids_data[idx + j * idx_width]; + if (ids_value != padding_idx) { + unsigned int word_idx = static_cast(ids_value); + ++ids_map[word_idx]; + } + } + + VLOG(4) << "====sequence %d====" << i; + for (std::map::const_iterator it = ids_map.begin(); + it != ids_map.end(); ++it) { + VLOG(4) << it->first << " => " << it->second; + csr_vals[val_idx] = it->second; + csr_colmuns[val_idx] = it->first; + ++val_idx; + } + csr_row_idx[row_idx + 1] = csr_row_idx[row_idx] + ids_map.size(); + ++row_idx; + } + } +} +#else template struct EmbeddingVSumFunctor { void operator()(const framework::ExecutionContext &context, @@ -60,6 +105,7 @@ struct EmbeddingVSumFunctor { } } }; +#endif inline int FusedEmbeddingSeqPoolLastDim(const framework::DDim &table_dims, const framework::DDim &ids_dims) { @@ -83,16 +129,53 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { FusedEmbeddingSeqPoolLastDim(table_var->dims(), ids_t->dims()); const auto &ids_lod = ids_t->lod(); // in run time, the LoD of ids must be 1 - PADDLE_ENFORCE(ids_lod.size(), 1UL, - "The LoD level of Input(Ids) must be 1"); + PADDLE_ENFORCE_EQ(ids_lod.size(), 1UL, + "The LoD level of Input(Ids) must be 1"); int64_t batch_size = ids_lod[0].size() - 1; // in run time, the shape from Ids -> output // should be [seq_length, 1] -> [batch_size, last_dim] output_t->Resize({batch_size, last_dim}); if (combiner_type == "sum") { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) + int64_t padding_idx = context.Attr("padding_idx"); + auto output = output_t->mutable_data(context.GetPlace()); + int64_t table_height = table_var->dims()[0]; + int64_t table_width = table_var->dims()[1]; + auto weights = table_var->data(); + + const std::vector offset = ids_lod[0]; + auto len = ids_t->numel(); + int idx_width = len / offset.back(); + + Tensor csr_vals_t, csr_colmuns_t, csr_row_idx_t; + csr_vals_t.Resize({len}); + csr_colmuns_t.Resize({len}); + csr_row_idx_t.Resize({(batch_size + 1) * idx_width}); + auto csr_vals = csr_vals_t.mutable_data(context.GetPlace()); + auto csr_colmuns = csr_colmuns_t.mutable_data(context.GetPlace()); + auto csr_row_idx = csr_row_idx_t.mutable_data(context.GetPlace()); + prepare_csr_data(offset, ids_t->data(), idx_width, csr_vals, + csr_colmuns, csr_row_idx, padding_idx); + + const char transa = 'N'; + const T alpha = 1.0; + const T beta = 0.0; + const char matdescra[] = {'G', 'L', 'N', 'C'}; + + const int m = batch_size * idx_width; + const int n = table_width; + const int k = table_height; + auto blas = math::GetBlas(context); + blas.CSRMM(&transa, &m, &n, &k, &alpha, matdescra, (const T *)csr_vals, + (const int *)csr_colmuns, (const int *)csr_row_idx, + (const int *)csr_row_idx + 1, weights, &n, &beta, output, &n); + +#else EmbeddingVSumFunctor functor; functor(context, table_var, ids_t, output_t); +#endif } } }; @@ -148,7 +231,51 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { vbroadcast(src, dst, h, out_width); } } else { +#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \ + !defined(__OSX__) + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + int64_t padding_idx = context.Attr("padding_idx"); + + d_table->Resize(table_dim); + auto *d_table_data = d_table->mutable_data(context.GetPlace()); + memset(d_table_data, 0, d_table->numel() * sizeof(T)); + + const auto &ids_lod = ids->lod(); + PADDLE_ENFORCE_EQ(ids_lod.size(), 1UL, + "The LoD level of Input(Ids) must be 1"); + const std::vector offset = ids_lod[0]; + auto len = ids->numel(); + int idx_width = len / offset.back(); + + Tensor csr_vals_t, csr_colmuns_t, csr_row_idx_t; + csr_vals_t.Resize({len}); + csr_colmuns_t.Resize({len}); + int64_t batch_size = ids_lod[0].size() - 1; + csr_row_idx_t.Resize({(batch_size + 1) * idx_width}); + auto csr_vals = csr_vals_t.mutable_data(context.GetPlace()); + auto csr_colmuns = csr_colmuns_t.mutable_data(context.GetPlace()); + auto csr_row_idx = csr_row_idx_t.mutable_data(context.GetPlace()); + prepare_csr_data(offset, ids->data(), idx_width, csr_vals, + csr_colmuns, csr_row_idx, padding_idx); + + auto *d_output_data = d_output->data(); + auto blas = math::GetBlas(context); + int width = static_cast(table_dim[1]); + int num_seq = batch_size * idx_width; + LOG(INFO) << "num seq = " << num_seq << " width = " << width; + for (int i = 0; i < num_seq; ++i) { + for (int j = csr_row_idx[i]; j < csr_row_idx[i + 1]; ++j) { + unsigned int word_idx = csr_colmuns[j]; + T val = csr_vals[j]; + blas.AXPY(width, val, d_output_data + i * width, + d_table_data + word_idx * width); + } + } +#else LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now"; +#endif } } }; diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc new file mode 100644 index 00000000..7c5d0c71 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc @@ -0,0 +1,185 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ( + ctx->HasInput("X"), true, + "Input(X) of fused_fc_elementwise_layernorm should not be null."); + PADDLE_ENFORCE_EQ( + ctx->HasInput("W"), true, + "Input(W) of fused_fc_elementwise_layernorm should not be null."); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Y"), true, + "Input(Y) of fused_fc_elementwise_layernorm should not be null."); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("Out"), true, + "Output(Out) of fused_fc_elementwise_layernorm should not be null."); + + auto w_dims = ctx->GetInputDim("W"); + PADDLE_ENFORCE_EQ(w_dims.size(), 2, + "Fully Connected input should be 2-D tensor."); + + if (ctx->HasInput("Bias0")) { + auto bias0_dims = ctx->GetInputDim("Bias0"); + if (bias0_dims.size() == 2) { + PADDLE_ENFORCE_EQ(bias0_dims[0], 1, + "The shape of Bias must be [1, dim]."); + PADDLE_ENFORCE_EQ(bias0_dims[1], w_dims[1], + "The shape of Bias must be [1, dim]."); + } else if (bias0_dims.size() == 1) { + PADDLE_ENFORCE_EQ(bias0_dims[0], w_dims[1], + "The shape of Bias must be [1, dim]."); + } + } + + auto x_dims = ctx->GetInputDim("X"); + int x_num_col_dims = ctx->Attrs().Get("x_num_col_dims"); + PADDLE_ENFORCE_GT( + x_dims.size(), x_num_col_dims, + "The input tensor Input's rank of FCOp should be larger than " + "in_num_col_dims."); + + auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims); + PADDLE_ENFORCE_EQ( + x_mat_dims[1], w_dims[0], + "Fully Connected input and weigth size do not match. %s, %s"); + + std::vector fc_out_dims; + for (int i = 0; i < x_num_col_dims; ++i) { + fc_out_dims.push_back(x_dims[i]); + } + fc_out_dims.push_back(w_dims[1]); + + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(framework::make_ddim(fc_out_dims), y_dims); + + auto begin_norm_axis = ctx->Attrs().Get("begin_norm_axis"); + PADDLE_ENFORCE_LT( + begin_norm_axis, y_dims.size(), + "'begin_norm_axis' must be less than the rank of Input(Y)."); + + auto y_mat_dim = framework::flatten_to_2d(y_dims, begin_norm_axis); + int64_t dim_0 = y_mat_dim[0]; + int64_t dim_1 = y_mat_dim[1]; + if (ctx->HasInput("Scale")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1); + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], dim_1, + "scale should with right"); + } + } + if (ctx->HasInput("Bias1")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias1").size(), 1); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias1")[0], dim_1, + "bias should with right"); + } + } + + ctx->SetOutputDim("Out", y_dims); + if (ctx->HasOutput("Mean")) { + ctx->SetOutputDim("Mean", {dim_0}); + } + if (ctx->HasOutput("Variance")) { + ctx->SetOutputDim("Variance", {dim_0}); + } + ctx->ShareLoD("X", "Out"); + } +}; + +class FusedFCElementwiseLayerNormOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of fully connected operation"); + AddInput("W", + "(Tensor), The weight tensor of fully connected operation. It is " + "a 2-D Tensor with shape (I, O)"); + AddInput("Bias0", + "(Tensor, optional), The bias tensor of fully connecred " + "operation. It is a 1-D Tensor with shape (O), or a 2-D Tensor " + "with shape (1, O).") + .AsDispensable(); + AddInput("Y", + "(Tensor), The second input tensor of elementwise_add operation. " + "Note that the shape should be the same as fully connect's result " + "tensor."); + AddInput( + "Scale", + "(Tensor, optional), It is a 1-D input Tensor of layer_norm operation.") + .AsDispensable(); + AddInput( + "Bias1", + "(Tensor, optional), It is a 1-D input Tensor of layer_norm operation.") + .AsDispensable(); + AddOutput("Out", + "(Tensor), Output after normalization. The shape is the shame as " + "layer_norm's input."); + AddOutput("Mean", "(Tensor, optional), Mean of the current minibatch") + .AsDispensable(); + AddOutput("Variance", + "(Tensor, optional), Variance of the current minibatch") + .AsDispensable(); + AddAttr("x_num_col_dims", + "(int, default 1), This op can take tensors with more than " + "two dimensions as its inputs.") + .SetDefault(1) + .EqualGreaterThan(1); + AddAttr("activation_type", + "Activation type used in fully connected operator.") + .SetDefault(""); + AddAttr("epsilon", + "Constant for numerical stability [default 1e-5].") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE_GE(epsilon, 0.0f, + "'epsilon' should be between 0.0 and 0.001."); + PADDLE_ENFORCE_LE(epsilon, 0.001f, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddAttr("begin_norm_axis", + "the axis of `begin_norm_axis ... Rank(Y) - 1` will be " + "normalized. `begin_norm_axis` splits the tensor(`X`) to a " + "matrix [N,H]. [default 1].") + .SetDefault(1) + .AddCustomChecker([](const int &begin_norm_axis) { + PADDLE_ENFORCE_GT(begin_norm_axis, 0, + "'begin_norm_axis' should be greater than zero."); + }); + AddComment(R"DOC( +fc_out <= fc(X, W, Bias0) +add_out <= elementwise_add(fc_out, Y) +(out, mean, variance) <= layer_norm(add_out, Scale, Bias1) +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_fc_elementwise_layernorm, + ops::FusedFCElementwiseLayerNormOp, + ops::FusedFCElementwiseLayerNormOpMaker, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu new file mode 100644 index 00000000..74d34525 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu @@ -0,0 +1,201 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/platform/cuda_device_function.h" + +namespace paddle { +namespace operators { + +template +static __device__ __forceinline__ T Relu(T x) { + return (x > 0) ? x : 0; +} + +static __device__ __forceinline__ float RealSqrt(float x) { return sqrtf(x); } +static __device__ __forceinline__ double RealSqrt(double x) { return sqrt(x); } + +template +struct PairForLayerNorm { + __device__ __forceinline__ PairForLayerNorm() {} + __device__ __forceinline__ PairForLayerNorm(const T& first, const T& second) + : first_(first), second_(second) {} + + T first_; + T second_; +}; + +template +struct PairForLayerNormAddFunctor { + __device__ __forceinline__ PairForLayerNorm operator()( + const PairForLayerNorm& p1, const PairForLayerNorm& p2) { + return PairForLayerNorm(p1.first_ + p2.first_, p1.second_ + p2.second_); + } +}; + +template +__global__ void InplaceAddReluAddLayerNormKernel(const T* y, const T* bias_0, + const T* bias_1, + const T* scale, T* out, + T* mean, T* variance, int M, + int N, float epsilon) { + using BlockReduce = cub::BlockReduce, BlockDim>; + __shared__ typename BlockReduce::TempStorage temp_storage; + __shared__ T shared_mem[BlockDim + 2]; + + for (int i = blockIdx.x; i < M; i += gridDim.x) { + int index = i * N + threadIdx.x; + + // The fisrt BlockDim elements will be saved to shared memory. + int save_index = threadIdx.x; + T* save_ptr = shared_mem; + + double sum_i = 0; + double square_sum_i = 0; + for (int j = threadIdx.x; j < N; j += blockDim.x) { + T tmp_0 = out[index]; + // Add bias + T tmp_1 = bias_0 ? tmp_0 + bias_0[j] : tmp_0; + // Relu + T tmp_2 = DoRelu ? Relu(tmp_1) : tmp_1; + // elementwise_add + T tmp_3 = tmp_2 + y[index]; + + // Save + save_ptr[save_index] = tmp_3; + save_ptr = out; + + index += blockDim.x; + save_index = index; + + // For layer_norm, reduce to calculate mean and std + sum_i += tmp_3; + square_sum_i += (tmp_3 * tmp_3); + } + + auto pair = BlockReduce(temp_storage) + .Reduce(PairForLayerNorm(sum_i, square_sum_i), + PairForLayerNormAddFunctor()); + + if (threadIdx.x == 0) { + T mean_i = static_cast(pair.first_ / N); + T variance_i = static_cast(pair.second_ / N - mean_i * mean_i); + shared_mem[BlockDim] = mean_i; + shared_mem[BlockDim + 1] = variance_i; + if (mean) { + mean[blockIdx.x] = mean_i; + } + if (variance) { + variance[blockIdx.x] = variance_i; + } + } + __syncthreads(); + T mean_i = shared_mem[BlockDim]; + T std_i = static_cast(RealSqrt(shared_mem[BlockDim + 1] + epsilon)); + + index = i * N + threadIdx.x; + // First BlockDim elements loading from shared memory. + save_index = threadIdx.x; + save_ptr = shared_mem; + + // For layer_norm, calculate out + for (int j = threadIdx.x; j < N; j += blockDim.x) { + T tmp_0 = (save_ptr[save_index] - mean_i) / std_i; + T tmp_1 = scale ? scale[j] * tmp_0 : tmp_0; + out[index] = bias_1 ? tmp_1 + bias_1[j] : tmp_1; + + save_ptr = out; + index += blockDim.x; + save_index = index; + } + } +} + +template +class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* w = ctx.Input("W"); + auto* out = ctx.Output("Out"); + + auto w_dims = w->dims(); + int N = w_dims[1]; + int K = w_dims[0]; + int M = framework::product(x->dims()) / K; + + const T* x_data = x->data(); + const T* w_data = w->data(); + T* out_data = out->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + auto blas = math::GetBlas(dev_ctx); + blas.GEMM(false, false, M, N, K, static_cast(1.0), x_data, K, w_data, N, + static_cast(0.0), out_data, N); + + auto* y = ctx.Input("Y"); + auto* bias_0 = ctx.Input("Bias0"); + auto* bias_1 = ctx.Input("Bias1"); + auto* scale = ctx.Input("Scale"); + + const T* y_data = y->data(); + const T* bias_0_data = bias_0 ? bias_0->data() : nullptr; + const T* bias_1_data = bias_1 ? bias_1->data() : nullptr; + const T* scale_data = scale ? scale->data() : nullptr; + + auto* mean = ctx.Output("Mean"); + auto* variance = ctx.Output("Variance"); + + T* mean_data = mean ? mean->mutable_data(ctx.GetPlace()) : nullptr; + T* variance_data = + variance ? variance->mutable_data(ctx.GetPlace()) : nullptr; + + bool with_relu = + (ctx.Attr("activation_type") == "relu") ? true : false; + float epsilon = ctx.Attr("epsilon"); + + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + if (with_relu) { + switch (platform::RoundToPowerOfTwo(N)) { + CUDA_LAUNCH_KERNEL_HELPER( + InplaceAddReluAddLayerNormKernel< + T, true, + kPowerOfTwoDim><<>>( + y_data, bias_0_data, bias_1_data, scale_data, out_data, + mean_data, variance_data, M, N, epsilon)); + } + } else { + switch (platform::RoundToPowerOfTwo(N)) { + CUDA_LAUNCH_KERNEL_HELPER( + InplaceAddReluAddLayerNormKernel< + T, false, + kPowerOfTwoDim><<>>( + y_data, bias_0_data, bias_1_data, scale_data, out_data, + mean_data, variance_data, M, N, epsilon)); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(fused_fc_elementwise_layernorm, + ops::FusedFCElementwiseLayerNormOpKernel, + ops::FusedFCElementwiseLayerNormOpKernel); diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 76ea6f1b..63e97ab5 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -61,7 +61,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { T* temp_data = temp_outs[0]->mutable_data(input->dims(), ctx.GetPlace()); DataLayout layout = DataLayout::kNCHW; - std::vector in_dim = framework::vectorize2int(input->dims()); + std::vector in_dim = framework::vectorize(input->dims()); // ------------------- cudnn descriptors --------------------- PoolingMode pooling_mode; @@ -83,9 +83,9 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { pool_desc.descriptor(pooling_mode, k3x3, k1x1, k1x1); cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, framework::vectorize2int(input->dims())); + layout, framework::vectorize(input->dims())); cudnnTensorDescriptor_t pool_out_desc = out_pool_desc.descriptor( - layout, framework::vectorize2int(input->dims())); + layout, framework::vectorize(input->dims())); cudnnDataType_t cudnn_dtype = CudnnDataType::type; cudnnTensorDescriptor_t* out_desc = new cudnnTensorDescriptor_t[4]; @@ -126,7 +126,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { : CUDNN_DATA_FLOAT; for (int i = 0; i < 4; ++i) { - filter_dims.push_back(framework::vectorize2int(filters[i]->dims())); + filter_dims.push_back(framework::vectorize(filters[i]->dims())); CUDNN_ENFORCE(platform::dynload::cudnnSetFilterNdDescriptor( filter_desc[i], cudnn_dtype, format, 4, filter_dims[i].data())); bias_dims.push_back({1, filter_dims[i][0], 1, 1}); diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 56c41ef2..5c895099 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -17,7 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/sequence2batch.h" namespace paddle { @@ -219,9 +219,11 @@ class FusionGRUKernel : public framework::OpKernel { const T* wh_state_data = wh_data + D * D2; T* hidden_out_data = hidden_out->mutable_data(place); auto blas = math::GetBlas(ctx); - math::FCCompute(blas, total_T, D3, M, x_data, wx_data, - xx_data, - bias ? bias->data() : nullptr); + + auto& dev_ctx = ctx.template device_context(); + math::FCFunctor fc; + fc(dev_ctx, total_T, D3, M, x_data, wx_data, xx_data, + bias ? bias->data() : nullptr); int xx_offset = D3; int gate_offset = D; @@ -290,17 +292,17 @@ class FusionGRUKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto blas = math::GetBlas(dev_ctx); math::LoDTensor2BatchFunctor to_batch; + + math::FCFunctor fc; if (M > D3) { - math::FCCompute(blas, total_T, D3, M, x_data, wx_data, - xx_data, - bias ? bias->data() : nullptr); + fc(dev_ctx, total_T, D3, M, x_data, wx_data, xx_data, + bias ? bias->data() : nullptr); to_batch(dev_ctx, *xx, batched_input, true, is_reverse); } else { to_batch(dev_ctx, *x, xx, true, is_reverse); batched_input->set_lod(xx->lod()); - math::FCCompute(blas, total_T, D3, M, xx_data, wx_data, - batched_input_data, - bias ? bias->data() : nullptr); + fc(dev_ctx, total_T, D3, M, xx_data, wx_data, batched_input_data, + bias ? bias->data() : nullptr); } auto batched_lod = batched_input->lod(); @@ -396,7 +398,7 @@ class FusionGRUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker, - paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(fusion_gru, ops::FusionGRUOp, ops::FusionGRUOpMaker); + REGISTER_OP_CPU_KERNEL(fusion_gru, ops::FusionGRUKernel, ops::FusionGRUKernel); diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 1a31fc78..32f0e37a 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/sequence2batch.h" namespace paddle { @@ -281,8 +281,10 @@ class FuisonLSTMKernel : public framework::OpKernel { T* h_out_data = hidden_out->mutable_data(place); T* c_out_data = cell_out->mutable_data(place); auto blas = math::GetBlas(ctx); - math::FCCompute(blas, total_T, D4, M, x_data, wx_data, - xx_data, bias->data()); + + auto& dev_ctx = ctx.template device_context(); + math::FCFunctor fc; + fc(dev_ctx, total_T, D4, M, x_data, wx_data, xx_data, bias->data()); int xx_offset = D4; int gate_offset = D; @@ -359,16 +361,15 @@ class FuisonLSTMKernel : public framework::OpKernel { math::LoDTensor2BatchFunctor to_batch; auto& dev_ctx = ctx.template device_context(); auto blas = math::GetBlas(dev_ctx); + math::FCFunctor fc; if (M > D4) { - math::FCCompute(blas, x_dims[0], D4, M, x_data, wx_data, - xx_data, bias->data()); + fc(dev_ctx, x_dims[0], D4, M, x_data, wx_data, xx_data, bias->data()); to_batch(dev_ctx, *xx, batched_input, true, is_reverse); } else { to_batch(dev_ctx, *x, xx, true, is_reverse); batched_input->set_lod(xx->lod()); - math::FCCompute(blas, x_dims[0], D4, M, xx_data, - wx_data, batched_input_data, - bias->data()); + fc(dev_ctx, x_dims[0], D4, M, xx_data, wx_data, batched_input_data, + bias->data()); } auto batched_lod = batched_input->lod(); @@ -474,8 +475,7 @@ class FuisonLSTMKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker, - paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker); REGISTER_OP_CPU_KERNEL(fusion_lstm, ops::FuisonLSTMKernel, ops::FuisonLSTMKernel); diff --git a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc index 6be35de6..4c11482f 100644 --- a/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_repeated_fc_relu_op.cc @@ -144,8 +144,7 @@ class FusionRepeatedFCReluKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(fusion_repeated_fc_relu, ops::FusionRepeatedFCReluOp, - ops::FusionRepeatedFCReluOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::FusionRepeatedFCReluOpMaker); REGISTER_OP_CPU_KERNEL(fusion_repeated_fc_relu, ops::FusionRepeatedFCReluKernel, diff --git a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc index b05329cf..519670cc 100644 --- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include // for min, max #include #include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/fc.h" namespace paddle { namespace operators { @@ -209,9 +209,9 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel { } } auto& dev_ctx = ctx.template device_context(); - auto blas = math::GetBlas(dev_ctx); - math::FCCompute(blas, x_dims[0], w_dims[1], w_dims[0], - col_data, w_data, y_data, b_data, true); + math::FCFunctor fc; + fc(dev_ctx, x_dims[0], w_dims[1], w_dims[0], col_data, w_data, y_data, + b_data, true); } }; @@ -220,8 +220,7 @@ class FusionSeqConvEltAddReluKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(fusion_seqconv_eltadd_relu, ops::FusionSeqConvEltAddReluOp, - ops::FusionSeqConvEltAddReluOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::FusionSeqConvEltAddReluOpMaker); REGISTER_OP_CPU_KERNEL(fusion_seqconv_eltadd_relu, ops::FusionSeqConvEltAddReluKernel, diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index d091da5a..95a08d3b 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/cpu_vec.h" -#include "paddle/fluid/operators/math/fc_compute.h" +#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/platform/cpu_info.h" namespace paddle { @@ -165,8 +165,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { T* fc_out_data = fc_out->mutable_data(ctx.GetPlace()); auto blas = math::GetBlas(ctx); - math::FCCompute(blas, total_T, D, M0, ref_in_data, w_data, - out_data, b ? b->data() : NULL); + + auto& dev_ctx = ctx.template device_context(); + math::FCFunctor fc; + fc(dev_ctx, total_T, D, M0, ref_in_data, w_data, out_data, + b ? b->data() : NULL); w_data = w_data + M0 * D; // first write on blas.MatMul(N, D, M1, in1_data, w_data, fc_out_data); @@ -197,8 +200,7 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(fusion_seqexpand_concat_fc, ops::FusionSeqExpandConcatFCOp, - ops::FusionSeqExpandConcatFCOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::FusionSeqExpandConcatFCOpMaker); REGISTER_OP_CPU_KERNEL(fusion_seqexpand_concat_fc, ops::FusionSeqExpandConcatFCOpKernel, diff --git a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc index 25916768..b14ee88a 100644 --- a/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqpool_concat_op.cc @@ -126,8 +126,7 @@ class FusionSeqPoolConcatKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(fusion_seqpool_concat, ops::FusionSeqPoolConcatOp, - ops::FusionSeqPoolConcatOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::FusionSeqPoolConcatOpMaker); REGISTER_OP_CPU_KERNEL(fusion_seqpool_concat, ops::FusionSeqPoolConcatKernel, diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc new file mode 100644 index 00000000..14e327bb --- /dev/null +++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.cc @@ -0,0 +1,148 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h" +#include +#include +#include "paddle/fluid/operators/jit/kernels.h" + +namespace paddle { +namespace operators { + +void FusionSeqPoolCVMConcatOp::InferShape( + framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE_GE( + ctx->Inputs("X").size(), 1UL, + "Inputs(X) of FusionSeqPoolCVMConcatOp should not be empty."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of FusionSeqPoolCVMConcatOp should not be null."); + int axis = ctx->Attrs().Get("axis"); + PADDLE_ENFORCE_EQ( + axis, 1, "FusionSeqPoolCVMConcatOp only supports concat axis=1 yet."); + bool use_cvm = ctx->Attrs().Get("use_cvm"); + PADDLE_ENFORCE_EQ( + use_cvm, true, + "FusionSeqPoolCVMConcatOp only supports use_cvm is true yet."); + + auto ins_dims = ctx->GetInputsDim("X"); + const size_t n = ins_dims.size(); + PADDLE_ENFORCE_GT(n, 0UL, "Input tensors count should > 0."); + if (n == 1) { + LOG(WARNING) << "Only have one input, may waste memory"; + } + + // The output height should be confirmed in Compute, + // since input lod is not accessible here. + PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2, + "The dims size of first input should be 2."); + ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast(n)}); +} + +framework::OpKernelType FusionSeqPoolCVMConcatOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + return framework::OpKernelType( + framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]), ctx.GetPlace()); +} + +void FusionSeqPoolCVMConcatOpMaker::Make() { + AddInput("X", "(LoDTensor) Input tensors of this operator.").AsDuplicable(); + AddInput("CVM", + "(Tensor), a 2-D Tensor with shape [N x 2], where N is the batch " + "size, 2 is show and click."); + AddOutput("Out", "(LoDTensor) Output tensor of concat operator."); + AddAttr("pooltype", + "(string, default 'SUM') some of the pooling " + "pooltype of SequencePoolOp.") + .SetDefault("SUM") + .InEnum({"AVERAGE", "SUM", "SQRT"}); + AddAttr("use_cvm", "bool, use cvm or not").SetDefault(true); + AddAttr("axis", + "The axis along which the input tensors will be concatenated. " + "Only supports concat axis=1 yet.") + .SetDefault(1); + AddComment(R"DOC( +Fusion Sequence Pool of pooltype(sum, average and sqrt) and Concat Operator. +)DOC"); +} + +template +class FusionSeqPoolCVMConcatKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto ins = ctx.MultiInput("X"); + auto* out = ctx.Output("Out"); + std::string pooltype = ctx.Attr("pooltype"); + auto x0_lod = ins[0]->lod(); + auto x0_dims = ins[0]->dims(); + auto y_dims = out->dims(); + size_t bs = x0_lod[0].size() - 1; + out->Resize({static_cast(bs), y_dims[1]}); + framework::LoD y_lod(1); + y_lod[0].resize(bs + 1); + for (size_t i = 0; i <= bs; ++i) { + y_lod[0][i] = i; + } + out->set_lod(y_lod); + auto place = ctx.GetPlace(); + T* y_data = out->mutable_data(place); + + int w = ins[0]->numel() / x0_dims[0]; + PADDLE_ENFORCE_EQ(y_dims[1] % w, 0, + "The output of dims[1] should be dividable of w"); + jit::seq_pool_attr_t attr(w, jit::SeqPoolType::kSum); + if (pooltype == "AVERAGE") { + attr.type = jit::SeqPoolType::kAvg; + } else if (pooltype == "SQRT") { + attr.type = jit::SeqPoolType::kSqrt; + } + auto seqpool = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); + size_t n = ins.size(); + size_t dst_step_size = n * w; + for (size_t i = 0; i < n; ++i) { + auto x_dims = ins[i]->dims(); + auto x_lod = ins[i]->lod()[0]; + const T* src = ins[i]->data(); + T* dst = y_data + i * w; + PADDLE_ENFORCE_EQ(static_cast(ins[i]->numel() / x_dims[0]), w, + "Width of all inputs should be equal."); + PADDLE_ENFORCE_EQ(x_lod.size(), bs + 1, + "Batchsize of all inputs should be equal."); + for (size_t j = 0; j < bs; ++j) { + attr.h = static_cast(x_lod[j + 1] - x_lod[j]); + seqpool(src, dst, &attr); + + // Currently only use_cvm is true. + dst[0] = log(dst[0] + 1); + dst[1] = log(dst[1] + 1) - dst[0]; + + dst += dst_step_size; + src += attr.h * attr.w; + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fusion_seqpool_cvm_concat, ops::FusionSeqPoolCVMConcatOp, + ops::FusionSeqPoolCVMConcatOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(fusion_seqpool_cvm_concat, + ops::FusionSeqPoolCVMConcatKernel, + ops::FusionSeqPoolCVMConcatKernel); diff --git a/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h new file mode 100644 index 00000000..75e8556c --- /dev/null +++ b/paddle/fluid/operators/fused/fusion_seqpool_cvm_concat_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; + +class FusionSeqPoolCVMConcatOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; +}; + +class FusionSeqPoolCVMConcatOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc index 53679ebd..2d100560 100644 --- a/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc +++ b/paddle/fluid/operators/fused/fusion_squared_mat_sub_op.cc @@ -136,8 +136,7 @@ class FusionSquaredMatSubKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(fusion_squared_mat_sub, ops::FusionSquaredMatSubOp, - ops::FusionSquaredMatSubOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::FusionSquaredMatSubOpMaker); REGISTER_OP_CPU_KERNEL(fusion_squared_mat_sub, ops::FusionSquaredMatSubKernel, diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h index fff817fb..b3264ec0 100644 --- a/paddle/fluid/operators/gather.cu.h +++ b/paddle/fluid/operators/gather.cu.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,7 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include "paddle/fluid/framework/dim.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -39,6 +44,27 @@ __global__ void GatherCUDAKernel(const T* params, const IndexT* indices, } } +template +__global__ void GatherNdCUDAKernel(const T* input, const int* input_dims, + const IndexT* indices, T* output, + size_t remain_size, size_t slice_size, + size_t end_size) { + CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) { + int indices_i = i / slice_size; + int slice_i = i - indices_i * slice_size; // offset inside the slice + IndexT gather_i = 0; + int64_t temp = slice_size; + for (int64_t j = end_size - 1; j >= 0; --j) { + auto index_value = indices[indices_i * end_size + j]; + assert(index_value >= 0 && index_value < input_dims[j]); + gather_i += (index_value * temp); + temp *= input_dims[j]; + } + IndexT input_i = gather_i + slice_i; + *(output + i) = *(input + input_i); + } +} + /** * A thin wrapper on gpu tensor * Return a new tensor from source tensor, gathered according to index @@ -49,10 +75,16 @@ __global__ void GatherCUDAKernel(const T* params, const IndexT* indices, template void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { - // PADDLE_ENFORCE(platform::is_gpu_place(place)); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1 || - (index.dims().size() == 2 && index.dims()[1] == 1)); + if (index.dims().size() == 1) { + PADDLE_ENFORCE_GT(index.dims()[0], 0, + "The index of gather_op should not be empty when the " + "index's rank is 1."); + } else if (index.dims().size() == 2) { + PADDLE_ENFORCE_EQ(index.dims()[1], 1, + " If the index's rank of gather_op is 2, the second " + "dimension should be 1."); + } int index_size = index.dims()[0]; @@ -78,5 +110,55 @@ void GPUGather(const platform::DeviceContext& ctx, const Tensor& src, p_src, p_index, p_output, index_size, slice_size); } +template +void GPUGatherNd(const framework::ExecutionContext& context, + const Tensor& input, const Tensor& index, Tensor* output) { + const auto& ctx = context.template device_context(); + const auto gplace = boost::get(ctx.GetPlace()); + auto cplace = platform::CPUPlace(); + + auto index_dims = index.dims(); + auto index_dims_size = index_dims.size(); + auto input_dims = input.dims(); + auto input_dims_size = input_dims.size(); + + const T* p_input = input.data(); + const IndexT* p_index = index.data(); + T* p_output = output->data(); + + // final dim + int64_t end_size = index_dims[index_dims_size - 1]; + // remain dim + auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1); + int64_t remain_numel = framework::product(remain_ddim); + // slice size + int64_t slice_size = 1; + for (int64_t i = end_size; i < input_dims_size; ++i) { + slice_size *= input_dims[i]; + } + // source dim + std::vector v_input_dims(input_dims_size); + for (int i = 0; i < input_dims_size; ++i) { + v_input_dims[i] = static_cast(input_dims[i]); + } + + auto& dev_ctx = context.cuda_device_context(); + int bytes = input_dims_size * sizeof(int); + auto p_input_dims = memory::Alloc(dev_ctx, bytes); + int* g_input_dims = reinterpret_cast(p_input_dims->ptr()); + memory::Copy(gplace, g_input_dims, cplace, v_input_dims.data(), bytes, + ctx.stream()); + + int block = 512; + int n = slice_size * remain_numel; + int grid = (n + block - 1) / block; + + GatherNdCUDAKernel<<< + grid, block, 0, + reinterpret_cast(ctx).stream()>>>( + p_input, g_input_dims, p_index, p_output, remain_numel, slice_size, + end_size); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h index 1e02c036..26fb93c2 100644 --- a/paddle/fluid/operators/gather.h +++ b/paddle/fluid/operators/gather.h @@ -36,10 +36,16 @@ using framework::Tensor; template void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1 || - (index.dims().size() == 2 && index.dims()[1] == 1)); + if (index.dims().size() == 2) { + PADDLE_ENFORCE_EQ(index.dims()[1], 1, + "index.dims()[1] should be 1 when index.dims().size() == " + "2 in gather_op."); + } else { + PADDLE_ENFORCE_EQ(index.dims().size(), 1, + "index.dims().size() should be 1 or 2 in gather_op."); + } int64_t index_size = index.dims()[0]; auto src_dims = src.dims(); @@ -60,5 +66,51 @@ void CPUGather(const platform::DeviceContext& ctx, const Tensor& src, } } +template +void CPUGatherNd(const platform::DeviceContext& ctx, const Tensor& input, + const Tensor& index, Tensor* output) { + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + "It should be running on the CPU"); + + auto index_dims = index.dims(); + auto index_dims_size = index_dims.size(); + auto input_dims = input.dims(); + auto input_dims_size = input_dims.size(); + + const T* p_input = input.data(); + const IndexT* p_index = index.data(); + T* p_output = output->data(); + + // final dim + int64_t end_size = index_dims[index_dims_size - 1]; + // remain dim + auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1); + int64_t remain_numel = framework::product(remain_ddim); + // slice size + int64_t slice_size = 1; + for (int64_t i = end_size; i < input_dims_size; ++i) { + slice_size *= input_dims[i]; + } + const size_t slice_bytes = slice_size * sizeof(T); + + for (int64_t i = 0; i < remain_numel; ++i) { + int64_t index_ = 0; + int64_t temp = 1; + for (int64_t j = end_size - 1; j >= 0; --j) { + IndexT index_value = p_index[i * end_size + j]; + PADDLE_ENFORCE_LT(index_value, input_dims[j], + "Input(index[-1)] has wrong value, it is %d", + index_value); + PADDLE_ENFORCE_GE(index_value, 0UL, + "The value of Input(index) must be no less than 0"); + + index_ += (index_value * temp); + temp *= input_dims[j]; + } + memcpy(p_output + i * slice_size, p_input + index_ * slice_size, + slice_bytes); + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/gather_nd_op.cc b/paddle/fluid/operators/gather_nd_op.cc new file mode 100644 index 00000000..aed0f824 --- /dev/null +++ b/paddle/fluid/operators/gather_nd_op.cc @@ -0,0 +1,183 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gather_nd_op.h" +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" + +namespace paddle { +namespace operators { + +class GatherNdOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of GatherNdOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, + "Input(Index) of GatherNdOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of GatherNdOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto x_dims_size = x_dims.size(); + auto index_dims = ctx->GetInputDim("Index"); + auto index_dims_size = index_dims.size(); + + PADDLE_ENFORCE_LE( + index_dims[index_dims_size - 1], x_dims_size, + "Input(Index).shape[-1] should be no greater than Input(X).rank"); + PADDLE_ENFORCE_GE(index_dims_size, 2UL, + "The rank of Input(Index) should be greater than 1"); + + std::vector result_dims; + // The result dims is + // Index.shape[:-1] + X.shape[Index.shape[-1]:] + for (int i = 0; i < index_dims_size - 1; ++i) { + result_dims.emplace_back(index_dims[i]); + } + for (int i = index_dims[index_dims_size - 1]; i < x_dims_size; ++i) { + result_dims.emplace_back(x_dims[i]); + } + + ctx->SetOutputDim("Out", framework::make_ddim(result_dims)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class GatherNdGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*-->*/ framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); + } +}; + +class GatherNdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The source input of gather_nd op"); + AddInput("Index", "The index input of gather_nd op"); + AddOutput("Out", "The output of gather_nd op"); + AddComment(R"DOC( + Gather_Nd Operator. + + This function is actually a high-dimensional extension of gather + and supports for simultaneous indexing by multiple axes. Out is + obtained by gathering slices from X into a tensor with shape + Index.shape[:-1] + X.shape[Index.shape[-1]:]. + + Example: + + Given: + X = [[[ 0, 1, 2, 3], + [ 4, 5, 6, 7], + [ 8, 9, 10, 11]], + [[12, 13, 14, 15], + [16, 17, 18, 19], + [20, 21, 22, 23]]] + + X.shape = (2, 3, 4) + + *Case 1: + + Index = [[1]] + + we get: + Out = + [[12, 13, 14, 15], + [16, 17, 18, 19], + [20, 21, 22, 23]] + + *Case 2: + + Index = [[0,2]] + + we get: + + Out = [8, 9, 10, 11] + + *Case 3: + + Index = [[1, 2, 3]] + + we get: + + Out = [23] + +)DOC"); + } +}; + +class GatherNdGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("gather_nd_grad"); + op->SetInput("Index", Input("Index")); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(GatherNdGradNoNeedBufferVarInference, + "X"); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(gather_nd, ops::GatherNdOp, ops::GatherNdOpMaker, + ops::GatherNdGradOpDescMaker); + +REGISTER_OPERATOR(gather_nd_grad, ops::GatherNdGradOp, + ops::GatherNdGradNoNeedBufferVarInference); + +REGISTER_OP_CPU_KERNEL(gather_nd, ops::GatherNdOpKernel, + ops::GatherNdOpKernel, + ops::GatherNdOpKernel, + ops::GatherNdOpKernel, + ops::GatherNdOpKernel); + +REGISTER_OP_CPU_KERNEL(gather_nd_grad, ops::GatherNdGradOpKernel, + ops::GatherNdGradOpKernel, + ops::GatherNdGradOpKernel, + ops::GatherNdGradOpKernel, + ops::GatherNdGradOpKernel); diff --git a/paddle/fluid/operators/gather_nd_op.cu b/paddle/fluid/operators/gather_nd_op.cu new file mode 100644 index 00000000..1ad33503 --- /dev/null +++ b/paddle/fluid/operators/gather_nd_op.cu @@ -0,0 +1,105 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/gather_nd_op.h" +#include "paddle/fluid/operators/scatter.cu.h" + +namespace paddle { +namespace operators { + +template +class GatherNdOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, + "This kernel only runs on GPU device."); + auto *x = ctx.Input("X"); + auto *index = ctx.Input("Index"); + auto *output = ctx.Output("Out"); + + output->mutable_data(ctx.GetPlace()); + if (x->numel() == 0) return; + const auto &index_type = index->type(); + bool index_type_match = index_type == framework::proto::VarType::INT32 || + index_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + "Index holds the wrong type, it holds %s, but desires to be %s or %s", + paddle::framework::DataTypeToString(index_type), + paddle::framework::DataTypeToString(framework::proto::VarType::INT32), + paddle::framework::DataTypeToString(framework::proto::VarType::INT64)); + if (index_type == framework::proto::VarType::INT32) { + GPUGatherNd(ctx, *x, *index, output); + } else if (index_type == framework::proto::VarType::INT64) { + GPUGatherNd(ctx, *x, *index, output); + } + } +}; + +template +class GatherNdGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, + "This kernel only runs on GPU device."); + auto *index = ctx.Input("Index"); + auto *dX = ctx.Output(framework::GradVarName("X")); + auto *dO = ctx.Input(framework::GradVarName("Out")); + + dX->mutable_data(ctx.GetPlace()); + auto dxt = framework::EigenVector::Flatten(*dX); + auto &place = *ctx.template device_context() + .eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + if (dO->numel() == 0) return; + + const auto &index_type = index->type(); + bool index_type_match = index_type == framework::proto::VarType::INT32 || + index_type == framework::proto::VarType::INT64; + + PADDLE_ENFORCE_EQ( + index_type_match, true, + "Index holds the wrong type, it holds %s, but desires to be %s or %s", + paddle::framework::DataTypeToString(index_type), + paddle::framework::DataTypeToString(framework::proto::VarType::INT32), + paddle::framework::DataTypeToString(framework::proto::VarType::INT64)); + + if (index_type == framework::proto::VarType::INT32) { + GPUScatterNdAdd(ctx, *dO, *index, dX); + } else if (index_type == framework::proto::VarType::INT64) { + GPUScatterNdAdd(ctx, *dO, *index, dX); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +using CUDA = paddle::platform::CUDADeviceContext; +REGISTER_OP_CUDA_KERNEL(gather_nd, ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel, + ops::GatherNdOpCUDAKernel); + +REGISTER_OP_CUDA_KERNEL(gather_nd_grad, + ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel, + ops::GatherNdGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gather_nd_op.h b/paddle/fluid/operators/gather_nd_op.h new file mode 100644 index 00000000..059ca54c --- /dev/null +++ b/paddle/fluid/operators/gather_nd_op.h @@ -0,0 +1,91 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/scatter.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class GatherNdOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + "This kernel only runs on CPU."); + + auto *x = ctx.Input("X"); + auto *index = ctx.Input("Index"); + auto *output = ctx.Output("Out"); + + output->mutable_data(ctx.GetPlace()); + if (x->numel() == 0) return; + + const auto &index_type = index->type(); + bool index_type_match = index_type == framework::proto::VarType::INT32 || + index_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + "Index holds the wrong type, it holds %s, but desires to be %s or %s", + paddle::framework::DataTypeToString(index_type), + paddle::framework::DataTypeToString(framework::proto::VarType::INT32), + paddle::framework::DataTypeToString(framework::proto::VarType::INT64)); + if (index_type == framework::proto::VarType::INT32) { + CPUGatherNd(ctx.device_context(), *x, *index, output); + } else if (index_type == framework::proto::VarType::INT64) { + CPUGatherNd(ctx.device_context(), *x, *index, output); + } + } +}; + +template +class GatherNdGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + "This kernel only runs on CPU."); + auto *index = ctx.Input("Index"); + auto *dX = ctx.Output(framework::GradVarName("X")); + auto *dO = ctx.Input(framework::GradVarName("Out")); + dX->mutable_data(ctx.GetPlace()); + auto dxt = framework::EigenVector::Flatten(*dX); + auto &place = *ctx.template device_context() + .eigen_device(); + dxt.device(place) = dxt.constant(static_cast(0)); + if (dO->numel() == 0) return; + + const auto &index_type = index->type(); + bool index_type_match = index_type == framework::proto::VarType::INT32 || + index_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + "Index holds the wrong type, it holds %s, but desires to be %s or %s", + paddle::framework::DataTypeToString(index_type), + paddle::framework::DataTypeToString(framework::proto::VarType::INT32), + paddle::framework::DataTypeToString(framework::proto::VarType::INT64)); + if (index_type == framework::proto::VarType::INT32) { + ScatterNdAdd(ctx, *dO, *index, dX); + } else if (index_type == framework::proto::VarType::INT64) { + ScatterNdAdd(ctx, *dO, *index, dX); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc index 7cde7ca4..c765d344 100644 --- a/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc +++ b/paddle/fluid/operators/grid_sampler_cudnn_op.cu.cc @@ -55,9 +55,9 @@ class CUDNNGridSampleOpKernel : public framework::OpKernel { ScopedTensorDescriptor input_desc; ScopedTensorDescriptor output_desc; cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - DataLayout::kNCHW, framework::vectorize2int(input->dims())); + DataLayout::kNCHW, framework::vectorize(input->dims())); cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - DataLayout::kNCHW, framework::vectorize2int(output->dims())); + DataLayout::kNCHW, framework::vectorize(output->dims())); CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerForward( handle, cudnn_st_desc, CudnnDataType::kOne(), cudnn_input_desc, @@ -103,13 +103,13 @@ class CUDNNGridSampleGradOpKernel : public framework::OpKernel { ScopedTensorDescriptor input_grad_desc; ScopedTensorDescriptor output_grad_desc; cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - DataLayout::kNCHW, framework::vectorize2int(input->dims())); + DataLayout::kNCHW, framework::vectorize(input->dims())); cudnnTensorDescriptor_t cudnn_input_grad_desc = input_grad_desc.descriptor( - DataLayout::kNCHW, framework::vectorize2int(input_grad->dims())); + DataLayout::kNCHW, framework::vectorize(input_grad->dims())); cudnnTensorDescriptor_t cudnn_output_grad_desc = output_grad_desc.descriptor( - DataLayout::kNCHW, framework::vectorize2int(output_grad->dims())); + DataLayout::kNCHW, framework::vectorize(output_grad->dims())); CUDNN_ENFORCE(platform::dynload::cudnnSpatialTfSamplerBackward( handle, cudnn_st_dest, CudnnDataType::kOne(), cudnn_input_desc, diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 2b1e8038..e184ff14 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -38,9 +38,11 @@ class GroupNormOp : public framework::OperatorWithKernel { "Output(Mean) of GroupNormOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Variance"), "Output(Variance) of GroupNormOp should not be null."); - auto x_dim = ctx->GetInputDim("X"); - auto channel_num = x_dim[1]; + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + const int64_t channel_num = + (data_layout == DataLayout::kNCHW ? x_dim[1] : x_dim[x_dim.size() - 1]); auto batch_size = x_dim[0]; auto groups = ctx->Attrs().Get("groups"); PADDLE_ENFORCE_LE( @@ -91,7 +93,9 @@ class GroupNormOpMaker : public framework::OpProtoAndCheckerMaker { .AddCustomChecker([](const int &groups) { PADDLE_ENFORCE_GT(groups, 0, "'groups' should be greater than zero."); }); - + AddAttr("data_layout", + "An optional string from: \"NHWC\", \"NCHW\". ") + .SetDefault("NCHW"); AddComment(R"DOC( Group Normalization @@ -170,21 +174,10 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker { } }; -class GroupNormInplaceInToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc &op_desc, bool use_cuda) const override { - return {{"X", "Y"}}; - } -}; - -class GroupNormGradInplaceInToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc &op_desc, bool use_cuda) const override { - return {{framework::GradVarName("Y"), framework::GradVarName("X")}}; - } -}; +DECLARE_INPLACE_OP_INFERER(GroupNormInplaceInToOut, {"X", "Y"}); +DECLARE_INPLACE_OP_INFERER(GroupNormGradInplaceInToOut, + {framework::GradVarName("Y"), + framework::GradVarName("X")}); class GroupNormOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index 3bf85862..b7f79be4 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -19,6 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { +using DataLayout = framework::DataLayout; enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 }; #define CHECK_CASE(i, flags, kernel_name, ...) \ @@ -45,18 +46,27 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { } template -__global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, +__global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W, int imsize, int groups, - int group_size, T* mean, T* var) { + int group_size, T* mean, T* var, + const DataLayout data_layout) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; + int H = imsize / W; int number = min(group_size, static_cast(C - gid * group_size)); int ccid = gid * group_size + cid; if (ccid >= C) return; T x_mean = 0, x_var = 0; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { - T val = x[(bid * C + ccid) * imsize + imid]; + T val; + if (data_layout == DataLayout::kNCHW) { + val = x[(bid * C + ccid) * imsize + imid]; + } else { + int hid = imid / W; + int wid = imid % W; + val = x[(bid * H + hid) * W * C + wid * C + ccid]; + } x_mean += val; x_var += val * val; } @@ -69,11 +79,13 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, template __global__ void GroupNormForward(const T* x, const T* mean, const T* var, const T* scale, const T* bias, int N, int C, - int imsize, int groups, int group_size, - T epsilon, T* y, T* real_var) { + int W, int imsize, int groups, int group_size, + T epsilon, T* y, T* real_var, + const DataLayout data_layout) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; + int H = imsize / W; int ccid = gid * group_size + cid; if (ccid >= C) return; T x_mean = mean[bid * groups + gid]; @@ -82,11 +94,23 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var, T var_inv = 1.0 / sqrt(x_var + epsilon); if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { - T val = x[(bid * C + ccid) * imsize + imid]; + T val; + int hid, wid; + if (data_layout == DataLayout::kNCHW) { + val = x[(bid * C + ccid) * imsize + imid]; + } else { + hid = imid / W; + wid = imid % W; + val = x[(bid * H + hid) * W * C + wid * C + ccid]; + } val = (val - x_mean) * var_inv; if (flags & kHasScale) val *= scale[gid * group_size + cid]; if (flags & kHasBias) val += bias[gid * group_size + cid]; - y[(bid * C + ccid) * imsize + imid] = val; + if (data_layout == DataLayout::kNCHW) { + y[(bid * C + ccid) * imsize + imid] = val; + } else { + y[(bid * H + hid) * W * C + wid * C + ccid] = val; + } } } @@ -95,6 +119,9 @@ class GroupNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); const float epsilon = ctx.Attr("epsilon"); auto* scale = ctx.Input("Scale"); auto* bias = ctx.Input("Bias"); @@ -106,7 +133,13 @@ class GroupNormKernel const auto groups = ctx.Attr("groups"); const auto x_dims = x->dims(); - const int group_size = (x_dims[1] - 1) / groups + 1; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int group_size = (C - 1) / groups + 1; + const int W = + (data_layout == DataLayout::kNCHW ? x_dims[x_dims.size() - 1] + : x_dims[x_dims.size() - 2]); y->mutable_data(ctx.GetPlace()); mean->mutable_data(ctx.GetPlace()); @@ -130,31 +163,32 @@ class GroupNormKernel const T* bias_data = nullptr; if (bias) bias_data = bias->data(); - int imsize = x_dims[2] * x_dims[3]; + int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3] + : x_dims[1] * x_dims[2]); + int block_size = std::min(1024, imsize); dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); GroupNormForwardGetMeanAndVar<<>>( - x_data, x_dims[0], x_dims[1], imsize, groups, group_size, mean_data, - temp_var_data); + x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data, + temp_var_data, data_layout); int flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data, - scale_data, bias_data, x_dims[0], x_dims[1], imsize, - groups, group_size, epsilon, y_data, var_data); + scale_data, bias_data, x_dims[0], C, W, imsize, groups, + group_size, epsilon, y_data, var_data, data_layout); } }; template -__global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale, - const T* bias, const T* d_y, - int N, int C, int imsize, - int groups, int group_size, - T epsilon, T* d_mean, T* d_var, - T* d_scale, T* d_bias) { +__global__ void GroupNormBackwardGetMeanAndVar( + const T* x, const T* scale, const T* bias, const T* d_y, int N, int C, + int W, int imsize, int groups, int group_size, T epsilon, T* d_mean, + T* d_var, T* d_scale, T* d_bias, const DataLayout data_layout) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; + int H = imsize / W; int number = min(group_size, static_cast(C - gid * group_size)); int ccid = gid * group_size + cid; if (ccid >= C) return; @@ -165,8 +199,16 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale, T d_mean_data = 0, d_var_data = 0, d_scale_data = 0, d_bias_data = 0; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { - T val = x[(bid * C + ccid) * imsize + imid] - x_bias; - T dval = d_y[(bid * C + ccid) * imsize + imid]; + T val, dval; + if (data_layout == DataLayout::kNCHW) { + val = x[(bid * C + ccid) * imsize + imid] - x_bias; + dval = d_y[(bid * C + ccid) * imsize + imid]; + } else { + int hid = imid / W; + int wid = imid % W; + val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias; + dval = d_y[(bid * H + hid) * W * C + wid * C + ccid]; + } d_var_data += val * dval; d_mean_data += dval * x_scale; @@ -184,12 +226,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x, const T* scale, template __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, const T* bias, const T* var, const T* d_mean, - const T* d_var, int N, int C, int imsize, - int groups, int group_size, T epsilon, - T* d_x) { + const T* d_var, int N, int C, int W, + int imsize, int groups, int group_size, + T epsilon, T* d_x, + const DataLayout data_layout) { int gid = blockIdx.y; int cid = blockIdx.x; int bid = blockIdx.z; + int H = imsize / W; int number = min(group_size, static_cast(C - gid * group_size)); int ccid = gid * group_size + cid; if (ccid >= C) return; @@ -206,12 +250,23 @@ __global__ void GroupNormBackward(const T* x, const T* d_y, const T* scale, if (x_scale != 0) x_scale_inv = 1.0 / x_scale; for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) { - T tmp = x[(bid * C + ccid) * imsize + imid]; - T v_y = (tmp - x_bias) * x_scale_inv; - T dly = d_y[(bid * C + ccid) * imsize + imid]; - d_x[(bid * C + ccid) * imsize + imid] = - x_var_inv * - (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); + if (data_layout == DataLayout::kNCHW) { + T tmp = x[(bid * C + ccid) * imsize + imid]; + T v_y = (tmp - x_bias) * x_scale_inv; + T dly = d_y[(bid * C + ccid) * imsize + imid]; + d_x[(bid * C + ccid) * imsize + imid] = + x_var_inv * + (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); + } else { + int hid = imid / W; + int wid = imid % W; + T tmp = x[(bid * H + hid) * W * C + wid * C + ccid]; + T v_y = (tmp - x_bias) * x_scale_inv; + T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid]; + d_x[(bid * H + hid) * W * C + wid * C + ccid] = + x_var_inv * + (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean); + } } } @@ -220,6 +275,9 @@ class GroupNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); const float epsilon = ctx.Attr("epsilon"); auto* x = ctx.Input("Y"); auto* var = ctx.Input("Variance"); @@ -234,7 +292,13 @@ class GroupNormGradKernel auto* d_bias = ctx.Output(framework::GradVarName("Bias")); const auto& x_dims = x->dims(); - const int group_size = (x_dims[1] - 1) / groups + 1; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int group_size = (C - 1) / groups + 1; + const int W = + (data_layout == DataLayout::kNCHW ? x_dims[x_dims.size() - 1] + : x_dims[x_dims.size() - 2]); d_x->mutable_data(ctx.GetPlace()); math::SetConstant set_zero; @@ -273,21 +337,23 @@ class GroupNormGradKernel const T* bias_data = nullptr; if (bias) bias_data = bias->data(); - int imsize = x_dims[2] * x_dims[3]; + int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3] + : x_dims[1] * x_dims[2]); + int block_size = std::min(1024, imsize); dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); int flags = (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias; UNROLL_ALL_CASES(flags, GroupNormBackwardGetMeanAndVar, x_data, scale_data, - bias_data, y_data, x_dims[0], x_dims[1], imsize, groups, + bias_data, y_data, x_dims[0], C, W, imsize, groups, group_size, epsilon, temp_mean_data, temp_var_data, - d_scale_data, d_bias_data); + d_scale_data, d_bias_data, data_layout); if (d_x_data != nullptr) { UNROLL_ALL_CASES(flags, GroupNormBackward, x_data, y_data, scale_data, bias_data, var_data, temp_mean_data, temp_var_data, - x_dims[0], x_dims[1], imsize, groups, group_size, - epsilon, d_x_data); + x_dims[0], C, W, imsize, groups, group_size, epsilon, + d_x_data, data_layout); } } }; diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h index 498e65f6..d4a1b3f0 100644 --- a/paddle/fluid/operators/group_norm_op.h +++ b/paddle/fluid/operators/group_norm_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once #include +#include +#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" @@ -31,6 +33,9 @@ template class GroupNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); const float epsilon = ctx.Attr("epsilon"); auto* scale = ctx.Input("Scale"); auto* bias = ctx.Input("Bias"); @@ -42,7 +47,10 @@ class GroupNormKernel : public framework::OpKernel { const auto groups = ctx.Attr("groups"); const auto x_dims = x->dims(); - const int group_size = (x_dims[1] - 1) / groups + 1; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int group_size = (C - 1) / groups + 1; y->mutable_data(ctx.GetPlace()); mean->mutable_data(ctx.GetPlace()); @@ -58,36 +66,75 @@ class GroupNormKernel : public framework::OpKernel { const T* bias_data = nullptr; if (bias) bias_data = bias->data(); - int imsize = x_dims[2] * x_dims[3]; + int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3] + : x_dims[1] * x_dims[2]); + auto* iter_x_data = x_data; auto* iter_y_data = y_data; - for (int bid = 0; bid < x_dims[0]; bid++) + for (int bid = 0; bid < x_dims[0]; bid++) { for (int gid = 0; gid < groups; gid++) { T x_mean = 0, x_var = 0; - int number = std::min(group_size, - static_cast(x_dims[1] - gid * group_size)); - auto* tmp = iter_x_data; - for (int cid = 0; cid < number; cid++) { - for (int imid = 0; imid < imsize; imid++, iter_x_data++) { - x_mean += iter_x_data[0]; - x_var += iter_x_data[0] * iter_x_data[0]; + int number = + std::min(group_size, static_cast(C - gid * group_size)); + auto* tmp_x = iter_x_data; + auto* x_src_data = iter_x_data; + auto* tmp_y = iter_y_data; + auto* y_src_data = iter_y_data; + + if (data_layout == DataLayout::kNCHW) { + for (int cid = 0; cid < number; cid++) { + for (int imid = 0; imid < imsize; imid++, iter_x_data++) { + x_mean += iter_x_data[0]; + x_var += iter_x_data[0] * iter_x_data[0]; + } + } + } else { + for (int cid = 0; cid < number; cid++) { + iter_x_data = tmp_x + cid; + for (int imid = 0; imid < imsize; imid++, iter_x_data += C) { + x_mean += iter_x_data[0]; + x_var += iter_x_data[0] * iter_x_data[0]; + } } + iter_x_data = tmp_x + group_size; } + x_mean /= number * imsize; x_var /= number * imsize; x_var = x_var - x_mean * x_mean; T var_inv = 1.0 / sqrt(x_var + epsilon); mean_data[bid * groups + gid] = x_mean; var_data[bid * groups + gid] = x_var; - for (int cid = 0; cid < number; cid++) { - for (int imid = 0; imid < imsize; imid++, tmp++, iter_y_data++) { - T val = (tmp[0] - x_mean) * var_inv; - if (scale_data) val *= scale_data[gid * group_size + cid]; - if (bias_data) val += bias_data[gid * group_size + cid]; - iter_y_data[0] = val; + + if (data_layout == DataLayout::kNCHW) { + for (int cid = 0; cid < number; cid++) { + for (int imid = 0; imid < imsize; imid++, tmp_x++, iter_y_data++) { + T val = (tmp_x[0] - x_mean) * var_inv; + if (scale_data) val *= scale_data[gid * group_size + cid]; + if (bias_data) val += bias_data[gid * group_size + cid]; + iter_y_data[0] = val; + } } + } else { + for (int cid = 0; cid < number; cid++) { + tmp_x = x_src_data + cid; + iter_y_data = y_src_data + cid; + for (int imid = 0; imid < imsize; + imid++, tmp_x += C, iter_y_data += C) { + T val = (tmp_x[0] - x_mean) * var_inv; + if (scale_data) val *= scale_data[gid * group_size + cid]; + if (bias_data) val += bias_data[gid * group_size + cid]; + iter_y_data[0] = val; + } + } + iter_y_data = tmp_y + group_size; } } + if (data_layout == DataLayout::kNHWC) { + iter_x_data = x_data + (bid + 1) * C * imsize; + iter_y_data = y_data + (bid + 1) * C * imsize; + } + } } }; @@ -95,6 +142,9 @@ template class GroupNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); const float epsilon = ctx.Attr("epsilon"); auto* x = ctx.Input("Y"); auto* var = ctx.Input("Variance"); @@ -109,7 +159,10 @@ class GroupNormGradKernel : public framework::OpKernel { auto* d_bias = ctx.Output(framework::GradVarName("Bias")); const auto& x_dims = x->dims(); - const int group_size = (x_dims[1] - 1) / groups + 1; + const int C = + (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int group_size = (C - 1) / groups + 1; d_x->mutable_data(ctx.GetPlace()); math::SetConstant set_zero; @@ -137,54 +190,112 @@ class GroupNormGradKernel : public framework::OpKernel { const T* bias_data = nullptr; if (bias) bias_data = bias->data(); - int imsize = x_dims[2] * x_dims[3]; + int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3] + : x_dims[1] * x_dims[2]); auto* iter_x_data = x_data; auto* iter_d_x_data = d_x_data; auto* iter_y_data = y_data; - for (int bid = 0; bid < x_dims[0]; bid++) + for (int bid = 0; bid < x_dims[0]; bid++) { for (int gid = 0; gid < groups; gid++) { T x_var = var_data[bid * groups + gid]; T var_inv = 1.0 / sqrt(x_var + epsilon); - int number = std::min(group_size, - static_cast(x_dims[1] - gid * group_size)); + int number = + std::min(group_size, static_cast(C - gid * group_size)); T number_inv = 1.0 / (number * imsize); - auto* iter_x_data2 = iter_x_data; - auto* iter_y_data2 = iter_y_data; + auto* tmp_x = iter_x_data; + auto* tmp_y = iter_y_data; + auto* tmp_d_x = iter_d_x_data; + auto* x_src_data = iter_x_data; + auto* y_src_data = iter_y_data; + auto* iter_x_data_backup = iter_x_data; + auto* iter_y_data_backup = iter_y_data; + auto* iter_d_x_data_backup = iter_d_x_data; T dp_scale = 0, dp_bias = 0; - for (int cid = 0; cid < number; cid++) { - for (int imid = 0; imid < imsize; - imid++, iter_x_data++, iter_y_data++) { - T val = iter_x_data[0]; - if (bias_data) val -= bias_data[gid * group_size + cid]; - T dval = iter_y_data[0]; - dp_scale += val * dval; - dp_bias += dval * scale_data[gid * group_size + cid]; - - if (scale_data && scale_data[gid * group_size + cid] != 0) - val /= scale_data[gid * group_size + cid]; - if (d_bias_data) d_bias_data[gid * group_size + cid] += dval; - if (d_scale_data) - d_scale_data[gid * group_size + cid] += val * dval; + + if (data_layout == DataLayout::kNCHW) { + for (int cid = 0; cid < number; cid++) { + for (int imid = 0; imid < imsize; + imid++, iter_x_data++, iter_y_data++) { + T val = iter_x_data[0]; + if (bias_data) val -= bias_data[gid * group_size + cid]; + T dval = iter_y_data[0]; + dp_scale += val * dval; + dp_bias += dval * scale_data[gid * group_size + cid]; + + if (scale_data && scale_data[gid * group_size + cid] != 0) + val /= scale_data[gid * group_size + cid]; + if (d_bias_data) d_bias_data[gid * group_size + cid] += dval; + if (d_scale_data) + d_scale_data[gid * group_size + cid] += val * dval; + } } - } - for (int cid = 0; cid < number; cid++) { - for (int imid = 0; imid < imsize; - imid++, iter_d_x_data++, iter_x_data2++, iter_y_data2++) { - T v_y = iter_x_data2[0]; - T dly = iter_y_data2[0]; - T dss = dp_scale; - T dbs = dp_bias; - T v_scale = scale_data[gid * group_size + cid]; - T v_bias = bias_data[gid * group_size + cid]; - v_y -= v_bias; - if (v_scale != 0) v_y /= v_scale; - iter_d_x_data[0] = - (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) * - var_inv; + for (int cid = 0; cid < number; cid++) { + for (int imid = 0; imid < imsize; + imid++, iter_d_x_data++, tmp_x++, tmp_y++) { + T v_y = tmp_x[0]; + T dly = tmp_y[0]; + T dss = dp_scale; + T dbs = dp_bias; + T v_scale = scale_data[gid * group_size + cid]; + T v_bias = bias_data[gid * group_size + cid]; + v_y -= v_bias; + if (v_scale != 0) v_y /= v_scale; + iter_d_x_data[0] = + (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) * + var_inv; + } + } + } else { + for (int cid = 0; cid < number; cid++) { + iter_x_data = x_src_data + cid; + iter_y_data = y_src_data + cid; + for (int imid = 0; imid < imsize; + imid++, iter_x_data += C, iter_y_data += C) { + T val = iter_x_data[0]; + if (bias_data) val -= bias_data[gid * group_size + cid]; + T dval = iter_y_data[0]; + dp_scale += val * dval; + dp_bias += dval * scale_data[gid * group_size + cid]; + + if (scale_data && scale_data[gid * group_size + cid] != 0) + val /= scale_data[gid * group_size + cid]; + if (d_bias_data) d_bias_data[gid * group_size + cid] += dval; + if (d_scale_data) + d_scale_data[gid * group_size + cid] += val * dval; + } } + + for (int cid = 0; cid < number; cid++) { + tmp_x = x_src_data + cid; + tmp_y = y_src_data + cid; + iter_d_x_data = tmp_d_x + cid; + for (int imid = 0; imid < imsize; + imid++, iter_d_x_data += C, tmp_x += C, tmp_y += C) { + T v_y = tmp_x[0]; + T dly = tmp_y[0]; + T dss = dp_scale; + T dbs = dp_bias; + T v_scale = scale_data[gid * group_size + cid]; + T v_bias = bias_data[gid * group_size + cid]; + v_y -= v_bias; + if (v_scale != 0) v_y /= v_scale; + iter_d_x_data[0] = + (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) * + var_inv; + } + } + iter_x_data = iter_x_data_backup + group_size; + iter_y_data = iter_y_data_backup + group_size; + iter_d_x_data = iter_d_x_data_backup + group_size; } } + if (data_layout == DataLayout::kNHWC) { + iter_x_data = x_data + (bid + 1) * C * imsize; + iter_d_x_data = d_x_data + (bid + 1) * C * imsize; + iter_y_data = y_data + (bid + 1) * C * imsize; + } + } } }; diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index a0af5145..d20a7e96 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -97,10 +97,10 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_DISTRIBUTE // w_Out is set to used by prefetch, never change it in other cases - auto* w_out = ctx.Output("W_Out"); - operators::distributed::prefetch_with_reconstruct( - "Ids@Prefetch", "W@Prefetch", table_names, epmap, height_sections, - ctx, local_scope, w_out); + auto weight = ctx.Outputs("W_Out").front(); + operators::distributed::prefetch("Ids@Prefetch", "W@Prefetch", weight, + true, table_names, epmap, + height_sections, ctx, local_scope); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc index 157f13ff..51e2ec5f 100644 --- a/paddle/fluid/operators/huber_loss_op.cc +++ b/paddle/fluid/operators/huber_loss_op.cc @@ -25,27 +25,35 @@ class HuberLossOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must be initialized."); - PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must be initialized."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) must be initialized."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true, + "Input(Y) must be initialized."); auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); - - PADDLE_ENFORCE_EQ(x_dims.size(), 2, - "The rank of Input(X) must be 2 and the shape is " - "[batch_size, 1]."); - if (ctx->IsRuntime() || - (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) { - PADDLE_ENFORCE_EQ(x_dims, y_dims, "Shape of X and Y should be same"); + int rank = x_dims.size(); + + if (rank == y_dims.size()) { + PADDLE_ENFORCE_EQ(y_dims[rank - 1], 1U, + "The last dimension of Input(Y) should be equal to 1."); + } else { + PADDLE_ENFORCE_EQ(rank, y_dims.size() + 1, + "The rank of Input(X) should be equal to " + "the rank of Input(Y) plus 1."); } - if (ctx->IsRuntime()) { - PADDLE_ENFORCE_EQ(x_dims[1], 1, - "Each row of Input(X) contains a real value, " - "so the 2nd dimension of Input(X) must be 1."); + bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) || + framework::contain_unknown_dim(y_dims); + if (ctx->IsRuntime() || !contain_unknown_dim) { + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(y_dims, 0, rank - 1), + "The Input(X) and Input(Label) should have the same " + "shape except the last dimension."); } - ctx->SetOutputDim("Residual", x_dims); - ctx->SetOutputDim("Out", {x_dims[0], 1}); + auto out_dims = y_dims; + ctx->SetOutputDim("Residual", out_dims); + ctx->SetOutputDim("Out", out_dims); ctx->ShareLoD("X", "Out"); } }; @@ -98,8 +106,8 @@ class HuberLossGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) should not be null."); auto residual_dims = ctx->GetInputDim("Residual"); diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h index fa21bd01..7000b5d3 100644 --- a/paddle/fluid/operators/huber_loss_op.h +++ b/paddle/fluid/operators/huber_loss_op.h @@ -41,7 +41,7 @@ struct HuberLossForward { T delta; }; -template +template class HuberLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -49,7 +49,7 @@ class HuberLossKernel : public framework::OpKernel { auto* in1 = context.Input("Y"); auto* out0 = context.Output("Residual"); auto* out1 = context.Output("Out"); - auto delta = static_cast(context.Attr("delta")); + auto delta = static_cast(context.Attr("delta")); auto& place = *context.template device_context().eigen_device(); @@ -86,7 +86,7 @@ struct HuberLossBackward { T delta; }; -template +template class HuberLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -94,7 +94,7 @@ class HuberLossGradKernel : public framework::OpKernel { auto* in1 = context.Input(framework::GradVarName("Out")); auto* out0 = context.Output(framework::GradVarName("X")); auto* out1 = context.Output(framework::GradVarName("Y")); - auto delta = static_cast(context.op().Attr("delta")); + auto delta = static_cast(context.op().Attr("delta")); auto& place = *context.template device_context().eigen_device(); diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc new file mode 100644 index 00000000..6375c92d --- /dev/null +++ b/paddle/fluid/operators/instance_norm_op.cc @@ -0,0 +1,646 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/instance_norm_op.h" +#include +#include +#include +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +void InstanceNormOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of Instance Norm Op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true, + "Input(Scale) of Instance Norm Op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Bias"), true, + "Input(Bias) of Instance Norm Op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Y"), true, + "Output(Y) of Instance Norm Op should not be null."); + + PADDLE_ENFORCE_EQ( + ctx->HasOutput("SavedMean"), true, + "Output(SavedMean) of Instance Norm Op should not be null."); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("SavedVariance"), true, + "Output(SavedVariance) of Instance Norm Op should not be null."); + + const auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + "the dimension of input X must greater than or equal to 2"); + PADDLE_ENFORCE_LE(x_dims.size(), 5, + "the dimension of input X must smaller than or equal to 5"); + auto N = x_dims[0]; + auto C = x_dims[1]; + auto NxC = N * C; + + auto scale_dim = ctx->GetInputDim("Scale"); + auto bias_dim = ctx->GetInputDim("Bias"); + + PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL); + PADDLE_ENFORCE_EQ(bias_dim.size(), 1UL); + + bool check = !((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 || + framework::product(bias_dim) <= 0)); + + if (check) { + PADDLE_ENFORCE_EQ(scale_dim[0], C); + PADDLE_ENFORCE_EQ(bias_dim[0], C); + } + + ctx->SetOutputDim("Y", x_dims); + ctx->SetOutputDim("SavedMean", {NxC}); + ctx->SetOutputDim("SavedVariance", {NxC}); + ctx->ShareLoD("X", "Y"); +} + +framework::OpKernelType InstanceNormOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + auto input_data_type = ctx.Input("X")->type(); + // By default, the type of the scale, bias, mean, + // and var tensors should both be float. (For float or float16 input tensor) + // or double (For double input tensor). + auto in_param_type = framework::proto::VarType::FP32; + if (input_data_type == framework::proto::VarType::FP64) { + in_param_type = framework::proto::VarType::FP64; + } + PADDLE_ENFORCE_EQ(in_param_type, ctx.Input("Scale")->type(), + "Scale input should be of float type"); + PADDLE_ENFORCE_EQ(in_param_type, ctx.Input("Bias")->type(), + "Bias input should be of float type"); + + return framework::OpKernelType(input_data_type, ctx.GetPlace()); +} + +void InstanceNormOpMaker::Make() { + AddAttr("epsilon", "") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true, + "'epsilon' should be between 0.0 and 0.001."); + }); + AddInput("X", "The input tensor"); + AddInput("Scale", + "Scale is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddInput("Bias", + "Bias is a 1-dimensional tensor of size C " + "that is applied to the output"); + AddOutput("Y", "result after normalization"); + AddOutput("SavedMean", + "Mean of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddOutput("SavedVariance", + "Variance of the current mini batch, " + "will apply to output when training") + .AsIntermediate(); + AddComment(R"DOC( +Instance Normalization. + +Instance Norm has been implemented as disscussed in the paper: +https://arxiv.org/pdf/1607.08022.pdf +Can be used as a normalizer function for conv2d and fully_connected operations. +The required data format for this layer is as following: +NCHW `[batch, in_channels, in_height, in_width]` + +)DOC"); +} + +template +class InstanceNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + T epsilon = static_cast(ctx.Attr("epsilon")); + + const auto *x = ctx.Input("X"); + const auto &x_dims = x->dims(); + + const int N = x_dims[0]; + const int C = x_dims[1]; + const int NxC = N * C; + + const int sample_size = x->numel() / N / C; + + auto *y = ctx.Output("Y"); + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + + auto &dev_ctx = ctx.template device_context(); + auto *place = dev_ctx.eigen_device(); + + Eigen::DSizes bcast(1, sample_size); + Eigen::DSizes C_shape(C, 1); + Eigen::DSizes NxC_shape(NxC, 1); + Eigen::DSizes shape(NxC, sample_size); + + math::SetConstant set_constant; + + saved_mean->mutable_data(ctx.GetPlace()); + saved_variance->mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, saved_mean, static_cast(0)); + set_constant(dev_ctx, saved_variance, static_cast(0)); + + auto saved_mean_a = framework::EigenVector::Flatten(*saved_mean); + auto saved_mean_e = saved_mean_a.reshape(NxC_shape); + auto saved_variance_a = framework::EigenVector::Flatten(*saved_variance); + auto saved_variance_e = saved_variance_a.reshape(NxC_shape); + + auto x_e = framework::EigenVector::Flatten(*x); + auto x_arr = x_e.reshape(shape); + + Eigen::DSizes rdims(1); + + saved_mean_e.device(*place) = x_arr.mean(rdims); + auto saved_variance_arr = + (x_arr - saved_mean_e.broadcast(bcast)).square().mean(rdims) + epsilon; + + saved_variance_e.device(*place) = saved_variance_arr.sqrt().inverse(); + + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + auto scale_e = framework::EigenVector::Flatten(*scale); + auto scale_arr = scale_e.reshape(C_shape); + auto bias_e = framework::EigenVector::Flatten(*bias); + auto bias_arr = bias_e.reshape(C_shape); + + y->mutable_data(ctx.GetPlace()); + auto y_e = framework::EigenVector::Flatten(*y); + auto y_arr = y_e.reshape(shape); + + // (x - mean) * inv_std * scale + bias + Eigen::DSizes bcast_param(N, sample_size); + y_arr.device(*place) = (x_arr - saved_mean_e.broadcast(bcast)) * + saved_variance_e.broadcast(bcast) * + scale_arr.broadcast(bcast_param) + + bias_arr.broadcast(bcast_param); + } +}; + +void InstanceNormGradOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true, + "Input(scale) should not be null"); + + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Y")), true, + "Input(Y@GRAD) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("SavedMean"), true, + "Input(SavedMean) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("SavedVariance"), true, + "Input(SavedVariance) should not be null"); + + // check output + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, + "Output(x@GRAD) should not be null"); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Bias")), true, + "Output(Scale@GRAD) and Output(Bias@GRAD) should not be " + "null at the same time"); + } + const auto x_dims = ctx->GetInputDim("X"); + const int C = x_dims[1]; + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + if (ctx->HasOutput(framework::GradVarName("Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Scale"), {C}); + ctx->SetOutputDim(framework::GradVarName("Bias"), {C}); + } +} + +framework::OpKernelType InstanceNormGradOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + const auto *var = ctx.InputVar(framework::GradVarName("Y")); + if (var == nullptr) { + PADDLE_THROW("cannot find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("cannot find Y@GRAD"); + } + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); +} + +template +class InstanceNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + const auto *saved_mean = ctx.Input("SavedMean"); + const auto *saved_inv_variance = ctx.Input("SavedVariance"); + + const auto &x_dims = x->dims(); + + const int N = x_dims[0]; + const int C = x_dims[1]; + const int NxC = N * C; + const int sample_size = x->numel() / N / C; + + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + d_x->mutable_data(ctx.GetPlace()); + + auto &dev_ctx = ctx.template device_context(); + auto *place = dev_ctx.eigen_device(); + + Eigen::DSizes rdims(0); + Eigen::DSizes mean_rdims(1); + Eigen::DSizes rshape(NxC, sample_size); + Eigen::DSizes bcast(1, sample_size); + Eigen::DSizes C_shape(C, 1); + Eigen::DSizes NxC_shape(NxC, 1); + Eigen::DSizes param_shape(N, C); + Eigen::DSizes shape(NxC, sample_size); + + auto scale_e = framework::EigenVector::Flatten(*scale); + auto mean_e = framework::EigenVector::Flatten(*saved_mean); + auto inv_var_e = framework::EigenVector::Flatten(*saved_inv_variance); + auto dy_e = framework::EigenVector::Flatten(*d_y); + auto x_e = framework::EigenVector::Flatten(*x); + + auto scale_arr = scale_e.reshape(C_shape); + auto mean_arr = mean_e.reshape(NxC_shape); + auto inv_var_arr = inv_var_e.reshape(NxC_shape); + auto dy_arr = dy_e.reshape(shape); + auto x_arr = x_e.reshape(shape); + + auto tmp = + (x_arr - mean_arr.broadcast(bcast)) * inv_var_arr.broadcast(bcast); + + math::SetConstant set_constant; + // math: d_bias = np.sum(d_y, axis=(n,h,w)) + // math: d_scale = np.sum((X-mean) / inv_std * dy, axis=(n, h,w)) + if (d_scale && d_bias) { + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, d_scale, static_cast(0)); + set_constant(dev_ctx, d_bias, static_cast(0)); + + auto d_scale_e = framework::EigenVector::Flatten(*d_scale); + auto d_bias_e = framework::EigenVector::Flatten(*d_bias); + auto d_scale_data = d_scale_e.reshape(C_shape); + auto d_bias_data = d_bias_e.reshape(C_shape); + d_bias_data.device(*place) = + dy_arr.sum(mean_rdims).reshape(param_shape).sum(rdims); + d_scale_data.device(*place) = + (tmp * dy_arr).sum(mean_rdims).reshape(param_shape).sum(rdims); + } + + auto dy_mean = dy_arr.mean(mean_rdims).reshape(NxC_shape).broadcast(bcast); + + Eigen::DSizes bcast_param(N, sample_size); + set_constant(dev_ctx, d_x, static_cast(0)); + // math: d_x = scale * inv_var * d_y - scale * inv_var * np.sum(d_y, + // axis=(h,w)) + // - scale * (X - mean) * inv_var.pow(3) * np.sum(d_y * (X - + // mean), + // axis=(h,w)) + auto dx_e = framework::EigenVector::Flatten(*d_x); + auto dx_arr = dx_e.reshape(shape); + dx_arr.device(*place) = scale_arr.broadcast(bcast_param) * + inv_var_arr.broadcast(bcast) * + (dy_arr - dy_mean - + tmp * + (dy_arr * tmp) + .mean(mean_rdims) + .reshape(NxC_shape) + .broadcast(bcast)); + } +}; + +std::unique_ptr InstanceNormGradMaker::Apply() const { + auto *op = new framework::OpDesc(); + op->SetType("instance_norm_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + + op->SetInput("Scale", Input("Scale")); + op->SetInput("Bias", Input("Bias")); + op->SetInput("SavedMean", Output("SavedMean")); + op->SetInput("SavedVariance", Output("SavedVariance")); + + op->SetAttrMap(Attrs()); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + + return std::unique_ptr(op); +} + +void InstanceNormDoubleGradOp::InferShape( + framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("Scale"), true, + "Input(Scale) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("SavedMean"), true, + "Input(SavedMean) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("SavedVariance"), true, + "Input(SavedVariance) should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput("DDX"), true, + "Input(DDX) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("DY"), true, + "Input(Y@GRAD) should not be null"); + + // check output + PADDLE_ENFORCE_EQ(ctx->HasOutput("DX"), true, + "Output(DX) should not be null"); + + const auto x_dims = ctx->GetInputDim("X"); + const int C = x_dims[1]; + if (ctx->HasOutput("DX")) { + ctx->SetOutputDim("DX", x_dims); + } + if (ctx->HasOutput("DScale")) { + ctx->SetOutputDim("DScale", {C}); + } + if (ctx->HasOutput("DDY")) { + ctx->ShareDim("X", "DDY"); + } +} + +framework::OpKernelType InstanceNormDoubleGradOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + const auto *var = ctx.InputVar("DY"); + if (var == nullptr) { + PADDLE_THROW("cannot find Y@GRAD"); + } + const Tensor *t = nullptr; + if (var->IsType()) { + t = &var->Get(); + } else if (var->IsType()) { + t = &var->Get(); + } + if (t == nullptr) { + PADDLE_THROW("cannot find Y@GRAD"); + } + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); +} + +std::unique_ptr InstanceNormDoubleGradMaker::Apply() const { + auto *op = new framework::OpDesc(); + op->SetType("instance_norm_grad_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Scale", Input("Scale")); + op->SetInput("SavedMean", Input("SavedMean")); + op->SetInput("SavedVariance", Input("SavedVariance")); + op->SetInput("DDX", OutputGrad(framework::GradVarName("X"))); + op->SetInput("DDScale", OutputGrad(framework::GradVarName("Scale"))); + op->SetInput("DDBias", OutputGrad(framework::GradVarName("Bias"))); + op->SetInput("DY", Input(framework::GradVarName("Y"))); + + op->SetAttrMap(Attrs()); + op->SetOutput("DX", InputGrad("X")); + op->SetOutput("DScale", InputGrad("Scale")); + op->SetOutput("DDY", InputGrad(framework::GradVarName("Y"))); + return std::unique_ptr(op); +} + +template +class InstanceNormDoubleGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *X = ctx.Input("X"); + const auto *Scale = ctx.Input("Scale"); + const auto *dY = ctx.Input("DY"); + const auto *Saved_mean = ctx.Input("SavedMean"); + const auto *Saved_variance = ctx.Input("SavedVariance"); + const auto *ddX = ctx.Input("DDX"); + const auto *ddScale = ctx.Input("DDScale"); + const auto *ddBias = ctx.Input("DDBias"); + + auto *dX = ctx.Output("DX"); + auto *dScale = ctx.Output("DScale"); + auto *ddY = ctx.Output("DDY"); + + const auto &x_dims = X->dims(); + int N, C, H, W, D; + ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + const int sample_size = X->numel() / N / C; + const int NxC = N * C; + + const T *mean_data = Saved_mean->data(); + const T *inv_var_data = Saved_variance->data(); + Tensor mean_tensor; + Tensor inv_var_tensor; + ConstEigenArrayMap x_arr(X->data(), sample_size, NxC); + ConstEigenVectorArrayMap mean_arr(mean_data, NxC); + ConstEigenVectorArrayMap inv_var_arr(inv_var_data, NxC); + + Tensor mean_tile; + mean_tile.Resize({sample_size, NxC}); + mean_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap mean_tile_data(mean_tile.mutable_data(ctx.GetPlace()), + sample_size, NxC); + + Tensor inv_var_tile; + inv_var_tile.Resize({sample_size, NxC}); + inv_var_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap inv_var_tile_data( + inv_var_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); + + mean_tile_data = mean_arr.transpose().replicate(sample_size, 1); + inv_var_tile_data = inv_var_arr.transpose().replicate(sample_size, 1); + + ConstEigenVectorArrayMap scale_arr(Scale->data(), C); + + Tensor scale_tile; + scale_tile.Resize({sample_size, NxC}); + scale_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap scale_tile_data(scale_tile.mutable_data(ctx.GetPlace()), + sample_size, NxC); + scale_tile_data = scale_arr.transpose().replicate(sample_size, N); + + ConstEigenArrayMap dy_arr(dY->data(), sample_size, NxC); + ConstEigenArrayMap ddx_arr(ddX->data(), sample_size, NxC); + + // math: dx = scale * ((x - mean) * inv_var / HxW * (np.mean(ddx, + // axis=(h,w)) * + // np.sum(dy, axis=(h,w)) - + // np.sum(dy * ddx, axis=(h,w)) + 3 * np.mean(dy * (x - mean), + // axis=(h,w)) * inv_var.pow(2) * + // np.sum(ddx * (x - mean), axis=(h,w))) + inv_var.pow(3) / HxW * + // np.sum(ddx * (x - mean)) * + // (np.mean(dy, axis=(h,w)) - dy) + inv_var.pow(3) / HxW * + // np.sum(dy, + // axis=(h,w)) * (x - mean) * + // (np.mean(ddx, axis=(h,w)) - ddx) + ddr * (dy * inv_var - inv_var + // * + // np.mean(dy, axis=(h,w)) - + // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), + // axis=(h,w)))) + + auto &dev_ctx = ctx.template device_context(); + math::SetConstant set_constant; + + Tensor x_sub_mean_mul_invstd; + x_sub_mean_mul_invstd.Resize({sample_size, NxC}); + x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()); + EigenArrayMap x_sub_mean_mul_invstd_arr( + x_sub_mean_mul_invstd.mutable_data(ctx.GetPlace()), sample_size, + NxC); + x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data; + + if (dX) { + dX->mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, dX, static_cast(0)); + EigenArrayMap dx_arr(dX->mutable_data(ctx.GetPlace()), sample_size, + NxC); + + if (ddX) { + dx_arr += + x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data / + sample_size * + (ddx_arr.colwise().sum() * dy_arr.colwise().sum() / sample_size - + (dy_arr * ddx_arr).colwise().sum() + + 3. * (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() * + (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size); + + dx_arr += (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size * inv_var_tile_data * inv_var_tile_data * + (dy_arr.colwise().sum() / sample_size - dy_arr); + + dx_arr += (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size * inv_var_tile_data * inv_var_tile_data * + (ddx_arr.colwise().sum() / sample_size - ddx_arr); + + dx_arr = scale_tile_data * dx_arr.eval(); + } + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + + Tensor ddscale_tile; + ddscale_tile.Resize({sample_size, NxC}); + ddscale_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap ddscale_tile_data( + ddscale_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); + ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N); + + dx_arr += (dy_arr * inv_var_tile_data - + dy_arr.colwise().sum() / sample_size * inv_var_tile_data - + x_sub_mean_mul_invstd_arr * inv_var_tile_data * + (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size) * + ddscale_tile_data; + } + } + if (dScale) { + // math: dscale = inv_var * (dy - np.mean(dy, axis=(h,w) - (x-mean) * + // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(h,w)))) * ddx + dScale->mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, dScale, static_cast(0)); + EigenVectorArrayMap dscale_arr(dScale->mutable_data(ctx.GetPlace()), + C); + if (ddX) { + Tensor first_grad; + first_grad.Resize({sample_size, NxC}); + first_grad.mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, &first_grad, static_cast(0)); + EigenArrayMap first_grad_arr( + first_grad.mutable_data(ctx.GetPlace()), sample_size, NxC); + + first_grad_arr += + inv_var_tile_data * + (dy_arr - dy_arr.colwise().sum() / sample_size - + x_sub_mean_mul_invstd_arr * + (dy_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size); + first_grad_arr = first_grad_arr.eval() * ddx_arr; + for (int nc = 0; nc < NxC; ++nc) { + int c = nc % C; + dscale_arr(c) += first_grad_arr.colwise().sum()(nc); + } + } + } + if (ddY) { + // math: ddy = (x - mean) * inv_var * ddscale + ddbias + + // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * + // np.mean(ddx * (x - mean), axis=(h,w))) + ddY->mutable_data(ctx.GetPlace()); + set_constant(dev_ctx, ddY, static_cast(0)); + EigenArrayMap ddy_arr(ddY->mutable_data(ctx.GetPlace()), + sample_size, NxC); + if (ddX) { + ddy_arr += scale_tile_data * inv_var_tile_data * + (ddx_arr - ddx_arr.colwise().sum() / sample_size - + x_sub_mean_mul_invstd_arr * + (ddx_arr * x_sub_mean_mul_invstd_arr).colwise().sum() / + sample_size); + } + if (ddScale && ddBias) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({sample_size, NxC}); + ddscale_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap ddscale_tile_data( + ddscale_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); + ddscale_tile_data = ddscale_arr.transpose().replicate(sample_size, N); + + ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); + Tensor ddbias_tile; + ddbias_tile.Resize({sample_size, NxC}); + ddbias_tile.mutable_data(ctx.GetPlace()); + EigenArrayMap ddbias_tile_data( + ddbias_tile.mutable_data(ctx.GetPlace()), sample_size, NxC); + ddbias_tile_data = ddbias_arr.transpose().replicate(sample_size, N); + + ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; + ddy_arr += ddbias_tile_data; + } + } + } +}; + +DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInference, + {"DY", "DDY"}); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(instance_norm, ops::InstanceNormOp, ops::InstanceNormOpMaker, + ops::InstanceNormOpInferVarType, ops::InstanceNormGradMaker); +REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp, + ops::InstanceNormDoubleGradMaker); +REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp, + ops::InstanceNormDoubleGradOpInplaceInference); + +REGISTER_OP_CPU_KERNEL( + instance_norm, + ops::InstanceNormKernel, + ops::InstanceNormKernel); +REGISTER_OP_CPU_KERNEL( + instance_norm_grad, + ops::InstanceNormGradKernel, + ops::InstanceNormGradKernel); +REGISTER_OP_CPU_KERNEL( + instance_norm_grad_grad, + ops::InstanceNormDoubleGradKernel, + ops::InstanceNormDoubleGradKernel); diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu new file mode 100644 index 00000000..3f0799fb --- /dev/null +++ b/paddle/fluid/operators/instance_norm_op.cu @@ -0,0 +1,655 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/operators/instance_norm_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; +template +using CudnnDataType = platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +static __global__ void repeat_param(const T *input, T *output, + const int repeat_num, const int C) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < repeat_num * C; + i += blockDim.x * gridDim.x) { + int index = i % C; + output[i] = input[index]; + } +} + +template +static __global__ void add_param(const T *input, T *output, + const int repeat_num, const int C) { + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ou_storage; + for (int i = blockIdx.x; i < C; i += gridDim.x) { + T ou = static_cast(0); + for (int j = threadIdx.x; j < repeat_num; j += blockDim.x) { + const int index = j * C + i; + ou += static_cast(input[index]); + } + ou = BlockReduce(ou_storage).Reduce(ou, cub::Sum()); + if (threadIdx.x == 0) { + output[i] = ou; + } + __syncthreads(); + + if (AVG) { + output[i] /= repeat_num; + } + } +} + +template +class InstanceNormKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, + "It must be CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + + auto *x = ctx.Input("X"); + auto &x_dims = x->dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), 2, + "the dimension of input X must greater than or equal to 2"); + PADDLE_ENFORCE_LE( + x_dims.size(), 5, + "the dimension of input X must smaller than or equal to 5"); + int N, C, H, W, D; + ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + Tensor x_tmp; + x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D}); + + auto *y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t in_param_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + + VLOG(3) << "Setting descriptors."; + std::vector dims; + std::vector strides; + dims = {1, NxC, H, W, D}; + strides = {NxC * H * W * D, H * W * D, W * D, D, 1}; + + auto &dev_ctx = ctx.template device_context(); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); + + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + + Tensor scale_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + scale_tmp.mutable_data(ctx.GetPlace()); + Tensor bias_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + bias_tmp.mutable_data(ctx.GetPlace()); + + const int n = x->numel(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min((NxC + block - 1) / block, max_blocks); + + repeat_param<<>>( + scale->data(), scale_tmp.data(), N, C); + repeat_param<<>>( + bias->data(), bias_tmp.data(), N, C); + + auto handle = dev_ctx.cudnn_handle(); + + math::SetConstant> + functor; + + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + saved_mean->mutable_data>(ctx.GetPlace()); + saved_variance->mutable_data>(ctx.GetPlace()); + functor(dev_ctx, saved_mean, static_cast>(0)); + functor(dev_ctx, saved_variance, static_cast>(0)); + + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x_tmp.template data(), + data_desc_, y->template mutable_data(ctx.GetPlace()), in_param_desc_, + scale_tmp.template data>(), + bias_tmp.template data>(), 0, nullptr, nullptr, + epsilon, saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()))); + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); + } +}; + +template +static __global__ void GradComputeDX(const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *mean, + const T *x, + const BatchNormParamType *variance, + const int C, const int sample_size, + T *dx) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + + BatchNormParamType mean_val = mean[ncid]; + BatchNormParamType inv_var_val = variance[ncid]; + + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; + __shared__ BatchNormParamType dy_sum_val; + __shared__ BatchNormParamType dy_x_sub_mean_sum_val; + + BatchNormParamType dy_sum = static_cast>(0); + BatchNormParamType dy_x_sub_mean_sum = + static_cast>(0); + + for (int i = beg_idx; i < end_idx; i += BlockDim) { + BatchNormParamType dy_i = static_cast>(dy[i]); + dy_sum += dy_i; + dy_x_sub_mean_sum += + dy_i * (static_cast>(x[i]) - mean_val); + } + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + dy_x_sub_mean_sum = + BlockReduce(dy_x_sub_mean_storage).Reduce(dy_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; + } + __syncthreads(); + + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dx[i] = + (static_cast>(dy[i]) - + dy_sum_val / static_cast>(sample_size) - + (static_cast>(x[i]) - mean_val) * + dy_x_sub_mean_sum_val * inv_var_val * inv_var_val / sample_size) * + scale[c] * inv_var_val; + } +} + +template +class InstanceNormGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, + "It must use CUDAPlace."); + double epsilon = static_cast(ctx.Attr("epsilon")); + const auto *scale = ctx.Input("Scale"); + const auto *x = ctx.Input("X"); + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + + const auto &x_dims = x->dims(); + + int N, C, H, W, D; + ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + + Tensor x_tmp, d_y_tmp; + x_tmp.ShareDataWith(*x).Resize({1, NxC, H, W, D}); + d_y_tmp.ShareDataWith(*d_y).Resize({1, NxC, H, W, D}); + + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + + d_x->mutable_data(ctx.GetPlace()); + if (d_scale && d_bias) { + d_scale->mutable_data(ctx.GetPlace()); + d_bias->mutable_data(ctx.GetPlace()); + } + PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); + PADDLE_ENFORCE_EQ(scale->dims()[0], C); + + auto &dev_ctx = ctx.template device_context(); + + const int n = x->numel(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(NxC, max_blocks); + const int grid1 = (C + block - 1) / block; + + Tensor scale_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + scale_tmp.mutable_data(ctx.GetPlace()); + Tensor d_scale_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + Tensor d_bias_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + repeat_param<<>>( + scale->data(), scale_tmp.data(), N, C); + + std::vector dims; + std::vector strides; + dims = {1, NxC, H, W, D}; + strides = {NxC * H * W * D, H * W * D, W * D, D, 1}; + + if ((H * W * D) == 1) { + framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); + math::SetConstant> + functor; + functor(dev_ctx, d_scale, static_cast>(0)); + functor(dev_ctx, d_bias, static_cast>(0)); + return; + } + + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t in_param_desc_; + + CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnCreateTensorDescriptor(&in_param_desc_)); + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + + CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); + CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor( + in_param_desc_, data_desc_, CUDNN_BATCHNORM_SPATIAL)); + + const auto *saved_mean = ctx.Input("SavedMean"); + const auto *saved_var = ctx.Input("SavedVariance"); + const auto *saved_mean_data = + saved_mean->template data>(); + const auto *saved_var_data = + saved_var->template data>(); + if (d_scale && d_bias) { + CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( + dev_ctx.cudnn_handle(), CUDNN_BATCHNORM_SPATIAL, + CudnnDataType::kOne(), CudnnDataType::kZero(), + CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, + x_tmp.template data(), data_desc_, d_y_tmp.template data(), + data_desc_, d_x->template mutable_data(ctx.GetPlace()), + in_param_desc_, scale_tmp.template data>(), + d_scale_tmp.template mutable_data>( + ctx.GetPlace()), + d_bias_tmp.template mutable_data>( + ctx.GetPlace()), + epsilon, saved_mean_data, saved_var_data)); + } else { + if (d_x) { + GradComputeDX<<>>( + d_y->data(), scale->data>(), + saved_mean_data, x->data(), saved_var_data, C, H * W * D, + d_x->data()); + } + } + + if (d_scale && d_bias) { + add_param<<>>( + d_scale_tmp.data(), d_scale->data(), N, C); + add_param<<>>( + d_bias_tmp.data(), d_bias->data(), N, C); + } + + CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + CUDNN_ENFORCE( + platform::dynload::cudnnDestroyTensorDescriptor(in_param_desc_)); + } +}; + +static __device__ __forceinline__ float real_sqrt(float x) { + return 1. / sqrtf(x); +} +static __device__ __forceinline__ double real_sqrt(double x) { + return 1. / sqrt(x); +} + +template +__global__ void DoubleGradComputeDX(const T *x, const T *mean, + const T *variance, const T *ddx, + const T *dy, const T *scale, + const T *ddscale, int C, int sample_size, + const double epsilon, T *dx) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + + T mean_val = mean[ncid]; + T var_val = variance[ncid]; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage ddx_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_ddx_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; + __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; + __shared__ T dy_sum_val; + __shared__ T ddx_sum_val; + __shared__ T dy_mul_ddx_sum_val; + __shared__ T dy_mul_x_sub_mean_sum_val; + __shared__ T ddx_mul_x_sub_mean_sum_val; + + T dy_sum = 0; + T ddx_sum = 0; + T dy_mul_ddx_sum = 0; + T dy_mul_x_sub_mean_sum = 0; + T ddx_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T ddx_i = ddx[i]; + T dy_i = dy[i]; + T tmp = x[i] - mean_val; + + dy_sum += dy_i; + ddx_sum += ddx_i; + dy_mul_ddx_sum += (ddx_i * dy_i); + + dy_mul_x_sub_mean_sum += (dy_i * tmp); + ddx_mul_x_sub_mean_sum += (ddx_i * tmp); + } + + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum()); + dy_mul_ddx_sum = + BlockReduce(dy_mul_ddx_storage).Reduce(dy_mul_ddx_sum, cub::Sum()); + dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage) + .Reduce(dy_mul_x_sub_mean_sum, cub::Sum()); + ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage) + .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + ddx_sum_val = ddx_sum; + dy_mul_ddx_sum_val = dy_mul_ddx_sum; + dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum; + ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum; + } + __syncthreads(); + + if (ddx != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dx[i] += + ((x[i] - mean_val) * var_val * var_val * var_val / sample_size * + (ddx_sum_val * dy_sum_val / sample_size - dy_mul_ddx_sum_val + + 3. * dy_mul_x_sub_mean_sum_val * var_val * + ddx_mul_x_sub_mean_sum_val * var_val / sample_size) + + ddx_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * + var_val * (dy_sum_val / sample_size - dy[i]) + + dy_mul_x_sub_mean_sum_val * var_val / sample_size * var_val * + var_val * (ddx_sum_val / sample_size - ddx[i])) * + scale[c]; + } + } + __syncthreads(); + if (ddscale != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dx[i] += (dy[i] * var_val - dy_sum_val / sample_size * var_val - + (x[i] - mean_val) * var_val * dy_mul_x_sub_mean_sum_val * + var_val / sample_size) * + ddscale[c]; + } + } +} + +template +__global__ void DoubleGradComputeDDY(const T *x, const T *mean, + const T *variance, const T *ddscale, + const T *ddbias, const T *ddx, + const T *scale, int C, int sample_size, + const double epsilon, T *ddy) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + + T mean_val = mean[ncid]; + T var_val = variance[ncid]; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage ddx_storage; + __shared__ typename BlockReduce::TempStorage ddx_mul_x_sub_mean_storage; + __shared__ T ddx_sum_val; + __shared__ T ddx_mul_x_sub_mean_sum_val; + + T ddx_sum = 0; + T ddx_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T ddx_i = ddx[i]; + ddx_sum += ddx_i; + ddx_mul_x_sub_mean_sum += (ddx_i * (x[i] - mean_val)); + } + ddx_sum = BlockReduce(ddx_storage).Reduce(ddx_sum, cub::Sum()); + ddx_mul_x_sub_mean_sum = BlockReduce(ddx_mul_x_sub_mean_storage) + .Reduce(ddx_mul_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + ddx_sum_val = ddx_sum; + ddx_mul_x_sub_mean_sum_val = ddx_mul_x_sub_mean_sum; + } + __syncthreads(); + + if (ddx != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + ddy[i] += scale[c] * var_val * + (ddx[i] - ddx_sum_val / sample_size - + (x[i] - mean_val) * var_val * ddx_mul_x_sub_mean_sum_val * + var_val / sample_size); + } + } + __syncthreads(); + if (ddscale != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + ddy[i] += (x[i] - mean_val) * var_val * ddscale[c]; + } + } + __syncthreads(); + if (ddbias != nullptr) { + for (int i = beg_idx; i < end_idx; i += BlockDim) { + ddy[i] += ddbias[c]; + } + } +} + +template +__global__ void DoubleGradComputeDScale(const T *x, const T *mean, + const T *variance, const T *ddx, + const T *dy, int C, int sample_size, + const double epsilon, T *dscale) { + int beg_idx = blockIdx.x * sample_size + threadIdx.x; + int end_idx = (blockIdx.x + 1) * sample_size; + int ncid = blockIdx.x; + int c = ncid % C; + + T mean_val = mean[ncid]; + T var_val = variance[ncid]; + + typedef cub::BlockReduce BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage dy_mul_x_sub_mean_storage; + __shared__ typename BlockReduce::TempStorage dscale_tmp_storage; + __shared__ T dy_sum_val; + __shared__ T dy_mul_x_sub_mean_sum_val; + + T dy_sum = 0; + T dy_mul_x_sub_mean_sum = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + T dy_i = dy[i]; + dy_sum += dy_i; + dy_mul_x_sub_mean_sum += (dy_i * (x[i] - mean_val)); + } + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + dy_mul_x_sub_mean_sum = BlockReduce(dy_mul_x_sub_mean_storage) + .Reduce(dy_mul_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + dy_mul_x_sub_mean_sum_val = dy_mul_x_sub_mean_sum; + } + __syncthreads(); + + if (ddx != nullptr) { + T dscale_tmp = 0; + for (int i = beg_idx; i < end_idx; i += BlockDim) { + dscale_tmp += + ddx[i] * var_val * (dy[i] - dy_sum_val / sample_size - + dy_mul_x_sub_mean_sum_val * (x[i] - mean_val) * + var_val * var_val / sample_size); + } + dscale_tmp = BlockReduce(dscale_tmp_storage).Reduce(dscale_tmp, cub::Sum()); + + if (threadIdx.x == 0) { + dscale[ncid] += dscale_tmp; + } + __syncthreads(); + } +} + +template +class InstanceNormDoubleGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *X = ctx.Input("X"); + const auto *Scale = ctx.Input("Scale"); + const auto *dY = ctx.Input("DY"); + const auto *Saved_mean = ctx.Input("SavedMean"); + const auto *Saved_variance = ctx.Input("SavedVariance"); + const auto *running_mean = ctx.Input("Mean"); + const auto *running_var = ctx.Input("Variance"); + const auto *ddX = ctx.Input("DDX"); + const auto *ddScale = ctx.Input("DDScale"); + const auto *ddBias = ctx.Input("DDBias"); + const double epsilon = static_cast(ctx.Attr("epsilon")); + + auto *dX = ctx.Output("DX"); + auto *dScale = ctx.Output("DScale"); + auto *ddY = ctx.Output("DDY"); + + const T *x_data = X->data(); + const T *scale_data = Scale->data(); + const T *dy_data = dY->data(); + const T *ddx_data = (ddX == nullptr ? nullptr : ddX->data()); + + const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data()); + const T *ddbias_data = (ddScale == nullptr ? nullptr : ddBias->data()); + + const T *mean_data = Saved_mean->data(); + const T *variance_data = Saved_variance->data(); + + auto &x_dims = X->dims(); + int N, C, H, W, D; + ExtractNCWHD(x_dims, DataLayout::kNCHW, &N, &C, &H, &W, &D); + int NxC = N * C; + const int n = X->numel(); + int sample_size = n / N / C; + + auto &dev_ctx = ctx.template device_context(); + const int block = 512; + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = NxC; + const int grid1 = (C + block - 1) / block; + + math::SetConstant set_zero; + + if (dX) { + T *dx_data = dX->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, dX, static_cast(0)); + DoubleGradComputeDX<<>>( + x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, + ddscale_data, C, sample_size, epsilon, dx_data); + } + if (dScale) { + Tensor dscale_tmp = + ctx.AllocateTmpTensor({NxC}, dev_ctx); + set_zero(dev_ctx, &dscale_tmp, static_cast(0)); + T *dscale_tmp_data = dscale_tmp.mutable_data(ctx.GetPlace()); + + T *dscale_data = dScale->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, dScale, static_cast(0)); + DoubleGradComputeDScale<<>>( + x_data, mean_data, variance_data, ddx_data, dy_data, C, sample_size, + epsilon, dscale_tmp_data); + add_param<<>>( + dscale_tmp.data(), dScale->data(), N, C); + } + if (ddY) { + T *ddy_data = ddY->mutable_data(ctx.GetPlace()); + set_zero(dev_ctx, ddY, static_cast(0)); + DoubleGradComputeDDY<<>>( + x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, + scale_data, C, sample_size, epsilon, ddy_data); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL( + instance_norm, ops::InstanceNormKernel, + ops::InstanceNormKernel); +REGISTER_OP_CUDA_KERNEL( + instance_norm_grad, + ops::InstanceNormGradKernel, + ops::InstanceNormGradKernel); +REGISTER_OP_CUDA_KERNEL( + instance_norm_grad_grad, + ops::InstanceNormDoubleGradKernel, + ops::InstanceNormDoubleGradKernel); diff --git a/paddle/fluid/operators/instance_norm_op.h b/paddle/fluid/operators/instance_norm_op.h new file mode 100644 index 00000000..509c1ff0 --- /dev/null +++ b/paddle/fluid/operators/instance_norm_op.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/norm_utils.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using DataLayout = framework::DataLayout; + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +class InstanceNormOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class InstanceNormGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class InstanceNormDoubleGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override; +}; + +class InstanceNormOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +class InstanceNormGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override; +}; + +class InstanceNormDoubleGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override; +}; + +class InstanceNormOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", "Y"}}; + } +}; + +template +class InstanceNormKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override; +}; + +template +class InstanceNormGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override; +}; + +template +class InstanceNormDoubleGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 900b0c63..612f770b 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -19,53 +19,186 @@ namespace paddle { namespace operators { using framework::Tensor; +using DataLayout = framework::DataLayout; + +static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) { + auto dim_x = ctx->GetInputDim("X"); + auto interp_method = ctx->Attrs().Get("interp_method"); + + PADDLE_ENFORCE( + "bilinear" == interp_method || "nearest" == interp_method, + "Interpolation method can only be \"bilinear\" or \"nearest\" when " + "Input(X) dimension is 4"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + if (ctx->HasInputs("SizeTensor")) { + // top prority size + auto inputs_name = ctx->Inputs("SizeTensor"); + PADDLE_ENFORCE_EQ( + inputs_name.size(), 2, + "Input(SizeTensor)'size of Op(interpolate) must be 2. " + "Attr(out_shape)'s length must be 2 for 4-D input tensor."); + int out_h = ctx->Attrs().Get("out_h"); + int out_w = ctx->Attrs().Get("out_w"); + framework::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_h, out_w}; + } else { + dim_out = {dim_x[0], out_h, out_w, dim_x[3]}; + } + ctx->SetOutputDim("Out", dim_out); -class InterpolateOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; + return; + } - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of InterpolateOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of InterpolationOp should not be null."); + int out_h, out_w; + if (ctx->HasInput("Scale")) { + auto scale_tensor = ctx->GetInputDim("Scale"); + PADDLE_ENFORCE_EQ(scale_tensor.size(), 1, + "Scale's dimension size must be 1."); + out_h = -1; + out_w = -1; + } else { + float scale = ctx->Attrs().Get("scale"); + if (scale > 0) { + // round down + out_h = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[2] * scale) + : static_cast(dim_x[1] * scale)); + out_w = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[3] * scale) + : static_cast(dim_x[2] * scale)); + // protect when input shape is -1 + out_h = out_h > 0 ? out_h : -1; + out_w = out_w > 0 ? out_w : -1; + } else { + out_h = ctx->Attrs().Get("out_h"); + out_w = ctx->Attrs().Get("out_w"); + } + } - auto interp_method = ctx->Attrs().Get("interp_method"); - PADDLE_ENFORCE( - "bilinear" == interp_method || "nearest" == interp_method, - "Interpolation method can only be \"bilinear\" or \"nearest\"."); + if (ctx->HasInput("OutSize") && ctx->IsRuntime()) { + auto out_size_dim = ctx->GetInputDim("OutSize"); + PADDLE_ENFORCE_EQ(out_size_dim.size(), 1, + "OutSize's dimension size must be 1"); + PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2"); + ctx->ShareLoD("X", "Out"); + return; + } - auto dim_x = ctx->GetInputDim("X"); // NCHW format - PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4"); + framework::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_h, out_w}; + } else { + dim_out = {dim_x[0], out_h, out_w, dim_x[3]}; + } + ctx->SetOutputDim("Out", dim_out); +} + +static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) { + auto dim_x = ctx->GetInputDim("X"); + auto interp_method = ctx->Attrs().Get("interp_method"); + + PADDLE_ENFORCE("trilinear" == interp_method, + "Interpolation method can only be \"trilinear\" when Input(X) " + "dimension is 5"); + const DataLayout data_layout = framework::StringToDataLayout( + ctx->Attrs().Get("data_layout")); + + if (ctx->HasInputs("SizeTensor")) { + // top prority size + auto inputs_name = ctx->Inputs("SizeTensor"); + PADDLE_ENFORCE_EQ( + inputs_name.size(), 3, + "Input(SizeTensor)'s size of Op(interpolate) must be 3. " + "Attr(out_shape)'s length must be 3 for 5-D input tensor."); + int out_d = ctx->Attrs().Get("out_d"); + int out_h = ctx->Attrs().Get("out_h"); + int out_w = ctx->Attrs().Get("out_w"); + framework::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w}; + } else { + dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]}; + } + ctx->SetOutputDim("Out", dim_out); + + return; + } - int out_h, out_w; + int out_d, out_h, out_w; + if (ctx->HasInput("Scale")) { + auto scale_tensor = ctx->GetInputDim("Scale"); + PADDLE_ENFORCE_EQ(scale_tensor.size(), 1, + "Scale's dimension size must be 1"); + out_d = -1; + out_h = -1; + out_w = -1; + } else { float scale = ctx->Attrs().Get("scale"); if (scale > 0) { // round down - out_h = static_cast(dim_x[2] * scale); - out_w = static_cast(dim_x[3] * scale); + out_d = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[2] * scale) + : static_cast(dim_x[1] * scale)); + out_h = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[3] * scale) + : static_cast(dim_x[2] * scale)); + out_w = (data_layout == DataLayout::kNCHW + ? static_cast(dim_x[4] * scale) + : static_cast(dim_x[3] * scale)); // protect when input shape is -1 + out_d = out_d > 0 ? out_d : -1; out_h = out_h > 0 ? out_h : -1; out_w = out_w > 0 ? out_w : -1; } else { + out_d = ctx->Attrs().Get("out_d"); out_h = ctx->Attrs().Get("out_h"); out_w = ctx->Attrs().Get("out_w"); - PADDLE_ENFORCE_GT(out_h, 0, "out_h should be greater than 0."); - PADDLE_ENFORCE_GT(out_w, 0, "out_w should be greater than 0."); } + } - if (ctx->HasInput("OutSize") && ctx->IsRuntime()) { - auto out_size_dim = ctx->GetInputDim("OutSize"); - PADDLE_ENFORCE_EQ(out_size_dim.size(), 1, - "OutSize's dimension size must be 1"); - PADDLE_ENFORCE_EQ(out_size_dim[0], 2, "OutSize's dim[0] must be 2"); - ctx->ShareLoD("X", "Out"); - return; - } + if (ctx->HasInput("OutSize") && ctx->IsRuntime()) { + auto out_size_dim = ctx->GetInputDim("OutSize"); + PADDLE_ENFORCE_EQ(out_size_dim.size(), 1, + "OutSize's dimension size must be 1"); + PADDLE_ENFORCE_EQ(out_size_dim[0], 3, "OutSize's dim[0] must be 3"); + ctx->ShareLoD("X", "Out"); + return; + } + + framework::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {dim_x[0], dim_x[1], out_d, out_h, out_w}; + } else { + dim_out = {dim_x[0], out_d, out_h, out_w, dim_x[4]}; + } + ctx->SetOutputDim("Out", dim_out); +} + +class InterpolateOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of InterpolateOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of InterpolationOp should not be null."); - std::vector dim_out({dim_x[0], dim_x[1], out_h, out_w}); - ctx->SetOutputDim("Out", framework::make_ddim(dim_out)); + auto dim_x = ctx->GetInputDim("X"); // NCHW format + PADDLE_ENFORCE(dim_x.size() == 4 || dim_x.size() == 5, + "Input(X) dimension must be 4 or 5"); + + if (dim_x.size() == 4) { + // shape check for 2D interpolate for input tensor shape NCHW + Interpolate2DInferShapeCheck(ctx); + } else { // dim_x.size() == 5 + // shape check for 3D interpolate for input tensor shape NCDHW + Interpolate3DInferShapeCheck(ctx); + } } protected: @@ -74,6 +207,16 @@ class InterpolateOp : public framework::OperatorWithKernel { return framework::OpKernelType(ctx.Input("X")->type(), ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "SizeTensor" || var_name == "Scale") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } }; class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { @@ -81,22 +224,46 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "The input tensor of interpolate operator, " - "This is a 4-D tensor with shape of [N, C, H, w]."); + "This is a 4-D tensor with shape of [N, C, H, W] or a " + "5-D tensor with shape of [N, C, D, H, W]."); AddInput("OutSize", "This is a 1-D tensor with two numbers to specify output size. " - "The first number is height and the second number is width.") + "It should be [output_height, output_width] when input is a 4-D " + "tensor and should be [output_depth, output_height, output_width] " + "when input is a 5-D tensor. It has a higher priority than " + "the attr(out_d), attr(out_h), attr(out_w) and attr(scale).") + .AsDispensable(); + AddInput("SizeTensor", + "(vector>, optional). If provided, interpolate will " + "use this. The shape of the tensor in vector MUST BE [1]. " + "It has the highest priority compare with Input(OutSize) and " + "attr(out_d), attr(out_h), attr(out_w) and attr(scale).") + .AsDuplicable() + .AsDispensable(); + AddInput("Scale", + "This is a 1-D tensor with one number to specify output scale. " + "It has the higher priority compare with attr(scale).") .AsDispensable(); AddOutput("Out", "The output tensor of interpolate operator, " - "This is a 4-D tensor with shape of [N, C, H, W]."); - - AddAttr("out_h", "output height of interpolate op."); - AddAttr("out_w", "output width of interpolate op."); + "This is a tensor in same rank with Input(X)."); + + AddAttr( + "data_layout", + "(string, default NCHW) Only used in " + "an optional string from: \"NHWC\", \"NCHW\". " + "Specify that the data format of the input and output data is " + "channel_first or channel_last.") + .SetDefault("NCHW"); + AddAttr("out_d", "output depth of interpolate op.").SetDefault(0); + AddAttr("out_h", "output height of interpolate op.").SetDefault(0); + AddAttr("out_w", "output width of interpolate op.").SetDefault(0); AddAttr("scale", "scale factor of interpolate op.").SetDefault(0.); AddAttr("interp_method", "(string, default \"bilinear\"), interpolation " "method, can be \"bilinear\" for " - "bilinear interpolation and \"nearest\" for nearest " + "bilinear interpolation, \"trilinear\" for trilinear " + "interpolation and \"nearest\" for nearest " "neighbor interpolation.") .SetDefault("bilinear"); AddAttr( @@ -127,6 +294,11 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { to perform linear interpolation first in one direction, and then again in the other direction. + Trilinear interpolation is an extension of linear interpolation for + interpolating functions of three variables (e.g. D-direction, + H-direction and W-direction in this op) on a rectilinear 3D grid. + The linear interpolation is performed on three directions. + Align_corners and align_mode are optinal parameters,the calculation method of interpolation can be selected by them. @@ -183,6 +355,27 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { H_out = H_{in} * scale_{factor} W_out = W_{in} * scale_{factor} + Trilinear interpolation: + + if: + align_corners = False , align_mode = 0 + + input : (N,C,D_in,H_in,W_in) + output: (N,C,D_out,H_out,W_out) where: + + D_out = (D_{in}+0.5) * scale_{factor} - 0.5 + H_out = (H_{in}+0.5) * scale_{factor} - 0.5 + W_out = (W_{in}+0.5) * scale_{factor} - 0.5 + + + else: + + input : (N,C,D_in,H_in,W_in) + output: (N,C,D_out,H_out,W_out) where: + + D_out = D_{in} * scale_{factor} + H_out = H_{in} * scale_{factor} + W_out = W_{in} * scale_{factor} For details of nearest neighbor interpolation, please refer to Wikipedia: @@ -190,6 +383,9 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker { For details of bilinear interpolation, please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation + + For details of trilinear interpolation, please refer to Wikipedia: + https://en.wikipedia.org/wiki/Trilinear_interpolation )DOC"); } }; @@ -215,6 +411,16 @@ class InterpolateOpGrad : public framework::OperatorWithKernel { ctx.Input(framework::GradVarName("Out"))->type(), ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "SizeTensor" || var_name == "Scale") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } }; class InterpolateGradDescMaker : public framework::SingleGradOpDescMaker { @@ -226,9 +432,15 @@ class InterpolateGradDescMaker : public framework::SingleGradOpDescMaker { std::unique_ptr op(new framework::OpDesc()); op->SetType(ForwardOp().Type() + "_grad"); op->SetInput("X", Input("X")); + if (ForwardOp().Inputs().count("SizeTensor") > 0) { + op->SetInput("SizeTensor", Input("SizeTensor")); + } if (ForwardOp().Inputs().count("OutSize") > 0) { op->SetInput("OutSize", Input("OutSize")); } + if (ForwardOp().Inputs().count("Scale") > 0) { + op->SetInput("Scale", Input("Scale")); + } op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetAttrMap(Attrs()); @@ -251,6 +463,10 @@ REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker, ops::InterpolateGradDescMaker); REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad, ops::InterpolateGradNoNeedBufferVarsInference); +REGISTER_OPERATOR(trilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker, + ops::InterpolateGradDescMaker); +REGISTER_OPERATOR(trilinear_interp_grad, ops::InterpolateOpGrad, + ops::InterpolateGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel, ops::InterpolateKernel, ops::InterpolateKernel); @@ -261,3 +477,8 @@ REGISTER_OP_CPU_KERNEL(nearest_interp, ops::InterpolateKernel, ops::InterpolateKernel); REGISTER_OP_CPU_KERNEL(nearest_interp_grad, ops::InterpolateGradKernel, ops::InterpolateGradKernel); +REGISTER_OP_CPU_KERNEL(trilinear_interp, ops::InterpolateKernel, + ops::InterpolateKernel, + ops::InterpolateKernel); +REGISTER_OP_CPU_KERNEL(trilinear_interp_grad, ops::InterpolateGradKernel, + ops::InterpolateGradKernel); diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu index 1cdda4cf..6121389c 100644 --- a/paddle/fluid/operators/interpolate_op.cu +++ b/paddle/fluid/operators/interpolate_op.cu @@ -17,6 +17,7 @@ namespace paddle { namespace operators { using framework::Tensor; +using DataLayout = framework::DataLayout; template __global__ void KeNearestNeighborInterpFw( @@ -24,7 +25,7 @@ __global__ void KeNearestNeighborInterpFw( const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners) { + const bool align_corners, const DataLayout data_layout) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -33,20 +34,32 @@ __global__ void KeNearestNeighborInterpFw( int out_id_w = tid % output_w; int in_img_size = input_w / num_channels; int out_img_size = output_w / num_channels; - int channel_id = out_id_w / out_img_size; - int out_img_idy = (out_id_w % out_img_size) / out_img_w; + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + int in_img_idy = (align_corners) ? static_cast(ratio_h * out_img_idy + 0.5) : static_cast(ratio_h * out_img_idy); - - int out_img_idx = tid % out_img_w; int in_img_idx = (align_corners) ? static_cast(ratio_w * out_img_idx + 0.5) : static_cast(ratio_w * out_img_idx); - out[tid] = in[out_id_h * input_w + channel_id * in_img_size + - in_img_idy * in_img_w + in_img_idx]; + if (data_layout == DataLayout::kNCHW) { + out[tid] = in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + } else { + out[tid] = in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + } } } @@ -56,7 +69,7 @@ __global__ void KeNearestNeighborInterpBw( const size_t input_w, const T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners) { + const bool align_corners, const DataLayout data_layout) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -65,20 +78,33 @@ __global__ void KeNearestNeighborInterpBw( int out_id_w = tid % output_w; int in_img_size = input_w / num_channels; int out_img_size = output_w / num_channels; - int channel_id = out_id_w / out_img_size; - int out_img_idy = (out_id_w % out_img_size) / out_img_w; + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + int in_img_idy = (align_corners) ? static_cast(ratio_h * out_img_idy + 0.5) : static_cast(ratio_h * out_img_idy); - - int out_img_idx = tid % out_img_w; int in_img_idx = (align_corners) ? static_cast(ratio_w * out_img_idx + 0.5) : static_cast(ratio_w * out_img_idx); - T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - in_img_idy * in_img_w + in_img_idx]; + T* in_pos; + if (data_layout == DataLayout::kNCHW) { + in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + } else { + in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + } const T out_pos = out[out_id_h * output_w + out_id_w]; platform::CudaAtomicAdd(in_pos, out_pos); } @@ -90,7 +116,8 @@ __global__ void KeBilinearInterpFw( const size_t input_h, const size_t input_w, T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, const int align_mode) { + const bool align_corners, const int align_mode, + const DataLayout data_layout) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -100,9 +127,18 @@ __global__ void KeBilinearInterpFw( int out_id_w = tid % output_w; int in_img_size = input_w / num_channels; int out_img_size = output_w / num_channels; - int channel_id = out_id_w / out_img_size; - int out_img_idy = (out_id_w % out_img_size) / out_img_w; + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + int in_img_idy = align_flag ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) : static_cast(ratio_h * out_img_idy); @@ -114,7 +150,6 @@ __global__ void KeBilinearInterpFw( align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; T h2lambda = 1.f - h1lambda; - int out_img_idx = tid % out_img_w; int in_img_idx = align_flag ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) : static_cast(ratio_w * out_img_idx); @@ -126,14 +161,28 @@ __global__ void KeBilinearInterpFw( align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; T w2lambda = 1.f - w1lambda; - const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - in_img_idy * in_img_w + in_img_idx]; - - // bilinear interpolation - out[out_id_h * output_w + out_id_w] = - h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + - h1lambda * (w2lambda * in_pos[h_id * in_img_w] + - w1lambda * in_pos[h_id * in_img_w + w_id]); + if (data_layout == DataLayout::kNCHW) { + const T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + + // bilinear interpolation + out[out_id_h * output_w + out_id_w] = + h2lambda * (w2lambda * in_pos[0] + w1lambda * in_pos[w_id]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w] + + w1lambda * in_pos[h_id * in_img_w + w_id]); + } else { + const T* in_pos = + &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + + // bilinear interpolation + out[out_id_h * output_w + out_id_w] = + h2lambda * + (w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels]) + + h1lambda * (w2lambda * in_pos[h_id * in_img_w * num_channels] + + w1lambda * in_pos[h_id * in_img_w * num_channels + + w_id * num_channels]); + } } } @@ -143,7 +192,8 @@ __global__ void KeBilinearInterpBw( const size_t input_w, const T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const T ratio_h, const T ratio_w, - const bool align_corners, const int align_mode) { + const bool align_corners, const int align_mode, + const DataLayout data_layout) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -153,9 +203,18 @@ __global__ void KeBilinearInterpBw( int out_id_w = tid % output_w; int in_img_size = input_w / num_channels; int out_img_size = output_w / num_channels; - int channel_id = out_id_w / out_img_size; - int out_img_idy = (out_id_w % out_img_size) / out_img_w; + int channel_id, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idy = (out_id_w % out_img_size) / out_img_w; + out_img_idx = tid % out_img_w; + } else { + out_img_idy = out_id_w / (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } + int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5 : ratio_h * out_img_idy; in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; @@ -164,10 +223,8 @@ __global__ void KeBilinearInterpBw( src_h = (src_h > 0) ? src_h : 0; T h1lambda = align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; - T h2lambda = 1.f - h1lambda; - int out_img_idx = tid % out_img_w; int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 : ratio_w * out_img_idx; in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; @@ -178,171 +235,707 @@ __global__ void KeBilinearInterpBw( align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; T w2lambda = 1.f - w1lambda; - T* in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - in_img_idy * in_img_w + in_img_idx]; + T* in_pos; + if (data_layout == DataLayout::kNCHW) { + in_pos = &in[out_id_h * input_w + channel_id * in_img_size + + in_img_idy * in_img_w + in_img_idx]; + } else { + in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + } + const T* out_pos = &out[out_id_h * output_w + out_id_w]; - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[h_id * in_img_w], - h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id], - h1lambda * w1lambda * out_pos[0]); + + if (data_layout == DataLayout::kNCHW) { + platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos[h_id * in_img_w], + h1lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id], + h1lambda * w1lambda * out_pos[0]); + } else { + platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos[w_id * num_channels], + h2lambda * w1lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels], + h1lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd( + &in_pos[h_id * in_img_w * num_channels + w_id * num_channels], + h1lambda * w1lambda * out_pos[0]); + } } } template -class InterpolateOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "This kernel only runs on GPU device."); - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - auto* input_data = input->data(); +__global__ void KeTrilinearInterpFw( + const T* in, const size_t in_img_d, const size_t in_img_h, + const size_t in_img_w, const size_t input_h, const size_t input_w, T* out, + const size_t out_img_d, const size_t out_img_h, const size_t out_img_w, + const size_t output_h, const size_t output_w, const size_t num_channels, + const float ratio_d, const float ratio_h, const float ratio_w, + const bool align_corners, const int align_mode, + const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; - int n = input->dims()[0]; - int c = input->dims()[1]; - int in_h = input->dims()[2]; - int in_w = input->dims()[3]; + int channel_id, out_img_idt, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; + out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; + out_img_idx = tid % out_img_w; + } else { + out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); + out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / + (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } - auto interp_method = ctx.Attr("interp_method"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); + int in_img_idt = align_flag + ? static_cast(ratio_d * (out_img_idt + 0.5) - 0.5) + : static_cast(ratio_d * out_img_idt); + in_img_idt = (in_img_idt > 0) ? in_img_idt : 0; + int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0; + T src_d = ratio_d * (out_img_idt + 0.5) - 0.5; + src_d = (src_d > 0) ? src_d : 0; + T d1lambda = + align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt; + T d2lambda = 1.f - d1lambda; - float scale = ctx.Attr("scale"); - if (scale > 0) { - out_h = in_h * scale; - out_w = in_w * scale; - } + int in_img_idy = align_flag + ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) + : static_cast(ratio_h * out_img_idy); + in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; + int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; + T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; + src_h = (src_h > 0) ? src_h : 0; + T h1lambda = + align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; - auto out_size = ctx.Input("OutSize"); - if (out_size != nullptr) { - Tensor sizes; - framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes); - auto size_data = sizes.data(); - out_h = size_data[0]; - out_w = size_data[1]; + int in_img_idx = align_flag + ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = (src_w > 0) ? src_w : 0; + T w1lambda = + align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; + + if (data_layout == DataLayout::kNCHW) { + int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size + + (in_img_idt * in_img_h + in_img_idy) * in_img_w + + in_img_idx; + const T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w; + const T* in_pos2 = &in[in_pos2_idx]; + + // trilinear interpolation + out[out_id_h * output_w + out_id_w] = + d2lambda * + (h2lambda * (w2lambda * in_pos1[0] + w1lambda * in_pos1[w_id]) + + h1lambda * (w2lambda * in_pos1[h_id * in_img_w] + + w1lambda * in_pos1[h_id * in_img_w + w_id])) + + d1lambda * + (h2lambda * (w2lambda * in_pos2[0] + w1lambda * in_pos2[w_id]) + + h1lambda * (w2lambda * in_pos2[h_id * in_img_w] + + w1lambda * in_pos2[h_id * in_img_w + w_id])); + + } else { + int in_pos1_idx = out_id_h * input_w + + in_img_idt * in_img_h * in_img_w * num_channels + + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id; + const T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels; + const T* in_pos2 = &in[in_pos2_idx]; + + // trilinear interpolation + out[out_id_h * output_w + out_id_w] = + d2lambda * + (h2lambda * (w2lambda * in_pos1[0] + + w1lambda * in_pos1[w_id * num_channels]) + + h1lambda * (w2lambda * in_pos1[h_id * in_img_w * num_channels] + + w1lambda * in_pos1[h_id * in_img_w * num_channels + + w_id * num_channels])) + + d1lambda * + (h2lambda * (w2lambda * in_pos2[0] + + w1lambda * in_pos2[w_id * num_channels]) + + h1lambda * (w2lambda * in_pos2[h_id * in_img_w * num_channels] + + w1lambda * in_pos2[h_id * in_img_w * num_channels + + w_id * num_channels])); } + } +} - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); +template +__global__ void KeTrilinearInterpBw( + T* in, const size_t in_img_d, const size_t in_img_h, const size_t in_img_w, + const size_t input_h, const size_t input_w, const T* out, + const size_t out_img_d, const size_t out_img_h, const size_t out_img_w, + const size_t output_h, const size_t output_w, const size_t num_channels, + const T ratio_d, const T ratio_h, const T ratio_w, const bool align_corners, + const int align_mode, const DataLayout data_layout) { + int nthreads = output_h * output_w; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + bool align_flag = (align_mode == 0 && !align_corners); + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / output_w; + int out_id_w = tid % output_w; + int in_img_size = input_w / num_channels; + int out_img_size = output_w / num_channels; - auto* output_data = - output->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); + int channel_id, out_img_idt, out_img_idy, out_img_idx; + if (data_layout == DataLayout::kNCHW) { + channel_id = out_id_w / out_img_size; + out_img_idt = (out_id_w % out_img_size) / out_img_h / out_img_w; + out_img_idy = ((out_id_w % out_img_size) / out_img_w) % out_img_h; + out_img_idx = tid % out_img_w; + } else { + out_img_idt = out_id_w / (out_img_h * out_img_w * num_channels); + out_img_idy = out_id_w % (out_img_h * out_img_w * num_channels) / + (out_img_w * num_channels); + out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; + channel_id = tid % num_channels; + } - int in_hw = in_h * in_w; - int out_hw = out_h * out_w; - int in_chw = c * in_hw; - int out_chw = c * out_hw; + int in_img_idt = align_flag + ? static_cast(ratio_d * (out_img_idt + 0.5) - 0.5) + : static_cast(ratio_d * out_img_idt); + in_img_idt = (in_img_idt > 0) ? in_img_idt : 0; + int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0; + T src_d = ratio_d * (out_img_idt + 0.5) - 0.5; + src_d = (src_d > 0) ? src_d : 0; + T d1lambda = + align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt; + T d2lambda = 1.f - d1lambda; - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(in_h) / out_h; - } - if (out_w > 1) { - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(in_w) / out_w; - } + int in_img_idy = align_flag + ? static_cast(ratio_h * (out_img_idy + 0.5) - 0.5) + : static_cast(ratio_h * out_img_idy); + in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; + int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; + T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; + src_h = (src_h > 0) ? src_h : 0; + T h1lambda = + align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; + T h2lambda = 1.f - h1lambda; - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(*input, ctx.GetPlace(), output); - return; - } + int in_img_idx = align_flag + ? static_cast(ratio_w * (out_img_idx + 0.5) - 0.5) + : static_cast(ratio_w * out_img_idx); + in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; + int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; + T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; + src_w = (src_w > 0) ? src_w : 0; + T w1lambda = + align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; + T w2lambda = 1.f - w1lambda; - int pixelNum = n * out_chw; - int grid_dim = (pixelNum + 512 - 1) / 512; - grid_dim = grid_dim > 8 ? 8 : grid_dim; - - if ("nearest" == interp_method) { - KeNearestNeighborInterpFw< - T><<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_corners); - } else if ("bilinear" == interp_method) { - KeBilinearInterpFw< - T><<>>( - input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, - out_chw, c, ratio_h, ratio_w, align_corners, align_mode); + if (data_layout == DataLayout::kNCHW) { + int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size + + (in_img_idt * in_img_h + in_img_idy) * in_img_w + + in_img_idx; + T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w; + T* in_pos2 = &in[in_pos2_idx]; + + const T* out_pos = &out[out_id_h * output_w + out_id_w]; + + // trilinear interpolation grad + platform::CudaAtomicAdd(&in_pos1[0], + d2lambda * h2lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos1[w_id], + d2lambda * h2lambda * w1lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w], + d2lambda * h1lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id], + d2lambda * h1lambda * w1lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos2[0], + d1lambda * h2lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos2[w_id], + d1lambda * h2lambda * w1lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w], + d1lambda * h1lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id], + d1lambda * h1lambda * w1lambda * out_pos[0]); + } else { + int in_pos1_idx = out_id_h * input_w + + in_img_idt * in_img_h * in_img_w * num_channels + + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id; + T* in_pos1 = &in[in_pos1_idx]; + int in_pos2_idx = in_pos1_idx + d_id * in_img_h * in_img_w * num_channels; + T* in_pos2 = &in[in_pos2_idx]; + + const T* out_pos = &out[out_id_h * output_w + out_id_w]; + + // trilinear interpolation grad + platform::CudaAtomicAdd(&in_pos1[0], + d2lambda * h2lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos1[w_id * num_channels], + d2lambda * h2lambda * w1lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels], + d2lambda * h1lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd( + &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels], + d2lambda * h1lambda * w1lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos2[0], + d1lambda * h2lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos2[w_id * num_channels], + d1lambda * h2lambda * w1lambda * out_pos[0]); + platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels], + d1lambda * h1lambda * w2lambda * out_pos[0]); + platform::CudaAtomicAdd( + &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels], + d1lambda * h1lambda * w1lambda * out_pos[0]); } } -}; +} template -class InterpolateGradOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - auto* output_grad_data = output_grad->data(); - auto* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); - - auto& device_ctx = - ctx.template device_context(); - math::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); - - int n = input_grad->dims()[0]; - int c = input_grad->dims()[1]; - int in_h = input_grad->dims()[2]; - int in_w = input_grad->dims()[3]; - - auto interp_method = ctx.Attr("interp_method"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - float scale = ctx.Attr("scale"); +static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, + const Tensor& input, Tensor* output) { + auto* input_data = input.data(); + + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + + auto list_new_shape_tensor = ctx.MultiInput("SizeTensor"); + if (list_new_shape_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_shape_tensor); + out_h = new_size[0]; + out_w = new_size[1]; + } else { + float scale; + auto scale_tensor = ctx.Input("Scale"); + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } else { + scale = ctx.Attr("scale"); + } if (scale > 0) { - out_h = in_h * scale; - out_w = in_w * scale; + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); } auto out_size = ctx.Input("OutSize"); if (out_size != nullptr) { Tensor sizes; - framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes); + framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); auto size_data = sizes.data(); out_h = size_data[0]; out_w = size_data[1]; } + } + PADDLE_ENFORCE_GT( + out_h, 0, + "out_h in Attr(out_shape) of Op(interpolate) should be greater than 0."); + PADDLE_ENFORCE_GT( + out_w, 0, + "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0."); + + framework::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_h, out_w}; + } else { + dim_out = {n, out_h, out_w, c}; + } + auto output_data = output->mutable_data(dim_out, ctx.GetPlace()); - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(input, ctx.GetPlace(), output); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } - int in_hw = in_h * in_w; - int out_hw = out_h * out_w; - int in_chw = c * in_hw; - int out_chw = c * out_hw; + int in_hw = in_h * in_w; + int out_hw = out_h * out_w; + int in_chw = c * in_hw; + int out_chw = c * out_hw; + + int pixelNum = n * out_chw; + int grid_dim = (pixelNum + 512 - 1) / 512; + grid_dim = grid_dim > 8 ? 8 : grid_dim; + + if ("nearest" == interp_method) { + KeNearestNeighborInterpFw< + T><<>>( + input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + } else if ("bilinear" == interp_method) { + KeBilinearInterpFw< + T><<>>( + input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, + out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); + } +} - float ratio_h = 0.f; - float ratio_w = 0.f; - if (out_h > 1) { - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(in_h) / out_h; +template +static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, + const Tensor& input, Tensor* output) { + auto* input_data = input.data(); + + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + + int out_d = ctx.Attr("out_d"); + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + + auto list_new_shape_tensor = ctx.MultiInput("SizeTensor"); + if (list_new_shape_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_shape_tensor); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } else { + float scale; + auto scale_tensor = ctx.Input("Scale"); + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } else { + scale = ctx.Attr("scale"); } - if (out_w > 1) { - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(in_w) / out_w; + if (scale > 0) { + out_d = static_cast(in_d * scale); + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); } + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + Tensor sizes; + framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_d = size_data[0]; + out_h = size_data[1]; + out_w = size_data[2]; + } + } + PADDLE_ENFORCE_GT( + out_d, 0, + "out_d in Attr(out_shape) of Op(interpolate) should be greater than 0."); + PADDLE_ENFORCE_GT( + out_h, 0, + "out_h in Attr(out_shape) of Op(interpolate) should be greater than 0."); + PADDLE_ENFORCE_GT( + out_w, 0, + "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0."); + + framework::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_d, out_h, out_w}; + } else { + dim_out = {n, out_d, out_h, out_w, c}; + } + auto output_data = output->mutable_data(dim_out, ctx.GetPlace()); + + if (in_d == out_d && in_h == out_h && in_w == out_w) { + framework::TensorCopy(input, ctx.GetPlace(), output); + return; + } + + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(in_d) / out_d; + } + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); - return; + int in_dhw = in_d * in_h * in_w; + int out_dhw = out_d * out_h * out_w; + int in_cdhw = c * in_dhw; + int out_cdhw = c * out_dhw; + + int pixelNum = n * out_cdhw; + int grid_dim = (pixelNum + 512 - 1) / 512; + grid_dim = grid_dim > 8 ? 8 : grid_dim; + + if ("trilinear" == interp_method) { + KeTrilinearInterpFw< + T><<>>( + input_data, in_d, in_h, in_w, n, in_cdhw, output_data, out_d, out_h, + out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, + align_mode, data_layout); + } +} + +template +static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, + Tensor* input_grad, const Tensor output_grad) { + auto* input = ctx.Input("X"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + float scale; + auto scale_tensor = ctx.Input("Scale"); + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } else { + scale = ctx.Attr("scale"); + } + if (scale > 0) { + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + } + + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + Tensor sizes; + framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_h = size_data[0]; + out_w = size_data[1]; + } + auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); + if (list_new_size_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_size_tensor); + out_h = new_size[0]; + out_w = new_size[1]; + } + + auto* output_grad_data = output_grad.data(); + framework::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_h, in_w}; + } else { + dim_grad = {n, in_h, in_w, c}; + } + input_grad->mutable_data(dim_grad, ctx.GetPlace()); + auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; + zero(device_ctx, input_grad, static_cast(0.0)); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } + + int in_hw = in_h * in_w; + int out_hw = out_h * out_w; + int in_chw = c * in_hw; + int out_chw = c * out_hw; + + int pixelNum = n * out_chw; + int grid_dim = (pixelNum + 512 - 1) / 512; + grid_dim = grid_dim > 8 ? 8 : grid_dim; + + if ("nearest" == interp_method) { + KeNearestNeighborInterpBw< + T><<>>( + input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, + n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + } else if ("bilinear" == interp_method) { + KeBilinearInterpBw< + T><<>>( + input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, + n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, + data_layout); + } +} + +template +static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, + Tensor* input_grad, + const Tensor& output_grad) { + auto* input = ctx.Input("X"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + + int out_d = ctx.Attr("out_d"); + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + float scale; + auto scale_tensor = ctx.Input("Scale"); + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } else { + scale = ctx.Attr("scale"); + } + if (scale > 0) { + out_d = static_cast(in_d * scale); + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + } + + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + Tensor sizes; + framework::TensorCopySync(*out_size, platform::CPUPlace(), &sizes); + auto size_data = sizes.data(); + out_d = size_data[0]; + out_h = size_data[1]; + out_w = size_data[2]; + } + auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); + if (list_new_size_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_size_tensor); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } + + auto* output_grad_data = output_grad.data(); + framework::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_d, in_h, in_w}; + } else { + dim_grad = {n, in_d, in_h, in_w, c}; + } + auto* input_grad_data = input_grad->mutable_data(dim_grad, ctx.GetPlace()); + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; + zero(device_ctx, input_grad, static_cast(0.0)); + + if (in_d == out_d && in_h == out_h && in_w == out_w) { + framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); + return; + } + + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(in_d) / out_d; + } + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } + + int in_dhw = in_d * in_h * in_w; + int out_dhw = out_d * out_h * out_w; + int in_cdhw = c * in_dhw; + int out_cdhw = c * out_dhw; + + int pixelNum = n * out_cdhw; + int grid_dim = (pixelNum + 512 - 1) / 512; + grid_dim = grid_dim > 8 ? 8 : grid_dim; + + if ("trilinear" == interp_method) { + KeTrilinearInterpBw< + T><<>>( + input_grad_data, in_d, in_h, in_w, n, in_cdhw, output_grad_data, out_d, + out_h, out_w, n, out_cdhw, c, ratio_d, ratio_h, ratio_w, align_corners, + align_mode, data_layout); + } +} + +template +class InterpolateOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + + auto input_dims = input->dims(); + if (input_dims.size() == 4) { // 2D interpolation + Interpolate2DCUDAFwd(ctx, *input, output); + } else if (input_dims.size() == 5) { // 3D interpolation + Interpolate3DCUDAFwd(ctx, *input, output); } + } +}; + +template +class InterpolateGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "This kernel only runs on GPU device."); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* output_grad = ctx.Input(framework::GradVarName("Out")); - int pixelNum = n * out_chw; - int grid_dim = (pixelNum + 512 - 1) / 512; - grid_dim = grid_dim > 8 ? 8 : grid_dim; - - if ("nearest" == interp_method) { - KeNearestNeighborInterpBw< - T><<>>( - input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, - out_w, n, out_chw, c, ratio_h, ratio_w, align_corners); - } else if ("bilinear" == interp_method) { - KeBilinearInterpBw< - T><<>>( - input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, - out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode); + auto output_grad_dims = output_grad->dims(); + if (output_grad_dims.size() == 4) { // 2D interpolation + Interpolate2DCUDABwd(ctx, input_grad, *output_grad); + } else if (output_grad_dims.size() == 5) { // 3D interpolation + Interpolate3DCUDABwd(ctx, input_grad, *output_grad); } } }; @@ -363,3 +956,9 @@ REGISTER_OP_CUDA_KERNEL(nearest_interp, ops::InterpolateOpCUDAKernel, REGISTER_OP_CUDA_KERNEL(nearest_interp_grad, ops::InterpolateGradOpCUDAKernel, ops::InterpolateGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(trilinear_interp, ops::InterpolateOpCUDAKernel, + ops::InterpolateOpCUDAKernel, + ops::InterpolateOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(trilinear_interp_grad, + ops::InterpolateGradOpCUDAKernel, + ops::InterpolateGradOpCUDAKernel); diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h index bd33abb9..b107d1e6 100644 --- a/paddle/fluid/operators/interpolate_op.h +++ b/paddle/fluid/operators/interpolate_op.h @@ -22,13 +22,66 @@ template using EigenTensor = framework::EigenTensor; using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; + +inline std::vector get_new_shape( + const std::vector& list_new_shape_tensor) { + // get tensor from + std::vector vec_new_shape; + for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { + auto tensor = list_new_shape_tensor[i]; + PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}), + "shape of dim tensor should be [1]"); + if (platform::is_gpu_place(tensor->place())) { + framework::Tensor temp; + TensorCopySync(*tensor, platform::CPUPlace(), &temp); + + vec_new_shape.push_back(static_cast(*temp.data())); + } else { + vec_new_shape.push_back(static_cast(*tensor->data())); + } + } + + return vec_new_shape; +} + +template +inline std::vector get_new_data_from_tensor(const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + framework::Tensor cpu_starts_tensor; + if (platform::is_gpu_place(new_data_tensor->place())) { + TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor); + new_data = cpu_starts_tensor.data(); + } + vec_new_data = std::vector(new_data, new_data + new_data_tensor->numel()); + return vec_new_data; +} + +inline void ExtractNCDWH(const framework::DDim& dims, + const DataLayout& data_layout, int* N, int* C, int* D, + int* H, int* W) { + *N = dims[0]; + if (dims.size() == 4) { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[3]; + *D = 1; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *W = data_layout == DataLayout::kNCHW ? dims[3] : dims[2]; + } else { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[4]; + *D = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *H = data_layout == DataLayout::kNCHW ? dims[3] : dims[2]; + *W = data_layout == DataLayout::kNCHW ? dims[4] : dims[3]; + } +} template static void NearestNeighborInterpolate(const Tensor& input, Tensor* output, const float ratio_h, const float ratio_w, const int n, const int c, const int out_h, const int out_w, - const bool align_corners) { + const bool align_corners, + const DataLayout& data_layout) { auto input_t = EigenTensor::From(input); auto output_t = EigenTensor::From(*output); for (int k = 0; k < out_h; k++) { // loop for images @@ -41,7 +94,11 @@ static void NearestNeighborInterpolate(const Tensor& input, Tensor* output, for (int i = 0; i < n; i++) { // loop for batches for (int j = 0; j < c; j++) { // loop for channels - output_t(i, j, k, l) = input_t(i, j, in_k, in_l); + if (data_layout == DataLayout::kNCHW) { + output_t(i, j, k, l) = input_t(i, j, in_k, in_l); + } else { + output_t(i, k, l, j) = input_t(i, in_k, in_l, j); + } } } } @@ -54,7 +111,8 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output, const int in_h, const int in_w, const int n, const int c, const int out_h, const int out_w, const bool align_corners, - const bool align_mode) { + const bool align_mode, + const DataLayout data_layout) { auto input_t = EigenTensor::From(input); auto output_t = EigenTensor::From(*output); bool align_flag = (align_mode == 0 && !align_corners); @@ -120,11 +178,164 @@ static void BilinearInterpolation(const Tensor& input, Tensor* output, for (int k = 0; k < out_h; k++) { // loop for images for (int l = 0; l < out_w; l++) { // bilinear interpolation - T out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] + + T out_t; + if (data_layout == DataLayout::kNCHW) { + out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] + input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] + input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] + input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l]; - output_t(i, j, k, l) = out_t; + output_t(i, j, k, l) = out_t; + + } else { + out_t = input_t(i, vy_n[k], vx_w[l], j) * vd_s[k] * vd_e[l] + + input_t(i, vy_s[k], vx_w[l], j) * vd_n[k] * vd_e[l] + + input_t(i, vy_n[k], vx_e[l], j) * vd_s[k] * vd_w[l] + + input_t(i, vy_s[k], vx_e[l], j) * vd_n[k] * vd_w[l]; + output_t(i, k, l, j) = out_t; + } + } + } + } + } +} + +template +static void TrilinearInterpolation( + const Tensor& input, Tensor* output, const float ratio_d, + const float ratio_h, const float ratio_w, const int in_d, const int in_h, + const int in_w, const int n, const int c, const int out_d, const int out_h, + const int out_w, const bool align_corners, const bool align_mode, + const DataLayout& data_layout) { + auto input_t = EigenTensor::From(input); + auto output_t = EigenTensor::From(*output); + bool align_flag = (align_mode == 0 && !align_corners); + + std::vector vt_f, vt_b; + std::vector vd_f, vd_b; + vt_f.reserve(out_d); + vt_b.reserve(out_d); + vd_f.reserve(out_d); + vd_b.reserve(out_d); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int j = 0; j < out_d; j++) { + int t_f = align_flag ? static_cast(ratio_d * (j + 0.5) - 0.5) + : static_cast(ratio_d * j); + t_f = (t_f > 0) ? t_f : 0; + int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1); + float idx_src_t = ratio_d * (j + 0.5) - 0.5; + idx_src_t = (idx_src_t > 0) ? idx_src_t : 0; + float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f; + float d_b = 1.f - d_f; + { + vt_f[j] = t_f; + vt_b[j] = t_b; + vd_f[j] = d_f; + vd_b[j] = d_b; + } + } + + std::vector vy_n, vy_s; + std::vector vd_n, vd_s; + vy_n.reserve(out_h); + vy_s.reserve(out_h); + vd_n.reserve(out_h); + vd_s.reserve(out_h); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int k = 0; k < out_h; k++) { + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; + float d_s = 1.f - d_n; + { + vy_n[k] = y_n; + vy_s[k] = y_s; + vd_n[k] = d_n; + vd_s[k] = d_s; + } + } + + std::vector vx_w, vx_e; + std::vector vd_w, vd_e; + vx_w.reserve(out_w); + vx_e.reserve(out_w); + vd_w.reserve(out_w); + vd_e.reserve(out_w); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int l = 0; l < out_w; l++) { + int x_w = (align_mode == 0 && !align_corners) + ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; + float d_e = 1.f - d_w; + { + vx_w[l] = x_w; + vx_e[l] = x_e; + vd_w[l] = d_w; + vd_e[l] = d_e; + } + } + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for collapse(5) +#endif + for (int b = 0; b < n; b++) { // loop for batches + for (int i = 0; i < c; i++) { // loop for channels + for (int j = 0; j < out_d; j++) { // loop for D, H, W + for (int k = 0; k < out_h; k++) { + for (int l = 0; l < out_w; l++) { + // trilinear interpolation + if (data_layout == DataLayout::kNCHW) { + T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] * + vd_s[k] * vd_e[l] + + input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] * + vd_s[k] * vd_w[l] + + input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] * + vd_n[k] * vd_e[l] + + input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] * + vd_n[k] * vd_w[l] + + input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] * + vd_s[k] * vd_e[l] + + input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] * + vd_s[k] * vd_w[l] + + input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] * + vd_n[k] * vd_e[l] + + input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] * + vd_n[k] * vd_w[l]; + output_t(b, i, j, k, l) = out_t; + } else { + T out_t = input_t(b, vt_f[j], vy_n[k], vx_w[l], i) * vd_b[j] * + vd_s[k] * vd_e[l] + + input_t(b, vt_f[j], vy_n[k], vx_e[l], i) * vd_b[j] * + vd_s[k] * vd_w[l] + + input_t(b, vt_f[j], vy_s[k], vx_w[l], i) * vd_b[j] * + vd_n[k] * vd_e[l] + + input_t(b, vt_f[j], vy_s[k], vx_e[l], i) * vd_b[j] * + vd_n[k] * vd_w[l] + + input_t(b, vt_b[j], vy_n[k], vx_w[l], i) * vd_f[j] * + vd_s[k] * vd_e[l] + + input_t(b, vt_b[j], vy_n[k], vx_e[l], i) * vd_f[j] * + vd_s[k] * vd_w[l] + + input_t(b, vt_b[j], vy_s[k], vx_w[l], i) * vd_f[j] * + vd_n[k] * vd_e[l] + + input_t(b, vt_b[j], vy_s[k], vx_e[l], i) * vd_f[j] * + vd_n[k] * vd_w[l]; + output_t(b, j, k, l, i) = out_t; + } + } } } } @@ -135,7 +346,7 @@ template static void NearestNeighborInterpolateGrad( const Tensor& output_grad, Tensor* input_grad, const float ratio_h, const float ratio_w, const int n, const int c, const int out_h, - const int out_w, const bool align_corners) { + const int out_w, const bool align_corners, const DataLayout data_layout) { auto input_grad_t = EigenTensor::From(*input_grad); auto output_grad_t = EigenTensor::From(output_grad); @@ -149,7 +360,11 @@ static void NearestNeighborInterpolateGrad( for (int i = 0; i < n; i++) { // loop for batches for (int j = 0; j < c; j++) { // loop for channels - input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l); + if (data_layout == DataLayout::kNCHW) { + input_grad_t(i, j, in_k, in_l) += output_grad_t(i, j, k, l); + } else { + input_grad_t(i, in_k, in_l, j) += output_grad_t(i, k, l, j); + } } } } @@ -157,13 +372,11 @@ static void NearestNeighborInterpolateGrad( } template -static void BilinearInterpolationGrad(const Tensor& output_grad, - Tensor* input_grad, const float ratio_h, - const float ratio_w, const int in_h, - const int in_w, const int n, const int c, - const int out_h, const int out_w, - const bool align_corners, - const int align_mode) { +static void BilinearInterpolationGrad( + const Tensor& output_grad, Tensor* input_grad, const float ratio_h, + const float ratio_w, const int in_h, const int in_w, const int n, + const int c, const int out_h, const int out_w, const bool align_corners, + const int align_mode, const DataLayout data_layout) { auto input_grad_t = EigenTensor::From(*input_grad); auto output_grad_t = EigenTensor::From(output_grad); bool align_flag = (align_mode == 0 && !align_corners); @@ -190,144 +403,477 @@ static void BilinearInterpolationGrad(const Tensor& output_grad, for (int i = 0; i < n; i++) { // loop for batches for (int j = 0; j < c; j++) { // loop for channels // bilinear interpolation grad - const T grad = output_grad_t(i, j, k, l); - input_grad_t(i, j, y_n, x_w) += static_cast(grad * d_s * d_e); - input_grad_t(i, j, y_s, x_w) += static_cast(grad * d_n * d_e); - input_grad_t(i, j, y_n, x_e) += static_cast(grad * d_s * d_w); - input_grad_t(i, j, y_s, x_e) += static_cast(grad * d_n * d_w); + if (data_layout == DataLayout::kNCHW) { + const T grad = output_grad_t(i, j, k, l); + input_grad_t(i, j, y_n, x_w) += static_cast(grad * d_s * d_e); + input_grad_t(i, j, y_s, x_w) += static_cast(grad * d_n * d_e); + input_grad_t(i, j, y_n, x_e) += static_cast(grad * d_s * d_w); + input_grad_t(i, j, y_s, x_e) += static_cast(grad * d_n * d_w); + } else { + const T grad = output_grad_t(i, k, l, j); + input_grad_t(i, y_n, x_w, j) += static_cast(grad * d_s * d_e); + input_grad_t(i, y_s, x_w, j) += static_cast(grad * d_n * d_e); + input_grad_t(i, y_n, x_e, j) += static_cast(grad * d_s * d_w); + input_grad_t(i, y_s, x_e, j) += static_cast(grad * d_n * d_w); + } } } } } } -template -class InterpolateKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - - const int n = input->dims()[0]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - std::string interp_method = ctx.Attr("interp_method"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); +template +static void TrilinearInterpolationGrad( + const Tensor& output_grad, Tensor* input_grad, const float ratio_d, + const float ratio_h, const float ratio_w, const int in_d, const int in_h, + const int in_w, const int n, const int c, const int out_d, const int out_h, + const int out_w, const bool align_corners, const int align_mode, + const DataLayout data_layout) { + auto input_grad_t = EigenTensor::From(*input_grad); + auto output_grad_t = EigenTensor::From(output_grad); + bool align_flag = (align_mode == 0 && !align_corners); + for (int j = 0; j < out_d; j++) { // loop for D + int t_f = align_flag ? static_cast(ratio_d * (j + 0.5) - 0.5) + : static_cast(ratio_d * j); + t_f = (t_f > 0) ? t_f : 0; + int t_b = (t_f + 1) < (in_d - 1) ? (t_f + 1) : (in_d - 1); + float idx_src_t = ratio_d * (j + 0.5) - 0.5; + idx_src_t = (idx_src_t > 0) ? idx_src_t : 0; + float d_f = align_flag ? idx_src_t - t_f : ratio_d * j - t_f; + float d_b = 1.f - d_f; + + for (int k = 0; k < out_h; k++) { // loop for H + int y_n = align_flag ? static_cast(ratio_h * (k + 0.5) - 0.5) + : static_cast(ratio_h * k); + y_n = (y_n > 0) ? y_n : 0; + int y_s = (y_n + 1) < (in_h - 1) ? (y_n + 1) : (in_h - 1); + float idx_src_y = ratio_h * (k + 0.5) - 0.5; + idx_src_y = (idx_src_y > 0) ? idx_src_y : 0; + float d_n = align_flag ? idx_src_y - y_n : ratio_h * k - y_n; + float d_s = 1.f - d_n; + + for (int l = 0; l < out_w; l++) { // loop for W + int x_w = align_flag ? static_cast(ratio_w * (l + 0.5) - 0.5) + : static_cast(ratio_w * l); + x_w = (x_w > 0) ? x_w : 0; + int x_e = (x_w + 1) < (in_w - 1) ? (x_w + 1) : (in_w - 1); + float idx_src_x = ratio_w * (l + 0.5) - 0.5; + idx_src_x = (idx_src_x > 0) ? idx_src_x : 0; + float d_w = align_flag ? idx_src_x - x_w : ratio_w * l - x_w; + float d_e = 1.f - d_w; + + for (int b = 0; b < n; b++) { // loop for batches + for (int i = 0; i < c; i++) { // loop for channels + // trilinear interpolation grad + if (data_layout == DataLayout::kNCHW) { + const T grad = output_grad_t(b, i, j, k, l); + input_grad_t(b, i, t_f, y_n, x_w) += + static_cast(grad * d_b * d_s * d_e); + input_grad_t(b, i, t_f, y_n, x_e) += + static_cast(grad * d_b * d_s * d_w); + input_grad_t(b, i, t_f, y_s, x_w) += + static_cast(grad * d_b * d_n * d_e); + input_grad_t(b, i, t_f, y_s, x_e) += + static_cast(grad * d_b * d_n * d_w); + input_grad_t(b, i, t_b, y_n, x_w) += + static_cast(grad * d_f * d_s * d_e); + input_grad_t(b, i, t_b, y_n, x_e) += + static_cast(grad * d_f * d_s * d_w); + input_grad_t(b, i, t_b, y_s, x_w) += + static_cast(grad * d_f * d_n * d_e); + input_grad_t(b, i, t_b, y_s, x_e) += + static_cast(grad * d_f * d_n * d_w); + } else { + const T grad = output_grad_t(b, j, k, l, i); + input_grad_t(b, t_f, y_n, x_w, i) += + static_cast(grad * d_b * d_s * d_e); + input_grad_t(b, t_f, y_n, x_e, i) += + static_cast(grad * d_b * d_s * d_w); + input_grad_t(b, t_f, y_s, x_w, i) += + static_cast(grad * d_b * d_n * d_e); + input_grad_t(b, t_f, y_s, x_e, i) += + static_cast(grad * d_b * d_n * d_w); + input_grad_t(b, t_b, y_n, x_w, i) += + static_cast(grad * d_f * d_s * d_e); + input_grad_t(b, t_b, y_n, x_e, i) += + static_cast(grad * d_f * d_s * d_w); + input_grad_t(b, t_b, y_s, x_w, i) += + static_cast(grad * d_f * d_n * d_e); + input_grad_t(b, t_b, y_s, x_e, i) += + static_cast(grad * d_f * d_n * d_w); + } + } + } + } + } + } +} - float scale = ctx.Attr("scale"); +template +static void Interpolate2DCPUFwd(const framework::ExecutionContext& ctx, + const Tensor& input, Tensor* output) { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + + auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); + if (list_new_size_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_size_tensor); + out_h = new_size[0]; + out_w = new_size[1]; + } else { + float scale; + auto scale_tensor = ctx.Input("Scale"); + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } else { + scale = ctx.Attr("scale"); + } if (scale > 0) { out_h = static_cast(in_h * scale); out_w = static_cast(in_w * scale); } - auto out_size = ctx.Input("OutSize"); if (out_size != nullptr) { - auto out_size_data = out_size->data(); + auto out_size_data = get_new_data_from_tensor(out_size); out_h = out_size_data[0]; out_w = out_size_data[1]; } - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); - - output->mutable_data({n, c, out_h, out_w}, ctx.GetPlace()); - auto& device_ctx = - ctx.template device_context(); - math::SetConstant zero; - zero(device_ctx, output, static_cast(0.0)); - - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(*input, ctx.GetPlace(), output); - return; - } + } + PADDLE_ENFORCE_GT( + out_h, 0, + "out_h in Attr(out_shape) of Op(interpolate) should be greater than 0."); + PADDLE_ENFORCE_GT( + out_w, 0, + "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0."); + framework::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_h, out_w}; + } else { + dim_out = {n, out_h, out_w, c}; + } + output->mutable_data(dim_out, ctx.GetPlace()); - float ratio_h = 0.f; - float ratio_w = 0.f; + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(input, ctx.GetPlace(), output); + return; + } - if (out_h > 1) { - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(in_h) / out_h; - } - if (out_w > 1) { - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(in_w) / out_w; - } + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } - if ("bilinear" == interp_method) { - BilinearInterpolation(*input, output, ratio_h, ratio_w, in_h, in_w, n, - c, out_h, out_w, align_corners, align_mode); - } else if ("nearest" == interp_method) { - NearestNeighborInterpolate(*input, output, ratio_h, ratio_w, n, c, - out_h, out_w, align_corners); - } + if ("bilinear" == interp_method) { + BilinearInterpolation(input, output, ratio_h, ratio_w, in_h, in_w, n, c, + out_h, out_w, align_corners, align_mode, + data_layout); + } else if ("nearest" == interp_method) { + NearestNeighborInterpolate(input, output, ratio_h, ratio_w, n, c, out_h, + out_w, align_corners, data_layout); } -}; +} template -class InterpolateGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); - auto* input_grad = ctx.Output(framework::GradVarName("X")); - auto* output_grad = ctx.Input(framework::GradVarName("Out")); - - const int n = input->dims()[0]; - const int c = input->dims()[1]; - const int in_h = input->dims()[2]; - const int in_w = input->dims()[3]; - - std::string interp_method = ctx.Attr("interp_method"); - int out_h = ctx.Attr("out_h"); - int out_w = ctx.Attr("out_w"); - - float scale = ctx.Attr("scale"); +static void Interpolate3DCPUFwd(const framework::ExecutionContext& ctx, + const Tensor& input, Tensor* output) { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input.dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + + int out_d = ctx.Attr("out_d"); + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + + auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); + if (list_new_size_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_size_tensor); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } else { + float scale; + auto scale_tensor = ctx.Input("Scale"); + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } else { + scale = ctx.Attr("scale"); + } if (scale > 0) { + out_d = static_cast(in_d * scale); out_h = static_cast(in_h * scale); out_w = static_cast(in_w * scale); } - auto out_size = ctx.Input("OutSize"); if (out_size != nullptr) { - auto out_size_data = out_size->data(); - out_h = out_size_data[0]; - out_w = out_size_data[1]; + auto out_size_data = get_new_data_from_tensor(out_size); + out_d = out_size_data[0]; + out_h = out_size_data[1]; + out_w = out_size_data[2]; } + } + PADDLE_ENFORCE_GT( + out_d, 0, + "out_d in Attr(out_shape) of Op(interpolate) should be greater than 0."); + PADDLE_ENFORCE_GT( + out_h, 0, + "out_h in Attr(out_shape) of Op(interpolate) should be greater than 0."); + PADDLE_ENFORCE_GT( + out_w, 0, + "out_w in Attr(out_shape) of Op(interpolate) should be greater than 0."); + + framework::DDim dim_out; + if (data_layout == DataLayout::kNCHW) { + dim_out = {n, c, out_d, out_h, out_w}; + } else { + dim_out = {n, out_d, out_h, out_w, c}; + } - bool align_corners = ctx.Attr("align_corners"); - int align_mode = ctx.Attr("align_mode"); + output->mutable_data(dim_out, ctx.GetPlace()); - input_grad->mutable_data({n, c, in_h, in_w}, ctx.GetPlace()); - auto& device_ctx = - ctx.template device_context(); - math::SetConstant zero; - zero(device_ctx, input_grad, static_cast(0.0)); + if (in_d == out_d && in_h == out_h && in_w == out_w) { + framework::TensorCopy(input, ctx.GetPlace(), output); + return; + } - if (in_h == out_h && in_w == out_w) { - framework::TensorCopy(*output_grad, ctx.GetPlace(), input_grad); - return; - } + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(in_d) / out_d; + } + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } - float ratio_h = 0.f; - float ratio_w = 0.f; + if ("trilinear" == interp_method) { + TrilinearInterpolation(input, output, ratio_d, ratio_h, ratio_w, in_d, + in_h, in_w, n, c, out_d, out_h, out_w, + align_corners, align_mode, data_layout); + } +} - if (out_h > 1) { - ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) - : static_cast(in_h) / out_h; - } - if (out_w > 1) { - ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) - : static_cast(in_w) / out_w; +template +static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx, + Tensor* input_grad, const Tensor& output_grad) { + auto* input = ctx.Input("X"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + float scale; + auto scale_tensor = ctx.Input("Scale"); + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } else { + scale = ctx.Attr("scale"); + } + if (scale > 0) { + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + } + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + auto out_size_data = get_new_data_from_tensor(out_size); + out_h = out_size_data[0]; + out_w = out_size_data[1]; + } + auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); + if (list_new_size_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_size_tensor); + out_h = new_size[0]; + out_w = new_size[1]; + } + + framework::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_h, in_w}; + } else { + dim_grad = {n, in_h, in_w, c}; + } + input_grad->mutable_data(dim_grad, ctx.GetPlace()); + + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; + zero(device_ctx, input_grad, static_cast(0.0)); + + if (in_h == out_h && in_w == out_w) { + framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); + return; + } + + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } + + if ("bilinear" == interp_method) { + BilinearInterpolationGrad(output_grad, input_grad, ratio_h, ratio_w, + in_h, in_w, n, c, out_h, out_w, align_corners, + align_mode, data_layout); + } else if ("nearest" == interp_method) { + NearestNeighborInterpolateGrad(output_grad, input_grad, ratio_h, ratio_w, + n, c, out_h, out_w, align_corners, + data_layout); + } +} + +template +static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx, + Tensor* input_grad, const Tensor output_grad) { + auto* input = ctx.Input("X"); + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); + int n, c, in_d, in_h, in_w; + ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w); + + auto interp_method = ctx.Attr("interp_method"); + bool align_corners = ctx.Attr("align_corners"); + int align_mode = ctx.Attr("align_mode"); + + int out_d = ctx.Attr("out_d"); + int out_h = ctx.Attr("out_h"); + int out_w = ctx.Attr("out_w"); + float scale; + auto scale_tensor = ctx.Input("Scale"); + if (scale_tensor != nullptr) { + auto scale_data = get_new_data_from_tensor(scale_tensor); + scale = scale_data[0]; + } else { + scale = ctx.Attr("scale"); + } + if (scale > 0) { + out_d = static_cast(in_d * scale); + out_h = static_cast(in_h * scale); + out_w = static_cast(in_w * scale); + } + auto out_size = ctx.Input("OutSize"); + if (out_size != nullptr) { + auto out_size_data = get_new_data_from_tensor(out_size); + out_d = out_size_data[0]; + out_h = out_size_data[1]; + out_w = out_size_data[2]; + } + auto list_new_size_tensor = ctx.MultiInput("SizeTensor"); + if (list_new_size_tensor.size() > 0) { + // have size tensor + auto new_size = get_new_shape(list_new_size_tensor); + out_d = new_size[0]; + out_h = new_size[1]; + out_w = new_size[2]; + } + + framework::DDim dim_grad; + if (data_layout == DataLayout::kNCHW) { + dim_grad = {n, c, in_d, in_h, in_w}; + } else { + dim_grad = {n, in_d, in_h, in_w, c}; + } + input_grad->mutable_data(dim_grad, ctx.GetPlace()); + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; + zero(device_ctx, input_grad, static_cast(0.0)); + + if (in_d == out_d && in_h == out_h && in_w == out_w) { + framework::TensorCopy(output_grad, ctx.GetPlace(), input_grad); + return; + } + + float ratio_d = 0.f; + float ratio_h = 0.f; + float ratio_w = 0.f; + if (out_d > 1) { + ratio_d = (align_corners) ? static_cast(in_d - 1) / (out_d - 1) + : static_cast(in_d) / out_d; + } + if (out_h > 1) { + ratio_h = (align_corners) ? static_cast(in_h - 1) / (out_h - 1) + : static_cast(in_h) / out_h; + } + if (out_w > 1) { + ratio_w = (align_corners) ? static_cast(in_w - 1) / (out_w - 1) + : static_cast(in_w) / out_w; + } + + if ("trilinear" == interp_method) { + TrilinearInterpolationGrad( + output_grad, input_grad, ratio_d, ratio_h, ratio_w, in_d, in_h, in_w, n, + c, out_d, out_h, out_w, align_corners, align_mode, data_layout); + } +} + +template +class InterpolateKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + + auto input_dims = input->dims(); + if (input_dims.size() == 4) { // 2D interpolation + Interpolate2DCPUFwd(ctx, *input, output); + } else if (input_dims.size() == 5) { // 3D interpolation + Interpolate3DCPUFwd(ctx, *input, output); } + } +}; + +template +class InterpolateGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* output_grad = ctx.Input(framework::GradVarName("Out")); - if ("bilinear" == interp_method) { - BilinearInterpolationGrad(*output_grad, input_grad, ratio_h, ratio_w, - in_h, in_w, n, c, out_h, out_w, - align_corners, align_mode); - } else if ("nearest" == interp_method) { - NearestNeighborInterpolateGrad(*output_grad, input_grad, ratio_h, - ratio_w, n, c, out_h, out_w, - align_corners); + auto output_grad_dims = output_grad->dims(); + if (output_grad_dims.size() == 4) { // 2D interpolation grad + Interpolate2DCPUBwd(ctx, input_grad, *output_grad); + } else if (output_grad_dims.size() == 5) { // 3D interpolation grad + Interpolate3DCPUBwd(ctx, input_grad, *output_grad); } } }; diff --git a/paddle/fluid/operators/jit/gen/seqpool.cc b/paddle/fluid/operators/jit/gen/seqpool.cc index d9e5904a..ec8e4e98 100644 --- a/paddle/fluid/operators/jit/gen/seqpool.cc +++ b/paddle/fluid/operators/jit/gen/seqpool.cc @@ -66,7 +66,7 @@ class SeqPoolCreator : public JitCodeCreator { ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) * 4 /* load, mul and save */ + 256) * - 8; + 16; } std::unique_ptr CreateJitCode( const seq_pool_attr_t& attr) const override { diff --git a/paddle/fluid/operators/jit/kernels.h b/paddle/fluid/operators/jit/kernels.h deleted file mode 100644 index 279a5f98..00000000 --- a/paddle/fluid/operators/jit/kernels.h +++ /dev/null @@ -1,160 +0,0 @@ -#ifdef PYBIND_AVX_MKLML -// Generated by the paddle/fluid/operators/jit/CMakeLists.txt. DO NOT EDIT! - -#pragma once -#include "paddle/fluid/operators/jit/helper.h" -#include "paddle/fluid/operators/jit/registry.h" - -USE_JITKERNEL_REFER(kVMul); -USE_JITKERNEL_REFER(kVAdd); -USE_JITKERNEL_REFER(kVAddRelu); -USE_JITKERNEL_REFER(kVSub); -USE_JITKERNEL_REFER(kVScal); -USE_JITKERNEL_REFER(kStrideScal); -USE_JITKERNEL_REFER(kVAddBias); -USE_JITKERNEL_REFER(kVCopy); -USE_JITKERNEL_REFER(kVRelu); -USE_JITKERNEL_REFER(kVIdentity); -USE_JITKERNEL_REFER(kVExp); -USE_JITKERNEL_REFER(kVSigmoid); -USE_JITKERNEL_REFER(kVTanh); -USE_JITKERNEL_REFER(kLSTMCtHt); -USE_JITKERNEL_REFER(kLSTMC1H1); -USE_JITKERNEL_REFER(kGRUH1); -USE_JITKERNEL_REFER(kGRUHtPart1); -USE_JITKERNEL_REFER(kGRUHtPart2); -USE_JITKERNEL_REFER(kCRFDecoding); -USE_JITKERNEL_REFER(kLayerNorm); -USE_JITKERNEL_REFER(kNCHW16CMulNC); -USE_JITKERNEL_REFER(kSeqPool); -USE_JITKERNEL_REFER(kMatMul); -USE_JITKERNEL_REFER(kVSquare); -USE_JITKERNEL_REFER(kHSum); -USE_JITKERNEL_REFER(kHMax); -USE_JITKERNEL_REFER(kStrideASum); -USE_JITKERNEL_REFER(kSoftmax); -USE_JITKERNEL_REFER(kEmbSeqPool); -USE_JITKERNEL_REFER(kSgd); -USE_JITKERNEL_REFER(kVBroadcast); -USE_JITKERNEL_MORE(kMatMul, mkl); -USE_JITKERNEL_MORE(kVMul, mkl); -USE_JITKERNEL_MORE(kVAdd, mkl); -USE_JITKERNEL_MORE(kVScal, mkl); -USE_JITKERNEL_MORE(kStrideScal, mkl); -USE_JITKERNEL_MORE(kVExp, mkl); -USE_JITKERNEL_MORE(kVSquare, mkl); -USE_JITKERNEL_MORE(kVCopy, mkl); -USE_JITKERNEL_MORE(kVSigmoid, mkl); -USE_JITKERNEL_MORE(kVTanh, mkl); -USE_JITKERNEL_MORE(kSeqPool, mkl); -USE_JITKERNEL_MORE(kSoftmax, mkl); -USE_JITKERNEL_MORE(kEmbSeqPool, mkl); -USE_JITKERNEL_MORE(kSgd, mkl); -USE_JITKERNEL_MORE(kVBroadcast, mkl); -USE_JITKERNEL_MORE(kCRFDecoding, intrinsic); -USE_JITKERNEL_MORE(kLayerNorm, intrinsic); -USE_JITKERNEL_MORE(kVSigmoid, mix); -USE_JITKERNEL_MORE(kVTanh, mix); -USE_JITKERNEL_MORE(kLSTMCtHt, mix); -USE_JITKERNEL_MORE(kLSTMC1H1, mix); -USE_JITKERNEL_MORE(kGRUH1, mix); -USE_JITKERNEL_MORE(kGRUHtPart1, mix); -USE_JITKERNEL_MORE(kGRUHtPart2, mix); -USE_JITKERNEL_MORE(kSoftmax, mix); -USE_JITKERNEL_GEN(kMatMul); -USE_JITKERNEL_GEN(kVMul); -USE_JITKERNEL_GEN(kVAdd); -USE_JITKERNEL_GEN(kVSub); -USE_JITKERNEL_GEN(kVAddRelu); -USE_JITKERNEL_GEN(kVScal); -USE_JITKERNEL_GEN(kVAddBias); -USE_JITKERNEL_GEN(kVRelu); -USE_JITKERNEL_GEN(kVSquare); -USE_JITKERNEL_GEN(kVIdentity); -USE_JITKERNEL_GEN(kVExp); -USE_JITKERNEL_GEN(kVSigmoid); -USE_JITKERNEL_GEN(kVTanh); -USE_JITKERNEL_GEN(kLSTMCtHt); -USE_JITKERNEL_GEN(kLSTMC1H1); -USE_JITKERNEL_GEN(kGRUH1); -USE_JITKERNEL_GEN(kGRUHtPart1); -USE_JITKERNEL_GEN(kGRUHtPart2); -USE_JITKERNEL_GEN(kNCHW16CMulNC); -USE_JITKERNEL_GEN(kSeqPool); -USE_JITKERNEL_GEN(kHMax); -USE_JITKERNEL_GEN(kHSum); -USE_JITKERNEL_GEN(kEmbSeqPool); -USE_JITKERNEL_GEN(kSgd); -USE_JITKERNEL_GEN(kVBroadcast); -#elif defined PYBIND_NOAVX_OPENBLAS -// Generated by the paddle/fluid/operators/jit/CMakeLists.txt. DO NOT EDIT! - -#pragma once -#include "paddle/fluid/operators/jit/helper.h" -#include "paddle/fluid/operators/jit/registry.h" - -USE_JITKERNEL_REFER(kVMul); -USE_JITKERNEL_REFER(kVAdd); -USE_JITKERNEL_REFER(kVAddRelu); -USE_JITKERNEL_REFER(kVSub); -USE_JITKERNEL_REFER(kVScal); -USE_JITKERNEL_REFER(kStrideScal); -USE_JITKERNEL_REFER(kVAddBias); -USE_JITKERNEL_REFER(kVCopy); -USE_JITKERNEL_REFER(kVRelu); -USE_JITKERNEL_REFER(kVIdentity); -USE_JITKERNEL_REFER(kVExp); -USE_JITKERNEL_REFER(kVSigmoid); -USE_JITKERNEL_REFER(kVTanh); -USE_JITKERNEL_REFER(kLSTMCtHt); -USE_JITKERNEL_REFER(kLSTMC1H1); -USE_JITKERNEL_REFER(kGRUH1); -USE_JITKERNEL_REFER(kGRUHtPart1); -USE_JITKERNEL_REFER(kGRUHtPart2); -USE_JITKERNEL_REFER(kCRFDecoding); -USE_JITKERNEL_REFER(kLayerNorm); -USE_JITKERNEL_REFER(kNCHW16CMulNC); -USE_JITKERNEL_REFER(kSeqPool); -USE_JITKERNEL_REFER(kMatMul); -USE_JITKERNEL_REFER(kVSquare); -USE_JITKERNEL_REFER(kHSum); -USE_JITKERNEL_REFER(kHMax); -USE_JITKERNEL_REFER(kStrideASum); -USE_JITKERNEL_REFER(kSoftmax); -USE_JITKERNEL_REFER(kEmbSeqPool); -USE_JITKERNEL_REFER(kSgd); -USE_JITKERNEL_REFER(kVBroadcast); -USE_JITKERNEL_MORE(kVSigmoid, mix); -USE_JITKERNEL_MORE(kVTanh, mix); -USE_JITKERNEL_MORE(kLSTMCtHt, mix); -USE_JITKERNEL_MORE(kLSTMC1H1, mix); -USE_JITKERNEL_MORE(kGRUH1, mix); -USE_JITKERNEL_MORE(kGRUHtPart1, mix); -USE_JITKERNEL_MORE(kGRUHtPart2, mix); -USE_JITKERNEL_MORE(kSoftmax, mix); -USE_JITKERNEL_GEN(kMatMul); -USE_JITKERNEL_GEN(kVMul); -USE_JITKERNEL_GEN(kVAdd); -USE_JITKERNEL_GEN(kVSub); -USE_JITKERNEL_GEN(kVAddRelu); -USE_JITKERNEL_GEN(kVScal); -USE_JITKERNEL_GEN(kVAddBias); -USE_JITKERNEL_GEN(kVRelu); -USE_JITKERNEL_GEN(kVSquare); -USE_JITKERNEL_GEN(kVIdentity); -USE_JITKERNEL_GEN(kVExp); -USE_JITKERNEL_GEN(kVSigmoid); -USE_JITKERNEL_GEN(kVTanh); -USE_JITKERNEL_GEN(kLSTMCtHt); -USE_JITKERNEL_GEN(kLSTMC1H1); -USE_JITKERNEL_GEN(kGRUH1); -USE_JITKERNEL_GEN(kGRUHtPart1); -USE_JITKERNEL_GEN(kGRUHtPart2); -USE_JITKERNEL_GEN(kNCHW16CMulNC); -USE_JITKERNEL_GEN(kSeqPool); -USE_JITKERNEL_GEN(kHMax); -USE_JITKERNEL_GEN(kHSum); -USE_JITKERNEL_GEN(kEmbSeqPool); -USE_JITKERNEL_GEN(kSgd); -USE_JITKERNEL_GEN(kVBroadcast); -#endif diff --git a/paddle/fluid/operators/label_smooth_op.cu b/paddle/fluid/operators/label_smooth_op.cu index ab259b48..89f1d28e 100644 --- a/paddle/fluid/operators/label_smooth_op.cu +++ b/paddle/fluid/operators/label_smooth_op.cu @@ -12,15 +12,101 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/label_smooth_op.h" +namespace paddle { +namespace operators { +template +__global__ void LabelSmoothRunOriginKernel(const int N, const float epsilon, + const int label_dim, const T* src, + T* dst) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + for (; idx < N; idx += blockDim.x * gridDim.x) { + dst[idx] = static_cast(1 - epsilon) * src[idx] + + static_cast(epsilon / label_dim); + } +} + +template +__global__ void LabelSmoothRunDistKernel(const int N, const float epsilon, + const int dist_numel, const T* src, + const T* dist_data, T* dst) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + for (; idx < N; idx += blockDim.x * gridDim.x) { + int dist_idx = idx - (idx / dist_numel) * dist_numel; + dst[idx] = static_cast(1 - epsilon) * src[idx] + + static_cast(epsilon) * dist_data[dist_idx]; + } +} + +template +__global__ void LabelSmoothGradRunKernel(const int N, const float epsilon, + const T* src, T* dst) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + for (; idx < N; idx += blockDim.x * gridDim.x) { + dst[idx] = static_cast(1 - epsilon) * src[idx]; + } +} + +template +class LabelSmoothGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* out_t = ctx.Output("Out"); + auto* in_t = ctx.Input("X"); + auto* dist_t = ctx.Input("PriorDist"); + auto label_dim = in_t->dims()[1]; + auto epsilon = ctx.Attr("epsilon"); + auto& dev = *ctx.template device_context().eigen_device(); + auto size_prob = in_t->numel(); + const T* in_data = in_t->data(); + T* out_data = out_t->mutable_data(ctx.GetPlace()); + int threads = 512; + int grid = (size_prob + threads - 1) / threads; + auto stream = ctx.cuda_device_context().stream(); + if (dist_t) { + auto dist_numel = dist_t->numel(); + const T* dist_data = dist_t->data(); + LabelSmoothRunDistKernel<<>>( + size_prob, epsilon, dist_numel, in_data, dist_data, out_data); + + } else { + LabelSmoothRunOriginKernel<<>>( + size_prob, epsilon, label_dim, in_data, out_data); + } + } +}; + +template +class LabelSmoothGradGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* d_out_t = ctx.Input(framework::GradVarName("Out")); + auto* d_in_t = ctx.Output(framework::GradVarName("X")); + d_in_t->mutable_data(ctx.GetPlace()); + + auto epsilon = ctx.Attr("epsilon"); + auto& dev = *ctx.template device_context().eigen_device(); + const T* in_data = d_out_t->data(); + auto size_prob = d_out_t->numel(); + T* out_data = d_in_t->mutable_data(ctx.GetPlace()); + int threads = 512; + int grid = (size_prob + threads - 1) / threads; + auto stream = ctx.cuda_device_context().stream(); + LabelSmoothGradRunKernel<<>>( + size_prob, epsilon, in_data, out_data); + } +}; +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( label_smooth, - ops::LabelSmoothKernel, - ops::LabelSmoothKernel); + ops::LabelSmoothGPUKernel, + ops::LabelSmoothGPUKernel); REGISTER_OP_CUDA_KERNEL( label_smooth_grad, - ops::LabelSmoothGradKernel, - ops::LabelSmoothGradKernel); + ops::LabelSmoothGradGPUKernel, + ops::LabelSmoothGradGPUKernel); diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index a94704a7..ed09c64f 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -23,21 +23,28 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("Emission", - "(LoDTensor, default LoDTensor) " - "A 2-D LoDTensor with shape [N x D], where N is the size of the " + "(LoDTensor/Tensor). When a LoDTensor input,A 2-D LoDTensor" + " with shape [N x D], where N is the size of the " "mini-batch and D is the total tag number. The unscaled emission " - "weight matrix for the linear chain CRF. "); + "weight matrix for the linear chain CRF. When a Tensor input," + "A Tensor with shape [N x S x D], where N is batch number," + "S is max length of sequences, D is the total tag number."); AddInput("Transition", "(Tensor, default Tensor) A 2-D Tensor with shape " "[(D + 2) x D]. The learnable parameter for the linear_chain_crf " "operator. See more details in the operator's comments."); AddInput("Label", - "(LoDTensor, default LoDTensor) A LoDTensor with shape " + "(LoDTensor/Tensor), when a LoDTensor input, " "[N x 1], where N is the total element number in a mini-batch. " - "The ground truth."); + "when a Tensor input, [N x S], where N is batch number. " + "S is max length of sequences. The ground truth."); + AddInput("length", + "(Tensor, default Tensor) A Tensor with shape " + "[M x 1], where M is the sequence number in a mini-batch.") + .AsDispensable(); AddOutput( "Alpha", - "(Tensor, default Tensor) A 2-D Tensor with shape [N x D]. " + "(Tensor, default Tensor), the same shape with Emission. " "The forward vectors for the entire batch. Denote it as $\alpha$. " "$\alpha$ is a memo table used to calculate the normalization " "factor in CRF. $\alpha[k, v]$ stores the unnormalized " @@ -49,7 +56,7 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker { .AsIntermediate(); AddOutput( "EmissionExps", - "(Tensor, default Tensor) A 2-D Tensor with shape [N x D]. " + "(Tensor, default Tensor), the same shape with Emission. " "The exponentials of Input(Emission). This is an intermediate " "computational result in forward computation, and will be reused in " "backward computation.") @@ -145,11 +152,6 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("LogLikelihood"), "Output(LogLikelihood) should be not null."); - auto emission_dims = ctx->GetInputDim("Emission"); - PADDLE_ENFORCE_EQ(emission_dims.size(), 2, - "The Input(Emission) should be a 2-D tensor."); - PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); - auto transition_dims = ctx->GetInputDim("Transition"); PADDLE_ENFORCE_EQ(transition_dims.size(), 2, "The Input(Transition) should be a 2-D tensor."); @@ -164,20 +166,40 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { "An invalid dimension for the Input(Transition), which should " "be a 2-D tensor with shape [(D + 2) x D]."); } - PADDLE_INFERSHAPE_ENFORCE_EQ( - ctx, emission_dims[1], transition_dims[1], - "The 2nd dimension of the Input(Emission) and the Input(Transition) " - "should be equal to the tag number."); - - auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, - "The Input(Label) should be a 2-D tensor with the 2nd " - "dimensions fixed to 1."); - PADDLE_INFERSHAPE_ENFORCE_EQ( - ctx, emission_dims[0], label_dims[0], - "The height of Input(Emission) and the height of Input(Label) " - "should be the same."); - + auto emission_dims = ctx->GetInputDim("Emission"); + PADDLE_ENFORCE_NE(emission_dims[0], 0, + "An empty mini-batch is not allowed."); + if (ctx->HasInput("length")) { + PADDLE_ENFORCE_EQ(emission_dims.size(), 3, + "The Input(Emission) should be a 3-D tensor."); + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(label_dims.size(), 3, + "The Input(Label) should be a 3-D tensor"); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_dims[0], label_dims[0], + "The batch size of Input(Emission) and Input(Label) " + "should be the same."); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_dims[1], label_dims[1], + "The max length of Input(Emission) and Input(Label) " + "should be the same."); + } else { + PADDLE_ENFORCE_EQ(emission_dims.size(), 2, + "The Input(Emission) should be a 2-D tensor."); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_dims[1], transition_dims[1], + "The 2nd dimension of the Input(Emission) and the Input(Transition) " + "should be equal to the tag number."); + + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(label_dims.size(), 2, + "The Input(Label) should be a 2-D tensor with the 2nd " + "dimensions fixed to 1."); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_dims[0], label_dims[0], + "The height of Input(Emission) and the height of Input(Label) " + "should be the same."); + } ctx->SetOutputDim("Alpha", emission_dims); ctx->SetOutputDim("EmissionExps", emission_dims); ctx->SetOutputDim("TransitionExps", transition_dims); @@ -210,12 +232,6 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("LogLikelihood")), "Input(LogLikelihood@GRAD) shoudl be not null."); - auto emission_exps_dims = ctx->GetInputDim("EmissionExps"); - PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2, - "The Input(EmissionExps) should be a 2-D tensor."); - PADDLE_ENFORCE(emission_exps_dims[0], - "An empty mini-batch is not allowed."); - auto transition_exps_dims = ctx->GetInputDim("TransitionExps"); PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2, "The Input(TransitionExps) should be a 2-D tensor."); @@ -230,15 +246,34 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { "An invalid dimension for the Input(TransitionExps), which should " "be a 2-D tensor with shape [(D + 2) x D]."); } - PADDLE_INFERSHAPE_ENFORCE_EQ( - ctx, emission_exps_dims[1], transition_exps_dims[1], - "The 2nd dimension of the Input(EmissionExps) and the " - "Input(TransitionExps) should be equal to the tag number."); + auto emission_exps_dims = ctx->GetInputDim("EmissionExps"); auto label_dims = ctx->GetInputDim("Label"); - PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL, - "The Input(Label) should be a 2-D tensor with the 2nd " - "dimensions fixed to 1."); + if (ctx->HasInput("length")) { + PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 3, + "The Input(EmissionExps) should be a 3-D tensor."); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_exps_dims[2], transition_exps_dims[1], + "The 3nd dimension of the Input(EmissionExps) and the " + "Input(TransitionExps) should be equal to the tag number."); + PADDLE_ENFORCE_EQ(label_dims.size(), 3, + "The Input(Label) should be a 3-D tensor with the 3nd " + "dimensions fixed to 1."); + } else { + PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2, + "The Input(EmissionExps) should be a 2-D tensor."); + PADDLE_INFERSHAPE_ENFORCE_EQ( + ctx, emission_exps_dims[1], transition_exps_dims[1], + "The 2nd dimension of the Input(EmissionExps) and the " + "Input(TransitionExps) should be equal to the tag number."); + PADDLE_ENFORCE_EQ(label_dims.size(), 2, + "The Input(Label) should be a 2-D tensor"); + PADDLE_ENFORCE_EQ(label_dims[1], 1, + "The Input(Label) 2nd dimensions fixed to 1."); + } + PADDLE_ENFORCE_NE(emission_exps_dims[0], 0, + "An empty mini-batch is not allowed."); + PADDLE_INFERSHAPE_ENFORCE_EQ( ctx, emission_exps_dims[0], label_dims[0], "The height of Input(EmissionExps) and the height of Input(Label) " @@ -246,8 +281,12 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { if (ctx->HasOutput(framework::GradVarName("Emission"))) { ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims); - ctx->ShareLoD("Emission", framework::GradVarName("Emission")); + if (ctx->HasInput("length") == false) { + ctx->ShareLoD("Emission", framework::GradVarName("Emission")); + } } + // ctx->SetOutputDim(framework::GradVarName("Emission"), + // emission_exps_dims); if (ctx->HasOutput(framework::GradVarName("Transition"))) { ctx->SetOutputDim(framework::GradVarName("Transition"), transition_exps_dims); @@ -275,15 +314,15 @@ class LinearChainCRFGradDescMaker : public framework::SingleGradOpDescMaker { std::unique_ptr op(new framework::OpDesc()); op->SetType("linear_chain_crf_grad"); op->SetAttrMap(Attrs()); - op->SetInput("Emission", Input("Emission")); op->SetInput("Transition", Input("Transition")); op->SetInput("Label", Input("Label")); - op->SetInput("Alpha", Output("Alpha")); op->SetInput("EmissionExps", Output("EmissionExps")); op->SetInput("TransitionExps", Output("TransitionExps")); - + if (ForwardOp().Inputs().count("length") > 0) { + op->SetInput("length", Input("length")); + } op->SetInput(framework::GradVarName("LogLikelihood"), OutputGrad("LogLikelihood")); diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h old mode 100644 new mode 100755 index d5162bcd..8cd3cdad --- a/paddle/fluid/operators/linear_chain_crf_op.h +++ b/paddle/fluid/operators/linear_chain_crf_op.h @@ -54,20 +54,9 @@ template class LinearChainCRFOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - // TODO(caoying) The checks related to LoD information should be - // moved into InferShape once after the InferShape is refactored. - PADDLE_ENFORCE_EQ(ctx.Input("Emission")->NumLevels(), 1UL, - "The Input(Emission) should be a sequence."); - PADDLE_ENFORCE_EQ(ctx.Input("Label")->NumLevels(), 1UL, - "The Input(Label) should be a sequence."); - auto in_lod = ctx.Input("Label")->lod(); - PADDLE_ENFORCE(in_lod.size(), "Input(Label) must be a sequence."); - const size_t level = 0; - const size_t seq_num = in_lod[level].size() - 1; - - const LoDTensor* emission_weights = ctx.Input("Emission"); - const Tensor* transition_weights = ctx.Input("Transition"); - const LoDTensor* label = ctx.Input("Label"); + const Tensor* emission_weights = ctx.Input("Emission"); + const Tensor* transition_weights = + ctx.Input("Transition"); Tensor* emission_exps = ctx.Output("EmissionExps"); Tensor* transition_exps = ctx.Output("TransitionExps"); @@ -76,56 +65,103 @@ class LinearChainCRFOpKernel : public framework::OpKernel { // Because the computation codes only runs on CPU, here the memory for all // the outputs is FIXED to be allocated on the CPU memory. - emission_exps->mutable_data(platform::CPUPlace()); + auto* emission_exps_data = + emission_exps->mutable_data(platform::CPUPlace()); + auto* alpha_data = alpha->mutable_data(platform::CPUPlace()); transition_exps->mutable_data(platform::CPUPlace()); - alpha->mutable_data(platform::CPUPlace()); - // Resize the output tensor to its correct dimension. + memset(emission_exps_data, 0, emission_exps->numel() * sizeof(T)); + memset(alpha_data, 0, alpha->numel() * sizeof(T)); + auto emission_dims = emission_weights->dims(); + + const Tensor* label = ctx.Input("Label"); + auto& dev_ctx = ctx.template device_context(); + Tensor emission_weights_tmp = ctx.AllocateTmpTensor( + emission_weights->dims(), dev_ctx); + emission_weights_tmp.ShareDataWith(*emission_weights); + Tensor label_tmp = + ctx.AllocateTmpTensor(label->dims(), dev_ctx); + label_tmp.ShareDataWith(*label); + Tensor emission_exps_tmp = + ctx.AllocateTmpTensor(emission_exps->dims(), dev_ctx); + emission_exps_tmp.ShareDataWith(*emission_exps); + Tensor alpha_tmp = + ctx.AllocateTmpTensor(alpha->dims(), dev_ctx); + alpha_tmp.ShareDataWith(*alpha); + size_t seq_num = 0; + size_t batch_size; + size_t tag_num; + const int64_t* length_data = nullptr; + framework::Vector in_lod; + if (ctx.HasInput("length")) { + const Tensor* label_length = ctx.Input("length"); + length_data = label_length->data(); + seq_num = label_length->numel(); + batch_size = emission_dims[0] * emission_dims[1]; + tag_num = emission_dims[2]; + emission_weights_tmp.Resize( + {emission_dims[0] * emission_dims[1], emission_dims[2]}); + auto label_dims = label->dims(); + label_tmp.Resize({label_dims[0] * label_dims[1], label_dims[2]}); + alpha_tmp.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]}); + emission_exps_tmp.Resize( + {emission_dims[0] * emission_dims[1], emission_dims[2]}); + PADDLE_ENFORCE_EQ(seq_num, emission_dims[0], + "the size of Input(length) must be equal to " + "emission_dims[0]."); + PADDLE_ENFORCE_EQ(seq_num, label_dims[0], + "the size of Input(length) must be equal to " + "label_dims[0]."); + } else { + seq_num = ctx.Input("Label")->lod()[0].size() - 1; + batch_size = emission_dims[0]; + tag_num = emission_dims[1]; + in_lod = ctx.Input("Label")->lod()[0]; + PADDLE_ENFORCE_NE(in_lod.size(), 0, "Input(Label) must be a sequence."); + } + ll->Resize({static_cast(seq_num), 1}); ll->mutable_data(platform::CPUPlace()); - // Now, all the inputs and outputs should be on the CPU memory. - auto emission_dims = emission_weights->dims(); - const size_t batch_size = emission_dims[0]; - const size_t tag_num = emission_dims[1]; - Tensor emission_row_max; emission_row_max.mutable_data( framework::make_ddim({static_cast(batch_size), 1}), platform::CPUPlace()); - auto& place = *ctx.template device_context() .eigen_device(); - auto x = EigenMatrix::From(*emission_weights); + auto x = EigenMatrix::From(emission_weights_tmp); auto x_row_max = EigenMatrix::From(emission_row_max); x_row_max.device(place) = x.maximum(Eigen::DSizes(1)) .reshape(Eigen::DSizes(static_cast(batch_size), 1)); - - auto x_exps = EigenMatrix::From(*emission_exps); + auto x_exps = EigenMatrix::From(emission_exps_tmp); x_exps.device(place) = (x - x_row_max.broadcast(Eigen::DSizes(1, tag_num))).exp(); - auto w = EigenMatrix::From(*transition_weights); auto w_exps = EigenMatrix::From(*transition_exps); w_exps.device(place) = w.exp(); - T* log_likelihood = ll->data(); for (size_t i = 0; i < seq_num; ++i) { - int start_pos = static_cast(in_lod[level][i]); - int end_pos = static_cast(in_lod[level][i + 1]); + int start_pos = 0; + int end_pos = 0; + if (ctx.HasInput("length")) { + if (length_data[i] == 0) continue; + start_pos = i * emission_dims[1]; + end_pos = start_pos + static_cast(length_data[i]); + } else { + start_pos = static_cast(in_lod[i]); + end_pos = static_cast(in_lod[i + 1]); + } if (end_pos == start_pos) { // If an empty input sequence is given, pad 0 for its cost. log_likelihood[i] = 0.; continue; } - - const Tensor one_seq = emission_weights->Slice(start_pos, end_pos); + const Tensor one_seq = emission_weights_tmp.Slice(start_pos, end_pos); Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos); - Tensor one_seq_exps = emission_exps->Slice(start_pos, end_pos); - const Tensor one_seq_label = label->Slice(start_pos, end_pos); - Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); - + Tensor one_seq_exps = emission_exps_tmp.Slice(start_pos, end_pos); + const Tensor one_seq_label = label_tmp.Slice(start_pos, end_pos); + Tensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos); log_likelihood[i] = ForwardOneSequence( one_seq, one_seq_row_max, one_seq_exps, *transition_weights, *transition_exps, one_seq_label, &one_seq_alpha); @@ -197,52 +233,91 @@ template class LinearChainCRFGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - const size_t level = 0; // currently, only support sequence. - auto lod = ctx.Input("Label")->lod(); - PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence."); - - const Tensor* label = ctx.Input("Label"); + const Tensor* label = ctx.Input("Label"); const Tensor* emission_exps = ctx.Input("EmissionExps"); const Tensor* transition_exps = ctx.Input("TransitionExps"); const Tensor* alpha = ctx.Input("Alpha"); const T* ll_grad = ctx.Input(framework::GradVarName("LogLikelihood"))->data(); - + auto& dev_ctx = ctx.template device_context(); Tensor* emission_grad = ctx.Output(framework::GradVarName("Emission")); + auto* emission_grad_data = + emission_grad->mutable_data(platform::CPUPlace()); + memset(emission_grad_data, 0, emission_grad->numel() * sizeof(T)); + Tensor alpha_tmp = + ctx.AllocateTmpTensor(alpha->dims(), dev_ctx); + alpha_tmp.ShareDataWith(*alpha); + Tensor label_tmp = + ctx.AllocateTmpTensor(label->dims(), dev_ctx); + label_tmp.ShareDataWith(*label); + Tensor emission_exps_tmp = + ctx.AllocateTmpTensor(emission_exps->dims(), dev_ctx); + emission_exps_tmp.ShareDataWith(*emission_exps); + Tensor emission_grad_tmp = + ctx.AllocateTmpTensor(emission_grad->dims(), dev_ctx); + emission_grad_tmp.ShareDataWith(*emission_grad); + // getting seq_num using padding or not + size_t seq_num = 0; + framework::Vector lod; + const int64_t* length_data = nullptr; + if (ctx.HasInput("length")) { + const Tensor* label_length = ctx.Input("length"); + length_data = label_length->data(); + seq_num = label_length->numel(); + auto emission_dims = emission_grad->dims(); + auto label_dims = label->dims(); + emission_grad_tmp.Resize( + {emission_dims[0] * emission_dims[1], emission_dims[2]}); + label_tmp.Resize({label_dims[0] * label_dims[1], label_dims[2]}); + alpha_tmp.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]}); + emission_exps_tmp.Resize( + {emission_dims[0] * emission_dims[1], emission_dims[2]}); + } else { + seq_num = ctx.Input("Label")->lod()[0].size() - 1; + lod = ctx.Input("Label")->lod()[0]; + PADDLE_ENFORCE_NE(lod.size(), 0, "Input(Label) must be a sequence."); + } + Tensor* transition_grad = ctx.Output(framework::GradVarName("Transition")); // TODO(caoying) Fix this constraint. When the Input(Emission) is from the // data reader operator, it can have no gradients. - PADDLE_ENFORCE(emission_grad, "Output(Emission@Grad) should not be null."); - emission_grad->mutable_data(platform::CPUPlace()); if (transition_grad) { transition_grad->mutable_data(platform::CPUPlace()); math::set_constant(ctx.device_context(), transition_grad, 0.); } // Now, all the inputs and outputs should be on the CPU memory. - auto emission_dims = emission_exps->dims(); // Beta is the memo table used in dynamic programming to calculate the // backwark vectors. For a backward vector i (the i-th row of beta), it // captures the unnormalized probabilities of partial sequences starting // at position i. Tensor beta; - beta.mutable_data(emission_dims, platform::CPUPlace()); - - for (size_t i = 0; i < lod[level].size() - 1; ++i) { - int start_pos = static_cast(lod[level][i]); - int end_pos = static_cast(lod[level][i + 1]); - if (end_pos == start_pos) continue; - + auto* beta_data = beta.mutable_data(emission_dims, platform::CPUPlace()); + memset(beta_data, 0, beta.numel() * sizeof(T)); + if (ctx.HasInput("length")) { + beta.Resize({emission_dims[0] * emission_dims[1], emission_dims[2]}); + } + for (size_t i = 0; i < seq_num; ++i) { + int start_pos = 0; + int end_pos = 0; + if (ctx.HasInput("length")) { + if (length_data[i] == 0) continue; + start_pos = i * emission_dims[1]; + end_pos = start_pos + static_cast(length_data[i]); + } else { + start_pos = static_cast(lod[i]); + end_pos = static_cast(lod[i + 1]); + } const Tensor one_seq_emission_exps = - emission_exps->Slice(start_pos, end_pos); - const Tensor one_seq_label = label->Slice(start_pos, end_pos); - const Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos); + emission_exps_tmp.Slice(start_pos, end_pos); + const Tensor one_seq_label = label_tmp.Slice(start_pos, end_pos); + const Tensor one_seq_alpha = alpha_tmp.Slice(start_pos, end_pos); Tensor one_seq_beta = beta.Slice(start_pos, end_pos); - Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos); - + Tensor one_seq_emission_grad = + emission_grad_tmp.Slice(start_pos, end_pos); BackwardOneSequence( ctx.template device_context(), ll_grad[i], one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label, @@ -261,7 +336,6 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { const T* x_exps = emission_exps.data(); const int64_t* label_value = label.data(); T* beta_value = beta->data(); - auto x_dims = emission_exps.dims(); const size_t seq_length = x_dims[0]; const size_t tag_num = x_dims[1]; diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h index 45a155af..9f6565dd 100644 --- a/paddle/fluid/operators/load_combine_op.h +++ b/paddle/fluid/operators/load_combine_op.h @@ -41,10 +41,15 @@ class LoadCombineOpKernel : public framework::OpKernel { if (!model_from_memory) { std::ifstream fin(filename, std::ios::binary); PADDLE_ENFORCE(static_cast(fin), - "Cannot open file %s for load_combine op", filename); + "OP(LoadCombine) fail to open file %s, please check " + "whether the model file is complete or damaged.", + filename); LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names); } else { - PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory"); + PADDLE_ENFORCE(!filename.empty(), + "OP(LoadCombine) fail to open file %s, please check " + "whether the model file is complete or damaged.", + filename); std::stringstream fin(filename, std::ios::in | std::ios::binary); LoadParamsFromBuffer(ctx, place, &fin, load_as_fp16, out_var_names); } diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc index 458037c5..409f8397 100644 --- a/paddle/fluid/operators/lod_reset_op.cc +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -36,7 +36,10 @@ class LoDResetOp : public framework::OperatorWithKernel { } else if (ctx->IsRuntime()) { ctx->ShareLoD("Y", "Out"); } - + auto append = ctx->Attrs().Get("append"); + if (append) { + ctx->ShareLoD("X", /*->*/ "Out"); + } ctx->SetOutputDim("Out", ctx->GetInputDim("X")); } @@ -53,10 +56,14 @@ class LoDResetOpVarTypeInference : public framework::VarTypeInference { void operator()(framework::InferVarTypeContext *ctx) const override { auto x_var_name = ctx->Input("X").front(); auto out_var_name = ctx->Output("Out").front(); + bool append = boost::get(ctx->GetAttr("append")); if (ctx->HasInput("Y")) { auto y_var_name = ctx->Input("Y").front(); auto y_lod_level = std::max(ctx->GetLoDLevel(y_var_name), 1); ctx->SetLoDLevel(out_var_name, y_lod_level); + } else if (append) { + auto x_lod_level = std::max(ctx->GetLoDLevel(x_var_name), 1); + ctx->SetLoDLevel(out_var_name, x_lod_level); } else { ctx->SetLoDLevel(out_var_name, 1); } @@ -84,6 +91,7 @@ class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>("target_lod", "The target level 0 LoD from Attr().") .SetDefault(std::vector{}); + AddAttr("append", "Append data to lod vector.").SetDefault(false); AddComment(R"DOC(LoDReset operator Set LoD of `X` to a new one specified by `Y` or attribute `target_lod`. When `Y` diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h index 1c2f0b0a..d827f2a2 100644 --- a/paddle/fluid/operators/lod_reset_op.h +++ b/paddle/fluid/operators/lod_reset_op.h @@ -29,6 +29,7 @@ class LoDResetKernel : public framework::OpKernel { auto* out = ctx.Output("Out"); auto* in = ctx.Input("X"); auto* lod_t = ctx.Input("Y"); + bool append = ctx.Attr("append"); out->ShareDataWith(*in); @@ -71,9 +72,14 @@ class LoDResetKernel : public framework::OpKernel { std::vector ulevel0(level0.size(), 0); std::transform(level0.begin(), level0.end(), ulevel0.begin(), [](int a) { return static_cast(a); }); - framework::LoD target_lod; - target_lod.push_back(ulevel0); - out->set_lod(target_lod); + if (append) { + auto* out_lod = out->mutable_lod(); + out_lod->push_back(ulevel0); + } else { + framework::LoD target_lod; + target_lod.push_back(ulevel0); + out->set_lod(target_lod); + } } }; diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 8716662f..f9e12e01 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -15,7 +15,6 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/lookup_table_op.h" -#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/float16.h" @@ -32,8 +31,16 @@ __global__ void LookupTable(T *output, const T *table, const int64_t *ids, while (idy < K) { int64_t id = ids[idy]; - PADDLE_ASSERT_MSG(id >= 0, "received id:", id); - PADDLE_ASSERT_MSG(id < N, "received id:", id); + PADDLE_ENFORCE( + id >= 0, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, id); + PADDLE_ENFORCE( + id < N, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, id); T *out = output + idy * D; const T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { @@ -59,8 +66,16 @@ __global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids, while (idy < K) { int64_t id = ids[idy]; - PADDLE_ASSERT_MSG(id >= 0, "received id:", id); - PADDLE_ASSERT_MSG(id < N, "received id:", id); + PADDLE_ENFORCE( + id >= 0, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, id); + PADDLE_ENFORCE( + id < N, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, id); const T *out = output + idy * D; T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { @@ -82,46 +97,27 @@ class LookupTableCUDAKernel : public framework::OpKernel { auto id_name = context.Inputs("Ids").front(); auto out_name = context.Outputs("Out").front(); - // for remote prefetch - auto epmap = context.Attr>("epmap"); - auto height_sections = - context.Attr>("height_sections"); - auto table_names = context.Attr>("table_names"); - - if (!epmap.empty()) { -// if epmap is not empty, then the parameter will be fetched from remote -// parameter -// server -#ifdef PADDLE_WITH_DISTRIBUTE - operators::distributed::prefetch(id_name, out_name, table_names, epmap, - height_sections, context, - context.scope()); -#else - PADDLE_THROW( - "paddle is not compiled with distribute support, can not do " - "parameter prefetch!"); -#endif - } else { - size_t N = table_t->dims()[0]; - size_t D = table_t->dims()[1]; - size_t K = ids_t->numel(); - - auto *ids = ids_t->data(); - auto *table = table_t->data(); - auto *output = output_t->mutable_data(context.GetPlace()); - - dim3 threads(128, 8); - dim3 grids(8, 1); - - if (padding_idx == -1) - LookupTable<<< - grids, threads, 0, context.cuda_device_context().stream()>>>( - output, table, ids, N, K, D, padding_idx); - else - LookupTable<<< - grids, threads, 0, context.cuda_device_context().stream()>>>( - output, table, ids, N, K, D, padding_idx); - } + size_t N = table_t->dims()[0]; + size_t D = table_t->dims()[1]; + size_t K = ids_t->numel(); + + auto *ids = ids_t->data(); + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + dim3 threads(128, 8); + dim3 grids(8, 1); + + if (padding_idx == -1) + LookupTable< + T, 128, 8, 8, + false><<>>( + output, table, ids, N, K, D, padding_idx); + else + LookupTable< + T, 128, 8, 8, + true><<>>( + output, table, ids, N, K, D, padding_idx); } }; diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 62e298e0..4863ed17 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -46,6 +46,7 @@ class LookupTableKernel : public framework::OpKernel { auto *table_var = context.InputVar("W"); auto id_name = context.Inputs("Ids").front(); + auto embedding_name = context.Inputs("W").front(); auto out_name = context.Outputs("Out").front(); // for remote prefetch @@ -57,12 +58,12 @@ class LookupTableKernel : public framework::OpKernel { if (remote_prefetch && !epmap.empty()) { // if epmap is not empty, then the parameter will be fetched from remote -// parameter -// server +// parameter server + #ifdef PADDLE_WITH_DISTRIBUTE - operators::distributed::prefetch(id_name, out_name, table_names, epmap, - height_sections, context, - context.scope()); + operators::distributed::prefetch(id_name, out_name, embedding_name, false, + table_names, epmap, height_sections, + context, context.scope()); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " @@ -85,8 +86,18 @@ class LookupTableKernel : public framework::OpKernel { if (padding_idx != kNoPadding && ids[i] == padding_idx) { memset(output + i * row_width, 0, row_width * sizeof(T)); } else { - PADDLE_ENFORCE_LT(ids[i], row_number); - PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); + PADDLE_ENFORCE_LT( + ids[i], row_number, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + row_number, ids[i]); + PADDLE_ENFORCE_GE( + ids[i], 0, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + row_number, ids[i]); memcpy(output + i * row_width, table + ids[i] * row_width, row_width * sizeof(T)); } @@ -181,8 +192,8 @@ class LookupTableGradKernel : public framework::OpKernel { auto *ids_data = ids->data(); - int N = table_dim[0]; - int D = table_dim[1]; + int64_t N = table_dim[0]; + int64_t D = table_dim[1]; auto *d_output_data = d_output->data(); auto *d_table_data = d_table->mutable_data(context.GetPlace()); @@ -194,8 +205,16 @@ class LookupTableGradKernel : public framework::OpKernel { // the gradient of padding_idx should be 0, already done by memset, so // do nothing. } else { - PADDLE_ENFORCE_LT(ids_data[i], N); - PADDLE_ENFORCE_GE(ids_data[i], 0); + PADDLE_ENFORCE_LT( + ids_data[i], N, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, ids_data[i]); + PADDLE_ENFORCE_GE( + ids_data[i], 0, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, ids_data[i]); for (int j = 0; j < D; ++j) { d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; } diff --git a/paddle/fluid/operators/lookup_table_v2_op.cc b/paddle/fluid/operators/lookup_table_v2_op.cc new file mode 100644 index 00000000..f1b98235 --- /dev/null +++ b/paddle/fluid/operators/lookup_table_v2_op.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lookup_table_v2_op.h" + +#include + +#include "paddle/fluid/framework/no_need_buffer_vars_inference.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { + +class LookupTableV2Op : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true, + "Input(W) of LookupTableV2Op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Ids"), true, + "Input(Ids) of LookupTableV2Op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of LookupTableV2Op should not be null."); + + auto table_dims = ctx->GetInputDim("W"); + auto ids_dims = ctx->GetInputDim("Ids"); + int ids_rank = ids_dims.size(); + VLOG(5) << "ids rank is " << ids_rank << std::endl; + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + + auto output_dims = framework::vectorize(ids_dims); + output_dims.push_back(table_dims[1]); + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + + if (ctx->GetOutputsVarType("Out")[0] == + framework::proto::VarType::LOD_TENSOR) { + ctx->ShareLoD("Ids", /*->*/ "Out"); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class LookupTableV2OpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("W", + "(Tensor) The input represents embedding tensors, " + "which is a learnable parameter."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "The last dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("is_sparse", + "(boolean, default false) " + "Sparse update.") + .SetDefault(false); + AddAttr("is_distributed", + "(boolean, default false) distributed lookup table.") + .SetDefault(false); + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(kNoPadding); + + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); + + AddComment(R"DOC( +Lookup Table V2 Operator. + +This operator is used to perform lookups on the parameter W, +then concatenated into a dense tensor. + +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + +)DOC"); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LookupTableV2GradOpNoBuffer, "W"); + +class LookupTableV2GradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + + op->SetType("lookup_table_v2_grad"); + + op->SetInput("W", Input("W")); + op->SetInput("Ids", Input("Ids")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + + op->SetOutput(framework::GradVarName("W"), InputGrad("W")); + + op->SetAttrMap(Attrs()); + return op; + } +}; + +class LookupTableV2OpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto table_dims = ctx->GetInputDim("W"); + ctx->SetOutputDim(framework::GradVarName("W"), table_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar( + ctx.InputVar(framework::GradVarName("Out"))); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class LookupTableV2OpGradVarTypeInference : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override { + auto out_var_name = ctx->Output(framework::GradVarName("W")).front(); + auto attr = ctx->GetAttr("is_sparse"); + bool is_sparse = boost::get(attr); + if (is_sparse) { + VLOG(3) << "lookup_table_v2_grad op " << framework::GradVarName("W") + << " is set to SelectedRows"; + ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS); + } else { + VLOG(3) << "lookup_table_v2_grad op " << framework::GradVarName("W") + << " is set to LoDTensor"; + ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR); + } + ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0])); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lookup_table_v2, ops::LookupTableV2Op, + ops::LookupTableV2OpMaker, ops::LookupTableV2GradOpDescMaker); + +REGISTER_OPERATOR(lookup_table_v2_grad, ops::LookupTableV2OpGrad, + ops::LookupTableV2GradOpNoBuffer, + ops::LookupTableV2OpGradVarTypeInference); + +REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel, + ops::LookupTableV2Kernel); +REGISTER_OP_CPU_KERNEL(lookup_table_v2_grad, + ops::LookupTableV2GradKernel, + ops::LookupTableV2GradKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu new file mode 100644 index 00000000..e7f580c5 --- /dev/null +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -0,0 +1,201 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/lookup_table_v2_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +__global__ void LookupTableV2(T *output, const T *table, const int64_t *ids, + const int64_t N, const int64_t K, const int64_t D, + const int64_t padding_idx) { + int idx = threadIdx.x; + int idy = blockIdx.x + threadIdx.y * GridDimX; + + while (idy < K) { + int64_t id = ids[idy]; + PADDLE_ENFORCE( + id >= 0, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, id); + PADDLE_ENFORCE( + id < N, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, id); + T *out = output + idy * D; + const T *tab = table + id * D; + for (int i = idx; i < D; i += BlockDimX) { + if (PaddingFlag) { + if (id == padding_idx) + out[i] = static_cast(0); + else + out[i] = tab[i]; + } else { + out[i] = tab[i]; + } + } + idy += BlockDimY * GridDimX; + } +} + +template +__global__ void LookupTableV2Grad(T *table, const T *output, const int64_t *ids, + const int64_t N, const int64_t K, + const int64_t D) { + int idx = threadIdx.x; + int idy = blockIdx.x + threadIdx.y * GridDimX; + + while (idy < K) { + int64_t id = ids[idy]; + PADDLE_ENFORCE( + id >= 0, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, id); + PADDLE_ENFORCE( + id < N, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, id); + const T *out = output + idy * D; + T *tab = table + id * D; + for (int i = idx; i < D; i += BlockDimX) { + paddle::platform::CudaAtomicAdd(&tab[i], out[i]); + } + idy += BlockDimY * GridDimX; + } +} + +template +class LookupTableV2CUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *table_t = context.Input("W"); + auto *ids_t = context.Input("Ids"); + auto *output_t = context.Output("Out"); + int64_t padding_idx = context.Attr("padding_idx"); + + auto id_name = context.Inputs("Ids").front(); + auto out_name = context.Outputs("Out").front(); + + size_t N = table_t->dims()[0]; + size_t D = table_t->dims()[1]; + size_t K = ids_t->numel(); + + auto *ids = ids_t->data(); + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + dim3 threads(128, 8); + dim3 grids(8, 1); + + if (padding_idx == -1) + LookupTableV2< + T, 128, 8, 8, + false><<>>( + output, table, ids, N, K, D, padding_idx); + else + LookupTableV2< + T, 128, 8, 8, + true><<>>( + output, table, ids, N, K, D, padding_idx); + } +}; + +template +class LookupTableV2GradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto &dev_ctx = + context.template device_context(); + bool is_sparse = context.Attr("is_sparse"); + + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + auto *ids = context.Input("Ids"); + auto *table = context.Input("W"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + int64_t ids_num = ids->numel(); + + auto stream = dev_ctx.stream(); + // copy GPU memory to CPU pinned memory + framework::Vector new_rows; + new_rows.resize(ids_num); + auto gpu_place = boost::get(context.GetPlace()); + + // TODO(yuyang18): Strange code here. + memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()), + gpu_place, ids_data, ids_num * sizeof(int64_t), stream); + d_table->set_rows(new_rows); + + auto *d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_num, table->dims()[1]}); + d_table_value->mutable_data(context.GetPlace()); + + auto *d_table_data = d_table_value->data(); + auto *d_output_data = d_output->data(); + auto d_output_dims = d_output->dims(); + PADDLE_ENFORCE_EQ( + d_table_value->dims(), + framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); + memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data, + d_output->numel() * sizeof(T), stream); + + } else { + auto ids_t = context.Input("Ids"); + auto d_output_t = context.Input(framework::GradVarName("Out")); + auto d_table_t = context.Output(framework::GradVarName("W")); + + int N = d_table_t->dims()[0]; + int D = d_table_t->dims()[1]; + int K = ids_t->numel(); + const int64_t *ids = ids_t->data(); + const T *d_output = d_output_t->data(); + T *d_table = d_table_t->mutable_data(context.GetPlace()); + + auto t = framework::EigenVector::Flatten(*d_table_t); + t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); + + dim3 threads(128, 8); + dim3 grids(8, 1); + LookupTableV2Grad<<>>( + d_table, d_output, ids, N, K, D); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL(lookup_table_v2, ops::LookupTableV2CUDAKernel, + ops::LookupTableV2CUDAKernel, + ops::LookupTableV2CUDAKernel); +REGISTER_OP_CUDA_KERNEL(lookup_table_v2_grad, + ops::LookupTableV2GradCUDAKernel, + ops::LookupTableV2GradCUDAKernel, + ops::LookupTableV2GradCUDAKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op.h b/paddle/fluid/operators/lookup_table_v2_op.h new file mode 100644 index 00000000..16f4d7c4 --- /dev/null +++ b/paddle/fluid/operators/lookup_table_v2_op.h @@ -0,0 +1,218 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/blas.h" + +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +constexpr int64_t kNoPadding = -1; + +template +class LookupTableV2Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *ids_t = context.Input("Ids"); // int tensor + auto *output_t = context.Output("Out"); // float tensor + auto *table_var = context.InputVar("W"); + + auto id_name = context.Inputs("Ids").front(); + auto embedding_name = context.Inputs("W").front(); + auto out_name = context.Outputs("Out").front(); + + // for remote prefetch + auto epmap = context.Attr>("epmap"); + auto remote_prefetch = context.Attr("remote_prefetch"); + auto height_sections = + context.Attr>("height_sections"); + auto table_names = context.Attr>("table_names"); + + if (remote_prefetch && !epmap.empty()) { +// if epmap is not empty, then the parameter will be fetched from remote +// parameter server + +#ifdef PADDLE_WITH_DISTRIBUTE + operators::distributed::prefetch(id_name, out_name, embedding_name, false, + table_names, epmap, height_sections, + context, context.scope()); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); +#endif + } else { + int64_t padding_idx = context.Attr("padding_idx"); + int64_t *ids = const_cast(ids_t->data()); + int64_t ids_numel = ids_t->numel(); + + if (table_var->IsType()) { + auto *table_t = context.Input("W"); + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_LT( + ids[i], row_number, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + row_number, ids[i]); + PADDLE_ENFORCE_GE( + ids[i], 0, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + row_number, ids[i]); + memcpy(output + i * row_width, table + ids[i] * row_width, + row_width * sizeof(T)); + } + } + } else if (table_var->IsType()) { + const auto &table_t = table_var->Get(); + int64_t row_width = table_t.value().dims()[1]; + const auto *table = table_t.value().data(); + auto *output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_GE(ids[i], 0); + auto id_index = table_t.Index(ids[i]); + PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); + blas.VCOPY(row_width, table + id_index * row_width, + output + i * row_width); + } + } + } + } + } +}; + +template +class LookupTableV2GradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *table_var = context.InputVar("W"); + DDim table_dim; + if (table_var->IsType()) { + table_dim = context.Input("W")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW( + "The parameter W of a LookupTableV2 " + "must be either LoDTensor or SelectedRows"); + } + + int64_t padding_idx = context.Attr("padding_idx"); + bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + int64_t ids_num = ids->numel(); + + std::vector new_rows; + new_rows.resize(ids_num); + std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t)); + d_table->set_rows(new_rows); + + auto *d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_num, table_dim[1]}); + + d_table_value->mutable_data(context.GetPlace()); + + d_table->set_height(table_dim[0]); + + auto *d_output_data = d_output->data(); + auto *d_table_data = d_table_value->data(); + + auto d_output_dims = d_output->dims(); + PADDLE_ENFORCE_EQ( + d_table_value->dims(), + framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); + memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); + + } else { + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + + int64_t N = table_dim[0]; + int64_t D = table_dim[1]; + + auto *d_output_data = d_output->data(); + auto *d_table_data = d_table->mutable_data(context.GetPlace()); + + memset(d_table_data, 0, d_table->numel() * sizeof(T)); + + for (int64_t i = 0; i < ids->numel(); ++i) { + if (padding_idx != kNoPadding && ids_data[i] == padding_idx) { + // the gradient of padding_idx should be 0, already done by memset, so + // do nothing. + } else { + PADDLE_ENFORCE_LT( + ids_data[i], N, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, ids_data[i]); + PADDLE_ENFORCE_GE( + ids_data[i], 0, + "Variable value (input) of OP(fluid.layers.embedding) " + "expected >= 0 and < %ld, but got %ld. Please check input value.", + N, ids_data[i]); + for (int j = 0; j < D; ++j) { + d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j]; + } + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lstm_unit_op.cu b/paddle/fluid/operators/lstm_unit_op.cu index acf09423..87451cb1 100644 --- a/paddle/fluid/operators/lstm_unit_op.cu +++ b/paddle/fluid/operators/lstm_unit_op.cu @@ -19,7 +19,6 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/cross_entropy_op.h" #include "paddle/fluid/operators/lstm_unit_op.h" -#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/match_matrix_tensor_op.cc b/paddle/fluid/operators/match_matrix_tensor_op.cc new file mode 100644 index 00000000..e9a645d2 --- /dev/null +++ b/paddle/fluid/operators/match_matrix_tensor_op.cc @@ -0,0 +1,334 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include + +#include "paddle/fluid/operators/match_matrix_tensor_op.h" +#include "paddle/fluid/operators/search_compute.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +void MatchMatrixTensorOP::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "X(Input) of MatchMatrix should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true, + "Y(Input) of MatchMatrix should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true, + "W(Input) of MatchMatrix should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Out(Output) of MatchMatrix should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Tmp"), true, + "Tmp(Output) of MatchMatrix should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, + "The rank of Input(X) can't be less than 2."); + + auto y_dims = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_EQ(y_dims.size(), 2, + "The rank of Input(Y) can't be less than 2."); + + auto w_dims = ctx->GetInputDim("W"); + PADDLE_ENFORCE_EQ(w_dims.size(), 3UL, "W should be 3-D tensor"); + + int dim_t = ctx->Attrs().Get("dim_t"); + PADDLE_ENFORCE_EQ(w_dims[0], x_dims[1], + "W 's shape must satisfy: W[0] = X[1]"); + PADDLE_ENFORCE_EQ(w_dims[1], dim_t, "W 's shape must satisfy: W[1] = dim_t"); + PADDLE_ENFORCE_EQ(w_dims[2], y_dims[1], + "W 's shape must satisfy: W[2] = Y[1]"); + + int out_dim_0 = -1; + int tmp_dim_0 = -1; + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE_EQ(x_lod.empty(), false, "The Input(X) must hold lod info."); + const auto& x_lod_0 = x_lod[0]; + PADDLE_ENFORCE_GE(x_lod_0.size(), 2, + "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod_0.back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + + framework::Variable* y_var = + boost::get(ctx->GetInputVarPtrs("Y")[0]); + const auto& y_lod = y_var->Get().lod(); + PADDLE_ENFORCE_EQ(y_lod.empty(), false, "The Input(Y) must hold lod info."); + const auto& y_lod_0 = y_lod[0]; + PADDLE_ENFORCE_GE(y_lod_0.size(), 2, + "The Input(Y)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + y_dims[0], static_cast(y_lod_0.back()), + "The Input(Y)'s lod info mismatches the actual tensor shape."); + + PADDLE_ENFORCE_EQ(x_lod_0.size(), y_lod_0.size(), + "The Length of X and Y must be equal."); + + out_dim_0 = 0; + for (size_t i = 1; i < x_lod_0.size(); i++) { + int x_len = x_lod_0[i] - x_lod_0[i - 1]; + int y_len = y_lod_0[i] - y_lod_0[i - 1]; + out_dim_0 += (x_len * y_len); + } + out_dim_0 *= dim_t; + + tmp_dim_0 = x_dims[0] * dim_t * x_dims[1]; + } else { + // compile time + framework::VarDesc* x_desc = + boost::get(ctx->GetInputVarPtrs("X")[0]); + PADDLE_ENFORCE_GE(x_desc->GetLoDLevel(), 1); + framework::VarDesc* y_desc = + boost::get(ctx->GetInputVarPtrs("Y")[0]); + PADDLE_ENFORCE_GE(y_desc->GetLoDLevel(), 1); + } + + std::vector out_dims_vec{out_dim_0}; + out_dims_vec.push_back(1); + std::vector tmp_dims_vec{tmp_dim_0}; + tmp_dims_vec.push_back(1); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); + ctx->SetOutputDim("Tmp", framework::make_ddim(tmp_dims_vec)); +} + +void MatchMatrixTensorOpGrad::InferShape( + framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true, + "Input(Y) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("W"), true, + "Input(W) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) of SequencePadGradOp should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + if (ctx->HasOutput(framework::GradVarName("Y"))) { + ctx->SetOutputDim(framework::GradVarName("Y"), ctx->GetInputDim("Y")); + ctx->ShareLoD("Y", /*->*/ framework::GradVarName("Y")); + } + if (ctx->HasOutput(framework::GradVarName("W"))) { + ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + } +} + +void MatchMatrixTensorOpMaker::Make() { + AddInput("X", + "X (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("Y", + "Y (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("W", "W (Tensor), The weight of X and Y."); + AddAttr("dim_t", "the dim of W").SetDefault(1); + AddOutput("Out", + "(LoDTensor, default LoDTensor) Output variable which " + "is X * W * Y"); + AddOutput("Tmp", + "(LoDTensor, default LoDTensor) tmp variable which is " + "used for X * W"); + AddComment(R"DOC( + Match Matrix Tensor Operator + + This operator calculate X * W * Y, only support 2-D for X and Y. + the output is a level-1 LodTensor: + level_0: dim_t + + NOTE: only support 'float32' data type now. + + )DOC"); +} + +template +class CPUMatchMatrixTensorOPKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* w = ctx.Input("W"); + auto* out = ctx.Output("Out"); + auto* tmp = ctx.Output("Tmp"); + + int dim_t = ctx.Attr("dim_t"); + int dim_in = x->dims()[1]; + + const auto& offset_l = x->lod()[0]; + const auto& offset_r = y->lod()[0]; + + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + top_size += dim_t * len_l * len_r; + top_offset.push_back(top_size); + } + auto* out_data = out->mutable_data(ctx.GetPlace()); + memset(out_data, 0.0, out->dims()[0] * out->dims()[1] * sizeof(T)); + + auto* bottom_l_data = x->data(); + auto* bottom_r_data = y->data(); + auto* t_data = w->data(); + auto* bottom_l_trans_data = tmp->mutable_data(ctx.GetPlace()); + memset(bottom_l_trans_data, 0.0, + tmp->dims()[0] * tmp->dims()[1] * sizeof(T)); + + auto blas = math::GetBlas(ctx); + + call_gemm(blas, CblasNoTrans, CblasNoTrans, x->dims()[0], dim_t * dim_in, + dim_in, 1.0f, bottom_l_data, t_data, 0.0f, bottom_l_trans_data); + + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + for (int t = 0; t < dim_t; t++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + auto* top_data = out_data + top_offset[b] + t * len_l * len_r; + const auto* l_t_data = + bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in; + const auto* r_data = bottom_r_data + offset_r[b] * dim_in; + auto blas_2 = math::GetBlas(ctx); + call_gemm_with_lda(blas_2, CblasNoTrans, CblasTrans, len_l, len_r, + dim_in, 1.0f, l_t_data, r_data, 0.0f, top_data, + dim_t * dim_in); + } + } + + framework::LoD out_lod; + out_lod.push_back(top_offset); + + out->set_lod(out_lod); + } +}; + +template +class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* w = ctx.Input("W"); + auto* tmp = ctx.Input("Tmp"); + + int dim_t = ctx.Attr("dim_t"); + int dim_in = x->dims()[1]; + + const auto& offset_l = x->lod()[0]; + const auto& offset_r = y->lod()[0]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + top_size += dim_t * len_l * len_r; + top_offset.push_back(top_size); + } + + auto* bottom_l_data = x->data(); + auto* bottom_r_data = y->data(); + auto* bottom_l_trans_data = tmp->data(); + + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_y = ctx.Output(framework::GradVarName("Y")); + + Tensor tmp_grad; + tmp_grad.Resize(tmp->dims()); + auto* d_tmp_data = tmp_grad.mutable_data(ctx.GetPlace()); + auto* top_diff = d_out->data(); + auto* bottom_l_diff = d_x->mutable_data(ctx.GetPlace()); + auto* bottom_r_diff = d_y->mutable_data(ctx.GetPlace()); + auto* bottom_l_trans_diff = const_cast(d_tmp_data); + memset(bottom_l_diff, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T)); + memset(bottom_r_diff, 0.0, y->dims()[0] * y->dims()[1] * sizeof(T)); + memset(bottom_l_trans_diff, 0.0, + tmp->dims()[0] * tmp->dims()[1] * sizeof(T)); + + for (size_t b = 0; b < x->lod()[0].size() - 1; b++) { + for (int t = 0; t < dim_t; t++) { + int len_l = offset_l[b + 1] - offset_l[b]; + int len_r = offset_r[b + 1] - offset_r[b]; + + for (int i = 0; i < len_l; i++) { + for (int j = 0; j < len_r; j++) { + auto diff = + top_diff[top_offset[b] + t * len_l * len_r + i * len_r + j]; + auto* l_trans_data = bottom_l_trans_data + + (offset_l[b] + i) * dim_in * dim_t + + t * dim_in; + auto* l_trans_diff = bottom_l_trans_diff + + (offset_l[b] + i) * dim_in * dim_t + + t * dim_in; + auto* r_data = bottom_r_data + (offset_r[b] + j) * dim_in; + auto* r_diff = bottom_r_diff + (offset_r[b] + j) * dim_in; + if (diff != 0.0) { + sse_axpy(r_data, l_trans_diff, dim_in, diff); + sse_axpy(l_trans_data, r_diff, dim_in, diff); + } + } + } + } + } + + auto blas = math::GetBlas(ctx); + + auto* t_data = w->data(); + auto* d_w = ctx.Output(framework::GradVarName("W")); + auto* t_diff = d_w->mutable_data(ctx.GetPlace()); + memset(t_diff, 0.0, w->dims()[0] * w->dims()[1] * w->dims()[2] * sizeof(T)); + // bottom_diff + call_gemm(blas, CblasNoTrans, CblasTrans, x->dims()[0], dim_in, + dim_t * dim_in, 1.0f, bottom_l_trans_diff, t_data, 1.0f, + bottom_l_diff); + + // t_diff + call_gemm(blas, CblasTrans, CblasNoTrans, dim_in, dim_t * dim_in, + x->dims()[0], 1.0f, bottom_l_data, bottom_l_trans_diff, 1.0f, + t_diff); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(match_matrix_tensor, ops::MatchMatrixTensorOP, + ops::MatchMatrixTensorOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(match_matrix_tensor_grad, ops::MatchMatrixTensorOpGrad); + +REGISTER_OP_CPU_KERNEL(match_matrix_tensor, + ops::CPUMatchMatrixTensorOPKernel< + paddle::platform::CPUDeviceContext, float>); +// ops::CPUMatchMatrixTensorOPKernel + +REGISTER_OP_CPU_KERNEL(match_matrix_tensor_grad, + ops::CPUMatchMatrixTensorOPGradKernel< + paddle::platform::CPUDeviceContext, float>); +// ops::CPUMatchMatrixTensorOPGradKernel diff --git a/paddle/fluid/operators/match_matrix_tensor_op.h b/paddle/fluid/operators/match_matrix_tensor_op.h new file mode 100644 index 00000000..b067d1c0 --- /dev/null +++ b/paddle/fluid/operators/match_matrix_tensor_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +class MatchMatrixTensorOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +class MatchMatrixTensorOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +class MatchMatrixTensorOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math.h b/paddle/fluid/operators/math.h index 8cc24200..3b28928a 100644 --- a/paddle/fluid/operators/math.h +++ b/paddle/fluid/operators/math.h @@ -38,5 +38,9 @@ inline HOSTDEVICE float real_log(float x) { return ::logf(x); } inline HOSTDEVICE double real_log(double x) { return ::log(x); } +inline HOSTDEVICE float real_min(float x, float y) { return ::fminf(x, y); } + +inline HOSTDEVICE double real_min(double x, double y) { return ::fmin(x, y); } + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index d4837696..ca0c92b4 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -7,7 +7,7 @@ function(math_library TARGET) set(cc_srcs) set(cu_srcs) set(hip_srcs) - set(math_common_deps device_context framework_proto) + set(math_common_deps device_context framework_proto enforce) set(multiValueArgs DEPS) cmake_parse_arguments(math_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -56,6 +56,7 @@ math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_scale) math_library(softmax DEPS math_function jit_kernel_helper) math_library(beam_search DEPS math_function) +math_library(fc DEPS blas) math_library(matrix_bit_code) diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index ce8109f6..a15dab93 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -112,6 +112,22 @@ class Blas { template void GEMM_FREE(T* data) const; + + template + void CSRMM(const char* transa, const int* m, const int* n, const int* k, + const T* alpha, const char* matdescra, const T* val, + const int* indx, const int* pntrb, const int* pntre, const T* b, + const int* ldb, const T* beta, T* c, const int* ldc) const; + +#if !defined(PADDLE_WITH_CUDA) + template + void MatMulWithHead(const framework::Tensor& mat_a, + const MatDescriptor& dim_a, + const framework::Tensor& mat_b, + const MatDescriptor& dim_b, T alpha, int head_number, + framework::Tensor* mat_out, T beta, + bool mat_y_split_vertical) const; +#endif #endif template @@ -176,6 +192,15 @@ class Blas { int K, T alpha, const T* A, const T* B, T beta, T* C, int batchCount, int64_t strideA, int64_t strideB) const; +#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) + template + void BatchedGEMMWithHead(CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, + int W1, int H1, int W2, int H2, T alpha, const T* A, + const T* B, T beta, T* C, int batchCount, + int64_t strideA, int64_t strideB, + int64_t head_number, bool split_b_vertical) const; +#endif + template void MatMul(const framework::Tensor& mat_a, const MatDescriptor& dim_a, const framework::Tensor& mat_b, const MatDescriptor& dim_b, @@ -221,6 +246,18 @@ class BlasT : private Blas { void GEMM_FREE(ARGS... args) const { Base()->template GEMM_FREE(args...); } + + template + void CSRMM(ARGS... args) const { + Base()->template CSRMM(args...); + } + +#if !defined(PADDLE_WITH_CUDA) + template + void MatMulWithHead(ARGS... args) const { + Base()->template MatMulWithHead(args...); + } +#endif #endif template diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index 58f7be12..4188e26f 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -31,23 +31,24 @@ template <> struct CUBlas { template static void GEMM(ARGS... args) { - PADDLE_ENFORCE(platform::dynload::cublasSgemm(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemm(args...)); } template static void AXPY(ARGS... args) { - PADDLE_ENFORCE(platform::dynload::cublasSaxpy(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSaxpy(args...)); } template static void GEMV(ARGS... args) { - PADDLE_ENFORCE(platform::dynload::cublasSgemv(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemv(args...)); } template static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 - PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cublasSgemmStridedBatched(args...)); #else PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5"); #endif @@ -69,7 +70,7 @@ struct CUBlas { VLOG(5) << "use_tensor_op_math: " << (dev_ctx->tensor_core_available() ? "True" : "False"); dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasSgemmEx( handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc)); }); @@ -83,23 +84,24 @@ template <> struct CUBlas { template static void GEMM(ARGS... args) { - PADDLE_ENFORCE(platform::dynload::cublasDgemm(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemm(args...)); } template static void AXPY(ARGS... args) { - PADDLE_ENFORCE(platform::dynload::cublasDaxpy(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDaxpy(args...)); } template static void GEMV(ARGS... args) { - PADDLE_ENFORCE(platform::dynload::cublasDgemv(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasDgemv(args...)); } template static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 - PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...)); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cublasDgemmStridedBatched(args...)); #else PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5"); #endif @@ -120,7 +122,7 @@ struct CUBlas { const float16 *alpha, const float16 *A, int lda, const float16 *B, int ldb, const float16 *beta, float16 *C, int ldc) { - PADDLE_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cublasHgemm(handle, transa, transb, m, n, k, reinterpret_cast(alpha), reinterpret_cast(A), lda, @@ -140,7 +142,7 @@ struct CUBlas { long long int strideC, // NOLINT int batchCount) { #if CUDA_VERSION >= 8000 - PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasHgemmStridedBatched( handle, transa, transb, m, n, k, reinterpret_cast(alpha), reinterpret_cast(A), lda, strideA, @@ -174,7 +176,7 @@ struct CUBlas { #endif // CUDA_VERSION >= 9000 dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE(platform::dynload::cublasGemmEx( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmEx( handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); }); @@ -356,7 +358,7 @@ void Blas::BatchedGEMM( << (use_tensor_op_math ? "True" : "False"); context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { - PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx( handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index f067e283..e2620bcf 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -128,6 +128,12 @@ struct CBlas { static void VMERF(ARGS... args) { platform::dynload::vmsErf(args...); } +#if !defined(_WIN32) + template + static void CSRMM(ARGS... args) { + platform::dynload::mkl_scsrmm(args...); + } +#endif }; template <> @@ -233,6 +239,12 @@ struct CBlas { static void VMERF(ARGS... args) { platform::dynload::vmdErf(args...); } +#if !defined(_WIN32) + template + static void CSRMM(ARGS... args) { + platform::dynload::mkl_dcsrmm(args...); + } +#endif }; #else @@ -567,6 +579,72 @@ void Blas::BatchedGEMM( #endif } +#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) +template <> +template +void Blas::BatchedGEMMWithHead( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int W1, int H1, int W2, + int H2, T alpha, const T *A, const T *B, T beta, T *C, int batchCount, + int64_t strideA, int64_t strideB, int64_t head_number, + bool split_b_vertical) const { + int lda = (transA == CblasNoTrans) ? W1 : H1; + int ldb = (transB == CblasNoTrans) ? W2 : H2; + auto a_array = std::vector(batchCount); + auto b_array = std::vector(batchCount); + auto c_array = std::vector(batchCount); + + if (split_b_vertical) { + int ldc = W2; + int sub_width = W2 / head_number; + + for (int i = 0; i < head_number; i++) { + int sub_matA_offset = (transA == CblasNoTrans) + ? i * (W1 / head_number) + : i * (W1 / head_number) * H1; + int sub_matB_offset = (transB == CblasNoTrans) + ? i * (W2 / head_number) + : i * (W2 / head_number) * H2; + int sub_matC_offset = i * W2 / head_number; + for (int k = 0; k < batchCount; ++k) { + a_array[k] = &A[k * strideA] + sub_matA_offset; + b_array[k] = &B[k * strideB] + sub_matB_offset; + c_array[k] = &C[k * H1 * W2] + sub_matC_offset; + } + + CBlas::GEMM_BATCH(CblasRowMajor, &transA, &transB, &H1, &sub_width, + &H2, &alpha, a_array.data(), &lda, b_array.data(), + &ldb, &beta, c_array.data(), &ldc, + 1 /* group_count */, &batchCount); + } + + } else { + PADDLE_ENFORCE_EQ(W1, H2); + int ldc = W2 * head_number; + int sub_width = W1 / head_number; + + for (int i = 0; i < head_number; i++) { + int sub_matA_offset = (transA == CblasNoTrans) + ? i * (W1 / head_number) + : i * (W1 / head_number) * H1; + int sub_matB_offset = (transB == CblasNoTrans) + ? i * (W1 / head_number) * W2 + : i * (W1 / head_number); + int sub_matC_offset = i * W2; + for (int k = 0; k < batchCount; ++k) { + a_array[k] = &A[k * strideA] + sub_matA_offset; + b_array[k] = &B[k * strideB] + sub_matB_offset; + c_array[k] = &C[k * H1 * head_number * W2] + sub_matC_offset; + } + + CBlas::GEMM_BATCH(CblasRowMajor, &transA, &transB, &H1, &W2, + &sub_width, &alpha, a_array.data(), &lda, + b_array.data(), &ldb, &beta, c_array.data(), &ldc, + 1 /* group_count */, &batchCount); + } + } +} +#endif + template template void Blas::MatMul(const int M, const int N, const int K, @@ -619,7 +697,11 @@ void Blas::MatMul(const framework::Tensor &mat_a, mat_b.data(), beta, mat_out->data()); } else { PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ || - dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0); + dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0, + "dim_a.batch_size should be equal to dim_b.batch_size, or " + "one of dim_a.batch_size and dim_b.batch_size should be 0. " + "But got dim_a.batch_size = %d, dim_b.batch_size = %d.", + dim_a.batch_size_, dim_b.batch_size_); this->template BatchedGEMM( transA, transB, dim_a.height_, dim_b.width_, dim_a.width_, alpha, mat_a.data(), mat_b.data(), beta, mat_out->data(), @@ -627,6 +709,102 @@ void Blas::MatMul(const framework::Tensor &mat_a, dim_a.stride_, dim_b.stride_); } } + +#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) +/* + * Multiple two matrixes with multiple heads + * + * A new parameter, i.e head_number is added compared to normal MatMul. + * The head_number describes the number of heads a matrix is vertically + * split. + * + * When user calls this API, the multiplication of two big matrixes is split + * into multiplication of several (head_number_) small matrixes. e.g. if Mat A + * is [3, 24] and Mat B is [24, 4], when multiple A and B with head_number as + * 4, Mat A will be splitted as 4 matrix of [3, 6] and Mat B will be + * (horizontally) splitted as 4 matrix of [6, 4]. The result of final matrix + * will be 4 matrix of [3, 4], i.e. [3, 16]. + * Another example is A is [3, 8], B is [2, 16], head_number is 4. In this + * case, A will be splitted as [3, 2], B will be (vertically) splitted as + * [2, 4]. The final result will be 4 matrix of 4 matrix of [3,4], i.e. [3, 16] + */ +template +template +void Blas::MatMulWithHead(const framework::Tensor &mat_a, + const MatDescriptor &dim_a, + const framework::Tensor &mat_b, + const MatDescriptor &dim_b, T alpha, + int head_number, + framework::Tensor *mat_out, T beta, + bool mat_b_split_vertical) const { + PADDLE_ENFORCE_EQ(dim_a.width_ % head_number, 0); + PADDLE_ENFORCE_GE(head_number, 1); + PADDLE_ENFORCE_LE(head_number, dim_a.width_); + CBLAS_TRANSPOSE transA = !dim_a.trans_ ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = !dim_b.trans_ ? CblasNoTrans : CblasTrans; + + if (mat_b_split_vertical) { + PADDLE_ENFORCE_EQ(dim_b.height_, dim_a.width_ / head_number); + PADDLE_ENFORCE_EQ(dim_b.width_ % head_number, 0); + } + + if (dim_a.batch_size_ == 0 && dim_b.batch_size_ == 0) { + int lda = !dim_a.trans_ ? dim_a.width_ : dim_a.height_; + int ldb = !dim_b.trans_ ? dim_b.width_ : dim_b.height_; + int sub_matA_offset; + int sub_matB_offset; + int sub_matC_offset; + int sub_mat_M = dim_a.height_; + int sub_mat_N; + int sub_mat_K; + int ldc; + + for (int i = 0; i < head_number; i++) { + sub_matA_offset = dim_a.trans_ + ? i * (dim_a.width_ / head_number) * dim_a.height_ + : i * (dim_a.width_ / head_number); + if (mat_b_split_vertical) { + sub_matB_offset = dim_b.trans_ + ? i * (dim_b.width_ / head_number) * dim_b.height_ + : i * (dim_b.width_ / head_number); + sub_matC_offset = i * dim_b.width_ / head_number; + + sub_mat_N = dim_b.width_ / head_number; + sub_mat_K = dim_b.height_; + + ldc = dim_b.width_; + } else { + sub_matB_offset = + dim_b.trans_ ? i * (dim_b.height_ / head_number) + : i * (dim_b.height_ / head_number) * dim_b.width_; + sub_matC_offset = i * dim_b.width_; + + sub_mat_N = dim_b.width_; + sub_mat_K = dim_a.width_ / head_number; + + ldc = head_number * dim_b.width_; + } + + this->template GEMM(transA, transB, sub_mat_M, sub_mat_N, sub_mat_K, + alpha, mat_a.data() + sub_matA_offset, lda, + mat_b.data() + sub_matB_offset, ldb, beta, + mat_out->data() + sub_matC_offset, ldc); + } + } else { + PADDLE_ENFORCE_EQ((dim_a.batch_size_ == dim_b.batch_size_ || + dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0), + true); + + this->template BatchedGEMMWithHead( + transA, transB, dim_a.width_, dim_a.height_, dim_b.width_, + dim_b.height_, alpha, mat_a.data(), mat_b.data(), beta, + mat_out->data(), + dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_, + dim_a.stride_, dim_b.stride_, head_number, mat_b_split_vertical); + } +} +#endif + template template void Blas::VINV(int n, const T *a, T *y) const { @@ -652,6 +830,19 @@ void Blas::VMERF(int n, const T *a, T *y, #endif } +#ifdef PADDLE_WITH_MKLML +template <> +template +void Blas::CSRMM( + const char *transa, const int *m, const int *n, const int *k, + const T *alpha, const char *matdescra, const T *val, const int *indx, + const int *pntrb, const int *pntre, const T *b, const int *ldb, + const T *beta, T *c, const int *ldc) const { + CBlas::CSRMM(transa, m, n, k, alpha, matdescra, val, indx, pntrb, pntre, b, + ldb, beta, c, ldc); +} +#endif + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index 153e6117..5a7cd602 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/float16.h" @@ -264,8 +265,7 @@ class ConcatFunctor { const T** dev_ins_data = nullptr; if (!has_same_shape || in_num < 2 || in_num > 4) { tmp_dev_ins_data = - platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate( - inputs_data.size() * sizeof(T*)); + memory::Alloc(context, inputs_data.size() * sizeof(T*)); memory::Copy(boost::get(context.GetPlace()), tmp_dev_ins_data->ptr(), platform::CPUPlace(), static_cast(inputs_data.data()), @@ -292,8 +292,7 @@ class ConcatFunctor { } } else { auto tmp_dev_ins_col_data = - platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate( - inputs_col.size() * sizeof(int)); + memory::Alloc(context, inputs_col.size() * sizeof(int)); memory::Copy(boost::get(context.GetPlace()), tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), static_cast(inputs_col.data()), @@ -356,8 +355,7 @@ class SplitFunctor { T** dev_out_gpu_data = nullptr; if (!has_same_shape || o_num < 2 || o_num > 4) { tmp_dev_outs_data = - platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate( - outputs_data.size() * sizeof(T*)); + memory::Alloc(context, outputs_data.size() * sizeof(T*)); memory::Copy(boost::get(context.GetPlace()), tmp_dev_outs_data->ptr(), platform::CPUPlace(), reinterpret_cast(outputs_data.data()), @@ -384,8 +382,9 @@ class SplitFunctor { } } else { auto tmp_dev_ins_col_data = - platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate( - outputs_cols.size() * sizeof(int)); + memory::Alloc(context, + + outputs_cols.size() * sizeof(int)); memory::Copy(boost::get(context.GetPlace()), tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), reinterpret_cast(outputs_cols.data()), diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 4406a558..8940a414 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -160,7 +160,7 @@ inline void vec_sum(const size_t n, const float* x, end = n & ~(block - 1); __m256 tmp = _mm256_setzero_ps(); for (i = 0; i < end; i += block) { - tmp = _mm256_add_ps(tmp, _mm256_load_ps(x + i)); + tmp = _mm256_add_ps(tmp, _mm256_loadu_ps(x + i)); } __m256 hsum = _mm256_hadd_ps(tmp, tmp); diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 5bc05257..2d871c6e 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -27,7 +27,10 @@ __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label, const int ignore_index) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { - PADDLE_ASSERT(label[i] >= 0 && label[i] < D || label[i] == ignore_index); + PADDLE_ENFORCE(label[i] >= 0 && label[i] < D || label[i] == ignore_index, + "label[%d] expected >= 0 and < %ld, or == %ld, but got " + "%ld. Please check input value.", + i, D, ignore_index, label[i]); Y[i] = ignore_index == label[i] ? static_cast(0) : -math::TolerableValue()(real_log(X[i * D + label[i]])); diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h index 48082a72..db198189 100644 --- a/paddle/fluid/operators/math/cross_entropy.h +++ b/paddle/fluid/operators/math/cross_entropy.h @@ -25,7 +25,8 @@ namespace math { template struct TolerableValue { HOSTDEVICE T operator()(const T& x) const { - PADDLE_ASSERT(std::is_floating_point::value); + PADDLE_ENFORCE(std::is_floating_point::value, + "TolerableValue should be float in cross_entropy."); const T kApproInf = 1e20; if (x == INFINITY) return kApproInf; diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 240cec14..a372f6fa 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -487,8 +487,12 @@ class DepthwiseConvFunctor +class FCFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, const int M, + const int N, const int K, const T* X, const T* W, T* Y, + const T* B = nullptr, bool relu = false) { + auto blas = math::GetBlas(context); + blas.MatMul(M, N, K, X, W, Y); + if (B == NULL) { + return; + } + if (relu) { + auto compute = + jit::KernelFuncs, platform::CPUPlace>::Cache() + .At(N); + for (int i = 0; i < M; i++) { + T* dst = Y + i * N; + compute(B, dst, dst, N); + } + } else { + auto compute = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + N); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int i = 0; i < M; i++) { + T* dst = Y + i * N; + compute(B, dst, dst, N); + } + } + } +}; + +template class FCFunctor; +template class FCFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/fc.cu b/paddle/fluid/operators/math/fc.cu new file mode 100644 index 00000000..1b22b810 --- /dev/null +++ b/paddle/fluid/operators/math/fc.cu @@ -0,0 +1,73 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/fc.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void InplaceAddReluKernel(const T* bias, T* data, int M, int N) { + for (int i = blockIdx.x; i < M; i += gridDim.x) { + int index = i * N + threadIdx.x; + for (int j = threadIdx.x; j < N; j += blockDim.x) { + T tmp = data[index] + bias[j]; + if (DoRelu) { + data[index] = (tmp > 0) ? tmp : 0; + } else { + data[index] = tmp; + } + index += blockDim.x; + } + } +} + +template +class FCFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, const int M, + const int N, const int K, const T* X, const T* W, T* Y, + const T* B = nullptr, bool relu = false) { + auto blas = math::GetBlas(context); + blas.GEMM(false, false, M, N, K, static_cast(1.0), X, K, W, N, + static_cast(0.0), Y, N); + if (B == NULL) { + return; + } + + const int kThreadsPerBlock = 1024; + int max_threads = context.GetMaxPhysicalThreadCount(); + int num_threads = std::min(kThreadsPerBlock, (((N + 31) >> 5) << 5)); + int num_blocks = std::max(max_threads / num_threads, 1); + if (relu) { + InplaceAddReluKernel< + T, true><<>>(B, Y, M, + N); + } else { + InplaceAddReluKernel< + T, false><<>>(B, Y, M, + N); + } + } +}; + +template class FCFunctor; +template class FCFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/fc.h b/paddle/fluid/operators/math/fc.h new file mode 100644 index 00000000..9bef496f --- /dev/null +++ b/paddle/fluid/operators/math/fc.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { +namespace math { + +template +class FCFunctor { + public: + void operator()(const DeviceContext& context, const int M, const int N, + const int K, const T* X, const T* W, T* Y, + const T* B = nullptr, bool relu = false); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h deleted file mode 100644 index 66ce5759..00000000 --- a/paddle/fluid/operators/math/fc_compute.h +++ /dev/null @@ -1,55 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/fluid/operators/math/blas.h" - -namespace paddle { -namespace operators { -namespace math { - -template -inline void FCCompute(const BlasT& blas, const int M, - const int N, const int K, const T* X, const T* W, T* Y, - const T* B = NULL, bool relu = false) { - blas.MatMul(M, N, K, X, W, Y); - if (B == NULL) { - return; - } - if (relu) { - auto compute = - jit::KernelFuncs, platform::CPUPlace>::Cache().At( - N); - for (int i = 0; i < M; i++) { - T* dst = Y + i * N; - compute(B, dst, dst, N); - } - } else { - auto compute = - jit::KernelFuncs, platform::CPUPlace>::Cache().At(N); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int i = 0; i < M; i++) { - T* dst = Y + i * N; - compute(B, dst, dst, N); - } - } -} - -} // namespace math -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu index 4897767f..809014ea 100644 --- a/paddle/fluid/operators/math/im2col.cu +++ b/paddle/fluid/operators/math/im2col.cu @@ -66,8 +66,8 @@ class Im2ColFunctor& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { - PADDLE_ENFORCE(im.dims().size() == 3); - PADDLE_ENFORCE(col->dims().size() == 5); + PADDLE_ENFORCE_EQ(im.dims().size(), 3); + PADDLE_ENFORCE_EQ(col->dims().size(), 5); int im_channels = im.dims()[0]; int im_height = im.dims()[1]; @@ -152,8 +152,8 @@ class Col2ImFunctor& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* im) { - PADDLE_ENFORCE(im->dims().size() == 3); - PADDLE_ENFORCE(col.dims().size() == 5); + PADDLE_ENFORCE_EQ(im->dims().size(), 3); + PADDLE_ENFORCE_EQ(col.dims().size(), 5); int im_channels = im->dims()[0]; int im_height = im->dims()[1]; @@ -249,8 +249,8 @@ class Im2ColFunctor& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { - PADDLE_ENFORCE(im.dims().size() == 3); - PADDLE_ENFORCE(col->dims().size() == 5); + PADDLE_ENFORCE_EQ(im.dims().size(), 3); + PADDLE_ENFORCE_EQ(col->dims().size(), 5); int im_channels = im.dims()[0]; int im_height = im.dims()[1]; int im_width = im.dims()[2]; @@ -331,8 +331,8 @@ class Col2ImFunctor& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* im) { - PADDLE_ENFORCE(im->dims().size() == 3); - PADDLE_ENFORCE(col.dims().size() == 5); + PADDLE_ENFORCE_EQ(im->dims().size(), 3); + PADDLE_ENFORCE_EQ(col.dims().size(), 5); int im_channels = im->dims()[0]; int im_height = im->dims()[1]; int im_width = im->dims()[2]; diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu index 8f939159..6aabfb06 100644 --- a/paddle/fluid/operators/math/sample_prob.cu +++ b/paddle/fluid/operators/math/sample_prob.cu @@ -142,9 +142,9 @@ void GPUSampleWithProb::operator()( int num_tries = UniqSampler(sampler, num_samples, s_data); VLOG(1) << "num_tries: " << num_tries; - PADDLE_ENFORCE(cudaMemcpy(samples_data + num_true, s_data, - sizeof(int64_t) * num_samples, - cudaMemcpyHostToDevice)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMemcpy(samples_data + num_true, s_data, + sizeof(int64_t) * num_samples, + cudaMemcpyHostToDevice)); int threads = 512; const size_t size = batch_size * num_sampled_classes; diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 647d4f14..f73c9bb9 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -376,11 +376,115 @@ struct MergeAdd { } }; +template +struct MergeAverage { + framework::SelectedRows operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input) { + framework::SelectedRows out; + (*this)(context, input, &out); + return out; + } + + void operator()(const platform::CPUDeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* output) { + std::vector inputs; + inputs.push_back(&input); + (*this)(context, inputs, output); + } + + void operator()(const platform::CPUDeviceContext& context, + const std::vector& inputs, + framework::SelectedRows* output) { + if (inputs.size() == 0) { + VLOG(3) << "no input! return"; + return; + } + const framework::SelectedRows* has_value_input = nullptr; + for (auto* in : inputs) { + if (in->rows().size() > 0) { + has_value_input = in; + break; + } + } + if (has_value_input == nullptr) { + VLOG(3) << "no input has value! just return" << std::endl; + return; + } + auto input_width = has_value_input->value().dims()[1]; + auto input_height = has_value_input->height(); + framework::SelectedRows& out = *output; + std::set merged_row_set; + size_t row_num = 0; + for (auto* input : inputs) { + if (input->rows().size() == 0) { + continue; + } + PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1], + "all input should have same " + "dimension except for the first one"); + PADDLE_ENFORCE_EQ(input_height, input->height(), + "all input should have same height"); + row_num += input->rows().size(); + merged_row_set.insert(input->rows().begin(), input->rows().end()); + } + + out.set_height(input_height); + out.mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merged_row_set.size()), input_width}), + context.GetPlace()); + auto* out_data = out.mutable_value()->data(); + + std::vector merge_rows(merged_row_set.begin(), + merged_row_set.end()); + std::sort(merge_rows.begin(), merge_rows.end()); + + out.set_rows(merge_rows); + + math::SetConstant constant_functor; + constant_functor(context, out.mutable_value(), 0.0); + + std::unordered_map rows_to_id; + for (size_t i = 0; i < merge_rows.size(); ++i) { + rows_to_id[merge_rows[i]] = i; + } + + auto blas = math::GetBlas(context); + for (auto* input : inputs) { + if (input->rows().size() == 0) { + continue; + } + auto* input_data = input->value().data(); + auto& input_rows = input->rows(); + + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = rows_to_id[input_rows[i]]; + elementwise_add_to( + context, &blas, static_cast(input_width), + &input_data[i * input_width], &out_data[out_i * input_width]); + } + } + size_t input_width_cast = static_cast(input_width); + T count = static_cast(inputs.size()); + for (size_t i = 0; i < merge_rows.size(); i++) { + for (size_t j = 0; j < input_width_cast; j++) { + out_data[i * input_width + j] = out_data[i * input_width + j] / count; + } + } + } +}; + template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; template struct MergeAdd; +template struct MergeAverage; +template struct MergeAverage; +template struct MergeAverage; +template struct MergeAverage; + template struct UpdateToTensor { void operator()(const platform::CPUDeviceContext& context, diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index 0d63f641..b3e2c8a6 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -55,11 +55,11 @@ struct SelectedRowsAdd { auto* in1_data = in1_value.data(); auto in1_place = input1.place(); - PADDLE_ENFORCE(platform::is_gpu_place(in1_place)); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true); auto in2_place = input2.place(); - PADDLE_ENFORCE(platform::is_gpu_place(in2_place)); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true); auto out_place = context.GetPlace(); - PADDLE_ENFORCE(platform::is_gpu_place(out_place)); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true); memory::Copy(boost::get(out_place), out_data, boost::get(in1_place), in1_data, @@ -162,9 +162,9 @@ struct SelectedRowsAddTo { } auto in1_place = input1.place(); - PADDLE_ENFORCE(platform::is_gpu_place(in1_place)); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true); auto in2_place = input2->place(); - PADDLE_ENFORCE(platform::is_gpu_place(in2_place)); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true); auto* in1_data = in1_value.data(); auto* in2_data = in2_value->data(); diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index db0ee9bc..a1eb69db 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -93,6 +93,18 @@ struct MergeAdd { const bool sorted_result = false); }; +template +struct MergeAverage { + framework::SelectedRows operator()(const DeviceContext& context, + const framework::SelectedRows& input); + void operator()(const DeviceContext& context, + const framework::SelectedRows& input, + framework::SelectedRows* output); + void operator()(const DeviceContext& context, + const std::vector& inputs, + framework::SelectedRows* output); +}; + enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; // out = selected_rows_in / tensor diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index 5581b9e0..b7a499aa 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -223,6 +223,46 @@ TEST(selected_rows_functor, cpu_add_to) { EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0); } +TEST(selected_rows_functor, cpu_merge_average_float) { + paddle::platform::CPUPlace cpu_place; + paddle::platform::CPUDeviceContext ctx(cpu_place); + paddle::operators::math::SetConstant + functor; + int64_t height = 10; + int64_t row_numel = 10; + + std::vector rows{0, 4, 4, 7}; + std::unique_ptr selected_rows{ + new paddle::framework::SelectedRows(rows, height)}; + auto* in_value = selected_rows->mutable_value(); + in_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows.size()), row_numel}), + cpu_place); + functor(ctx, in_value, 1.0); + + paddle::operators::math::scatter::MergeAverage< + paddle::platform::CPUDeviceContext, float> + merge_average_functor; + paddle::framework::SelectedRows output = + merge_average_functor(ctx, *selected_rows); + + auto out_height = output.height(); + EXPECT_EQ(out_height, height); + + auto& out_rows = output.rows(); + EXPECT_EQ(out_rows[0], 0); + EXPECT_EQ(out_rows[1], 4); + EXPECT_EQ(out_rows[2], 7); + + auto* out_data = output.value().data(); + + EXPECT_EQ(out_data[0 * row_numel], 1.0); + EXPECT_EQ(out_data[1 * row_numel], 2.0); + EXPECT_EQ(out_data[2 * row_numel], 1.0); +} + TEST(selected_rows_functor, cpu_merge_add_float) { paddle::platform::CPUPlace cpu_place; paddle::platform::CPUDeviceContext ctx(cpu_place); diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 011d45c3..cc3fbd58 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -37,18 +37,23 @@ class MaxSeqPoolFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::LoDTensor& input, T pad_value, - framework::Tensor* output, framework::Tensor* index) { + framework::LoDTensor* output, framework::Tensor* index) { auto in_dims = input.dims(); auto out_dims = output->dims(); auto idx_dims = index->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1); - PADDLE_ENFORCE_GT(out_dims.size(), 1); + PADDLE_ENFORCE_GT(in_dims.size(), 1, + "The rank of input shall be greater than 1."); + PADDLE_ENFORCE_GT(out_dims.size(), 1, + "The rank of output shall be greater than 1."); for (int64_t i = 1; i < in_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i], + "The dimension of input and output shall be same."); } - PADDLE_ENFORCE_EQ(idx_dims, out_dims); + PADDLE_ENFORCE_EQ(idx_dims, out_dims, + "The dimension of index and output shall be same."); - auto starts = input.lod()[0]; + auto lod_level = input.lod().size(); + auto starts = input.lod()[lod_level - 1]; const T* in_data = input.data(); T* out_data = output->data(); int* max_index = index->data(); @@ -85,16 +90,20 @@ class MaxSeqPoolFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::LoDTensor& input, T pad_value, - framework::Tensor* output, framework::Tensor* index) { + framework::LoDTensor* output, framework::Tensor* index) { auto in_dims = input.dims(); auto out_dims = output->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1); - PADDLE_ENFORCE_GT(out_dims.size(), 1); + PADDLE_ENFORCE_GT(in_dims.size(), 1, + "The rank of input shall be greater than 1."); + PADDLE_ENFORCE_GT(out_dims.size(), 1, + "The rank of output shall be greater than 1."); for (int64_t i = 1; i < in_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i], + "The dimension of input and output shall be same."); } - auto starts = input.lod()[0]; + auto lod_level = input.lod().size(); + auto starts = input.lod()[lod_level - 1]; const T* in_data = input.data(); T* out_data = output->data(); @@ -123,18 +132,23 @@ template class MaxSeqPoolGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& out_grad, + const framework::LoDTensor& out_grad, const framework::Tensor& index, framework::LoDTensor* in_grad) { auto og_dims = out_grad.dims(); auto ig_dims = in_grad->dims(); auto idx_dims = index.dims(); - PADDLE_ENFORCE_GT(og_dims.size(), 1); - PADDLE_ENFORCE_GT(ig_dims.size(), 1); + PADDLE_ENFORCE_GT(og_dims.size(), 1, + "The rank of output@Grad shall be greater than 1."); + PADDLE_ENFORCE_GT(ig_dims.size(), 1, + "The rank of input@Grad shall be greater than 1."); for (int64_t i = 1; i < og_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); + PADDLE_ENFORCE_EQ( + og_dims[i], ig_dims[i], + "The dimension of input@Grad and output@Grad shall be same."); } - PADDLE_ENFORCE_EQ(idx_dims, og_dims); + PADDLE_ENFORCE_EQ(idx_dims, og_dims, + "The dimension of index and output@Grad shall be same."); const T* og_data = out_grad.data(); const int* max_index = index.data(); @@ -159,14 +173,15 @@ class LastSeqPoolFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::LoDTensor& input, T pad_value, - framework::Tensor* output) { + framework::LoDTensor* output) { // Create pointers to input and output data auto* in_data = input.data(); auto* out_data = output->data(); // Calculate the size of each item in sequence int64_t item_size = input.numel() / input.dims()[0]; - auto lod = input.lod()[0]; + auto lod_level = input.lod().size(); + auto lod = input.lod()[lod_level - 1]; int seq_num = static_cast(lod.size()) - 1; for (int i = 0; i < seq_num; ++i) { // Calculate the length of each sequence @@ -191,14 +206,15 @@ class FirstSeqPoolFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::LoDTensor& input, T pad_value, - framework::Tensor* output) { + framework::LoDTensor* output) { // Create pointers to input and output data auto* in_data = input.data(); auto* out_data = output->data(); // Calculate the size of each item in sequence int64_t item_size = input.numel() / input.dims()[0]; - auto lod = input.lod()[0]; + auto lod_level = input.lod().size(); + auto lod = input.lod()[lod_level - 1]; int seq_num = static_cast(lod.size()) - 1; for (int i = 0; i < seq_num; ++i) { // Calculate the length of each sequence @@ -222,12 +238,15 @@ template class SumSeqPoolGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const framework::Tensor& out_grad, + const framework::LoDTensor& out_grad, framework::LoDTensor* in_grad) { - auto lod = in_grad->lod()[0]; + auto lod_level = in_grad->lod().size(); + auto lod = in_grad->lod()[lod_level - 1]; int64_t out_w = out_grad.numel() / out_grad.dims()[0]; int64_t in_w = in_grad->numel() / in_grad->dims()[0]; - PADDLE_ENFORCE(in_w == out_w); + PADDLE_ENFORCE_EQ( + in_w, out_w, + "The feature size of input@Grad and output@Grad shall be same."); const T* out_g_data = out_grad.data(); T* in_g_data = in_grad->mutable_data(context.GetPlace()); auto blas = math::GetBlas(context); @@ -250,8 +269,9 @@ class SequencePoolFunctor { /* max pool has index output */ void operator()(const platform::CPUDeviceContext& context, const std::string pooltype, T pad_value, - const framework::LoDTensor& input, framework::Tensor* output, - bool is_test, framework::Tensor* index = nullptr) { + const framework::LoDTensor& input, + framework::LoDTensor* output, bool is_test, + framework::Tensor* index = nullptr) { if (pooltype == "MAX") { if (is_test) { math::MaxSeqPoolFunctor max_pool; @@ -272,11 +292,13 @@ class SequencePoolFunctor { first_pool(context, input, pad_value, output); return; } - - auto lod = input.lod()[0]; + auto lod_level = input.lod().size(); + auto lod = input.lod()[lod_level - 1]; if (pooltype == "SUM") { auto place = context.GetPlace(); - PADDLE_ENFORCE(platform::is_cpu_place(place)); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(place), true, + "Sequence_pool should run on CPU Device when pooltype is SUM"); const T* src = input.data(); T* dst = output->mutable_data(place); jit::seq_pool_attr_t attr( @@ -330,7 +352,8 @@ template class SequencePoolGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, - const std::string pooltype, const framework::Tensor& out_grad, + const std::string pooltype, + const framework::LoDTensor& out_grad, framework::LoDTensor* in_grad, /* max pool has index */ const framework::Tensor* index = nullptr) { @@ -352,7 +375,8 @@ class SequencePoolGradFunctor { return; } - auto lod = in_grad->lod()[0]; + auto lod_level = in_grad->lod().size(); + auto lod = in_grad->lod()[lod_level - 1]; auto& place = *context.eigen_device(); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { if (lod[i] == lod[i + 1]) continue; diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index 4de99ba6..91545131 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -159,9 +159,11 @@ class SequencePoolFunctor { public: void operator()(const platform::CUDADeviceContext& context, const std::string pooltype, T pad_value, - const framework::LoDTensor& input, framework::Tensor* output, - bool is_test, framework::Tensor* index = nullptr) { - auto& lod = input.lod()[0]; + const framework::LoDTensor& input, + framework::LoDTensor* output, bool is_test, + framework::Tensor* index = nullptr) { + auto lod_level = input.lod().size(); + auto& lod = input.lod()[lod_level - 1]; const size_t item_dim = output->numel() / output->dims()[0]; dim3 threads(1024, 1); dim3 grid(lod.size(), 1); @@ -319,11 +321,13 @@ template class SequencePoolGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const std::string pooltype, const framework::Tensor& out_grad, + const std::string pooltype, + const framework::LoDTensor& out_grad, framework::LoDTensor* in_grad, /* max pool has index */ const framework::Tensor* index = nullptr) { - auto& lod = in_grad->lod()[0]; + auto lod_level = in_grad->lod().size(); + auto& lod = in_grad->lod()[lod_level - 1]; const size_t item_dim = in_grad->numel() / in_grad->dims()[0]; dim3 threads(1024, 1); dim3 grid(lod.size(), 1); diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h index 1dc02eae..847d0bca 100644 --- a/paddle/fluid/operators/math/sequence_pooling.h +++ b/paddle/fluid/operators/math/sequence_pooling.h @@ -28,7 +28,7 @@ class SequencePoolFunctor { /* max pool has index output */ void operator()(const DeviceContext& context, const std::string pooltype, T pad_value, const framework::LoDTensor& input, - framework::Tensor* output, bool is_test = false, + framework::LoDTensor* output, bool is_test = false, framework::Tensor* index = nullptr); }; @@ -36,7 +36,7 @@ template class SequencePoolGradFunctor { public: void operator()(const DeviceContext& context, const std::string pooltype, - const framework::Tensor& out_grad, + const framework::LoDTensor& out_grad, framework::LoDTensor* in_grad, /* max pool has index */ const framework::Tensor* index = nullptr); diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index 71d13739..1c0970c0 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -35,7 +35,7 @@ void SoftmaxCUDNNFunctor::operator()( // ------------------- cudnn descriptors --------------------- ScopedTensorDescriptor xDesc; ScopedTensorDescriptor yDesc; - std::vector cudnn_tensor_dims = framework::vectorize2int(X->dims()); + std::vector cudnn_tensor_dims = framework::vectorize(X->dims()); DataLayout layout = DataLayout::kNCHW; if (cudnn_tensor_dims.size() == 5) { layout = DataLayout::kNCDHW; @@ -64,7 +64,7 @@ void SoftmaxGradCUDNNFunctor::operator()( ScopedTensorDescriptor yDesc; ScopedTensorDescriptor dyDesc; ScopedTensorDescriptor dxDesc; - std::vector cudnn_tensor_dims = framework::vectorize2int(Y->dims()); + std::vector cudnn_tensor_dims = framework::vectorize(Y->dims()); DataLayout layout = DataLayout::kNCHW; if (cudnn_tensor_dims.size() == 5) { layout = DataLayout::kNCDHW; diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 4fb03cdc..fae5160c 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -41,6 +41,7 @@ void SoftmaxEigen(const DeviceContext& context, const int axis_dim, const framework::Tensor* X, framework::Tensor* Y) { constexpr int kBatchDim = 0; constexpr int kClassDim = 1; + constexpr int kAxisDim = 1; auto logits = EigenMatrix::From(*X); auto softmax = EigenMatrix::From(*Y); @@ -49,23 +50,44 @@ void SoftmaxEigen(const DeviceContext& context, const int axis_dim, const int num_classes = logits.dimension(kClassDim); const int num_remain = num_classes / axis_dim; - Eigen::DSizes along_class(kClassDim); + Eigen::DSizes along_axis(kAxisDim); + Eigen::DSizes batch_classes(batch_size, num_classes); Eigen::DSizes batch_by_one(batch_size, 1); Eigen::DSizes one_by_class(1, num_classes); - Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); + Eigen::DSizes batch_one_remain(batch_size, 1, num_remain); + Eigen::DSizes one_axis_one(1, axis_dim, 1); Eigen::DSizes one_axis(1, axis_dim); + Eigen::DSizes batch_axis_remain(batch_size, axis_dim, num_remain); - auto shifted_logits = (logits - - logits.maximum(along_class) - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)) - .unaryExpr(ValueClip()); + // For numerical stability, logits should be shifted by maximum number along + // axis, calculate shifted_logits into softmax tensor for memory reuse. + if (num_remain == 1) { + // axis == -1, axis and class in same dimension, calculate along + // class dimension directly for higher performance + softmax.device(*context.eigen_device()) = (logits - + logits.maximum(along_axis) + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)) + .unaryExpr(ValueClip()); + } else { + // axis != -1, class dimension split into (axis, remain), max and sum + // should be calculated along axis dimension + softmax.device(*context.eigen_device()) = + (logits.reshape(batch_axis_remain) - + logits.reshape(batch_axis_remain) + .maximum(along_axis) + .eval() + .reshape(batch_one_remain) + .broadcast(one_axis_one) + .reshape(batch_classes)) + .unaryExpr(ValueClip()); + } - softmax.device(*context.eigen_device()) = shifted_logits.exp(); + softmax.device(*context.eigen_device()) = softmax.exp(); softmax.device(*context.eigen_device()) = (softmax * softmax.reshape(batch_axis_remain) - .sum(along_class) + .sum(along_axis) .inverse() .eval() .broadcast(one_axis)); diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu index c467ae84..d78e3385 100644 --- a/paddle/fluid/operators/math/unpooling.cu +++ b/paddle/fluid/operators/math/unpooling.cu @@ -37,7 +37,10 @@ __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, int cidx = boffset / in_c_stride; int out_offset = bidx * out_n_stride + cidx * out_c_stride; int out_index = indices_data[i]; - PADDLE_ASSERT(out_index < out_c_stride); + PADDLE_ENFORCE(out_index < out_c_stride, + "out_index < out_c_stride. Expected %ld < %ld, but got " + "%ld >= %ld. Please check input value.", + out_index, out_c_stride, out_index, out_c_stride); output_data[out_offset + out_index] = input_data[i]; } } @@ -59,7 +62,10 @@ __global__ void KernelUnpool2dMaxGrad( int cidx = boffset / in_c_stride; int out_offset = bidx * out_n_stride + cidx * out_c_stride; int out_index = indices_data[i]; - PADDLE_ASSERT(out_index < out_c_stride); + PADDLE_ENFORCE(out_index < out_c_stride, + "out_index < out_c_stride. Expected %ld < %ld, but got " + "%ld >= %ld. Please check input value.", + out_index, out_c_stride, out_index, out_c_stride); input_grad[i] = output_grad[out_offset + out_index]; } } diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu index 28e1a752..25d8a247 100644 --- a/paddle/fluid/operators/math/vol2col.cu +++ b/paddle/fluid/operators/math/vol2col.cu @@ -78,8 +78,8 @@ class Vol2ColFunctor { const std::vector& strides, const std::vector& paddings, framework::Tensor* col) const { - PADDLE_ENFORCE(vol.dims().size() == 4); - PADDLE_ENFORCE(col->dims().size() == 7); + PADDLE_ENFORCE_EQ(vol.dims().size(), 4); + PADDLE_ENFORCE_EQ(col->dims().size(), 7); int input_channels = vol.dims()[0]; int input_depth = vol.dims()[1]; @@ -204,8 +204,8 @@ class Col2VolFunctor { const std::vector& strides, const std::vector& paddings, framework::Tensor* vol) const { - PADDLE_ENFORCE(vol->dims().size() == 4); - PADDLE_ENFORCE(col.dims().size() == 7); + PADDLE_ENFORCE_EQ(vol->dims().size(), 4); + PADDLE_ENFORCE_EQ(col.dims().size(), 7); int input_channels = vol->dims()[0]; int input_depth = vol->dims()[1]; diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index f1828274..eb43f43d 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -60,7 +60,20 @@ class MatMulKernel : public framework::OpKernel { auto mat_dim_b = math::CreateMatrixDescriptor( ColumnMatrixFromVector(y.dims()), 0, context.Attr("transpose_Y")); auto scale = static_cast(context.Attr("alpha")); + +#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) + int head_number = context.Attr("head_number"); + bool split_vertical_y = (mat_dim_a.width_ != mat_dim_b.height_); + + if (head_number > 1) { + blas.MatMulWithHead(x, mat_dim_a, y, mat_dim_b, scale, head_number, out, + T(0), split_vertical_y); + } else { + blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0)); + } +#else blas.MatMul(x, mat_dim_a, y, mat_dim_b, scale, out, T(0)); +#endif } }; @@ -289,22 +302,34 @@ class MatMulOp : public framework::OperatorWithKernel { math::CreateMatrixDescriptor(ColumnMatrixFromVector(dim_y), 0, context->Attrs().Get("transpose_Y")); - PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_); if (context->IsRuntime()) { PADDLE_ENFORCE(mat_dim_x.batch_size_ == mat_dim_y.batch_size_ || mat_dim_x.batch_size_ == 0 || mat_dim_y.batch_size_ == 0); } std::vector dim_out; + int64_t dim_out_y = mat_dim_y.width_; +#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) + int head_number = context->Attrs().Get("head_number"); + bool split_vertical_y = (mat_dim_x.width_ != mat_dim_y.height_); + PADDLE_ENFORCE_LE(head_number, mat_dim_x.width_); + + if (!split_vertical_y && head_number > 0) { + dim_out_y = head_number * mat_dim_y.width_; + } +#else + PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_); +#endif + if (mat_dim_x.batch_size_ != 0) { dim_out = framework::vectorize(dim_x); dim_out[dim_out.size() - 2] = mat_dim_x.height_; - dim_out[dim_out.size() - 1] = mat_dim_y.width_; + dim_out[dim_out.size() - 1] = dim_out_y; } else if (mat_dim_y.batch_size_ != 0) { dim_out = framework::vectorize(dim_y); dim_out[dim_out.size() - 2] = mat_dim_x.height_; - dim_out[dim_out.size() - 1] = mat_dim_y.width_; + dim_out[dim_out.size() - 1] = dim_out_y; } else { - dim_out = {mat_dim_x.height_, mat_dim_y.width_}; + dim_out = {mat_dim_x.height_, dim_out_y}; } if (dim_x.size() == 1 && dim_out[dim_out.size() - 2] == 1) { @@ -339,6 +364,10 @@ class MatMulOpMaker : public framework::OpProtoAndCheckerMaker { )DOC") .SetDefault(false); AddAttr("alpha", "The scale of Out").SetDefault(1.0f); +#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) + AddAttr("head_number", "The number of heads of the matrix") + .SetDefault(1); +#endif AddComment(R"DOC( MatMul Operator. @@ -360,6 +389,9 @@ Examples without transpose: - X: [B, M, K], Y: [B, K, N] => Out: [B, M, N] - X: [B, ..., M, K], Y: [B, ..., K, N] => Out: [B, ..., M, N] +Example of matrix multiplication with head_number of H +- X: [B, M, K], Y: [B, K, N] => Out: [B, M, H * N] + The behavior is designed to be similar to the `numpy.matmul` function. The differences are: - When the rank of the input data is less than or equal to 3, it @@ -367,6 +399,9 @@ The differences are: - When the rank of the input is greater than 3, the rank of X and Y must be equal, and the first `rank - 2` dimensions must be equal. - We add `transpose_X` and `transpose_Y` flags. +- We add `head_number` attribute, which is used to multiple two matrixes head + by head, and eventually concatenates the output of several (head_number) + small matrixes multiplication. Both the input `X` and `Y` can carry the LoD (Level of Details) information, or not. But the output only shares the LoD information with input `X`. diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu index 08088eb8..ada1892f 100644 --- a/paddle/fluid/operators/mean_iou_op.cu +++ b/paddle/fluid/operators/mean_iou_op.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/mean_iou_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -116,9 +117,7 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel { auto out_correct_t = EigenTensor::From(*out_correct); // Temporary memory - auto& allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); - auto tmp_ious_data = allocator.Allocate(num_classes * sizeof(float)); + auto tmp_ious_data = memory::Alloc(dev_ctx, num_classes * sizeof(float)); float* ious_data = static_cast(tmp_ious_data->ptr()); // Init out_wrong, out_correct and out_mean_iou diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc index 5edc233f..6a9d8222 100644 --- a/paddle/fluid/operators/merge_lod_tensor_op.cc +++ b/paddle/fluid/operators/merge_lod_tensor_op.cc @@ -28,9 +28,9 @@ class MergeLoDTensorOp : public framework::OperatorBase { const framework::AttributeMap &attrs) : OperatorBase(type, inputs, outputs, attrs) {} - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &dev_place) const override { + protected: + void RunBase(const framework::Scope &scope, + const platform::Place &dev_place) const { // get device context from pool platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); @@ -125,6 +125,33 @@ class MergeLoDTensorOp : public framework::OperatorBase { out_lod->insert(out_lod->begin(), x.lod()[i]); } } + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + RunBase(scope, dev_place); + } +}; + +class MergeLoDTensorInferOp : public MergeLoDTensorOp { + public: + MergeLoDTensorInferOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : MergeLoDTensorOp(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + RunBase(scope, dev_place); + framework::Variable *in_true_var = scope.FindVar(Input("InTrue")); + framework::Variable *in_false_var = scope.FindVar(Input("InFalse")); + in_true_var->Clear(); + in_false_var->Clear(); + in_true_var->GetMutable(); + in_false_var->GetMutable(); + } }; class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker { @@ -196,3 +223,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp, ops::MergeLoDTensorOpProtoMaker, ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker); +REGISTER_OPERATOR(merge_lod_tensor_infer, ops::MergeLoDTensorInferOp, + ops::MergeLoDTensorOpProtoMaker, + ops::MergeLoDTensorInferShape, + paddle::framework::EmptyGradOpMaker); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 1767ebaf..414576f1 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -27,29 +27,16 @@ using platform::GetMKLDNNFormat; using platform::MKLDNNDeviceContext; using platform::to_void_cast; -namespace { -std::string gethash(const mkldnn::memory::dims &operand_dims, - const mkldnn::algorithm algorithm) { - auto dim2str = [](const mkldnn::memory::dims &operand_dims) { - std::string dstr = ""; - for (size_t i = 0; i < operand_dims.size(); ++i) { - dstr += std::to_string(operand_dims[i]) + "-"; - } - return dstr; - }; - return dim2str(operand_dims) + std::to_string(algorithm); -} -} // namespace - template class MKLDNNActivationKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *x = ctx.Input("X"); - PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && - x->format() != memory::format::format_undef, - "Wrong layout/format set for Input x tensor"); + PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN, + "Wrong layout set for X tensor"); + PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for X tensor"); Functor functor; functor(ctx); @@ -62,12 +49,13 @@ class MKLDNNActivationGradKernel public: void Compute(const framework::ExecutionContext &ctx) const override { const auto *diff_y = ctx.Input(framework::GradVarName("Out")); - PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN && - diff_y->format() != memory::format::format_undef, - "Wrong layout/format set for Input OutGrad tensor"); + PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Input OutGrad tensor"); + PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Input OutGrad tensor"); - PADDLE_ENFORCE( - !ctx.Attr("is_test"), + PADDLE_ENFORCE_EQ( + ctx.Attr("is_test"), false, "is_test attribute should be set to False in training phase."); Functor functor; @@ -77,70 +65,35 @@ class MKLDNNActivationGradKernel template void eltwise_forward(const framework::ExecutionContext &ctx, - mkldnn::algorithm algorithm, const T alpha = 0, - const T beta = 0) { + mkldnn::algorithm algorithm) { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); auto &dev_ctx = ctx.template device_context(); - const auto &mkldnn_engine = dev_ctx.GetEngine(); const auto *x = ctx.Input("X"); auto *y = ctx.Output("Out"); - const T *x_data = x->data(); - T *y_data = y->mutable_data(ctx.GetPlace()); + const T alpha = ctx.op().HasAttr("alpha") ? ctx.Attr("alpha") : 0; + const T beta = ctx.op().HasAttr("beta") ? ctx.Attr("beta") : 0; PADDLE_ENFORCE( x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4, "Input dim must be with 2, 3 or 4"); - std::vector src_tz = framework::vectorize2int(x->dims()); + auto src_tz = framework::vectorize(x->dims()); - auto src_format = - src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format(); + auto src_format = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : x->format(); bool is_test = ctx.Attr("is_test"); - // TODO(jczaja): When adding leaky-relu , swish , elu make sure to extend key - // with alpha, beta - std::string key = platform::MKLDNNHandler::GetHash( - src_tz, std::to_string(algorithm) + ctx.op().Output("Out")); - - // TODO(jczaja): Make it Thread safe - // save input data and layout to be referred in backward path - const std::string key_src_data = key + "@eltwise_fwd_src_data"; - const std::string key_src_layout = key + "@eltwise_fwd_src_layout"; - // Just in case some int8 models are run interchangebly - // with float models then format maybe diffrent - key += std::to_string(src_format); - const std::string key_src_mem = key + "@eltwise_fwd_src_mem"; - auto p_src_data = std::make_shared(x_data); - auto p_src_layout = std::make_shared(src_format); - if (!is_test) { - dev_ctx.SetBlob(key_src_data, p_src_data); - dev_ctx.SetBlob(key_src_layout, p_src_layout); - } - - platform::ActivationMKLDNNHandler handler(dev_ctx, mkldnn_engine, key); + platform::ActivationMKLDNNHandler handler( + src_tz, algorithm, alpha, beta, src_format, is_test, dev_ctx, + ctx.GetPlace(), ctx.op().Input("X")); - auto md = platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType(), - src_format); - - auto activation_pd = handler.AcquireActivationPrimitiveDescriptor( - is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training, - algorithm, md, alpha, beta); - - auto src_memory_p = handler.AcquireSrcMemory(md, to_void_cast(x_data)); - // jczaja: Workaround, src_memory_p is needed in BWD so it has - // to be accessible under key not dependant on TID - if (!is_test) { - dev_ctx.SetBlob(key_src_mem, src_memory_p); - } - - auto dst_memory_p = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(y_data)); - auto activation_p = handler.AcquireActivation(dst_memory_p, src_memory_p); + auto src_memory_p = handler.AcquireSrcMemory(x); + auto dst_memory_p = handler.AcquireDstMemory(y); + auto activation_p = + handler.AcquireForwardPrimitive(*src_memory_p, *dst_memory_p); // push primitive to stream and wait until it's executed std::vector pipeline; @@ -153,59 +106,34 @@ void eltwise_forward(const framework::ExecutionContext &ctx, template void eltwise_grad(const framework::ExecutionContext &ctx, - mkldnn::algorithm algorithm, const T alpha = 0, - const T beta = 0) { + mkldnn::algorithm algorithm) { auto &dev_ctx = ctx.template device_context(); - const auto &mkldnn_engine = dev_ctx.GetEngine(); + const auto *x = ctx.Input("X"); const auto *diff_y = ctx.Input(framework::GradVarName("Out")); auto *diff_x = ctx.Output(framework::GradVarName("X")); - const T *diff_y_data = diff_y->data(); - T *diff_x_data = diff_x->mutable_data(ctx.GetPlace()); + const T alpha = ctx.op().HasAttr("alpha") ? ctx.Attr("alpha") : 0; + const T beta = ctx.op().HasAttr("beta") ? ctx.Attr("beta") : 0; - std::vector diff_dst_tz = framework::vectorize2int(diff_y->dims()); + auto diff_dst_tz = framework::vectorize(diff_y->dims()); - auto diff_y_format = - diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format(); - - auto diff_dst_md = platform::MKLDNNMemDesc( - diff_dst_tz, platform::MKLDNNGetDataType(), diff_y_format); - - std::string key = platform::MKLDNNHandler::GetHash( - diff_dst_tz, std::to_string(algorithm) + ctx.op().Input("Out")); - - const std::string key_src_data = key + "@eltwise_fwd_src_data"; - const std::string key_src_layout = key + "@eltwise_fwd_src_layout"; - - // Get Data from FWD op - const auto p_src_layout = - std::static_pointer_cast(dev_ctx.GetBlob(key_src_layout)); - const auto p_src_data = - std::static_pointer_cast(dev_ctx.GetBlob(key_src_data)); - key += std::to_string(*p_src_layout); - const std::string key_src_mem = key + "@eltwise_fwd_src_mem"; - auto src_memory = - std::static_pointer_cast(dev_ctx.GetBlob(key_src_mem)); - PADDLE_ENFORCE(src_memory != nullptr, - "Fail to find src_memory in device context"); - src_memory->set_data_handle(*p_src_data); - - platform::ActivationMKLDNNHandler handler(dev_ctx, mkldnn_engine, key); - - auto diff_dst_memory_p = - handler.AcquireDiffDstMemory(diff_dst_md, to_void_cast(diff_y_data)); + // diff_dst and src dims should be the same + auto src_format = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : x->format(); - auto activation_backward_pd = - handler.AcquireActivationBackwardPrimitiveDescriptor( - algorithm, diff_dst_md, src_memory->get_primitive_desc().desc(), - alpha, beta); + auto diff_y_format = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : diff_y->format(); - auto diff_src_memory_p = - handler.AcquireDiffSrcMemoryFromPrimitive(diff_x_data); + platform::ActivationMKLDNNHandler handler( + diff_dst_tz, algorithm, alpha, beta, src_format, diff_y_format, dev_ctx, + ctx.GetPlace(), ctx.op().Input("X")); - auto activation_backward_p = handler.AcquireActivationBackward( - diff_src_memory_p, diff_dst_memory_p, src_memory); + auto src_memory_p = handler.AcquireBackwardSrcMemory(x); + auto diff_dst_memory_p = handler.AcquireDiffDstMemory(diff_y); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory(diff_x); + auto activation_backward_p = handler.AcquireBackwardPrimitive( + *src_memory_p, *diff_dst_memory_p, *diff_src_memory_p); // push primitive to stream and wait until it's executed std::vector pipeline; @@ -273,10 +201,11 @@ namespace ops = paddle::operators; act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace, \ ops::MKLDNNActivationGradKernel>); -#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ - __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ - __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor); \ - __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor); \ +#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ + __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ + __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ + __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor); \ + __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor); \ __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL); diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index 40f7231c..f3209151 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -58,6 +58,15 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler { batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p"); } + template + std::shared_ptr AcquireDstMemoryFromPrimitive( + framework::Tensor *output, platform::Place place) { + T *ptr = output->mutable_data( + place, batch_norm_pd_->dst_primitive_desc().get_size()); + return this->AcquireMemoryFromPrimitive( + batch_norm_pd_->dst_primitive_desc(), ptr, "@dst_mem_p"); + } + std::shared_ptr AcquireBatchNormPrimitiveDescriptor(const batch_norm_fwd::desc &bn_fwd_desc, const mkldnn::engine &engine) { @@ -111,21 +120,6 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler { return batch_norm_p; } - static std::string GetHash(const memory::dims &input_dims, float epsilon, - unsigned flag, bool is_test, memory::format format, - const std::string &suffix = "") { - auto dims2str = [](const memory::dims &operand_dims) { - std::string dstr = ""; - for (size_t i = 0; i < operand_dims.size(); ++i) { - dstr += std::to_string(operand_dims[i]) + "-"; - } - return dstr; - }; - return dims2str(input_dims) + std::to_string(epsilon) + - std::to_string(flag) + std::to_string(is_test) + - std::to_string(format) + suffix; - } - private: std::shared_ptr batch_norm_pd_; }; @@ -182,14 +176,14 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { const auto *scale = ctx.Input("Scale"); const auto *shift = ctx.Input("Bias"); - PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && - x->format() != memory::format::format_undef, - "Wrong layout/format set for Input x tensor"); + PADDLE_ENFORCE_EQ(x->layout(), DataLayout::kMKLDNN, + "Wrong layout set for X tensor"); + PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for X tensor"); const T *x_data = x->data(); const T *mean_data = mean->data(); const T *variance_data = variance->data(); - T *y_data = y->mutable_data(ctx.GetPlace()); T *mean_out_data = mean_out->mutable_data(ctx.GetPlace()); T *variance_out_data = variance_out->mutable_data(ctx.GetPlace()); T *batch_mean_data = nullptr; @@ -204,8 +198,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { ? mkldnn::prop_kind::forward_scoring : mkldnn::prop_kind::forward_training; - auto src_tz = paddle::framework::vectorize2int(x->dims()); - auto scale_tz = paddle::framework::vectorize2int(scale->dims()); + auto src_tz = paddle::framework::vectorize(x->dims()); + auto scale_tz = paddle::framework::vectorize(scale->dims()); PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); const unsigned int ic = scale_tz[0]; @@ -222,13 +216,13 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu; // create mkldnn memory from input x tensor - mkldnn::memory::format input_format = + MKLDNNMemoryFormat input_format = platform::MKLDNNFormatForSize(src_tz.size(), x->format()); // keys for backward pass - const std::string key = BatchNormMKLDNNHandler::GetHash( - src_tz, epsilon, flags, global_stats, input_format, - ctx.op().Output("SavedMean")); + const std::string key = + platform::CreateKey(src_tz, epsilon, flags, global_stats, input_format, + ctx.op().Output("SavedMean")); BatchNormMKLDNNHandler handler(dev_ctx, mkldnn_engine, key); auto user_src_md = platform::MKLDNNMemDesc( @@ -250,8 +244,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { handler.AcquireScaleshiftMemoryFromPrimitive(scaleshift_data.data()); // create mkldnn memory for output y tensor - auto dst_memory = handler.AcquireDstMemory( - batch_norm_fwd_pd->dst_primitive_desc().desc(), y_data); + auto dst_memory = + handler.AcquireDstMemoryFromPrimitive(y, ctx.GetPlace()); std::shared_ptr batch_norm_p; if (global_stats) { @@ -323,9 +317,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto *diff_scale = ctx.Output(framework::GradVarName("Scale")); auto *diff_shift = ctx.Output(framework::GradVarName("Bias")); - PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN && - diff_y->format() != memory::format::format_undef, - "Wrong layout/format set for Input diff_y tensor"); + PADDLE_ENFORCE_EQ(diff_y->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Input diff_y tensor"); + PADDLE_ENFORCE_NE(diff_y->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Input diff_y tensor"); const T *x_data = x->data(); const T *diff_y_data = diff_y->data(); @@ -334,38 +329,38 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { const T *scale_data = scale->data(); const T *shift_data = shift->data(); T *diff_x_data = diff_x->mutable_data(ctx.GetPlace()); + T *diff_scale_data = diff_scale->mutable_data(ctx.GetPlace()); T *diff_shift_data = diff_shift->mutable_data(ctx.GetPlace()); - auto src_tz = paddle::framework::vectorize2int(x->dims()); + auto src_tz = paddle::framework::vectorize(x->dims()); auto diff_src_tz = src_tz; auto dst_tz = src_tz; auto diff_dst_tz = dst_tz; - auto scale_tz = paddle::framework::vectorize2int(scale->dims()); + auto scale_tz = paddle::framework::vectorize(scale->dims()); PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); const unsigned int ic = scale_tz[0]; using bn_bwd_types = bn_type_traits; - mkldnn::memory::format dst_format = + MKLDNNMemoryFormat dst_format = platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format()); - mkldnn::memory::format input_format = + MKLDNNMemoryFormat input_format = platform::MKLDNNFormatForSize(src_tz.size(), x->format()); unsigned flags = mkldnn::use_scale_shift; // keys from forward pass - const std::string key = BatchNormMKLDNNHandler::GetHash( - src_tz, epsilon, flags, false, input_format, - ctx.op().Input("SavedMean")); + const std::string key = + platform::CreateKey(src_tz, epsilon, flags, false, input_format, + ctx.op().Input("SavedMean")); const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; // keys for primitives reuse const std::string key_with_hash = - key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false, - input_format); + key + platform::CreateKey(src_tz, epsilon, flags, false, input_format); const std::string key_batch_norm_bwd_p = key_with_hash + "@batch_norm_bwd_p"; const std::string key_batch_norm_src_mem_p = @@ -472,9 +467,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { // set layout/format of output tensors diff_x->set_layout(DataLayout::kMKLDNN); - diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() - .desc() - .data.format); + diff_x->set_format( + (MKLDNNMemoryFormat)diff_src_memory->get_primitive_desc() + .desc() + .data.format); } else { // primitives already exist UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data)); @@ -500,9 +496,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { // set layout/format of output tensors diff_x->set_layout(DataLayout::kMKLDNN); - diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() - .desc() - .data.format); + diff_x->set_format( + (MKLDNNMemoryFormat)diff_src_memory->get_primitive_desc() + .desc() + .data.format); } // execute optional reorder and batch_norm backward primitive diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index be19293e..8823e086 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -30,27 +30,26 @@ using platform::to_void_cast; static void EnforceLayouts(const std::vector inputs) { for (auto* input : inputs) { - const bool is_layout_correct = input->layout() == DataLayout::kMKLDNN; - const bool is_format_defined = - input->format() != memory::format::format_undef; - PADDLE_ENFORCE(is_layout_correct && is_format_defined, - "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Input tensor"); + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Input tensor"); } } static memory::primitive_desc CreateMemPrimDesc(const Tensor& input, const mkldnn::engine& engine, const memory::data_type& dt) { - const auto dims = paddle::framework::vectorize2int(input.dims()); + const auto dims = paddle::framework::vectorize(input.dims()); const auto format = input.format(); auto description = memory::desc(dims, dt, format); auto mem_prim_desc = memory::primitive_desc(description, engine); return mem_prim_desc; } -static mkldnn::memory::format GetDstMemFormat( +static MKLDNNMemoryFormat GetDstMemFormat( const concat::primitive_desc& concat_pd) { - return (memory::format)concat_pd.dst_primitive_desc().desc().data.format; + return (MKLDNNMemoryFormat)concat_pd.dst_primitive_desc().desc().data.format; } static platform::CPUPlace GetCpuPlace( @@ -67,31 +66,6 @@ static const mkldnn::engine& GetMKLDNNEngine( return dev_ctx.GetEngine(); } -std::string CreateKey(const paddle::framework::ExecutionContext& ctx, - const std::vector multi_input, - const int64_t& concat_axis, const memory::data_type& dt) { - std::string key; - key.reserve(platform::MKLDNNHandler::MaxKeyLength); - for (size_t i = 0; i < multi_input.size(); i++) { - platform::MKLDNNHandler::AppendKeyDims( - &key, paddle::framework::vectorize2int(multi_input[i]->dims())); - } - platform::MKLDNNHandler::AppendKey(&key, std::to_string(concat_axis)); - platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Out")); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt)); - platform::MKLDNNHandler::AppendKey(&key, - std::to_string(multi_input[0]->format())); - if (platform::get_cur_mkldnn_session_id() == - platform::kMKLDNNSessionID_Default) { - auto tid = std::this_thread::get_id(); - std::stringstream ss; - ss << tid; - platform::MKLDNNHandler::AppendKey(&key, "-t:"); - platform::MKLDNNHandler::AppendKey(&key, ss.str()); - } - return key; -} - template class ConcatPrimitiveFactory { public: @@ -127,8 +101,8 @@ class ConcatPrimitiveFactory { private: memory::desc CreateDstMemDescriptor(Tensor* output, const memory::data_type& dt) { - auto dst_dims = paddle::framework::vectorize2int(output->dims()); - return memory::desc(dst_dims, dt, memory::format::any); + auto dst_dims = paddle::framework::vectorize(output->dims()); + return memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any); } mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd, @@ -161,8 +135,8 @@ class ConcatPrimitiveFactory { std::vector srcs_pd; std::vector srcs; std::vector inputs; - boost::optional dst_mem; // TODO(mgallus): change to std::optional -}; // upon introduction of C++17 to paddle + boost::optional dst_mem; +}; template class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { @@ -180,7 +154,10 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::ToMKLDNNDataType(multi_input[0]->type()); ConcatPrimitiveFactory prim_creator; - std::string key = CreateKey(ctx, multi_input, concat_axis, dt); + std::string key = platform::CreateKey( + paddle::framework::vectorize(multi_input[0]->dims()), concat_axis, + ctx.op().Output("Out"), dt, multi_input[0]->format(), + platform::ThreadIDasStr()); const std::string key_prim = key + "@concat_p"; const std::string key_concat_pd = key + "@concat_pd"; const std::string key_srcs = key + "@concat_srcs"; diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 876a0b8b..86c7d7a5 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -29,47 +29,97 @@ using mkldnn::stream; using platform::to_void_cast; using platform::GetMKLDNNFormat; -inline void GetWeightsTz(std::vector& weights_tz, int groups, // NOLINT - bool is_conv3d) { +constexpr int same_scale_mask = 0; +constexpr int o_slice_mask = 1 << 0; // 1 +constexpr int g_slice_mask = 1 << 1; // 2 +constexpr int g_o_slice_mask = g_slice_mask | o_slice_mask; // 3 + +static int ComputeMask(bool is_multi_channel, int multi_channel_mask) { + return is_multi_channel ? multi_channel_mask : same_scale_mask; +} + +static int ComputeWeightsMask(int is_multi_channel, int g) { + int multi_channel_mask = g > 1 ? g_o_slice_mask : o_slice_mask; + return ComputeMask(is_multi_channel, multi_channel_mask); +} + +static int ComputeBiasMask(int is_multi_channel) { + return ComputeMask(is_multi_channel, o_slice_mask); +} + +inline void GetWeightsTz(std::vector& weights_tz, int groups) { // NOLINT if (groups > 1) { - if (is_conv3d) { - int output = weights_tz[0]; - int input = weights_tz[1]; - int dimension = weights_tz[2]; - int height = weights_tz[3]; - int width = weights_tz[4]; - weights_tz.resize(6); - weights_tz[0] = groups; - weights_tz[1] = output / groups; - weights_tz[2] = input; - weights_tz[3] = dimension; - weights_tz[4] = height; - weights_tz[5] = width; - } else { - int output = weights_tz[0]; - int input = weights_tz[1]; - int height = weights_tz[2]; - int width = weights_tz[3]; - weights_tz.resize(5); - weights_tz[0] = groups; - weights_tz[1] = output / groups; - weights_tz[2] = input; - weights_tz[3] = height; - weights_tz[4] = width; - } + // if (is_conv3d) [o, i, dimension, h, w]->[g, o/g, i, dimension, h, w] + // else [o, i, h, w] -> [g, o/g, i, h, w] + weights_tz.push_back(0); + std::rotate(weights_tz.begin(), weights_tz.end() - 1, weights_tz.end()); + weights_tz[0] = groups; + weights_tz[1] = weights_tz[1] / groups; } } -inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format, - int groups, bool is_conv3d) { +inline MKLDNNMemoryFormat GetWeightsFormat(MKLDNNMemoryFormat format, + int groups, bool is_conv3d) { if (is_conv3d) { - return (groups == 1) ? format : mkldnn::memory::format::goidhw; + return (groups == 1) ? format : MKLDNNMemoryFormat::goidhw; } else { - return (groups == 1) ? format : mkldnn::memory::format::goihw; + return (groups == 1) ? format : MKLDNNMemoryFormat::goihw; + } +} + +static std::vector ComputeOutputShiftScale( + const float scale_out_data, const float scale_in_data, + const std::vector& scale_weights_data) { + int count = scale_weights_data.size(); + std::vector output_shift_scale(count); +#pragma omp parallel for + for (int i = 0; i < count; i++) { + if (scale_weights_data[i] == 0.0) { + output_shift_scale[i] = scale_out_data; + } else { + output_shift_scale[i] = + static_cast(static_cast(scale_out_data) / + (static_cast(scale_in_data) * + static_cast(scale_weights_data[i]))); + } + } + return output_shift_scale; +} + +static std::vector ComputeBiasScale( + const float scale_in_data, const std::vector& scale_weights_data) { + int count = scale_weights_data.size(); + std::vector scale_bias_data(count); +#pragma omp parallel for if (count > 1) + for (int i = 0; i < count; i++) { + scale_bias_data[i] = scale_in_data * scale_weights_data[i]; } + return scale_bias_data; } -template +static mkldnn::memory::data_type GetDstType(bool is_int8, + bool force_fp32_output, + std::string fuse_activation, + bool fuse_residual_conn, + const Tensor* residual_param) { + auto dst_dt = mkldnn::memory::data_type::f32; // uint8_t, int8_t, float + if (is_int8 && !force_fp32_output) { + if (fuse_residual_conn && residual_param) { + // when residual exists, dst_dt will follow the residual_param type, + // but output will to be set to u8 if relu exists + auto residual_dt = framework::ToMKLDNNDataType(residual_param->type()); + dst_dt = residual_dt; + } else { + // when residual does not exist, if (b)relu exist s8 else s8 + dst_dt = (fuse_activation == "relu" || fuse_activation == "relu6") + ? mkldnn::memory::data_type::u8 + : mkldnn::memory::data_type::s8; + } + } + return dst_dt; +} + +template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { @@ -80,7 +130,19 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { if (!is_INT8) { ComputeFP32(ctx); } else { - ComputeINT8(ctx); + std::string fuse_activation = ctx.Attr("fuse_activation"); + bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + bool force_fp32_output = ctx.Attr("force_fp32_output"); + auto residual_param = ctx.Input("ResidualData"); + auto dst_dt = GetDstType(true, force_fp32_output, fuse_activation, + fuse_residual_conn, residual_param); + if (dst_dt == mkldnn::memory::data_type::f32) { + ComputeINT8(ctx); + } else if (dst_dt == mkldnn::memory::data_type::u8) { + ComputeINT8(ctx); + } else if (dst_dt == mkldnn::memory::data_type::s8) { + ComputeINT8(ctx); + } } } @@ -96,38 +158,50 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; auto* output = ctx.Output("Output"); - PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && - input->format() != memory::format::format_undef, - "Wrong layout/format set for Input tensor"); - PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && - filter->format() != memory::format::format_undef, - "Wrong layout/format set for Filter tensor"); - PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, - "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); - PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, - "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); + PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Input tensor"); + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Input tensor"); + + PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Filter tensor"); + PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Filter tensor"); + + PADDLE_ENFORCE_GE( + input->dims().size(), 4, + "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); + PADDLE_ENFORCE_LE( + input->dims().size(), 5, + "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); + + PADDLE_ENFORCE_GE( + filter->dims().size(), 4, + "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); + PADDLE_ENFORCE_LE( + filter->dims().size(), 5, + "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); + if (bias) { - PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && - bias->format() != memory::format::format_undef, - "Wrong layout/format set for Bias tensor"); - PADDLE_ENFORCE(bias->dims().size() == 1, - "Bias must only have 1 dimension, i.e. X"); + PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Bias tensor"); + PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Bias tensor"); + + PADDLE_ENFORCE_EQ(bias->dims().size(), 1, + "Bias must only have 1 dimension, i.e. X"); } std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); - bool fuse_relu = ctx.Attr("fuse_relu"); + std::string fuse_activation = ctx.Attr("fuse_activation"); + float fuse_alpha = ctx.Attr("fuse_alpha"); + float fuse_beta = ctx.Attr("fuse_beta"); bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); - bool fuse_brelu = false; - float fuse_brelu_threshold = 6.0; int groups = ctx.Attr("groups"); bool is_conv3d = strides.size() == 3U; - if (!is_conv3d) { - fuse_brelu = ctx.Attr("fuse_brelu"); - fuse_brelu_threshold = ctx.Attr("fuse_brelu_threshold"); - } - // TODO(tpatejko): add support for dilation + PADDLE_ENFORCE( is_conv3d ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 && @@ -138,22 +212,21 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const T* input_data = input->data(); const T* filter_data = filter->data(); - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector weights_tz = - paddle::framework::vectorize2int(filter->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto weights_tz = paddle::framework::vectorize(filter->dims()); int g = std::max(groups, 1); - GetWeightsTz(weights_tz, g, is_conv3d); - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + GetWeightsTz(weights_tz, g); + auto dst_tz = paddle::framework::vectorize(output->dims()); // Get unique name for storing MKLDNN primitives - const std::string key = platform::ConvMKLDNNHandler::GetHash( - src_tz, weights_tz, fuse_relu, fuse_brelu, strides, paddings, dilations, + const std::string key = platform::CreateKey( + src_tz, weights_tz, fuse_activation, strides, paddings, dilations, groups, ctx.op().Input("Input") + ctx.op().Input("Filter")); std::vector pipeline; auto src_format = input->format(); - mkldnn::memory::format weights_format = + MKLDNNMemoryFormat weights_format = GetWeightsFormat(filter->format(), g, is_conv3d); auto user_src_md = platform::MKLDNNMemDesc( @@ -169,9 +242,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - weights_format = mkldnn::memory::format::any; + weights_format = MKLDNNMemoryFormat::any; // Check the format for user's special output - if (chosen_memory_format != mkldnn::memory::format::any) { + if (chosen_memory_format != MKLDNNMemoryFormat::any) { if (is_conv3d) { chosen_memory_format = platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); @@ -182,8 +255,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( weights_tz, platform::MKLDNNGetDataType(), weights_format); - std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. - // Currently used whenever bias is != nullptr. + std::vector bias_tz; auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); @@ -194,18 +266,18 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training; if (bias) { - bias_tz = paddle::framework::vectorize2int(bias->dims()); + bias_tz = paddle::framework::vectorize(bias->dims()); auto bias_md = platform::MKLDNNMemDesc( - bias_tz, platform::MKLDNNGetDataType(), memory::format::x); + bias_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); conv_pd = handler.AcquireConvolutionPrimitiveDescriptor( src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, - fuse_relu, fuse_residual_conn, fuse_brelu, fuse_brelu_threshold, + fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn, fwd_prop_kind); } else { conv_pd = handler.AcquireConvolutionPrimitiveDescriptor( src_md, weights_md, boost::none, dst_md, strides, paddings, - mkldnn_engine, fuse_relu, fuse_residual_conn, fuse_brelu, - fuse_brelu_threshold, fwd_prop_kind); + mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, + fuse_residual_conn, fwd_prop_kind); } // create mkldnn memory from input tensors (data/weights) @@ -237,7 +309,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto output_data = output->mutable_data(ctx.GetPlace(), handler.GetDstMemorySize()); auto residual_data_tz = - paddle::framework::vectorize2int(residual_param->dims()); + paddle::framework::vectorize(residual_param->dims()); auto residual_data_type = paddle::framework::ToMKLDNNDataType(residual_param->type()); @@ -267,7 +339,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { if (bias) { const T* bias_data = bias->data(); auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + {bias_tz}, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); user_bias_memory_p = handler.AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); @@ -288,6 +360,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { output->set_format(GetMKLDNNFormat(*dst_memory_p)); } + template void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { const bool is_test = ctx.Attr("is_test"); @@ -300,40 +373,61 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; auto* output = ctx.Output("Output"); - PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && - input->format() != memory::format::format_undef, - "Wrong layout/format set for Input tensor"); - PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && - filter->format() != memory::format::format_undef, - "Wrong layout/format set for Filter tensor"); - PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, - "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); - PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, - "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); + PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Input tensor"); + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Input tensor"); + + PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Filter tensor"); + PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Filter tensor"); + + PADDLE_ENFORCE_GE( + input->dims().size(), 4, + "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); + PADDLE_ENFORCE_LE( + input->dims().size(), 5, + "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); + + PADDLE_ENFORCE_GE( + filter->dims().size(), 4, + "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); + PADDLE_ENFORCE_LE( + filter->dims().size(), 5, + "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); + if (bias) { - PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && - bias->format() != memory::format::format_undef, - "Wrong layout/format set for Bias tensor"); - PADDLE_ENFORCE(bias->dims().size() == 1, - "Bias must only have 1 dimension, i.e. X"); + PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Bias tensor"); + PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Bias tensor"); + + PADDLE_ENFORCE_EQ(bias->dims().size(), 1, + "Bias must only have 1 dimension, i.e. X"); } std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); - bool fuse_relu = ctx.Attr("fuse_relu"); + std::string fuse_activation = ctx.Attr("fuse_activation"); + float fuse_alpha = ctx.Attr("fuse_alpha"); + float fuse_beta = ctx.Attr("fuse_beta"); bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); - bool fuse_brelu = ctx.Attr("fuse_brelu"); - float fuse_brelu_threshold = ctx.Attr("fuse_brelu_threshold"); bool force_fp32_output = ctx.Attr("force_fp32_output"); - bool unsigned_output = fuse_relu || fuse_brelu; - if (fuse_residual_conn) { - PADDLE_ENFORCE(force_fp32_output != true, - "residual fusion does not support force output with fp32"); - } + bool unsigned_output = + (fuse_activation == "relu" || fuse_activation == "relu6"); + auto scale_in_data = ctx.Attr("Scale_in"); + auto scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + auto scale_out_data = + force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); + + PADDLE_ENFORCE(!fuse_residual_conn || !force_fp32_output, + "residual fusion does not support force output with fp32"); + bool is_conv3d = strides.size() == 3U; - // TODO(tpatejko): add support for dilation PADDLE_ENFORCE( is_conv3d ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 && @@ -341,370 +435,166 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, "dilation in convolution is not implemented yet"); - PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently"); + PADDLE_ENFORCE_NE(is_conv3d, true, + "int8 does not support conv3d currently"); const T* input_data = input->data(); - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector weights_tz = - paddle::framework::vectorize2int(filter->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto weights_tz = paddle::framework::vectorize(filter->dims()); int g = std::max(groups, 1); - - GetWeightsTz(weights_tz, g, is_conv3d); - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + GetWeightsTz(weights_tz, g); + auto dst_tz = paddle::framework::vectorize(output->dims()); mkldnn::memory::data_type src_dt = paddle::framework::ToMKLDNNDataType(input->type()); - auto dst_dt = unsigned_output - ? paddle::framework::ToMKLDNNDataType( - framework::DataTypeTrait::DataType()) - : paddle::framework::ToMKLDNNDataType( - framework::DataTypeTrait::DataType()); - - if (force_fp32_output) { - dst_dt = paddle::framework::ToMKLDNNDataType( - framework::DataTypeTrait::DataType()); - } - - if (fuse_residual_conn) { - auto residual = ctx.Input("ResidualData"); - auto residual_dt = paddle::framework::ToMKLDNNDataType(residual->type()); - if (dst_dt != residual_dt) dst_dt = residual_dt; - } - - // Get unique name for storing MKLDNN primitives - std::string key; - key.reserve(MaxKeyLength); - platform::ConvMKLDNNHandler::AppendKey( - &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt, - input->format(), fuse_relu, fuse_residual_conn, fuse_brelu, + std::string key = platform::CreateKey( + src_tz, weights_tz, strides, paddings, dilations, groups, src_dt, + input->format(), fuse_activation, fuse_residual_conn, ctx.op().Input("Input") + ctx.op().Input("Filter")); - const std::string key_conv_pd = key + "@conv_pd"; - - bool need_s8_to_u8 = false; std::shared_ptr conv_p; std::shared_ptr src_memory_p; std::shared_ptr user_src_memory_p; - std::shared_ptr dst_memory_p; std::vector pipeline; std::shared_ptr conv_pd; - std::shared_ptr handler; - - auto prim_key = key + "@conv_p"; - auto dst_key = key + "@dst_mem_p"; - auto src_key = key + "@src_mem_p"; - auto user_src_key = key + "@user_src_mem_p"; - auto src_reorder_key = key + "@src_mem_preorder_p"; - auto residual_reorder_key = key + "@residual_data_mem_preorder_p"; - - conv_p = std::static_pointer_cast( - dev_ctx.GetBlob(prim_key)); - - if (conv_p == nullptr || !is_test) { - const K* filter_data = filter->data(); - auto scale_in_data = ctx.Attr("Scale_in"); - auto scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); - auto scale_weights_data = ctx.Attr>("Scale_weights"); - auto scale_out_data = - force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); - float sum_scale = - fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; - - bool is_multi_channel = scale_weights_data.size() > 1; - - int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] - : (weights_tz)[0]) - : 1; - std::vector output_shift_scale(count); -#pragma omp parallel for if (count > 1) - for (int i = 0; i < count; i++) { - if (scale_weights_data[i] == 0.0) - output_shift_scale[i] = - scale_out_data; // weights data will contain 0 - // in some models, then weights - // scale couldn't be calculated - else - output_shift_scale[i] = - static_cast(static_cast(scale_out_data) / - (static_cast(scale_in_data) * - static_cast(scale_weights_data[i]))); - } + std::shared_ptr dst_memory_p, user_residual_memory_p; - auto user_src_md = - platform::MKLDNNMemDesc({src_tz}, src_dt, input->format()); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), - ((g) == 1) ? mkldnn::memory::format::oihw - : mkldnn::memory::format::goihw); - - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - std::string data_format = ctx.Attr("data_format"); - auto chosen_memory_format = - platform::data_format_to_memory_format(data_format); - - std::vector bias_tz; - - auto src_md = - platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); - auto weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::s8, chosen_memory_format); - auto dst_md = - platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); - - // create a conv primitive descriptor and save it for usage in backward - // TODO(lidanqing): We use relu post-op instead of brelu post-op cause - // mkldnn v0.18 does not support INT8 brelu post-op. Use code in /**/ when - // v0.20 is enabled - std::shared_ptr bias_md_p; - if (bias) { - bias_tz = paddle::framework::vectorize2int(bias->dims()); - bias_md_p = std::make_shared(platform::MKLDNNMemDesc( - bias_tz, memory::data_type::s32, memory::format::x)); - } - conv_pd = ConvFwdPrimitiveDesc( - src_md, weights_md, bias_md_p, dst_md, strides, paddings, - mkldnn_engine, fuse_relu || fuse_brelu /*fuse_relu*/, - fuse_residual_conn, false /*fuse_brelu*/, fuse_brelu_threshold, - output_shift_scale, sum_scale, is_test); - // Save conv_pd/src_memory/weights_memory for backward pass - dev_ctx.SetBlob(key_conv_pd, conv_pd); - handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, - mkldnn_engine, key)); - // create mkldnn memory from input tensors (data/weights) - user_src_memory_p = - handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); - auto user_weights_memory_p = handler->AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); - - // create reorder primitive if the input format is not the preferred one - src_memory_p = - handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); - - std::shared_ptr weights_memory_p; - int mask_reorder = - is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; - weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline, is_test, true, scale_weights_data, - mask_reorder); - - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), - "Output and elementwise parameter need to have the " - "same dimension sizes"); - auto residual_dt = - paddle::framework::ToMKLDNNDataType(residual_param->type()); - if (residual_param->format() != handler->GetDstFormat()) { - auto residual_data_tz = - paddle::framework::vectorize2int(residual_param->dims()); - - auto user_residual_md = platform::MKLDNNMemDesc( - residual_data_tz, residual_dt, residual_param->format()); - - if (residual_dt == mkldnn::memory::data_type::u8) { - dst_memory_p = platform::SetDstMemory( - ctx, output, residual_param, user_residual_md, handler, - &pipeline); - } else { - need_s8_to_u8 = unsigned_output; - dst_memory_p = platform::SetDstMemory( - ctx, output, residual_param, user_residual_md, handler, - &pipeline); - } - } else { - output->ShareDataWith(*residual_param); - if (residual_dt == mkldnn::memory::data_type::u8) { - dst_memory_p = - platform::SetDstMemory(ctx, output, handler); - } else { - need_s8_to_u8 = unsigned_output; - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } - } - } else if (!force_fp32_output) { - if (unsigned_output) { - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } else { - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } - } else { - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } + const float* filter_data = filter->data(); + bool is_multi_channel = scale_weights_data.size() > 1; - // create convolution op primitive - auto scale_bias_key = key + "@scale_bias"; - if (bias) { - const K* bias_data = bias->data(); - auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); - auto user_bias_memory_p = handler->AcquireBiasMemory( - user_bias_md, to_void_cast(bias_data)); - std::shared_ptr bias_memory_p; - int mask_reorder = is_multi_channel ? 1 << 0 : 1; - int count = - is_multi_channel - ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) - : 1; - std::vector scale_bias_data(count); -#pragma omp parallel for if (count > 1) - for (int i = 0; i < count; i++) { - scale_bias_data[i] = scale_in_data * scale_weights_data[i]; - } - bias_memory_p = handler->AcquireBiasMemoryFromPrimitive( - user_bias_memory_p, pipeline, is_test, true, scale_bias_data, - mask_reorder); - conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, - bias_memory_p, dst_memory_p); - } else { - conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, - dst_memory_p); - } + auto output_shift_scale = ComputeOutputShiftScale( + scale_out_data, scale_in_data, scale_weights_data); - // push primitive to stream and wait until it's executed - pipeline.push_back(*conv_p); - } else { - auto src_memory_reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(src_reorder_key)); - src_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(src_key)); - if (src_memory_reorder_p) { - user_src_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(user_src_key)); - user_src_memory_p->set_data_handle(to_void_cast(input_data)); - } else if (src_memory_p) { - src_memory_p->set_data_handle(to_void_cast(input_data)); - } + float scale_residual = + fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; + auto user_src_md = + platform::MKLDNNMemDesc({src_tz}, src_dt, input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), + ((g) == 1) ? mkldnn::memory::format::oihw + : mkldnn::memory::format::goihw); - dst_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); - conv_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); - if (conv_pd) { - handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, - mkldnn_engine, key)); - } + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + std::string data_format = ctx.Attr("data_format"); + auto chosen_memory_format = + platform::data_format_to_memory_format(data_format); - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - auto residual_dt = - paddle::framework::ToMKLDNNDataType(residual_param->type()); - output->ShareDataWith(*residual_param); - if (residual_dt == mkldnn::memory::data_type::u8) { - platform::SetDstMemoryHandler(ctx, output, handler, - &dst_memory_p); - } else { - need_s8_to_u8 = unsigned_output; - platform::SetDstMemoryHandler(ctx, output, handler, - &dst_memory_p); - } - } else if (!force_fp32_output) { - if (unsigned_output) { - platform::SetDstMemoryHandler(ctx, output, handler, - &dst_memory_p); - } else { - platform::SetDstMemoryHandler(ctx, output, handler, - &dst_memory_p); - } - } else { - platform::SetDstMemoryHandler(ctx, output, handler, - &dst_memory_p); - } + auto src_md = platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); + auto weights_md = platform::MKLDNNMemDesc(weights_tz, memory::data_type::s8, + chosen_memory_format); + auto dst_md = platform::MKLDNNMemDesc( + dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - if (src_memory_reorder_p) { - pipeline.push_back(*src_memory_reorder_p); - } + platform::ConvMKLDNNHandler handler(dev_ctx, mkldnn_engine, key); + auto propagation = is_test ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; - auto residual_reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(residual_reorder_key)); - if (residual_reorder_p) { - pipeline.push_back(*residual_reorder_p); - } + std::vector bias_tz; - pipeline.push_back(*conv_p); + if (bias) { + bias_tz = paddle::framework::vectorize(bias->dims()); + auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32, + mkldnn::memory::format::x); + conv_pd = handler.AcquireConvolutionPrimitiveDescriptor( + src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, + fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn, + propagation, output_shift_scale, scale_residual); + } else { + conv_pd = handler.AcquireConvolutionPrimitiveDescriptor( + src_md, weights_md, boost::none, dst_md, strides, paddings, + mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, + fuse_residual_conn, propagation, output_shift_scale, scale_residual); } - // push primitive to stream and wait until it's executed - stream(stream::kind::eager).submit(pipeline).wait(); - if (need_s8_to_u8) { - output->mutable_data(ctx.GetPlace()); - } + // create mkldnn memory from input tensors (data/weights) + user_src_memory_p = + handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler.AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); - } + // create reorder primitive if the input format is not the preferred one + src_memory_p = + handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); - private: - mkldnn::primitive_attr CreatePostOps( - bool fuse_relu, bool fuse_residual_conn, - const std::vector& output_shift_scale, float sum_scale, - bool fuse_brelu, float fuse_brelu_threshold) const { - mkldnn::primitive_attr conv_attr; - mkldnn::post_ops post_operations; - int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; - conv_attr.set_output_scales(mask, output_shift_scale); + std::shared_ptr weights_memory_p; + + int mask_reorder = ComputeWeightsMask(is_multi_channel, g); + + weights_memory_p = handler.AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline, is_test, true, scale_weights_data, + mask_reorder); if (fuse_residual_conn) { - post_operations.append_sum(sum_scale); - } - if (fuse_relu) { - constexpr float scale = 1.0f; - constexpr float negative_slope = 0.0f; - constexpr float placeholder = 1.0f; // beta - post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, - negative_slope, placeholder); - } - if (fuse_brelu) { - constexpr float scale = 1.0f; - constexpr float placeholder = 0.0f; // beta - post_operations.append_eltwise(scale, - mkldnn::algorithm::eltwise_bounded_relu, - fuse_brelu_threshold, placeholder); + auto residual_param = ctx.Input("ResidualData"); + auto residual_param_data = residual_param->data(); + PADDLE_ENFORCE_EQ(output->dims(), residual_param->dims(), + "Output and elementwise parameter need to have the " + "same dimension sizes"); + auto residual_dt = + paddle::framework::ToMKLDNNDataType(residual_param->type()); + if (residual_param->format() != handler.GetDstFormat()) { + auto residual_data_tz = + paddle::framework::vectorize(residual_param->dims()); + auto user_residual_md = platform::MKLDNNMemDesc( + residual_data_tz, residual_dt, residual_param->format()); + + user_residual_memory_p = handler.AcquireResidualDataMemory( + user_residual_md, to_void_cast(residual_param_data)); + + T_out* output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p = handler.AcquireDstMemoryFromResidualDataMemory( + user_residual_memory_p, to_void_cast(output_data), pipeline); + + } else { + output->ShareDataWith(*residual_param); + auto output_data = output->mutable_data(ctx.GetPlace()); + dst_memory_p = handler.AcquireDstMemoryFromPrimitive( + to_void_cast(output_data)); + } + } else { + T_out* output_data = output->mutable_data( + ctx.GetPlace(), handler.GetDstMemorySize()); + dst_memory_p = handler.AcquireDstMemoryFromPrimitive( + to_void_cast(output_data)); } - conv_attr.set_post_ops(post_operations); - return conv_attr; - } - std::unique_ptr - ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, - const std::shared_ptr bias_md_p, - const memory::desc& dst, const std::vector& strides, - const std::vector& paddings, - const mkldnn::engine& engine, const bool fuse_relu, - const bool fuse_residual_conn, const bool fuse_brelu, - const float fuse_brelu_threshold, - const std::vector& output_shift_scale, - const float sum_scale, bool is_test) const { - memory::dims stride_dims = {strides[0], strides[1]}; - memory::dims padding_dims = {paddings[0], paddings[1]}; + // create convolution op primitive + if (bias) { + const float* bias_data = bias->data(); + auto user_bias_md = platform::MKLDNNMemDesc( + {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + auto user_bias_memory_p = handler.AcquireBiasMemory( + user_bias_md, to_void_cast(bias_data)); + std::shared_ptr bias_memory_p; + + auto scale_bias_data = + ComputeBiasScale(scale_in_data, scale_weights_data); + int mask_bias_reorder = ComputeBiasMask(is_multi_channel); + bias_memory_p = handler.AcquireBiasMemoryFromPrimitive( + user_bias_memory_p, pipeline, is_test, true, scale_bias_data, + mask_bias_reorder); + conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, + bias_memory_p, dst_memory_p); + } else { + conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); + } + // push primitive to stream and wait until it's executed + pipeline.push_back(*conv_p); - auto propagation = is_test ? mkldnn::prop_kind::forward_scoring - : mkldnn::prop_kind::forward_training; - auto conv_desc = - (bias_md_p != nullptr) - ? mkldnn::convolution_forward::desc( - propagation, mkldnn::convolution_direct, src, weights, - (*bias_md_p), dst, stride_dims, padding_dims, padding_dims, - mkldnn::padding_kind::zero) - : mkldnn::convolution_forward::desc( - propagation, mkldnn::convolution_direct, src, weights, dst, - stride_dims, padding_dims, padding_dims, - mkldnn::padding_kind::zero); - - mkldnn::primitive_attr conv_attr = - CreatePostOps(fuse_relu, fuse_residual_conn, output_shift_scale, - sum_scale, fuse_brelu, fuse_brelu_threshold); - - auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( - conv_desc, conv_attr, engine); - - return std::unique_ptr( - p_conv_pd); + // push primitive to stream and wait until it's executed + stream(stream::kind::eager).submit(pipeline).wait(); + if (platform::MKLDNNGetDataType() == memory::data_type::s8 && + unsigned_output) { + output->mutable_data(ctx.GetPlace()); + } + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); } }; @@ -726,18 +616,23 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); - PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && - input->format() != memory::format::format_undef, - "Wrong layout/format set for Input tensor"); - PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && - filter->format() != memory::format::format_undef, - "Wrong layout/format set for Filter tensor"); - PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN && - output_grad->format() != memory::format::format_undef, - "Wrong layout/format set for output_grad tensor"); + PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Input tensor"); + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Input tensor"); - PADDLE_ENFORCE( - !ctx.Attr("is_test"), + PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Filter tensor"); + PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Filter tensor"); + + PADDLE_ENFORCE_EQ(output_grad->layout(), DataLayout::kMKLDNN, + "Wrong layout set for output_grad tensor"); + PADDLE_ENFORCE_NE(output_grad->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for output_grad tensor"); + + PADDLE_ENFORCE_EQ( + ctx.Attr("is_test"), false, "is_test attribute should be set to False in training phase."); if (!input_grad && !filter_grad) return; @@ -754,28 +649,21 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { T* input_grad_data = nullptr; T* filter_grad_data = nullptr; - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector weights_tz = - paddle::framework::vectorize2int(filter->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto weights_tz = paddle::framework::vectorize(filter->dims()); int g = std::max(groups, 1); - GetWeightsTz(weights_tz, g, is_conv3d); - std::vector dst_tz = - paddle::framework::vectorize2int(output_grad->dims()); - bool fuse_relu = ctx.Attr("fuse_relu"); - bool fuse_brelu = false; - if (!is_conv3d) { - fuse_brelu = ctx.Attr("fuse_brelu"); - } + GetWeightsTz(weights_tz, g); + auto dst_tz = paddle::framework::vectorize(output_grad->dims()); auto src_format = input->format(); - mkldnn::memory::format weights_format = + MKLDNNMemoryFormat weights_format = GetWeightsFormat(filter->format(), g, is_conv3d); // Get an unique name from "argument" name of "input" and "Filter" variable // as well as attributes of primitive to be created // This name will be used as key when saving info into device context - const std::string key = platform::ConvMKLDNNHandler::GetHash( - src_tz, weights_tz, fuse_relu, fuse_brelu, strides, paddings, dilations, - groups, ctx.op().Input("Input") + ctx.op().Input("Filter")); + const std::string key = platform::CreateKey( + src_tz, weights_tz, "", strides, paddings, dilations, groups, + ctx.op().Input("Input") + ctx.op().Input("Filter")); const std::string key_conv_pd = key + "@conv_pd"; std::vector pipeline; @@ -796,9 +684,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - weights_format = mkldnn::memory::format::any; + weights_format = MKLDNNMemoryFormat::any; // Check the format for user's special output - if (chosen_memory_format != mkldnn::memory::format::any) { + if (chosen_memory_format != MKLDNNMemoryFormat::any) { if (is_conv3d) { chosen_memory_format = platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); @@ -902,7 +790,6 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { stream(stream::kind::eager).submit(pipeline).wait(); } }; - } // namespace operators } // namespace paddle @@ -911,17 +798,17 @@ namespace ops = paddle::operators; REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ::paddle::platform::CPUPlace, U8, ops::kConvMKLDNNINT8, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ::paddle::platform::CPUPlace, S8, ops::kConvMKLDNNINT8, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, @@ -931,7 +818,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 6d5982ab..84240d30 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -45,23 +45,29 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; auto* output = ctx.Output("Output"); - PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && - input->format() != mkldnn::memory::format::format_undef, - "Wrong layout/format set for Input tensor"); - PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && - filter->format() != mkldnn::memory::format::format_undef, - "Wrong layout/format set for Filter tensor"); - PADDLE_ENFORCE(input->dims().size() == 4, - "Input must be with 4 dimensions, i.e. NCHW"); - PADDLE_ENFORCE(filter->dims().size() == 4, - "Filter must be with 4 dimensions, i.e. OIHW"); + PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Input tensor"); + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Input tensor"); + + PADDLE_ENFORCE_EQ(filter->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Filter tensor"); + PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Filter tensor"); + + PADDLE_ENFORCE_EQ(input->dims().size(), 4, + "Input must be with 4 dimensions, i.e. NCHW"); + PADDLE_ENFORCE_EQ(filter->dims().size(), 4, + "Filter must be with 4 dimensions, i.e. OIHW"); if (bias) { - PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && - bias->format() != mkldnn::memory::format::format_undef, - "Wrong layout/format set for Bias tensor"); - PADDLE_ENFORCE(bias->dims().size() == 1, - "Bias must only have 1 dimension, i.e. X"); + PADDLE_ENFORCE_EQ(bias->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Bias tensor"); + PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Bias tensor"); + + PADDLE_ENFORCE_EQ(bias->dims().size(), 1, + "Bias must only have 1 dimension, i.e. X"); } std::vector strides = ctx.Attr>("strides"); @@ -69,7 +75,6 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); - // TODO(tpatejko): add support for dilation PADDLE_ENFORCE( dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, "dilation in convolution is not implemented yet"); @@ -77,10 +82,10 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { const T* input_data = input->data(); const T* filter_data = filter->data(); - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector iohw_weights_tz = - paddle::framework::vectorize2int(filter->dims()); - std::vector weights_tz = iohw_weights_tz; + auto src_tz = paddle::framework::vectorize(input->dims()); + auto iohw_weights_tz = paddle::framework::vectorize(filter->dims()); + auto weights_tz = iohw_weights_tz; + // IOHW -> OIHW weights_tz[0] = iohw_weights_tz[1]; weights_tz[1] = iohw_weights_tz[0]; @@ -119,21 +124,20 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { weights_tz[3] = h; weights_tz[4] = w; } - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + auto dst_tz = paddle::framework::vectorize(output->dims()); // Get unique name for storing MKLDNN primitives - const std::string key = platform::ConvTransposeMKLDNNHandler::GetHash( - src_tz, weights_tz, strides, paddings, dilations, groups, - ctx.op().Output("Output")); + const std::string key = + platform::CreateKey(src_tz, weights_tz, strides, paddings, dilations, + groups, ctx.op().Output("Output")); std::vector pipeline; auto user_src_md = platform::MKLDNNMemDesc( {src_tz}, platform::MKLDNNGetDataType(), input->format()); - auto user_weights_md = - platform::MKLDNNMemDesc({weights_tz}, platform::MKLDNNGetDataType(), - (g == 1) ? mkldnn::memory::format::oihw - : mkldnn::memory::format::goihw); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), + (g == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -142,14 +146,15 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { std::string data_format = ctx.Attr("data_format"); auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - bool fuse_relu = ctx.Attr("fuse_relu"); + std::string fuse_activation = ctx.Attr("fuse_activation"); + float fuse_alpha = ctx.Attr("fuse_alpha"); + float fuse_beta = ctx.Attr("fuse_beta"); auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. - // Currently used whenever bias is != nullptr. + std::vector bias_tz; auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); @@ -161,16 +166,17 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training; if (bias) { - bias_tz = paddle::framework::vectorize2int(bias->dims()); + bias_tz = paddle::framework::vectorize(bias->dims()); auto bias_md = platform::MKLDNNMemDesc( - bias_tz, platform::MKLDNNGetDataType(), mkldnn::memory::format::x); + bias_tz, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor( src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, - fuse_relu, false, false, 0.0, fwd_prop_kind); + fuse_activation, fuse_alpha, fuse_beta, false, fwd_prop_kind); } else { conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor( src_md, weights_md, boost::none, dst_md, strides, paddings, - mkldnn_engine, fuse_relu, false, false, 0.0, fwd_prop_kind); + mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, false, + fwd_prop_kind); } // create mkldnn memory from input tensors (data/weights) @@ -197,9 +203,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { std::shared_ptr conv_p; if (bias) { const T* bias_data = bias->data(); - auto user_bias_md = - platform::MKLDNNMemDesc({bias_tz}, platform::MKLDNNGetDataType(), - mkldnn::memory::format::x); + auto user_bias_md = platform::MKLDNNMemDesc( + {bias_tz}, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); auto user_bias_memory_p = handler.AcquireBiasMemory( user_bias_md, platform::to_void_cast(bias_data)); diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc index accc9a9d..b74e7127 100644 --- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc @@ -31,18 +31,6 @@ using framework::DataLayout; using mkldnn::stream; using platform::GetMKLDNNFormat; -std::string CreateKey(const paddle::framework::ExecutionContext& ctx, - const mkldnn::memory::data_type& src_dt, - const std::vector& src_tz, const float scale_data) { - std::string key; - key.reserve(platform::MKLDNNHandler::MaxKeyLength); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(src_dt)); - platform::MKLDNNHandler::AppendKeyDims(&key, src_tz); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(scale_data)); - platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Output")); - return key; -} - template class DeQuantOpKernel : public framework::OpKernel { public: @@ -59,12 +47,13 @@ class DeQuantOpKernel : public framework::OpKernel { std::vector reorder_scale = {1.0f / scale_data}; std::vector pipeline; - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto dst_tz = paddle::framework::vectorize(output->dims()); mkldnn::memory::data_type src_dt = paddle::framework::ToMKLDNNDataType(input->type()); - mkldnn::memory::format src_fmt = input->format(); - std::string key = CreateKey(ctx, src_dt, src_tz, reorder_scale[0]); + MKLDNNMemoryFormat src_fmt = input->format(); + std::string key = platform::CreateKey(src_dt, src_tz, reorder_scale[0], + ctx.op().Output("Output")); const std::string key_prim = key + "@reorder_p"; const std::string key_src_mem = key + "@src_mem"; const std::string key_dst_mem = key + "@dst_mem"; @@ -87,7 +76,7 @@ class DeQuantOpKernel : public framework::OpKernel { std::shared_ptr(new primitive::at(*src_memory)); auto dst_md = platform::MKLDNNMemDesc({dst_tz}, memory::data_type::f32, - memory::format::nchw); + MKLDNNMemoryFormat::nchw); auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); dst_memory = std::make_shared( dst_pd, to_void_cast(output_data)); diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index b525eaac..01837cfe 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -59,7 +59,7 @@ class FCPrimitiveFactory { weights_ = CreateFourDimWeightsMemory(input, weights); } - auto dst_desc = CreateMemDescriptor(output, memory::format::any); + auto dst_desc = CreateMemDescriptor(output, MKLDNNMemoryFormat::any); fc_ = CreateFcPrimitive(*input_, *weights_, dst_desc, bias, output, ctx); return *fc_; @@ -70,14 +70,14 @@ class FCPrimitiveFactory { const Tensor* in) { input_->set_data_handle(const_cast(in->data())); output_->set_data_handle(out->mutable_data(ctx.GetPlace())); - if (out->format() == memory::format::format_undef) { + if (out->format() == MKLDNNMemoryFormat::format_undef) { auto output_format = output_->get_primitive_desc().desc().data.format; - out->set_format((memory::format)output_format); + out->set_format((MKLDNNMemoryFormat)output_format); } } - memory::format MatchWeightFormat(memory::format fmt) { - using format = memory::format; + MKLDNNMemoryFormat MatchWeightFormat(MKLDNNMemoryFormat fmt) { + using format = MKLDNNMemoryFormat; switch (fmt) { case format::nChw16c: return format::oIhw16i; @@ -102,14 +102,14 @@ class FCPrimitiveFactory { } static mkldnn::memory::desc CreateMemDescriptor(const std::vector& dims, - memory::format format) { + MKLDNNMemoryFormat format) { return platform::MKLDNNMemDesc(dims, platform::MKLDNNGetDataType(), format); } static mkldnn::memory::desc CreateMemDescriptor(const Tensor* tensor, - memory::format format) { - auto dims = framework::vectorize2int(tensor->dims()); + MKLDNNMemoryFormat format) { + auto dims = framework::vectorize(tensor->dims()); return CreateMemDescriptor(dims, format); } @@ -124,10 +124,10 @@ class FCPrimitiveFactory { } mkldnn::memory TransposeWeights(const Tensor* weights) { - auto dims = framework::vectorize2int(weights->dims()); + auto dims = framework::vectorize(weights->dims()); std::swap(dims[0], dims[1]); // Correct output dimensions - auto src_desc = CreateMemDescriptor(dims, memory::format::io); - auto dst_desc = CreateMemDescriptor(dims, memory::format::oi); + auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::io); + auto dst_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oi); return Reorder(src_desc, dst_desc, weights->data()); } @@ -182,12 +182,12 @@ class FCPrimitiveFactory { mkldnn::memory CreateFourDimWeightsMemory(const Tensor* input, const Tensor* weights) { - auto input_dims = framework::vectorize2int(input->dims()); - auto weight_dims = framework::vectorize2int(weights->dims()); + auto input_dims = framework::vectorize(input->dims()); + auto weight_dims = framework::vectorize(weights->dims()); auto dims = {weight_dims[1], input_dims[1], input_dims[2], input_dims[3]}; auto dst_format = MatchWeightFormat(input->format()); - auto src_desc = CreateMemDescriptor(dims, memory::format::oihw); + auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oihw); auto dst_desc = CreateMemDescriptor(dims, dst_format); return Reorder(src_desc, dst_desc, weights_->get_data_handle()); @@ -199,7 +199,7 @@ class FCPrimitiveFactory { auto dst_prim_desc = fc_prim_desc.dst_primitive_desc(); auto buffer_size = dst_prim_desc.get_size(); T* output_data = output->mutable_data(ctx.GetPlace(), buffer_size); - output->set_format((memory::format)dst_prim_desc.desc().data.format); + output->set_format((MKLDNNMemoryFormat)dst_prim_desc.desc().data.format); return memory(dst_prim_desc, to_void_cast(output_data)); } @@ -221,25 +221,14 @@ class FCPrimitiveFactory { boost::optional fc_; }; -static std::string GetHash(const Tensor* input, const Tensor* weights, - const std::string& suffix) { - auto dim2str = [](const DDim& operand_dims) { - std::string str = ""; - for (size_t i = 0; i < operand_dims.size(); ++i) { - str += std::to_string(operand_dims[i]) + "-"; - } - return str; - }; - return std::to_string((unsigned)input->format()) + dim2str(weights->dims()) + - suffix; -} - template std::shared_ptr> GetPrimitiveFactory( const MKLDNNDeviceContext& dev_ctx, const ExecutionContext& ctx, const Tensor* input, const Tensor* weights, const mkldnn::engine& mkldnn_engine) { - const std::string key = GetHash(input, weights, ctx.op().Output("Out")); + const std::string key = platform::CreateKey( + input->format(), framework::vectorize(weights->dims()), + ctx.op().Output("Out")); auto prim_creator = std::static_pointer_cast>(dev_ctx.GetBlob(key)); diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc index 76b00b39..d992765c 100644 --- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc @@ -40,8 +40,6 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel { data[i] = dist(engine); } - // The format of output is set as the mkldnn's format - // TODO(@mozga-intel) The format of matrix sets inside the another layers. tensor->set_layout(DataLayout::kMKLDNN); tensor->set_format(mkldnn::memory::format::oihw); } diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index 97ffb385..fe1ead8f 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -32,16 +32,11 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { "MKLDNN LRN must use CPUPlace."); auto& dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); auto x = ctx.Input("X"); auto out = ctx.Output("Out"); auto mid = ctx.Output("MidOut"); - auto input_data = x->data(); - auto output_data = out->mutable_data(ctx.GetPlace()); - mid->mutable_data(ctx.GetPlace()); - const int n = ctx.Attr("n"); // MKL-DNN implements LRN in a caffe way: // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html @@ -52,31 +47,32 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { const float alpha = ctx.Attr("alpha") * static_cast(n); const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); - - auto e_mid = framework::EigenTensor::From(*mid); - e_mid = e_mid.constant(k); - - auto dims = paddle::framework::vectorize2int(x->dims()); - - // Format and dims are assumed to be the same for dst and src - auto md = paddle::platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), x->format()); - - const std::string key = platform::LRNMKLDNNHandler::GetHash( - dims, n, alpha, beta, k, x->format(), ctx.op().Output("Out")); - - platform::LRNMKLDNNHandler handler(ctx.Attr("is_test"), dev_ctx, - mkldnn_engine, key); - auto src_memory = - handler.AcquireSrcMemory(md, platform::to_void_cast(input_data)); - - // TODO(jczaja): Hide getting PD inside of handler for all Acquire API - handler.AcquireLRNPrimitiveDescriptor(md, n, alpha, beta, k); - - auto dst_memory = - handler.AcquireDstMemory(md, platform::to_void_cast(output_data)); - - auto lrn_p = handler.AcquireLRN(dst_memory, src_memory); + bool is_test = ctx.Attr("is_test"); + + auto dims = paddle::framework::vectorize(x->dims()); + + platform::LRNMKLDNNHandler handler(dims, n, alpha, beta, k, x->format(), + is_test, dev_ctx, ctx.GetPlace(), + ctx.op().Output("Out")); + + auto src_memory = handler.AcquireSrcMemory(x); + auto dst_memory = handler.AcquireDstMemory(out); + + std::shared_ptr workspace_memory; + std::shared_ptr lrn_p; + if (is_test == false) { + workspace_memory = handler.AcquireWorkspaceMemory(mid); + lrn_p = handler.AcquireForwardPrimitive(*src_memory, *workspace_memory, + *dst_memory); + } else { + // mid has to be allocated and filled + // k to pass LRN unit tests + // TODO(jczaja): Disable checking mid in unit tests (Require API change) + mid->mutable_data(ctx.GetPlace()); + auto e_mid = framework::EigenTensor::From(*mid); + e_mid = e_mid.constant(k); + lrn_p = handler.AcquireForwardPrimitive(*src_memory, *dst_memory); + } std::vector pipeline = {*lrn_p}; mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); @@ -104,6 +100,7 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { "is_test attribute should be set to False in training phase."); auto x = ctx.Input("X"); + auto mid = ctx.Input("MidOut"); auto out_grad = ctx.Input(framework::GradVarName("Out")); auto x_grad = ctx.Output(framework::GradVarName("X")); @@ -114,42 +111,20 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { const float k = ctx.Attr("k"); auto& dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); - - auto x_grad_data = x_grad->mutable_data(ctx.GetPlace()); - auto out_grad_data = out_grad->data(); - - auto dims = paddle::framework::vectorize2int(x->dims()); - - const std::string key = platform::LRNMKLDNNHandler::GetHash( - dims, n, alpha, beta, k, x->format(), ctx.op().Input("Out")); - - platform::LRNMKLDNNHandler handler(false, dev_ctx, mkldnn_engine, key); - - auto src_md = paddle::platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), x->format()); - - // diff_dst and diff_src layouts are assumed to be the same - auto diff_md = paddle::platform::MKLDNNMemDesc( - dims, platform::MKLDNNGetDataType(), out_grad->format()); - - auto workspace = handler.AcquireWorkspaceMemory(); - - auto diff_dst_memory = handler.AcquireDiffDstMemory( - diff_md, platform::to_void_cast(out_grad_data)); - auto diff_src_memory = handler.AcquireDiffSrcMemory( - diff_md, platform::to_void_cast(x_grad_data)); + auto dims = paddle::framework::vectorize(x->dims()); - auto src_memory = handler.AcquireSrcMemory( - src_md, platform::to_void_cast(x->data())); + platform::LRNMKLDNNHandler handler( + dims, n, alpha, beta, k, x->format(), out_grad->format(), dev_ctx, + ctx.GetPlace(), ctx.op().Input("Out")); - // TODO(jczaja): Hide this call inside Handler - handler.AcquireLRNBackwardPrimitiveDescriptor(src_md, diff_md, n, alpha, - beta, k); + auto src_memory = handler.AcquireSrcMemory(x); + auto workspace = handler.AcquireBackwardWorkspaceMemory(mid); + auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad); + auto diff_src_memory = handler.AcquireDiffSrcMemory(x_grad); - auto lrn_bwd = handler.AcquireLRNBackward(src_memory, diff_dst_memory, - workspace, diff_src_memory); + auto lrn_bwd = handler.AcquireBackwardPrimitive( + *src_memory, *diff_dst_memory, *workspace, *diff_src_memory); std::vector pipeline = {*lrn_bwd}; mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index 4819bb30..5c635e58 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -62,10 +62,10 @@ class MulPrimitiveFactory { return *mul_; } - auto src_desc = CreateMemDescriptor(&x_matrix, memory::format::nc); + auto src_desc = CreateMemDescriptor(&x_matrix, MKLDNNMemoryFormat::nc); x_input_ = CreateMemory(src_desc, &x_matrix); y_input_ = TransposeInputY(&y_matrix); - auto dst_desc = CreateMemDescriptor(output, memory::format::any); + auto dst_desc = CreateMemDescriptor(output, MKLDNNMemoryFormat::any); mul_ = CreateMulPrimitive(*x_input_, *y_input_, dst_desc, output, ctx); return *mul_; @@ -77,14 +77,14 @@ class MulPrimitiveFactory { const ExecutionContext &ctx) { Tensor x_tmp; Tensor data_matrix; - memory::format src_fmt = data->format(); - memory::format dst_fmt; + MKLDNNMemoryFormat src_fmt = data->format(); + MKLDNNMemoryFormat dst_fmt; auto src_mdesc = CreateMemDescriptor(data, src_fmt); if ((data->dims().size() == 4 && - src_fmt != (dst_fmt = memory::format::nchw)) || + src_fmt != (dst_fmt = MKLDNNMemoryFormat::nchw)) || (data->dims().size() == 5 && - dst_fmt != (dst_fmt = memory::format::ncdhw))) { + src_fmt != (dst_fmt = MKLDNNMemoryFormat::ncdhw))) { auto dst_mdesc = CreateMemDescriptor(data, dst_fmt); x_tmp.mutable_data(ctx.GetPlace(), data->memory_size()); @@ -92,7 +92,7 @@ class MulPrimitiveFactory { to_void_cast(x_tmp.data())); x_tmp.Resize(data->dims()); - x_tmp.set_format((memory::format)dst_mdesc.data.format); + x_tmp.set_format((MKLDNNMemoryFormat)dst_mdesc.data.format); data_matrix = framework::ReshapeToMatrix(x_tmp, num_col_dims); } else { data_matrix = framework::ReshapeToMatrix(*data, num_col_dims); @@ -106,23 +106,23 @@ class MulPrimitiveFactory { x_input_->set_data_handle(to_void_cast(in->data())); output_->set_data_handle(out->mutable_data(ctx.GetPlace())); - if (out->format() == memory::format::format_undef) { + if (out->format() == MKLDNNMemoryFormat::format_undef) { auto output_format = output_->get_primitive_desc().desc().data.format; - out->set_format((memory::format)output_format); + out->set_format((MKLDNNMemoryFormat)output_format); } } template memory::desc CreateMemDescriptor( - const Tensor *tensor, memory::format format, + const Tensor *tensor, MKLDNNMemoryFormat format, memory::data_type type = platform::MKLDNNGetDataType()) { - auto dims = framework::vectorize2int(tensor->dims()); + auto dims = framework::vectorize(tensor->dims()); return platform::MKLDNNMemDesc(dims, type, format); } template memory::desc CreateMemDescriptor( - const std::vector &dims, memory::format format, + const std::vector &dims, MKLDNNMemoryFormat format, memory::data_type type = platform::MKLDNNGetDataType()) { return platform::MKLDNNMemDesc(dims, type, format); } @@ -139,7 +139,7 @@ class MulPrimitiveFactory { auto buffer_size = dst_prim_desc.get_size(); OT *output_data = output->mutable_data(ctx.GetPlace(), buffer_size); - output->set_format((memory::format)dst_prim_desc.desc().data.format); + output->set_format((MKLDNNMemoryFormat)dst_prim_desc.desc().data.format); return memory(dst_prim_desc, to_void_cast(output_data)); } @@ -156,10 +156,10 @@ class MulPrimitiveFactory { } memory TransposeInputY(const Tensor *input_y) { - auto dims = framework::vectorize2int(input_y->dims()); + auto dims = framework::vectorize(input_y->dims()); std::swap(dims[0], dims[1]); // Correct output dimensions - auto src_desc = CreateMemDescriptor(dims, memory::format::io); - auto dst_desc = CreateMemDescriptor(dims, memory::format::oi); + auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::io); + auto dst_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oi); return Reorder(src_desc, dst_desc, to_void_cast(input_y->data())); } @@ -207,6 +207,14 @@ class QuantMulPrimitiveFactory : public MulPrimitiveFactory { int y_num_col_dims = ctx.Attr("y_num_col_dims"); auto scale_y = ctx.Attr>("scale_y"); + // TODO(intel-minghui) : Remove the restriction that only supports Input(Y) + // as weights + bool enforce = std::is_same::value; + PADDLE_ENFORCE( + enforce == true, + "Input(Y) supposed to be fp32 data type since only fp32 data type is " + "supported in the current design of MKLDNN INT8."); + auto x_matrix = this->template UpdateDataFormat(x_input, x_num_col_dims, ctx); auto y_matrix = @@ -222,15 +230,15 @@ class QuantMulPrimitiveFactory : public MulPrimitiveFactory { return *(this->mul_); } - auto src_desc = - this->template CreateMemDescriptor(&x_matrix, memory::format::nc); + auto src_desc = this->template CreateMemDescriptor( + &x_matrix, MKLDNNMemoryFormat::nc); this->x_input_ = this->template CreateMemory(src_desc, &x_matrix); const auto trans_y = this->TransposeInputY(&y_matrix); this->y_input_ = QuantInputY(trans_y, scale_y); auto dst_desc = - this->template CreateMemDescriptor(output, memory::format::any); + this->template CreateMemDescriptor(output, MKLDNNMemoryFormat::any); this->mul_ = CreateMulPrimitive(*(this->x_input_), *(this->y_input_), dst_desc, output, ctx); @@ -262,9 +270,9 @@ class QuantMulPrimitiveFactory : public MulPrimitiveFactory { auto y_dims = std::vector(dims, dims + ndims); auto user_y_desc = - this->template CreateMemDescriptor(y_dims, memory::format::oi); - auto y_desc = - this->template CreateMemDescriptor(y_dims, memory::format::oi); + this->template CreateMemDescriptor(y_dims, MKLDNNMemoryFormat::oi); + auto y_desc = this->template CreateMemDescriptor( + y_dims, MKLDNNMemoryFormat::oi); return ReorderWithScale(user_y_desc, y_desc, input_y.get_data_handle(), scale_y); @@ -324,33 +332,17 @@ class QuantMulPrimitiveFactory : public MulPrimitiveFactory { } }; -static std::string GetHash(const Tensor *input_x, const Tensor *input_y, - const std::string &suffix) { - auto dim2str = [](const DDim &operand_dims) { - std::string str = ""; - for (int i = 0; i < operand_dims.size(); ++i) { - str += std::to_string(operand_dims[i]) + "-"; - } - return str; - }; - - std::string hash = std::to_string((unsigned)input_x->format()) + - std::to_string((unsigned)input_x->type()) + - dim2str(input_x->dims()) + - std::to_string((unsigned)input_y->format()) + - std::to_string((unsigned)input_y->type()) + - dim2str(input_y->dims()) + suffix; - - return hash; -} - /* OT: output data type */ template std::shared_ptr> GetPrimitiveFactory( const MKLDNNDeviceContext &dev_ctx, const ExecutionContext &ctx, const Tensor *input_x, const Tensor *input_y, const mkldnn::engine &mkldnn_engine, bool enable_quant) { - const std::string key = GetHash(input_x, input_y, ctx.op().Output("Out")); + const std::string key = platform::CreateKey( + input_x->format(), input_x->type(), + framework::vectorize(input_x->dims()), input_y->format(), + input_y->type(), framework::vectorize(input_y->dims()), + ctx.op().Output("Out")); auto prim_creator = std::static_pointer_cast>( dev_ctx.GetBlob(key)); @@ -413,7 +405,8 @@ class MulMKLDNNKernel : public framework::OpKernel { out->Resize(out_dims); } out->set_layout(DataLayout::kMKLDNN); - out->set_format(out->format()); + out->set_format(platform::MKLDNNFormatForSize( + out_dims.size(), mkldnn::memory::format::nchw)); } }; diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index 52554800..83e9cfd9 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -37,14 +37,14 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { "It must use CPUPlace."); auto& dev_ctx = ctx.template device_context(); - const auto& mkldnn_engine = dev_ctx.GetEngine(); const Tensor* input = ctx.Input("X"); Tensor* output = ctx.Output("Out"); - PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && - input->format() != memory::format::format_undef, - "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Input tensor"); + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Input tensor"); std::string pooling_type = ctx.Attr("pooling_type"); std::vector ksize = ctx.Attr>("ksize"); @@ -65,54 +65,38 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(input->dims().size() == 4, "Input dim must be with 4, i.e. NCHW"); - const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - - auto input_format = input->format(); - memory::format output_format{memory::format::format_undef}; - - mkldnn::memory::data_type dt = - paddle::framework::ToMKLDNNDataType(input->type()); - auto fmt = input->format(); - - const std::string key = platform::PoolingMKLDNNHandler::GetHash( - src_tz, pooling_type, ksize, strides, paddings, dt, fmt, - ctx.op().Output("Out")); - - platform::PoolingMKLDNNHandler handler(pooling_type, dt, - ctx.Attr("is_test"), dev_ctx, - mkldnn_engine, key); - - auto src_md = platform::MKLDNNMemDesc(src_tz, dt, input_format); - - auto src_memory = - handler.AcquireSrcMemory(src_md, to_void_cast(input_data)); - - /* create memory descriptor for pooling without specified format - * ('any') which lets a primitive (pooling in this case) choose - * the memory format preferred for best performance - */ - auto dst_md = - platform::MKLDNNMemDesc(dst_tz, dt, mkldnn::memory::format::any); - - auto pooling_pd = handler.AcquirePoolingPrimitiveDescriptor( - src_tz, dst_tz, src_md, dst_md, ksize, strides, paddings, - ctx.Attr("ceil_mode")); - - auto dst_memory = - handler.AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); - - auto pool_p = handler.AcquirePooling(dst_memory, src_memory); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto dst_tz = paddle::framework::vectorize(output->dims()); + + auto is_test = ctx.Attr("is_test"); + + platform::PoolingMKLDNNHandler handler( + src_tz, dst_tz, ksize, strides, paddings, pooling_type, + ctx.Attr("ceil_mode"), input->format(), + paddle::framework::ToMKLDNNDataType(input->type()), is_test, dev_ctx, + ctx.GetPlace(), ctx.op().Output("Out")); + + auto src_memory = handler.AcquireSrcMemory(input); + auto dst_memory = handler.AcquireDstMemory(output); + + std::shared_ptr pool_p; + std::shared_ptr workspace_memory; + if ((is_test == false) && (pooling_type == "max")) { + // Training + workspace_memory = handler.AcquireWorkspaceMemory(); + pool_p = handler.AcquireForwardPrimitive(*src_memory, *dst_memory, + *workspace_memory); + } else { + // Inference + pool_p = handler.AcquireForwardPrimitive(*src_memory, *dst_memory); + } // push primitive to stream and wait until it's executed std::vector pipeline{*pool_p}; stream(stream::kind::eager).submit(pipeline).wait(); - output_format = - (memory::format)dst_memory->get_primitive_desc().desc().data.format; + auto output_format = + (MKLDNNMemoryFormat)dst_memory->get_primitive_desc().desc().data.format; output->set_layout(DataLayout::kMKLDNN); output->set_format(output_format); @@ -130,15 +114,18 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); Tensor* in_x_grad = ctx.Output(framework::GradVarName("X")); - PADDLE_ENFORCE(in_x->layout() == DataLayout::kMKLDNN && - in_x->format() != memory::format::format_undef, - "Wrong layout/format set for Input X tensor"); - PADDLE_ENFORCE(out_grad->layout() == DataLayout::kMKLDNN && - out_grad->format() != memory::format::format_undef, - "Wrong layout/format set for Input output_grad tensor"); + PADDLE_ENFORCE_EQ(in_x->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Input tensor"); + PADDLE_ENFORCE_NE(in_x->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Input tensor"); + + PADDLE_ENFORCE_EQ(out_grad->layout(), DataLayout::kMKLDNN, + "Wrong layout set for Input output_grad tensor"); + PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for Input output_grad tensor"); - PADDLE_ENFORCE( - !ctx.Attr("is_test"), + PADDLE_ENFORCE_EQ( + ctx.Attr("is_test"), false, "is_test attribute should be set to False in training phase."); std::string pooling_type = ctx.Attr("pooling_type"); @@ -155,56 +142,47 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto& dev_ctx = ctx.template device_context(); - const mkldnn::engine& mkldnn_engine = dev_ctx.GetEngine(); std::vector pipeline; - const T* out_grad_data = out_grad->data(); - T* in_x_grad_data = in_x_grad->mutable_data(ctx.GetPlace()); - memory::format in_x_grad_format{memory::format::format_undef}; - - std::vector diff_src_tz = - paddle::framework::vectorize2int(in_x_grad->dims()); - std::vector diff_dst_tz = - paddle::framework::vectorize2int(out_grad->dims()); + auto diff_src_tz = paddle::framework::vectorize(in_x_grad->dims()); + auto diff_dst_tz = paddle::framework::vectorize(out_grad->dims()); // Get an unique name from "argument" name of "Out" variable // This name will be used as key when referring info from device context - const std::string key = platform::PoolingMKLDNNHandler::GetHash( + const std::string key = platform::CreateKey( diff_src_tz, pooling_type, ksize, strides, paddings, memory::data_type::f32, in_x->format(), ctx.op().Input("Out")); - platform::PoolingMKLDNNHandler handler( - pooling_type, paddle::framework::ToMKLDNNDataType(in_x_grad->type()), - false, dev_ctx, mkldnn_engine, key); - - auto workspace = handler.AcquireWorkspaceMemory(); - - auto diff_dst_md = platform::MKLDNNMemDesc( - {diff_dst_tz}, platform::MKLDNNGetDataType(), out_grad->format()); - - auto diff_dst_memory = handler.AcquireDiffDstMemory( - diff_dst_md, to_void_cast(out_grad_data)); - - auto diff_src_md = - platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::any); - - auto bwd_pd = handler.AcquirePoolingBackwardPrimitiveDescriptor( - diff_dst_md, diff_src_md, ksize, strides, paddings); - - auto diff_src_memory = handler.AcquireDiffSrcMemoryFromPrimitive( - reinterpret_cast(in_x_grad_data)); - - auto pool_bwd_p = handler.AcquirePoolingBackward(diff_dst_memory, workspace, - diff_src_memory); + platform::PoolingMKLDNNHandler handler( + diff_dst_tz, diff_src_tz, ksize, strides, paddings, pooling_type, + ctx.Attr("ceil_mode"), in_x->format(), out_grad->format(), + paddle::framework::ToMKLDNNDataType(out_grad->type()), dev_ctx, + ctx.GetPlace(), ctx.op().Input("Out")); + + auto diff_dst_memory = handler.AcquireDiffDstMemory(out_grad); + auto diff_src_memory = handler.AcquireDiffSrcMemory(in_x_grad); + + std::shared_ptr pool_bwd_p; + std::shared_ptr workspace_memory; + if (pooling_type == "max") { + // Max - pooling needs Workspace + workspace_memory = handler.AcquireWorkspaceMemory(); + pool_bwd_p = handler.AcquireBackwardPrimitive( + *diff_dst_memory, *workspace_memory, *diff_src_memory); + } else { + // Average Pooling + pool_bwd_p = + handler.AcquireBackwardPrimitive(*diff_dst_memory, *diff_src_memory); + } pipeline.push_back(*pool_bwd_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc() - .desc() - .data.format; + auto in_x_grad_format = + (MKLDNNMemoryFormat)diff_src_memory->get_primitive_desc() + .desc() + .data.format; in_x_grad->set_layout(DataLayout::kMKLDNN); in_x_grad->set_format(in_x_grad_format); } // Compute() diff --git a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc index 11c2b83d..788e3f27 100644 --- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc @@ -30,18 +30,6 @@ using framework::DataLayout; using mkldnn::stream; using platform::GetMKLDNNFormat; -std::string CreateKey(const paddle::framework::ExecutionContext& ctx, - const std::vector& src_tz, const float scale_data, - const bool is_negative) { - std::string key; - key.reserve(platform::MKLDNNHandler::MaxKeyLength); - platform::MKLDNNHandler::AppendKeyDims(&key, src_tz); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(scale_data)); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(is_negative)); - platform::MKLDNNHandler::AppendKey(&key, ctx.op().Output("Output")); - return key; -} - template class QuantOpKernel : public framework::OpKernel { public: @@ -54,13 +42,14 @@ class QuantOpKernel : public framework::OpKernel { const auto& engine = dev_ctx.GetEngine(); std::vector pipeline; - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto dst_tz = paddle::framework::vectorize(output->dims()); const T* input_data = input->data(); bool is_negative = ctx.Attr("is_negative_input"); - std::string key = CreateKey(ctx, src_tz, scale_data, is_negative); + std::string key = platform::CreateKey(src_tz, scale_data, is_negative, + ctx.op().Output("Output")); const std::string key_prim = key + "@reorder_p"; const std::string key_src_mem = key + "@src_mem"; const std::string key_dst_mem = key + "@dst_mem"; @@ -124,7 +113,5 @@ class QuantOpKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -// TODO(Xiaoli) Support FP32->S8 quantization. - REGISTER_OP_KERNEL(quantize, MKLDNN, ::paddle::platform::CPUPlace, ops::QuantOpKernel); diff --git a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc index 44e82814..a5e1e504 100644 --- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc @@ -43,15 +43,13 @@ class ReQuantOpKernel : public framework::OpKernel { const auto& engine = dev_ctx.GetEngine(); std::vector pipeline; - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + auto src_tz = paddle::framework::vectorize(input->dims()); + auto dst_tz = paddle::framework::vectorize(output->dims()); mkldnn::memory::data_type src_dt = paddle::framework::ToMKLDNNDataType(input->type()); - mkldnn::memory::data_type dst_dt = src_dt; // TODO(Xiaoli) support - // requantize from different - // data type (e.g., s8 to u8) - mkldnn::memory::format src_fmt = memory::format::nhwc; - mkldnn::memory::format dst_fmt = memory::format::nhwc; + mkldnn::memory::data_type dst_dt = src_dt; + MKLDNNMemoryFormat src_fmt = MKLDNNMemoryFormat::nhwc; + MKLDNNMemoryFormat dst_fmt = MKLDNNMemoryFormat::nhwc; const T* input_data = input->data(); T* output_data = output->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index a01dd512..690f9271 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include "mkldnn.hpp" #include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" @@ -32,89 +33,42 @@ using mkldnn::softmax_forward; using mkldnn::stream; using platform::to_void_cast; -class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler { +template +class SoftmaxMKLDNNHandler + : public platform::MKLDNNHandlerT { public: - SoftmaxMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx, - mkldnn::engine engine, const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key) {} - - SoftmaxMKLDNNHandler( - std::shared_ptr softmax_pd, - std::shared_ptr softmax_bwd_pd, - const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key), - softmax_pd_(softmax_pd), - softmax_bwd_pd_(softmax_bwd_pd) { - // If we are in Grad operatgor then update a key with BWD suffix to - // distinguish from FWD memory primitives - key_ += "-BWD"; - } - - std::shared_ptr - AcquireSoftmaxPrimitiveDescriptor(const softmax_forward::desc& softmax_desc, - const mkldnn::engine& engine) { - // Softmax PD has to be passed to Grad op that - // may be executed by diffrent thread, hence - // for that one we use key that does not contain TID - const std::string key_softmax_pd = key_common_ + "@softmax_pd"; - - softmax_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_softmax_pd)); - if (softmax_pd_ == nullptr) { - static std::mutex acquire_barrier; - std::lock_guard block_threads_until_finish_this_job( - acquire_barrier); - softmax_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_softmax_pd)); - if (softmax_pd_ == nullptr) { - softmax_pd_.reset( - new softmax_forward::primitive_desc(softmax_desc, engine)); - dev_ctx_.SetBlob(key_softmax_pd, softmax_pd_); - } - } - - return softmax_pd_; - } - - std::shared_ptr AcquireSoftmax( - std::shared_ptr dst_memory_p, - std::shared_ptr src_memory_p) { - /*Generate key*/ - auto prim_key = key_ + "@softmax_p"; - - auto softmax_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - if (softmax_p == nullptr) { - softmax_p = std::make_shared( - *softmax_pd_, *(static_cast(src_memory_p.get())), - *(static_cast(dst_memory_p.get()))); - dev_ctx_.SetBlob(prim_key, softmax_p); - } - - return softmax_p; + SoftmaxMKLDNNHandler(const std::vector& dims, + const MKLDNNMemoryFormat fmt, const int& axis, + const platform::MKLDNNDeviceContext& dev_ctx, + platform::Place cpu_place, const std::string& uniq_name) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dims, axis, uniq_name)) { + auto md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); + + this->AcquireForwardPrimitiveDescriptor(prop_kind::forward_scoring, md, + axis); } - std::shared_ptr AcquireSoftmaxBackward( - std::shared_ptr dst_memory_p, - std::shared_ptr diff_dst_memory_p, - std::shared_ptr diff_src_memory_p) { - auto prim_key = key_ + "@softmax_bwd_p"; - auto softmax_bwd_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - if (softmax_bwd_p == nullptr) { - softmax_bwd_p = std::make_shared( - *softmax_bwd_pd_, *dst_memory_p, *diff_dst_memory_p, - *diff_src_memory_p); - dev_ctx_.SetBlob(prim_key, softmax_bwd_p); - } - - return softmax_bwd_p; + SoftmaxMKLDNNHandler(const std::vector& dims, + const MKLDNNMemoryFormat fmt, + const MKLDNNMemoryFormat diff_fmt, const int& axis, + const platform::MKLDNNDeviceContext& dev_ctx, + platform::Place cpu_place, const std::string& uniq_name) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dims, axis, uniq_name)) { + auto data_softmax_md = + mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); + auto diff_softmax_md = + mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), diff_fmt); + + this->AcquireBackwardPrimitiveDescriptor(diff_softmax_md, data_softmax_md, + axis); } - - private: - std::shared_ptr softmax_pd_; - std::shared_ptr softmax_bwd_pd_; }; template @@ -124,66 +78,41 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); auto& dev_ctx = ctx.template device_context(); - auto mkldnn_engine = dev_ctx.GetEngine(); const Tensor* input = ctx.Input("X"); Tensor* output = ctx.Output("Out"); PADDLE_ENFORCE_EQ( input->dims(), output->dims(), "The shape of softmax's input and output must be identical."); - // make sure 'output' holds memory, which will be shared by - // 'flattened_output' later. - output->mutable_data(ctx.GetPlace()); - - // flatten input and output to 2-D matrixs auto dims = input->dims(); // input and output share the same shape - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - framework::Tensor flattened_input; - framework::Tensor flattened_output; - flattened_input.ShareDataWith(*input).Resize(flattened_dims); - flattened_output.ShareDataWith(*output).Resize(flattened_dims); - - const T* input_data = flattened_input.data(); - T* output_data = flattened_output.mutable_data(ctx.GetPlace()); - - std::vector src_tz = paddle::framework::vectorize2int(flattened_dims); - std::vector dst_tz = src_tz; - // Same memory descriptor to be used for input and output - memory::dims softmax_tz = {src_tz[0], src_tz[1]}; - // Generate keys for storing/retriving primitives for this operator - const std::string key = - platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out")); + const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); - SoftmaxMKLDNNHandler handler(dev_ctx, mkldnn_engine, key); - // Currently only NC data format is supported - auto softmax_md = MKLDNNMemDesc( - {softmax_tz}, platform::MKLDNNGetDataType(), memory::format::nc); - // Normalization is made after innermost dimension eg. C out of NC - auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring, - softmax_md, 1 /*dim: C*/); + auto softmax_tz = paddle::framework::vectorize(dims); - auto softmax_pd = - handler.AcquireSoftmaxPrimitiveDescriptor(softmax_desc, mkldnn_engine); + SoftmaxMKLDNNHandler handler(softmax_tz, input->format(), axis, dev_ctx, + ctx.GetPlace(), ctx.op().Output("Out")); - auto softmax_src_memory_p = - handler.AcquireSrcMemory(softmax_md, to_void_cast(input_data)); - auto softmax_dst_memory_p = - handler.AcquireDstMemory(softmax_md, to_void_cast(output_data)); - auto softmax_p = - handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p); + auto softmax_src_memory_p = handler.AcquireSrcMemory(input); + auto softmax_dst_memory_p = handler.AcquireDstMemory(output); + auto softmax_p = handler.AcquireForwardPrimitive(*softmax_src_memory_p, + *softmax_dst_memory_p); - std::vector pipeline{ - *(static_cast(softmax_p.get()))}; + std::vector pipeline{*softmax_p}; stream(stream::kind::eager).submit(pipeline).wait(); const bool is_test = ctx.Attr("is_test"); if (!is_test) { - T threshold = exp(-64); - for (int i = 0; i < dst_tz[0] * dst_tz[1]; ++i) { - output_data[i] = - output_data[i] < threshold ? threshold : output_data[i]; - } + T* output_data = output->mutable_data(ctx.GetPlace()); + int size = std::accumulate(begin(softmax_tz), end(softmax_tz), 1, + std::multiplies()); + std::for_each(output_data, &output_data[size], [](T& val) { + val = std::max(val, static_cast(exp(-64))); + }); } + + output->set_layout(framework::DataLayout::kMKLDNN); + // Softmax output format is the same as input one + output->set_format(input->format()); } }; @@ -195,7 +124,6 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { "It must use CPUPlace."); auto& dev_ctx = ctx.template device_context(); - auto mkldnn_engine = dev_ctx.GetEngine(); const Tensor* output = ctx.Input("Out"); auto* dout = ctx.template Input(framework::GradVarName("Out")); auto* dx = @@ -205,68 +133,27 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { dout->dims(), dx->dims(), "The shape of softmax_grad's input and output must be identical."); - // make sure 'dx' holds memory, which will be shared by 'flattened_dx' - // later. - dx->template mutable_data(ctx.GetPlace()); - auto dims = dout->dims(); // input and output share the same shape - auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); - framework::Tensor flattened_output; - framework::Tensor flattened_dout; - framework::Tensor flattened_dx; - flattened_output.ShareDataWith(*output).Resize(flattened_dims); - flattened_dout.ShareDataWith(*dout).Resize(flattened_dims); - flattened_dx.ShareDataWith(*dx).Resize(flattened_dims); - - const T* dst_data = flattened_output.data(); - const T* diff_dst_ptr = flattened_dout.template data(); - T* diff_src_ptr = flattened_dx.template mutable_data(ctx.GetPlace()); + const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); - std::vector dst_tz = paddle::framework::vectorize2int(flattened_dims); - std::vector src_tz(dst_tz); + std::vector softmax_tz = paddle::framework::vectorize(dims); - // Same memory descriptor to be used for input and output - memory::dims softmax_tz = {src_tz[0], src_tz[1]}; - // Currently only supports NC data format - // retrieve eltwise primitive desc from device context - const std::string key = - platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Input("Out")); - const std::string key_softmax_pd = key + "@softmax_pd"; + SoftmaxMKLDNNHandler handler(softmax_tz, output->format(), + dout->format(), axis, dev_ctx, + ctx.GetPlace(), ctx.op().Input("Out")); - auto softmax_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_softmax_pd)); - PADDLE_ENFORCE(softmax_pd != nullptr, - "Fail to find softmax_pd in device context"); + auto dst_memory_p = handler.AcquireDstMemory(output); + auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout); + auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx); - // TODO(jczaja): Add layouts support when there is a need to do so - // Two dimensional softmax does support NC format - auto data_softmax_md = MKLDNNMemDesc( - {softmax_tz}, platform::MKLDNNGetDataType(), memory::format::nc); - auto diff_softmax_md = MKLDNNMemDesc( - {softmax_tz}, platform::MKLDNNGetDataType(), memory::format::nc); - // Normalization is made after innermost dimension eg. C out of NC - auto softmax_bwd_desc = - softmax_backward::desc(diff_softmax_md, data_softmax_md, 1 /* dim: C*/); - auto softmax_bwd_pd = - std::make_shared( - softmax_bwd_desc, mkldnn_engine, *softmax_pd); - - SoftmaxMKLDNNHandler handler(softmax_pd, softmax_bwd_pd, dev_ctx, - mkldnn_engine, key); - auto dst_memory_p = - handler.AcquireDstMemory(data_softmax_md, to_void_cast(dst_data)); - auto diff_dst_memory_p = handler.AcquireDiffDstMemory( - diff_softmax_md, to_void_cast(diff_dst_ptr)); - auto diff_src_memory_p = handler.AcquireDiffSrcMemory( - diff_softmax_md, to_void_cast(diff_src_ptr)); - - // Get primitve from device context - auto softmax_bwd_p = handler.AcquireSoftmaxBackward( - dst_memory_p, diff_dst_memory_p, diff_src_memory_p); + auto softmax_bwd_p = handler.AcquireBackwardPrimitive( + *dst_memory_p, *diff_dst_memory_p, *diff_src_memory_p); std::vector pipeline{*softmax_bwd_p}; stream(stream::kind::eager).submit(pipeline).wait(); + + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(dout->format()); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index 6f64157b..1a8e9d69 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -63,29 +63,31 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { LoDTensor* output = ctx.Output("Out"); T* output_data = output->mutable_data(ctx.GetPlace()); - std::vector dst_tz = framework::vectorize2int(output->dims()); + auto dst_tz = framework::vectorize(output->dims()); auto src_tz = dst_tz; - memory::format output_format{memory::format::format_undef}; + MKLDNNMemoryFormat output_format{MKLDNNMemoryFormat::format_undef}; std::vector scales; std::vector srcs_mpd; std::vector srcs_mem; - PADDLE_ENFORCE(in_vars[0]->IsType(), - "Input[0] must be LoDTensors"); + PADDLE_ENFORCE_EQ(in_vars[0]->IsType(), true, + "Input[0] must be LoDTensors"); auto& input0 = in_vars[0]->Get(); - PADDLE_ENFORCE(input0.layout() == DataLayout::kMKLDNN && - input0.format() != memory::format::format_undef, - "Wrong layout/format for inputs[0]"); + PADDLE_ENFORCE_EQ(input0.layout(), DataLayout::kMKLDNN, + "Wrong layout set for inputs[0] tensor"); + PADDLE_ENFORCE_NE(input0.format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for inputs[0] tensor"); - memory::format input_format = input0.format(); + MKLDNNMemoryFormat input_format = input0.format(); for (int i = 0; i < N; i++) { - PADDLE_ENFORCE(in_vars[i]->IsType(), - "all inputs must be all LoDTensors"); + PADDLE_ENFORCE_EQ(in_vars[i]->IsType(), true, + "all inputs must be all LoDTensors"); auto& input = in_vars[i]->Get(); - PADDLE_ENFORCE(input.layout() == DataLayout::kMKLDNN && - input.format() != memory::format::format_undef, - "Wrong layout/format for inputs"); + PADDLE_ENFORCE_EQ(input.layout(), DataLayout::kMKLDNN, + "Wrong layout set for inputs"); + PADDLE_ENFORCE_NE(input.format(), MKLDNNMemoryFormat::format_undef, + "Wrong format set for inputs"); if (input.numel() == 0) { continue; @@ -103,7 +105,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { } auto dst_md = - memory::desc(dst_tz, memory::data_type::f32, memory::format::any); + memory::desc(dst_tz, memory::data_type::f32, MKLDNNMemoryFormat::any); auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd); @@ -119,7 +121,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { } auto sum_prim = mkldnn::sum(sum_pd, inputs, *dst_mem); - output_format = (memory::format)platform::GetMKLDNNFormat(sum_pd); + output_format = (MKLDNNMemoryFormat)platform::GetMKLDNNFormat(sum_pd); primitive reorder_prim; std::shared_ptr target_mem; @@ -139,7 +141,6 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { output->set_layout(DataLayout::kMKLDNN); output->set_format(output_format); } else { // Fallback to naive version - // TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support SumKernel reference_kernel; reference_kernel.Compute(ctx); } diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index 480167f4..bcf919fa 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -43,11 +43,11 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { return; } - std::vector nchw_tz = paddle::framework::vectorize2int(input->dims()); + auto nchw_tz = paddle::framework::vectorize(input->dims()); - const std::string key = platform::TransposeMKLDNNHandler::GetHash( - nchw_tz, axis, - ctx.op().Output("Out") + std::to_string(input->format())); + const std::string key = + platform::CreateKey(nchw_tz, axis, ctx.op().Output("Out") + + std::to_string(input->format())); platform::TransposeMKLDNNHandler handler(nchw_tz, axis, dev_ctx, mkldnn_engine, key); @@ -64,30 +64,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); output->set_layout(DataLayout::kNCHW); - output->set_format(mkldnn::memory::format::format_undef); - } -}; - -template -class TransposeINT8MKLDNNOpKernel : public paddle::framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - std::vector axis = ctx.Attr>("axis"); - std::vector axis_int8 = {0, 2, 3, 1}; - if (axis.size() != 1) { - PADDLE_ENFORCE_EQ(axis.size(), axis_int8.size()); - for (size_t i = 0; i < axis.size(); i++) { - PADDLE_ENFORCE_EQ(axis[i], axis_int8[i], - "Current INT8 MKLDNN Transpose kernel only surpport " - "axis with [0, 2, 3, 1] due to MKL-DNN kernel " - "implementation."); - } - } - auto* input = ctx.Input("X"); - auto* output = ctx.Output("Out"); - output->ShareDataWith(*input); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(input->format()); + output->set_format(MKLDNNMemoryFormat::format_undef); } }; @@ -120,10 +97,9 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { const T* out_grad_data = out_grad->data(); x_grad->mutable_data(ctx.GetPlace()); - std::vector nchw_tz = - paddle::framework::vectorize2int(out_grad->dims()); + auto nchw_tz = paddle::framework::vectorize(out_grad->dims()); - const std::string key = platform::TransposeMKLDNNHandler::GetHash( + const std::string key = platform::CreateKey( nchw_tz, axis, ctx.op().Output(framework::GradVarName("X"))); platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx, @@ -148,9 +124,7 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace, - ops::TransposeMKLDNNOpKernel, - ops::TransposeINT8MKLDNNOpKernel, - ops::TransposeINT8MKLDNNOpKernel); + ops::TransposeMKLDNNOpKernel); REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace, ops::TransposeMKLDNNOpKernel); diff --git a/paddle/fluid/operators/modified_huber_loss_op.h b/paddle/fluid/operators/modified_huber_loss_op.h index d2b6d0c4..d6dd5539 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.h +++ b/paddle/fluid/operators/modified_huber_loss_op.h @@ -29,7 +29,10 @@ using EigenVector = framework::EigenVector; template struct CheckLabelValue { HOSTDEVICE T operator()(const T& val) const { - PADDLE_ASSERT(val == static_cast(0) || val == static_cast(1)); + PADDLE_ENFORCE(val == static_cast(0) || val == static_cast(1), + "LabelValue of modified_huber_loss_op expected to be 0 " + "or 1, but got %ld. Please check input value.", + val); } }; diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc index ebb88fe2..0823ea8f 100644 --- a/paddle/fluid/operators/mul_op.cc +++ b/paddle/fluid/operators/mul_op.cc @@ -144,13 +144,17 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker { )DOC") .SetDefault(1) .EqualGreaterThan(1); - AddAttr("scale_x", - "scale_x to used for int8 input data x." - "Only used with MKL-DNN INT8") + AddAttr( + "scale_x", + "scale_x to be used for int8 mul input data x. scale_x has the" + "same purpose as scale_in in OPs that support quantization." + "Only to be used with MKL-DNN INT8") .SetDefault(1.0f); - AddAttr>("scale_y", - "scale_y to used for int8 input data y." - "Only used with MKL-DNN INT8") + AddAttr>( + "scale_y", + "scale_y to be used for int8 mul input data y. scale_y has the" + "same purpose as scale_weights in OPs that support quantization." + "Only to be used with MKL-DNN INT8") .SetDefault({1.0f}); AddAttr("scale_out", "scale_out to be used for int8 output data." diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 12f3118e..1f2f778b 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -186,7 +186,7 @@ class NCEKernel : public framework::OpKernel { std::memcpy(x_tensor->data(), labels.data(), labels.size() * sizeof(int64_t)); - std::vector w_dims = paddle::framework::vectorize2int( + std::vector w_dims = paddle::framework::vectorize( context.Input("Weight")->dims()); w_dims[0] = static_cast(labels.size()); @@ -195,9 +195,10 @@ class NCEKernel : public framework::OpKernel { w_tensor->Resize(framework::make_ddim(w_dims)); #ifdef PADDLE_WITH_DISTRIBUTE + auto weight = context.Inputs("Weight").front(); operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch", - table_names, epmap, height_sections, - context, local_scope); + weight, false, table_names, epmap, + height_sections, context, local_scope); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " diff --git a/paddle/fluid/operators/ngraph/ngraph_bridge.cc b/paddle/fluid/operators/ngraph/ngraph_bridge.cc index db8a7ca9..9ea7db2a 100644 --- a/paddle/fluid/operators/ngraph/ngraph_bridge.cc +++ b/paddle/fluid/operators/ngraph/ngraph_bridge.cc @@ -44,8 +44,7 @@ bool NgraphBridge::isSupported( if (!isRegister(op_type)) { if (skip_op_list.count(op_type)) { if (op_type == "lookup_table" || op_type == "lookup_table_grad") { - if (op_attrs.Get("is_sparse") || - (op_attrs.Get("padding_idx") != kNoPadding)) { + if (op_attrs.Get("is_sparse")) { result = false; } } else if ((op_type == "reshape") || (op_type == "reshape2")) { diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc index 3a943686..3c53c87c 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine.cc +++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc @@ -72,33 +72,31 @@ static std::map {ngraph::element::boolean, framework::proto::VarType::BOOL}}; std::vector NgraphEngine::feed_vars = {}; -std::vector NgraphEngine::fetch_vars = {}; -framework::Variable* NgraphEngine::pre_var_ptr = nullptr; -const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr; -bool NgraphEngine::is_training = false; -std::unordered_map NgraphEngine::engine_cache = {}; -std::unordered_map>> - NgraphEngine::t_in_cache_ = {}; +std::weak_ptr NgraphEngine::wp_backend_; -std::shared_ptr NgraphEngine::backend_ = - ngraph::runtime::Backend::create("CPU"); +std::mutex NgraphEngine::ng_mutex_; static std::vector> NgraphOpIntervals( std::vector>* ops) { NgraphEngine::feed_vars.clear(); - NgraphEngine::fetch_vars.clear(); std::vector> intervals; int size = ops->size(); - int left = 0; + int left = 0, feed_idx = -1; while (left < size && ops->at(left)->Type() != framework::kFeedOpType && ops->at(left)->Type() != "read" && ops->at(left)->Type() != framework::kFetchOpType) { ++left; } + if (left < size) { + auto op_type = ops->at(left)->Type(); + if (op_type == framework::kFeedOpType || op_type == "read") { + feed_idx = left; + } + } + while (left < size && (ops->at(left)->Type() == framework::kFeedOpType || ops->at(left)->Type() == "read")) { for (auto& var_name_item : ops->at(left)->Outputs()) { @@ -116,11 +114,6 @@ static std::vector> NgraphOpIntervals( int index = right; while (index < size && ops->at(index)->Type() == framework::kFetchOpType) { - for (auto& var_name_item : ops->at(index)->Inputs()) { - for (auto& var_name : var_name_item.second) { - NgraphEngine::fetch_vars.emplace_back(var_name); - } - } ++index; } @@ -141,7 +134,9 @@ static std::vector> NgraphOpIntervals( ++end; } std::vector interval = {start, end}; - intervals.emplace_back(interval); + if (feed_idx != -1 && start > feed_idx) { + intervals.emplace_back(interval); + } } } // end while return intervals; @@ -163,16 +158,22 @@ static void SubstituteNgraphOp( framework::OpRegistry::CreateOp(ng_op_desc)); } -std::string SerializedBlock(const std::vector& op_descs) { +std::string SerializedBlock(const framework::BlockDesc& bdesc) { framework::proto::BlockDesc block_proto; framework::BlockDesc block_desc(nullptr, &block_proto); block_desc.Proto()->set_parent_idx(-1); block_desc.Proto()->set_idx(0); - for (auto* op_desc : op_descs) { + for (auto& op_desc : bdesc.AllOps()) { auto* op = block_desc.AppendOp(); *op->Proto() = *op_desc->Proto(); } + + auto* vars = block_desc.Proto()->mutable_vars(); + for (auto& var_desc : bdesc.AllVars()) { + *vars->Add() = *var_desc->Proto(); + } + return block_desc.Proto()->SerializeAsString(); } @@ -209,12 +210,12 @@ std::string GenerateEngineKey(const std::vector& engine_inputs, void NgraphEngine::FuseNgraphOps( const framework::BlockDesc& block_desc, std::vector>* ops) { - NgraphEngine::p_bdesc = &block_desc; auto intervals = NgraphOpIntervals(ops); + std::string serialized_block = SerializedBlock(block_desc); std::string engine_key = - GenerateEngineKey(feed_vars, fetch_vars, ops->size()); + std::to_string(std::hash()(serialized_block)); for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) { - SubstituteNgraphOp(ops, engine_key, "", *it); + SubstituteNgraphOp(ops, engine_key, serialized_block, *it); } } @@ -228,6 +229,20 @@ NgraphEngine::NgraphEngine(const framework::Scope& scope, var_node_map_ = std::make_shared< std::unordered_map>>(); + std::lock_guard lock(ng_mutex_); + + if (!wp_backend_.lock()) { + try { + VLOG(3) << "ngraph creating CPU backend."; + backend_ = ngraph::runtime::Backend::create("CPU"); + } catch (...) { + PADDLE_THROW("Unsupported nGraph backend"); + } + wp_backend_ = backend_; + } else { + backend_ = wp_backend_.lock(); + } + GetNgFunction(ctx); } @@ -235,25 +250,11 @@ void NgraphEngine::Prepare(const framework::ExecutionContext& ctx) { auto interval = ctx.Attr>("interval"); std::string serialized_graph = ctx.Attr("graph"); - auto input_vars = ctx.Inputs("Xs"); - if (!input_vars.empty()) { - feed_vars = input_vars; - var_in_ = input_vars; - } - auto output_vars = ctx.Outputs("Ys"); - if (!output_vars.empty()) { - var_out_ = output_vars; - } - framework::proto::BlockDesc block_proto; if (!serialized_graph.empty()) block_proto.ParseFromString(serialized_graph); framework::BlockDesc block_desc(nullptr, &block_proto); - if (!serialized_graph.empty()) { - NgraphEngine::p_bdesc = &block_desc; - } - bool has_fetch = false, is_full = false; - for (auto& var : p_bdesc->AllVars()) { + for (auto& var : block_desc.AllVars()) { if (!(var->GetType() == framework::proto::VarType::SELECTED_ROWS || var->GetType() == framework::proto::VarType::LOD_TENSOR || var->GetType() == framework::proto::VarType::LOD_TENSOR_ARRAY)) { @@ -281,43 +282,20 @@ void NgraphEngine::Prepare(const framework::ExecutionContext& ctx) { } std::vector ops_desc; - for (auto op_desc : p_bdesc->AllOps()) { + for (auto op_desc : block_desc.AllOps()) { ops_desc.emplace_back(op_desc); - if (op_desc->Type() == framework::kFetchOpType) { - has_fetch = true; - } - } - - for (auto op_desc : ops_desc) { if (op_desc->Type().find("_grad") != std::string::npos) { - is_training = true; this->is_test_ = false; - break; } } - if (interval[0] > 0 && - ops_desc.at(interval[0] - 1)->Type() == framework::kFeedOpType && - interval[1] < static_cast(ops_desc.size()) && - ops_desc.at(interval[1])->Type() == framework::kFetchOpType) { - is_full = true; - } - - if (is_full) { - this->op_state_ = this->is_test_ ? OpState::FULL_TEST : OpState::FULL_TRAIN; - } else { - this->op_state_ = - this->is_test_ ? OpState::PARTIAL_TEST : OpState::PARTIAL_TRAIN; - } - int idx = interval[0]; while (idx < interval[1]) { this->fused_ops_.emplace_back( framework::OpRegistry::CreateOp(*(ops_desc[idx]))); ++idx; } - while (idx < static_cast(ops_desc.size()) && - ops_desc.at(idx)->Type() != framework::kFetchOpType) { + while (idx < static_cast(ops_desc.size())) { auto op_desc = ops_desc.at(idx); for (auto& var_name_item : op_desc->Inputs()) { for (auto& var_name : var_name_item.second) { @@ -327,13 +305,21 @@ void NgraphEngine::Prepare(const framework::ExecutionContext& ctx) { ++idx; } - if (!has_fetch) { - op_state_ = OpState::UNKNOWN; + auto input_vars = ctx.Inputs("Xs"); + if (!input_vars.empty()) { + feed_vars = input_vars; + var_in_ = input_vars; + } + + auto output_vars = ctx.Outputs("Ys"); + if (!output_vars.empty()) { + var_out_ = output_vars; } if (var_in_.empty() && var_out_.empty()) { BuildNgIO(ops_desc, interval); } + for (size_t i = 0; i < var_in_.size(); ++i) { auto var_name = var_in_[i]; if (persistables_.find(var_name) == persistables_.end()) { @@ -346,6 +332,7 @@ void NgraphEngine::BuildNgIO(const std::vector& ops_desc, const std::vector& interval) { std::unordered_set inputs; std::unordered_set outputs; + for (int i = interval[0]; i < interval[1]; ++i) { auto op = ops_desc[i]; for (auto& var_name_item : op->Inputs()) { @@ -380,37 +367,15 @@ void NgraphEngine::BuildNgIO(const std::vector& ops_desc, "op %s has more than 1 output - Not handling yet", op->Type()); for (auto& var_name : var_name_item.second) { - switch (this->op_state_) { - case OpState::PARTIAL_TEST: - if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || - find(fetch_vars.begin(), fetch_vars.end(), var_name) != - fetch_vars.end()) { - this->var_out_.emplace_back(var_name); - } - break; - case OpState::FULL_TEST: - if (find(fetch_vars.begin(), fetch_vars.end(), var_name) != - fetch_vars.end()) { - this->var_out_.emplace_back(var_name); - } - break; - case OpState::PARTIAL_TRAIN: - if (find(fetch_vars.begin(), fetch_vars.end(), var_name) != - fetch_vars.end() || - post_op_inputs_.find(var_name) != post_op_inputs_.end() || - persistables_.find(var_name) != persistables_.end()) { - this->var_out_.emplace_back(var_name); - } - break; - case OpState::FULL_TRAIN: - if (find(fetch_vars.begin(), fetch_vars.end(), var_name) != - fetch_vars.end() || - persistables_.find(var_name) != persistables_.end()) { - this->var_out_.emplace_back(var_name); - } - break; - default: + if (this->is_test_) { + if (post_op_inputs_.find(var_name) != post_op_inputs_.end()) { + this->var_out_.emplace_back(var_name); + } + } else { + if (post_op_inputs_.find(var_name) != post_op_inputs_.end() || + persistables_.find(var_name) != persistables_.end()) { this->var_out_.emplace_back(var_name); + } } } } @@ -474,10 +439,14 @@ std::shared_ptr NgraphEngine::BuildNgFunction( ngraph::ParameterVector func_inputs; for (auto& vo : var_out_) { + PADDLE_ENFORCE_GT(var_node_map_->count(vo), 0, + "Cannot find vo %s in var_node_map_", vo); func_outputs.emplace_back(var_node_map_->at(vo)); } for (auto& vi : var_in_) { + PADDLE_ENFORCE_GT(var_node_map_->count(vi), 0, + "Cannot find vi %s in var_node_map_", vi); std::shared_ptr prm = std::dynamic_pointer_cast( var_in_node_map_->at(vi)); @@ -488,10 +457,14 @@ std::shared_ptr NgraphEngine::BuildNgFunction( } void NgraphEngine::ClearNgCache() { + auto& engine_cache = main_engine_cache::fetch(); + auto& t_in_cache_ = main_t_in_cache::fetch(); + auto it = engine_cache.begin(); while (it != engine_cache.end()) { auto ng_engine = it->second; - backend_->remove_compiled_function(ng_engine.ngraph_handle); + ng_engine.ngraph_backend->remove_compiled_function(ng_engine.ngraph_handle); + ng_engine.ngraph_backend.reset(); ++it; } engine_cache.clear(); @@ -529,16 +502,11 @@ void NgraphEngine::GetNgFunction(const framework::ExecutionContext& ctx) { std::to_string(interval[1]) + engine_key; func_cache_key_ = std::to_string(std::hash()(func_cache_key_)); + auto& engine_cache = main_engine_cache::fetch(); + if (engine_cache.find(func_cache_key_) != engine_cache.end()) { if (engine_cache[func_cache_key_].persistables.size() == 0) { ClearNgCache(); - } else { - auto var_name = engine_cache[func_cache_key_].persistables.begin(); - framework::Variable* var = scope_.FindVar(*var_name); - if (var != pre_var_ptr) { - ClearNgCache(); - } - pre_var_ptr = var; } } @@ -550,6 +518,7 @@ void NgraphEngine::GetNgFunction(const framework::ExecutionContext& ctx) { for (auto& r : func->get_results()) { r->set_needs_default_layout(true); } + engine_cache[func_cache_key_].ngraph_backend = backend_; engine_cache[func_cache_key_].ngraph_handle = backend_->compile(func); engine_cache[func_cache_key_].persistables = this->persistables_; engine_cache[func_cache_key_].var_in_updates = this->var_in_updates_; @@ -561,28 +530,32 @@ void NgraphEngine::GetNgFunction(const framework::ExecutionContext& ctx) { void NgraphEngine::Run(const framework::Scope& scope, const platform::Place& place) const { + VLOG(3) << "NgraphEngine Run ..."; std::shared_ptr ng_handle; + std::shared_ptr ng_backend; const std::set* p_persistables; const std::vector* p_var_in_updates; const std::vector* p_var_in; const std::vector* p_var_out; - bool is_test; - PADDLE_ENFORCE(engine_cache.find(func_cache_key_) != engine_cache.end(), - "Cannot find cached data to run ngraph function"); + auto& engine_cache = main_engine_cache::fetch(); + auto& t_in_cache_ = main_t_in_cache::fetch(); + + PADDLE_ENFORCE_GT(engine_cache.count(func_cache_key_), 0, + "Cannot find cached data to run ngraph function"); ng_handle = engine_cache[func_cache_key_].ngraph_handle; + ng_backend = engine_cache[func_cache_key_].ngraph_backend; p_persistables = &(engine_cache[func_cache_key_].persistables); p_var_in_updates = &(engine_cache[func_cache_key_].var_in_updates); p_var_in = &(engine_cache[func_cache_key_].var_in); p_var_out = &(engine_cache[func_cache_key_].var_out); - is_test = engine_cache[func_cache_key_].is_test; std::vector>* p_t_in; std::vector> t_in = {}; auto m_parameters = ng_handle->get_parameters(); auto m_results = ng_handle->get_results(); - if (is_test && t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) { + if (is_inference_ && t_in_cache_.find(func_cache_key_) != t_in_cache_.end()) { p_t_in = &(t_in_cache_[func_cache_key_]); for (size_t i = 0; i < p_var_in_updates->size(); ++i) { int index = p_var_in_updates->at(i); @@ -594,14 +567,14 @@ void NgraphEngine::Run(const framework::Scope& scope, if (var && var->IsType()) { auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]); - ti = backend_->create_tensor(ng_type, sp, pd_arr); + ti = ng_backend->create_tensor(ng_type, sp, pd_arr); (*p_t_in)[index] = ti; } else { PADDLE_THROW("Cannot find var or tensor with var name %s", vi); } } } else { - if (is_test) { + if (is_inference_) { p_t_in = &(t_in_cache_[func_cache_key_]); } else { p_t_in = &t_in; @@ -616,15 +589,13 @@ void NgraphEngine::Run(const framework::Scope& scope, if (var && var->IsType()) { auto* tensor_pd = GetMutableLoDTensorOrSelectedRowsValueFromVar(var); void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]); - PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), - "Ensure ngraph tensor layout align with paddle tensor"); - ti = backend_->create_tensor(ng_type, sp, pd_arr); + ti = ng_backend->create_tensor(ng_type, sp, pd_arr); } else { PADDLE_THROW("Cannot find var or tensor with var name %s", vi); } bool is_persistable = (p_persistables->find(vi) != p_persistables->end()) ? true : false; - if (!is_training && is_test && is_persistable) { + if (is_inference_ && is_persistable) { ti->set_stale(false); } (*p_t_in).emplace_back(ti); @@ -647,7 +618,7 @@ void NgraphEngine::Run(const framework::Scope& scope, auto ng_type = m_results[i]->get_element_type(); void* pd_arr = tensor_pd->mutable_data(place, ng2pd_type_map[ng_type]); std::shared_ptr to = - backend_->create_tensor(ng_type, sp, pd_arr); + ng_backend->create_tensor(ng_type, sp, pd_arr); t_out.emplace_back(to); } else { PADDLE_THROW("Cannot find var or tensor with var name %s", vo); diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h index 4cb14653..0fb2d167 100644 --- a/paddle/fluid/operators/ngraph/ngraph_engine.h +++ b/paddle/fluid/operators/ngraph/ngraph_engine.h @@ -14,11 +14,14 @@ limitations under the License. */ #pragma once +#include #include +#include //NOLINT #include #include #include #include +#include #include #include "paddle/fluid/framework/operator.h" @@ -30,17 +33,10 @@ limitations under the License. */ namespace paddle { namespace operators { -enum class OpState { /* nGraph support state on ops */ - FULL_TRAIN, /* Support full ops for train */ - PARTIAL_TRAIN, /* Support partial ops for train */ - FULL_TEST, /* Support full list of ops for test */ - PARTIAL_TEST, /* Support partial list of ops for test */ - UNKNOWN /* Output all for debug purpose */ -}; - // cache engine repetitives struct EngineCache { - std::shared_ptr ngraph_handle; + std::shared_ptr ngraph_handle = nullptr; + std::shared_ptr ngraph_backend = nullptr; std::set persistables; std::vector var_in; std::vector var_out; @@ -48,6 +44,82 @@ struct EngineCache { bool is_test = true; }; +template +class NgraphThreadCache { + public: + typedef decltype(Engine::getMutex()) mutex_type; + typedef std::lock_guard guard_type; + typedef T& ref_type; + enum class type_of_thread { unknown, forward, backward }; + + template + struct MetaInfo { + std::thread::id owner_tid; // owner of the cache, future use; + type_of_thread worker_type; // future use + S real_content; + MetaInfo() + : owner_tid{std::this_thread::get_id()}, + worker_type{type_of_thread::unknown} {} + }; + + typedef std::unique_ptr> content_type; + typedef std::list storage_type; + + protected: + static storage_type l; + static mutex_type getMutex() { return Engine::getMutex(); } + static void remove_from_list(const T* raw_ptr) { + guard_type guard(getMutex()); + l.remove_if([raw_ptr](const content_type& sh) { + return &(sh->real_content) == raw_ptr; + }); + } + + template + struct TLSDescriptor { + TRaw* raw_ptr; + TLSDescriptor() : raw_ptr{nullptr} {} + ~TLSDescriptor() { + // if thread die + NgraphThreadCache::remove_from_list(raw_ptr); + + /* TODO : Parallel executor swap */ + // FastMultiThreadCache::keep_alive_for_backward_thread(raw_ptr); + } + }; + + public: + NgraphThreadCache() = delete; + NgraphThreadCache(const NgraphThreadCache& copy) = delete; + + static T& fetch() { + thread_local TLSDescriptor tls; + if (!tls.raw_ptr) { + using elem_type = typename content_type::element_type; + content_type _p(new elem_type()); + if (!_p) PADDLE_THROW("Cannot alloc memory for thread-cache "); + guard_type guard(getMutex()); + l.push_back(std::move(_p)); + tls.raw_ptr = &l.back()->real_content; + } + return *(tls.raw_ptr); + } + auto getSize() -> decltype(l.size()) { + guard_type guard(getMutex()); + return l.size(); + } + + template + void for_each_cache(F f) { + guard_type guard(getMutex()); + std::for_each(l.begin(), l.end(), f); + } +}; + +template +typename NgraphThreadCache::storage_type + NgraphThreadCache::l; + // perform graph build through bridge and execute computation class NgraphEngine { public: @@ -57,20 +129,25 @@ class NgraphEngine { void Run(const framework::Scope& scope, const platform::Place& place) const; - static bool is_training; - static const framework::BlockDesc* p_bdesc; - static std::vector feed_vars, fetch_vars; + static std::vector feed_vars; static void FuseNgraphOps( const framework::BlockDesc& prog, std::vector>* ops); + static std::recursive_mutex& getMutex() { + static std::recursive_mutex mx; + return mx; + } + private: - static std::unordered_map engine_cache; - static std::unordered_map< - std::string, std::vector>> - t_in_cache_; - static framework::Variable* pre_var_ptr; + template + using ThCache = + NgraphThreadCache, NgraphEngine>; + + using main_engine_cache = ThCache; + using main_t_in_cache = + ThCache>>; const framework::Scope& scope_; const platform::Place& place_; @@ -78,12 +155,18 @@ class NgraphEngine { std::unordered_map var_type_map_; std::set persistables_; std::unordered_set post_op_inputs_; - OpState op_state_ = OpState::UNKNOWN; + // it is test for a single run, it can be a validation during training bool is_test_{true}; + // inference only. eg. CAPI inference + bool is_inference_{false}; std::string func_cache_key_; - + // use a weak pointer to keep backend_ alive + // to avoid it to be destropyed too earlier + static std::weak_ptr wp_backend_; + // use mutex to keep it thread safe + static std::mutex ng_mutex_; // ngraph backend eg. CPU - static std::shared_ptr backend_; + std::shared_ptr backend_; // var_name of inputs std::vector var_in_; // var_name of outputs from fetch in order diff --git a/paddle/fluid/operators/ngraph/ops/concat_op.h b/paddle/fluid/operators/ngraph/ops/concat_op.h index 27d79685..f34e1611 100644 --- a/paddle/fluid/operators/ngraph/ops/concat_op.h +++ b/paddle/fluid/operators/ngraph/ops/concat_op.h @@ -39,7 +39,10 @@ void BuildConcatNode( } } auto op_attrs = framework::AttrReader(op->Attrs()); - const size_t axis = op_attrs.Get("axis"); + int axis = op_attrs.Get("axis"); + if (axis < 0) { + axis = axis + args[0]->get_shape().size(); + } auto out = std::make_shared(args, axis); platform::SetOutputNode(op, "Out", out, ngb_node_map); } diff --git a/paddle/fluid/operators/ngraph/ops/conv2d_op.h b/paddle/fluid/operators/ngraph/ops/conv2d_op.h index b8ad7491..ab88d870 100644 --- a/paddle/fluid/operators/ngraph/ops/conv2d_op.h +++ b/paddle/fluid/operators/ngraph/ops/conv2d_op.h @@ -80,7 +80,7 @@ std::shared_ptr GroupedGradConvolutionFilter( auto data_slice = std::make_shared( data_batch, lower_bound, upper_bound); - size_t filter_step = data_shape.at(0); + size_t filter_step = filter_shape.at(0) / groups; const std::vector filter_lower_bound{i * filter_step, 0, 0, 0}; const std::vector filter_upper_bound{ @@ -127,7 +127,7 @@ std::shared_ptr GroupedGradConvolutionData( auto data_slice = std::make_shared( data_batch, lower_bound, upper_bound); - size_t filter_step = data_shape.at(0); + size_t filter_step = filter_shape.at(0) / groups; const std::vector filter_lower_bound{i * filter_step, 0, 0, 0}; const std::vector filter_upper_bound{ diff --git a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h index bc91be45..e06446ac 100644 --- a/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h +++ b/paddle/fluid/operators/ngraph/ops/cross_entropy_op.h @@ -29,7 +29,7 @@ namespace ngraphs { std::shared_ptr remove_trailing_one( const std::shared_ptr& input) { auto shape = input->get_shape(); - if (shape.back() == 1) { + if (shape.back() == 1 && shape.size() > 1) { shape.pop_back(); return platform::NgReshaper(input, shape); } else { @@ -73,6 +73,7 @@ std::shared_ptr create_xe( shape.back() = 1; return platform::NgReshaper(-node_sum, shape); } + std::shared_ptr create_mask( const std::shared_ptr& label, int ignore_index) { auto ignore_node = paddle::platform::CreateConstant( diff --git a/paddle/fluid/operators/ngraph/ops/dropout_op.h b/paddle/fluid/operators/ngraph/ops/dropout_op.h index cf19a585..3fb55980 100644 --- a/paddle/fluid/operators/ngraph/ops/dropout_op.h +++ b/paddle/fluid/operators/ngraph/ops/dropout_op.h @@ -41,6 +41,7 @@ static void BuildDropoutNode( op_attrs.Get("dropout_implementation"); auto is_test = op_attrs.Get("is_test"); auto seed = op_attrs.Get("seed"); + auto fix_seed = op_attrs.Get("fix_seed"); float value = 1.0f - dropout_prob; bool upscale_in_train = (dropout_implementation == "upscale_in_train"); @@ -58,7 +59,8 @@ static void BuildDropoutNode( ngraph::Shape{}, {1}); auto gen_mask = std::make_shared( - one, input->get_shape(), input->get_element_type(), seed, value); + one, input->get_shape(), input->get_element_type(), seed, value, + fix_seed); if (upscale_in_train) { auto mask_val = paddle::platform::CreateConstant( diff --git a/paddle/fluid/operators/ngraph/ops/lookup_table_op.h b/paddle/fluid/operators/ngraph/ops/lookup_table_op.h index 5126854d..45bb3159 100644 --- a/paddle/fluid/operators/ngraph/ops/lookup_table_op.h +++ b/paddle/fluid/operators/ngraph/ops/lookup_table_op.h @@ -47,16 +47,27 @@ void BuildLookupTableNode( if (is_sparse) { PADDLE_THROW("Sparsity is not yet supported in nGraph lookup_table op."); } - + auto ng_w_mask = ng_w; if (padding_idx != kNoPadding) { - PADDLE_THROW("Padding is not yet supported in nGraph lookup_table op."); + auto w_shape = ng_w->get_shape(); + + std::vector maskV(w_shape[0], 1); + maskV[padding_idx] = 0; + auto maskV_node = std::make_shared( + ng_w->get_element_type(), ngraph::Shape{w_shape[0]}, maskV); + ngraph::AxisSet axis_set; + for (unsigned int i = 1; i < w_shape.size(); ++i) axis_set.insert(i); + auto maskV_bd = + std::make_shared(maskV_node, w_shape, axis_set); + ng_w_mask = std::make_shared(ng_w, maskV_bd); } auto shape = ng_ids->get_shape(); if (shape.back() == 1) { shape.pop_back(); ng_ids = platform::NgReshaper(ng_ids, shape); } - auto ng_lookup = std::make_shared(ng_w, ng_ids); + + auto ng_lookup = std::make_shared(ng_w_mask, ng_ids); platform::SetOutputNode(op, "Out", ng_lookup, ngb_node_map); } @@ -67,8 +78,6 @@ void BuildLookupTableGradNode( ngb_node_map) { auto op_attrs = paddle::framework::AttrReader(op->Attrs()); const bool is_sparse = op_attrs.Get("is_sparse"); - const int64_t padding_idx = op_attrs.Get("padding_idx"); - auto ng_ids = paddle::platform::GetInputNode(op, "Ids", ngb_node_map); PADDLE_ENFORCE_NOT_NULL(ng_ids); @@ -81,9 +90,6 @@ void BuildLookupTableGradNode( PADDLE_THROW("Sparsity is not yet supported in nGraph lookup_table op."); } - if (padding_idx != kNoPadding) { - PADDLE_THROW("Padding is not yet supported in nGraph lookup_table op."); - } auto shape = ng_ids->get_shape(); if (shape.back() == 1) { shape.pop_back(); diff --git a/paddle/fluid/operators/ngraph/ops/reshape_op.h b/paddle/fluid/operators/ngraph/ops/reshape_op.h index 53a2aebe..89ad04f0 100644 --- a/paddle/fluid/operators/ngraph/ops/reshape_op.h +++ b/paddle/fluid/operators/ngraph/ops/reshape_op.h @@ -57,8 +57,7 @@ static void BuildReshapeNode( std::shared_ptr input = platform::GetInputNode(op, "X", ngb_node_map); auto input_shape = input->get_shape(); - // TODO(mozga-intel) The vector of shape is not supported yet, that's - // asDispensable() operator" + std::shared_ptr shape = platform::GetInputNode(op, "Shape", ngb_node_map); diff --git a/paddle/fluid/operators/ngraph/ops/slice_op.h b/paddle/fluid/operators/ngraph/ops/slice_op.h index 1ae4d198..f5ab4135 100644 --- a/paddle/fluid/operators/ngraph/ops/slice_op.h +++ b/paddle/fluid/operators/ngraph/ops/slice_op.h @@ -57,8 +57,18 @@ void BuildSliceNode( ng_end[axes[i]] = end; } auto out = std::make_shared(input, ng_start, ng_end); - platform::SetOutputNode(op, "Out", out, ngb_node_map); + auto out_shape = out->get_shape(); + + std::vector out_axis_vec(out_shape.size()); + std::iota(out_axis_vec.begin(), out_axis_vec.end(), 0); + + paddle::platform::TrimTrailingSingularDims(&out_shape); + auto out_dim = std::make_shared( + out, ngraph::AxisVector(out_axis_vec), ngraph::Shape(out_shape)); + + platform::SetOutputNode(op, "Out", out_dim, ngb_node_map); } + void BuildSliceGradNode( const std::shared_ptr& op, std::shared_ptr< diff --git a/paddle/fluid/operators/norm_utils.h b/paddle/fluid/operators/norm_utils.h new file mode 100644 index 00000000..fee06fe5 --- /dev/null +++ b/paddle/fluid/operators/norm_utils.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using DataLayout = framework::DataLayout; + +inline void ExtractNCWHD(const framework::DDim &dims, + const DataLayout &data_layout, int *N, int *C, int *H, + int *W, int *D) { + *N = dims[0]; + if (dims.size() == 2) { + *C = dims[1]; + *H = 1; + *W = 1; + *D = 1; + } else { + *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; + *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; + *W = dims.size() > 3 + ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) + : 1; + *D = dims.size() > 4 + ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) + : 1; + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc new file mode 100644 index 00000000..7a75afca --- /dev/null +++ b/paddle/fluid/operators/one_hot_v2_op.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/one_hot_v2_op.h" +#include +#include +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace operators { + +class OneHotV2Op : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of OneHotOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of OneHotOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dims.size(), 1, + "Rank of Input(X) should be at least 1."); + + int depth = ctx->Attrs().Get("depth"); + if (ctx->HasInput("depth_tensor")) { + depth = -1; + } + + auto out_dims_vec = framework::vectorize(x_dims); + out_dims_vec.push_back(depth); + auto out_dims = framework::make_ddim(out_dims_vec); + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", /* --> */ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "depth_tensor") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + +class OneHotV2OpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor, LoDTensor) Input variable with rank at least 2. " + "The last dimension of X should be 1. Each value of X is an index " + "to indicate the position."); + AddInput("depth_tensor", "(Tensor, Tensor), Length of one-hot vector") + .AsDispensable(); + AddOutput("Out", + "(Tensor, Tensor) Output tensor with same rank as X. " + "The tensor consists of one-hot representations of values in X."); + + AddAttr("depth", + "A positive integer to specify the length of one-hot vector.") + .SetDefault(-1); + AddAttr("dtype", + "An integer to specify the data type of one-hot " + "vector. The default value is FP32.") + .SetDefault(paddle::framework::proto::VarType::FP32); + AddAttr("allow_out_of_range", + "If it is set true and the input data is out of range, " + "the output tensor will be filled zeros. The default value " + "is false.") + .SetDefault(false); + AddComment(R"DOC( +One Hot Operator. This operator creates the one-hot representations for input +index values. The following example will help to explain the function of this +operator: + +X is a LoDTensor: + X.lod = [[0, 1, 4]] + X.shape = [4] + X.data = [1, 1, 3, 0] + +set depth = 4 + +Out is a LoDTensor: + Out.lod = [[0, 1, 4]] + Out.shape = [4, 4] + Out.data = [[0., 1., 0., 0.], + [0., 1., 0., 0.], + [0., 0., 0., 1.], + [1., 0., 0., 0.]] +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(one_hot_v2, ops::OneHotV2Op, ops::OneHotV2OpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + one_hot_v2, ops::OneHotV2Kernel, + ops::OneHotV2Kernel); diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu new file mode 100644 index 00000000..2366f142 --- /dev/null +++ b/paddle/fluid/operators/one_hot_v2_op.cu @@ -0,0 +1,99 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/one_hot_v2_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { +using platform::PADDLE_CUDA_NUM_THREADS; + +template +__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data, + const int64_t numel, const int depth) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < numel && p_in_data[idx] >= 0 && p_in_data[idx] < depth) { + *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0; + } +} + +template +struct OneHotV2OpCUDAFunctor { + const framework::LoDTensor* in_; + framework::LoDTensor* out_; + const DeviceContext& ctx_; + int depth_; + + OneHotV2OpCUDAFunctor(const framework::LoDTensor* in, + framework::LoDTensor* out, int depth, + const DeviceContext& ctx) + : in_(in), out_(out), depth_(depth), ctx_(ctx) {} + + template + void apply() const { + auto* p_in_data = in_->data(); + auto numel = in_->numel(); + auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); + auto stream = ctx_.stream(); + math::set_constant(ctx_, out_, 0.0); + + FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) / + PADDLE_CUDA_NUM_THREADS, + PADDLE_CUDA_NUM_THREADS, 0, stream>>>( + p_in_data, p_out_data, numel, depth_); + } +}; + +using LoDTensor = framework::LoDTensor; +template +class OneHotV2CUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + + int depth = -1; + if (context.HasInput("depth_tensor")) { + auto* depth_tensor = context.Input("depth_tensor"); + if (platform::is_gpu_place(depth_tensor->place())) { + framework::Tensor temp; + TensorCopySync(*depth_tensor, platform::CPUPlace(), &temp); + depth = *temp.data(); + } else { + depth = *depth_tensor->data(); + } + + auto out_dims = out->dims(); + out_dims[out_dims.size() - 1] = depth; + out->Resize(out_dims); + } else { + depth = context.Attr("depth"); + } + framework::VisitDataType( + static_cast( + context.Attr("dtype")), + OneHotV2OpCUDAFunctor( + in, out, depth, context.template device_context())); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + one_hot_v2, + ops::OneHotV2CUDAKernel, + ops::OneHotV2CUDAKernel); diff --git a/paddle/fluid/operators/one_hot_v2_op.h b/paddle/fluid/operators/one_hot_v2_op.h new file mode 100644 index 00000000..7cfe2d61 --- /dev/null +++ b/paddle/fluid/operators/one_hot_v2_op.h @@ -0,0 +1,94 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +struct OneHotV2OpFunctor { + const framework::LoDTensor* in_; + framework::LoDTensor* out_; + int depth_; + const DeviceContext& ctx_; + bool allow_out_of_range_; + + OneHotV2OpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out, + int depth, const DeviceContext& ctx, + bool allow_out_of_range = false) + : in_(in), + out_(out), + depth_(depth), + ctx_(ctx), + allow_out_of_range_(allow_out_of_range) {} + + template + void apply() const { + auto* p_in_data = in_->data(); + auto numel = in_->numel(); + auto* p_out_data = out_->mutable_data(ctx_.GetPlace()); + math::set_constant(ctx_, out_, 0.0); + + if (allow_out_of_range_) { + for (int i = 0; i < numel; ++i) { + if (p_in_data[i] >= 0 && p_in_data[i] < depth_) { + *(p_out_data + i * depth_ + p_in_data[i]) = 1.0; + } + } + } else { + for (int i = 0; i < numel; ++i) { + PADDLE_ENFORCE_GE(p_in_data[i], 0, + "Illegal index value, should be at least 0."); + PADDLE_ENFORCE_LT( + p_in_data[i], depth_, + "Illegal index value, should be less than depth (%d).", depth_); + *(p_out_data + i * depth_ + p_in_data[i]) = 1.0; + } + } + } +}; + +using LoDTensor = framework::LoDTensor; +using Tensor = framework::Tensor; +template +class OneHotV2Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + int depth = context.Attr("depth"); + bool allow_out_of_range = context.Attr("allow_out_of_range"); + if (context.HasInput("depth_tensor")) { + auto* depth_tensor = context.Input("depth_tensor"); + auto* depth_data = depth_tensor->data(); + depth = depth_data[0]; + auto out_dims = out->dims(); + out_dims[out_dims.size() - 1] = depth; + out->Resize(out_dims); + } + + framework::VisitDataType( + static_cast( + context.Attr("dtype")), + OneHotV2OpFunctor( + in, out, depth, context.template device_context(), + allow_out_of_range)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adadelta_op.cc b/paddle/fluid/operators/optimizers/adadelta_op.cc index dd365629..01c0f1bb 100644 --- a/paddle/fluid/operators/optimizers/adadelta_op.cc +++ b/paddle/fluid/operators/optimizers/adadelta_op.cc @@ -56,6 +56,11 @@ class AdadeltaOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( param_dim, ctx->GetInputDim("Grad"), "param and grad input of AdadeltaOp should have same dimension"); + PADDLE_ENFORCE_NE(framework::product(ctx->GetInputDim("AvgSquaredGrad")), 0, + "Maybe the Input variable AvgSquaredGrad has not " + "been initialized. You may need to confirm if you put " + "exe.run(startup_program) after optimizer.minimize " + "function."); PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"), "Param and AvgSquaredGrad input of AdadeltaOp " "should have same dimension"); diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc index bd1bb98e..0310fe2e 100644 --- a/paddle/fluid/operators/optimizers/adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/adagrad_op.cc @@ -44,6 +44,11 @@ class AdagradOp : public framework::OperatorWithKernel { "Output(MomentOut) of AdagradOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, + "Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function."); PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, "LearningRate should have one element"); auto param_dims = ctx->GetInputDim("Param"); diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index dd347aa0..fc851e56 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -43,6 +43,11 @@ void AdamOp::InferShape(framework::InferShapeContext* ctx) const { "Output(Moment2Out) of AdamOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, + "Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function."); PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, "Learning rate should have 1 dimension"); auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); diff --git a/paddle/fluid/operators/optimizers/adamax_op.cc b/paddle/fluid/operators/optimizers/adamax_op.cc index aef1fc97..a0152906 100644 --- a/paddle/fluid/operators/optimizers/adamax_op.cc +++ b/paddle/fluid/operators/optimizers/adamax_op.cc @@ -54,6 +54,11 @@ class AdamaxOp : public framework::OperatorWithKernel { "Output(InfNormOut) of AdamaxOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, + "Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function."); PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, "Learning rate should have 1 dimension"); auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); diff --git a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc index 07899278..b44a84cc 100644 --- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc +++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cc @@ -49,6 +49,11 @@ class DecayedAdagradOp : public framework::OperatorWithKernel { "Output(MomentOut) of DecayedAdagradOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, + "Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function."); PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, "LearningRate should have one element"); auto param_dims = ctx->GetInputDim("Param"); diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc new file mode 100644 index 00000000..f263e675 --- /dev/null +++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/optimizers/dpsgd_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +class DpsgdOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, + "Input(Param) of DpsgdOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true, + "Input(Grad) of DpsgdOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true, + "Input(LearningRate) of DpsgdOp should not be null."); + PADDLE_ENFORCE_EQ( + ctx->GetInputsVarType("Param").front(), + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + PADDLE_ENFORCE_EQ( + ctx->GetInputsVarType("Grad").front(), + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); + + PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true, + "Output(ParamOut) of DpsgdOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 dimension"); + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of DpsgdOp should have same dimension"); + + ctx->SetOutputDim("ParamOut", param_dims); + } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); + } +}; + +class DpsgdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("LearningRate", "(Tensor) Learning rate"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + + AddAttr("clip", + "(float, default 0.9) " + "Exponential decay rate for the " + "1st moment estimates.") + .SetDefault(10.0f); + AddAttr("batch_size", + "(float, default 0.999) " + "exponential decay rate for the weighted " + "infinity norm estimates.") + .SetDefault(16.0f); + AddAttr("sigma", + "(float, default 1.0e-8) " + "Constant for numerical stability") + .SetDefault(1.0f); + AddComment(R"DOC( +Dpsgd Optimizer. + +We implement the Dpsgd optimizer according to CCS16 paper - +Deep Learning with Differential Privacy. + +Dpsgd updates: +CCS16 - Deep Learning with Differential Privacy. +[https://arxiv.org/abs/1607.00133] + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(dpsgd, ops::DpsgdOp, ops::DpsgdOpMaker); +REGISTER_OP_CPU_KERNEL( + dpsgd, ops::DpsgdOpKernel, + ops::DpsgdOpKernel); diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h new file mode 100644 index 00000000..4eba7fed --- /dev/null +++ b/paddle/fluid/operators/optimizers/dpsgd_op.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class DpsgdOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); + + const auto *grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE_EQ(grad_var->IsType(), true, + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); + + const auto *learning_rate = ctx.Input("LearningRate"); + + const auto *param = ctx.Input("Param"); + const auto *grad = ctx.Input("Grad"); + + auto *param_out = ctx.Output("ParamOut"); + + auto sz = param_out->numel(); + PADDLE_ENFORCE_EQ(param->numel(), sz); + PADDLE_ENFORCE_EQ(grad->numel(), sz); + + const T *lr = learning_rate->data(); + const T *param_data = param->data(); + const T *grad_data = grad->data(); + + T *out_data = param_out->mutable_data(ctx.GetPlace()); + + T clip = static_cast(ctx.Attr("clip")); + T batch_size = static_cast(ctx.Attr("batch_size")); + T sigma = static_cast(ctx.Attr("sigma")); + + // compute clipping + float l2_norm = 0.0; + for (int64_t i = 0; i < grad->numel(); ++i) { + l2_norm = l2_norm + grad_data[i] * grad_data[i]; + } + l2_norm = std::sqrt(l2_norm); + + float scale = 1.0; + if (l2_norm > clip) { + scale = l2_norm / clip; + } + + // generate gaussian noise. + // [https://en.wikipedia.org/wiki/Box-Muller_transform] + float V1, V2, S; + float X; + float mu = 0.0; + float U1, U2; + unsigned seed = (unsigned int)(time(NULL)); + std::minstd_rand engine; + engine.seed(seed); + std::uniform_real_distribution dist(0.0, 1.0); + do { + // srand((unsigned int)(time(NULL))); + // U1 = (rand() * 1.0) / RAND_MAX; + // U2 = (rand() * 1.0) / RAND_MAX; + // U1 = rand_rr(&seed) * (1.0 / RAND_MAX); + // U2 = rand_rr(&seed) * (1.0 / RAND_MAX); + U1 = dist(engine); + U2 = dist(engine); + V1 = 2 * U1 - 1; + V2 = 2 * U2 - 1; + S = V1 * V1 + V2 * V2; + } while (S >= 1 || S == 0); + + X = V1 * sqrt(-2 * log(S) / S); + + float gaussian_noise = mu + X * sigma; + + // update parameters + for (int64_t i = 0; i < grad->numel(); ++i) { + out_data[i] = + param_data[i] - + lr[0] * (grad_data[i] / scale + gaussian_noise / batch_size); + } + // CCS16 - Deep Learning with Differential Privacy. + // [https://arxiv.org/abs/1607.00133] + } // Compute +}; // class +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc index c1a4f579..98b71175 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.cc +++ b/paddle/fluid/operators/optimizers/ftrl_op.cc @@ -57,6 +57,11 @@ class FTRLOp : public framework::OperatorWithKernel { "Two input of FTRL Op's dimension must be same."); auto lr_dim = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_NE(framework::product(lr_dim), 0, + "Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function."); PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1, "Learning Rate should be a scalar."); diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h index e85be99f..e0064c20 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.h +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h @@ -30,7 +30,7 @@ class LarsMomentumOpKernel : public framework::OpKernel { auto learning_rate = ctx.Input("LearningRate"); auto* grad_var = ctx.InputVar("Grad"); // only support dense for now. - PADDLE_ENFORCE(grad_var->IsType()); + PADDLE_ENFORCE_EQ(grad_var->IsType(), true); auto grad = ctx.Input("Grad"); param_out->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index 29a2ae67..f56f5b6b 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -54,6 +54,15 @@ class MomentumOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"), "Output(VelocityOut) of Momentum should not be null."); + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, + "Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function."); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning_rate should be a scalar"); + auto param_dim = ctx->GetInputDim("Param"); if (ctx->GetInputsVarType("Grad")[0] == framework::proto::VarType::LOD_TENSOR) { @@ -64,8 +73,6 @@ class MomentumOp : public framework::OperatorWithKernel { param_dim, ctx->GetInputDim("Velocity"), "Param and Velocity of MomentumOp should have the same dimension."); } - PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1, - "Learning_rate should be a scalar"); ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("VelocityOut", param_dim); diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index 62163e45..9ccf3d93 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -32,6 +32,11 @@ class SGDOp : public framework::OperatorWithKernel { "Output(ParamOut) of SGDOp should not be null."); auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_NE(framework::product(lr_dims), 0, + "Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function."); PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, "Learning rate should have 1 element"); auto param_dim = ctx->GetInputDim("Param"); diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc index 4a332ce1..b26f1270 100644 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -65,9 +65,9 @@ class PoolCUDNNOpKernel : public framework::OpKernel { } cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, framework::vectorize2int(input->dims())); + layout, framework::vectorize(input->dims())); cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, framework::vectorize2int(output->dims())); + layout, framework::vectorize(output->dims())); PoolingMode pooling_mode; if (pooling_type == "max") { @@ -132,9 +132,9 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { } cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( - layout, framework::vectorize2int(input->dims())); + layout, framework::vectorize(input->dims())); cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( - layout, framework::vectorize2int(output->dims())); + layout, framework::vectorize(output->dims())); PoolingMode pooling_mode; if (pooling_type == "max") { diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu index 998768db..4a26c98a 100644 --- a/paddle/fluid/operators/prelu_op.cu +++ b/paddle/fluid/operators/prelu_op.cu @@ -41,7 +41,7 @@ class CUDAPReluKernel : public framework::OpKernel { int numel = x->numel(); auto dim = x->dims(); - std::vector input_shape = framework::vectorize2int(dim); + std::vector input_shape = framework::vectorize(dim); if (mode == "channel") { math::PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; @@ -157,7 +157,7 @@ class CUDAPReluGradKernel : public framework::OpKernel { int numel = x->numel(); auto dim = x->dims(); - std::vector input_shape = framework::vectorize2int(dim); + std::vector input_shape = framework::vectorize(dim); auto stream = context.cuda_device_context().stream(); T* dalpha_tmp_ptr; diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc new file mode 100644 index 00000000..6d5129f8 --- /dev/null +++ b/paddle/fluid/operators/prroi_pool_op.cc @@ -0,0 +1,188 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/prroi_pool_op.h" +#include + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class PRROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor), " + "the input of PRROIPoolOp. " + "The format of input tensor is NCHW. Where N is the batch size, " + "C is the number of input channels, " + "H is the height of the input feature map, and " + "W is the width."); + AddInput("ROIs", + "(LoDTensor), " + "ROIs (Regions of Interest) to pool over. " + "should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]. " + "where (x1, y1) is the top left coordinates, and " + "(x2, y2) is the bottom right coordinates. " + "The roi batch index can be calculated from LoD."); + AddOutput("Out", + "(Tensor), " + "the output of PRROIPoolOp is a 4-D Tensor with shape " + "(num_rois, output_channels, pooled_h, pooled_w)."); + AddAttr( + "output_channels", + "(int), " + "the number of channels of the output feature map. " + "For a task of C classes of objects, output_channels should be " + "(C + 1) for classification only."); + AddAttr("spatial_scale", + "(float, default 1.0), " + "Multiplicative spatial scale factor " + "to translate ROI coords from their input scale " + "to the scale used when pooling.") + .SetDefault(1.0); + AddAttr("pooled_height", + "(int, default 1), " + "the pooled output height.") + .SetDefault(1); + AddAttr("pooled_width", + "(int, default 1), " + "the pooled output width.") + .SetDefault(1); + AddComment(R"Doc( +**PRROIPool Operator** + +Precise region of interest pooling (also known as PRROIPooling) is to perform + bilinear interpolation average pooling method for RoI Pooling. + +Please refer to https://arxiv.org/abs/1807.11590 for more details. + + )Doc"); + } +}; + +class PRROIPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of op(PRROIPool) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("ROIs"), true, + "Input(ROIs) of op(PRROIPool) should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of op(PRROIPool) should not be null."); + auto input_dims = ctx->GetInputDim("X"); + auto rois_dims = ctx->GetInputDim("ROIs"); + + PADDLE_ENFORCE_EQ(input_dims.size(), 4, + "The format of input tensor is NCHW"); + PADDLE_ENFORCE_EQ(rois_dims.size(), 2, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]"); + PADDLE_ENFORCE_EQ(rois_dims[1], 4, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]"); + + int pooled_height = ctx->Attrs().Get("pooled_height"); + int pooled_width = ctx->Attrs().Get("pooled_width"); + int output_channels = ctx->Attrs().Get("output_channels"); + float spatial_scale = ctx->Attrs().Get("spatial_scale"); + + PADDLE_ENFORCE_EQ( + input_dims[1], output_channels * pooled_height * pooled_width, + "the channel of X(%d) should be equal to the product of " + "output_channels(%d), pooled_height(%d) and pooled_width(%d)", + input_dims[1], output_channels, pooled_height, pooled_width); + + PADDLE_ENFORCE_GT(pooled_height, 0, + "The pooled output height must be greater than 0"); + PADDLE_ENFORCE_GT(pooled_width, 0, + "The pooled output width must be greater than 0"); + PADDLE_ENFORCE_GT(output_channels, 1, + "The pooled output channels must greater than 1"); + PADDLE_ENFORCE_GT(spatial_scale, 0.0f, + "The spatial scale must greater than 0."); + + auto out_dims = input_dims; + out_dims[0] = rois_dims[0]; + out_dims[1] = + output_channels; // input_dims[1] / (pooled_height * pooled_width); + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + ctx->SetOutputDim("Out", out_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class PRROIPoolGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "The gradient of Out should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, + "The gradient of X should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class PRROIPoolGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("prroi_pool_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(prroi_pool, ops::PRROIPoolOp, ops::PRROIPoolOpMaker, + ops::PRROIPoolGradDescMaker); +REGISTER_OPERATOR(prroi_pool_grad, ops::PRROIPoolGradOp); +REGISTER_OP_CPU_KERNEL( + prroi_pool, + ops::CPUPRROIPoolOpKernel, + ops::CPUPRROIPoolOpKernel); +REGISTER_OP_CPU_KERNEL( + prroi_pool_grad, + ops::CPUPRROIPoolGradOpKernel, + ops::CPUPRROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu new file mode 100644 index 00000000..915e3daa --- /dev/null +++ b/paddle/fluid/operators/prroi_pool_op.cu @@ -0,0 +1,309 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/prroi_pool_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +DEVICE void PrRoIPoolingDistributeDiffCUDA(T* diff, const T top_diff, + const int h, const int w, + const int height, const int width, + const T coeff) { + bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); + if (!overflow) { + paddle::platform::CudaAtomicAdd(diff + h * width + w, top_diff * coeff); + } +} + +template +__global__ void GPUPRROIPoolForward( + const int nthreads, const T* input_data, const T* input_rois, + const float spatial_scale, const int input_channels, const int height, + const int width, const int output_channels, const int pooled_height, + const int pooled_width, const int* rois_batch_id_data, T* output_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(offset_input_rois[0]) * spatial_scale; + T roi_start_h = static_cast(offset_input_rois[1]) * spatial_scale; + T roi_end_w = static_cast(offset_input_rois[2]) * spatial_scale; + T roi_end_h = static_cast(offset_input_rois[3]) * spatial_scale; + + T roi_width = max(roi_end_w - roi_start_w, static_cast(0.0)); + T roi_height = max(roi_end_h - roi_start_h, static_cast(0.0)); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T win_start_w = roi_start_w + bin_size_w * pw; + T win_start_h = roi_start_h + bin_size_h * ph; + T win_end_w = win_start_w + bin_size_w; + T win_end_h = win_start_h + bin_size_h; + + T win_size = max(static_cast(0.0), bin_size_w * bin_size_h); + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + const T* offset_input_data = + input_data + + (roi_batch_id * input_channels + input_channel) * height * width; + + if (win_size > static_cast(0.0)) { + int s_w = floor(win_start_w); + int e_w = ceil(win_end_w); + int s_h = floor(win_start_h); + int e_h = ceil(win_end_h); + T sum_out = 0; + + for (int w_iter = s_w; w_iter < e_w; ++w_iter) { + for (int h_iter = s_h; h_iter < e_h; ++h_iter) { + sum_out += PrRoIPoolingMatCalculation( + offset_input_data, h_iter, w_iter, h_iter + 1, w_iter + 1, + max(win_start_h, static_cast(h_iter)), + max(win_start_w, static_cast(w_iter)), + min(win_end_h, static_cast(h_iter) + static_cast(1.0)), + min(win_end_w, static_cast(w_iter) + static_cast(1.0)), + height, width); + } + } + output_data[i] = sum_out / win_size; + } else { + output_data[i] = 0.; + } + } +} + +template +__global__ void GPUPRROIPoolBackward( + const int nthreads, const T* input_rois, const T* output_grad_data, + const float spatial_scale, const int input_channels, const int height, + const int width, const int output_channels, const int pooled_height, + const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_offset = + (roi_batch_id * input_channels + input_channel) * height * width; + T* offset_input_grad_data = input_grad_data + input_offset; + const T* offset_output_grad_data = output_grad_data + i; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(offset_input_rois[0]) * spatial_scale; + T roi_start_h = static_cast(offset_input_rois[1]) * spatial_scale; + T roi_end_w = static_cast(offset_input_rois[2]) * spatial_scale; + T roi_end_h = static_cast(offset_input_rois[3]) * spatial_scale; + + T roi_width = max(roi_end_w - roi_start_w, static_cast(0.0)); + T roi_height = max(roi_end_h - roi_start_h, static_cast(0.0)); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T win_start_w = roi_start_w + bin_size_w * pw; + T win_start_h = roi_start_h + bin_size_h * ph; + T win_end_w = win_start_w + bin_size_w; + T win_end_h = win_start_h + bin_size_h; + + T win_size = max(static_cast(0.0), bin_size_w * bin_size_h); + int s_w = floor(win_start_w); + int e_w = ceil(win_end_w); + int s_h = floor(win_start_h); + int e_h = ceil(win_end_h); + + T sum_out = win_size == static_cast(0.) + ? static_cast(0.) + : *offset_output_grad_data / win_size; + + for (int w_iter = s_w; w_iter < e_w; ++w_iter) { + for (int h_iter = s_h; h_iter < e_h; ++h_iter) { + PrRoIPoolingMatDistributeDiff( + offset_input_grad_data, sum_out, h_iter, w_iter, h_iter + 1, + w_iter + 1, max(win_start_h, static_cast(h_iter)), + max(win_start_w, static_cast(w_iter)), + min(win_end_h, static_cast(h_iter) + static_cast(1.0)), + min(win_end_w, static_cast(w_iter) + static_cast(1.0)), + height, width, PrRoIPoolingDistributeDiffCUDA); + } + } + } +} + +template +class GPUPRROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + int rois_num = rois->dims()[0]; + if (rois_num == 0) return; + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + "The rois_batch_size and input(X) batch_size must be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + + // set rois batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(platform::CPUPlace()); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + framework::Tensor rois_batch_id_list_gpu; + framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), + ctx.device_context(), &rois_batch_id_list_gpu); + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; + + // call cuda kernel function + GPUPRROIPoolForward< + T><<>>( + output_size, in->data(), rois->data(), spatial_scale, + input_channels, height, width, output_channels, pooled_height, + pooled_width, rois_batch_id_list_gpu.data(), + out->mutable_data(ctx.GetPlace())); + } +}; + +template +class GPUPRROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + int rois_num = rois->dims()[0]; + int input_channels = in->dims()[1]; + int height = in->dims()[2]; + int width = in->dims()[3]; + + if (input_grad) { + // set roi batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(platform::CPUPlace()); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + framework::Tensor rois_batch_id_list_gpu; + framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), + ctx.device_context(), &rois_batch_id_list_gpu); + + input_grad->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.cuda_device_context(), input_grad, static_cast(0)); + + int output_grad_size = output_grad->numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPUPRROIPoolBackward< + T><<>>( + output_grad_size, rois->data(), output_grad->data(), + spatial_scale, input_channels, height, width, output_channels, + pooled_height, pooled_width, rois_batch_id_list_gpu.data(), + input_grad->mutable_data(ctx.GetPlace())); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(prroi_pool, ops::GPUPRROIPoolOpKernel, + ops::GPUPRROIPoolOpKernel); +REGISTER_OP_CUDA_KERNEL( + prroi_pool_grad, + ops::GPUPRROIPoolGradOpKernel, + ops::GPUPRROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h new file mode 100644 index 00000000..621e543f --- /dev/null +++ b/paddle/fluid/operators/prroi_pool_op.h @@ -0,0 +1,364 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +HOSTDEVICE T PrRoIPoolingGetData(const T* data, const int h, const int w, + const int height, const int width) { + bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); + T retVal = overflow ? 0.0f : data[h * width + w]; + return retVal; +} + +template +HOSTDEVICE T PrRoIPoolingMatCalculation(const T* this_data, const int s_h, + const int s_w, const int e_h, + const int e_w, const T y0, const T x0, + const T y1, const T x1, const int h0, + const int w0) { + T alpha, beta, lim_alpha, lim_beta, tmp; + T sum_out = 0; + + alpha = x0 - static_cast(s_w); + beta = y0 - static_cast(s_h); + lim_alpha = x1 - static_cast(s_w); + lim_beta = y1 - static_cast(s_h); + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp; + + alpha = static_cast(e_w) - x1; + lim_alpha = static_cast(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp; + + alpha = x0 - static_cast(s_w); + beta = static_cast(e_h) - y1; + lim_alpha = x1 - static_cast(s_w); + lim_beta = static_cast(e_h) - y0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp; + + alpha = static_cast(e_w) - x1; + lim_alpha = static_cast(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp; + + return sum_out; +} + +template +HOSTDEVICE void PrRoIPoolingDistributeDiff(T* diff, const T top_diff, + const int h, const int w, + const int height, const int width, + const T coeff) { + bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); + if (!overflow) { + *(diff + h * width + w) = top_diff * coeff; + } +} + +template +HOSTDEVICE void PrRoIPoolingMatDistributeDiff( + T* diff, const T top_diff, const int s_h, const int s_w, const int e_h, + const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0, + const int w0, Functor functor) { + T alpha, beta, lim_alpha, lim_beta, tmp; + + alpha = x0 - static_cast(s_w); + beta = y0 - static_cast(s_h); + lim_alpha = x1 - static_cast(s_w); + lim_beta = y1 - static_cast(s_h); + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + functor(diff, top_diff, s_h, s_w, h0, w0, tmp); + + alpha = static_cast(e_w) - x1; + lim_alpha = static_cast(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + functor(diff, top_diff, s_h, e_w, h0, w0, tmp); + + alpha = x0 - static_cast(s_w); + beta = static_cast(e_h) - y1; + lim_alpha = x1 - static_cast(s_w); + lim_beta = static_cast(e_h) - y0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + functor(diff, top_diff, e_h, s_w, h0, w0, tmp); + + alpha = static_cast(e_w) - x1; + lim_alpha = static_cast(e_w) - x0; + tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha + + 0.5f * alpha * alpha) * + (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta); + functor(diff, top_diff, e_h, e_w, h0, w0, tmp); +} + +template +class CPUPRROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_channels = ctx.Attr("output_channels"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto in_stride = framework::stride(in_dims); + auto out_stride = framework::stride(out->dims()); + + const T* input_data = in->data(); + + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(ctx.GetPlace()); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + "the rois_batch_size and input(X) batch_size should be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num, + "the rois_num from input and lod must be the same"); + + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + T* output_data = out->mutable_data(ctx.GetPlace()); + const T* input_rois = rois->data(); + + // calculate prroipooling, parallel processing can be implemented per ROI + for (int n = 0; n < rois_num; ++n) { + // set roi batch id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(offset_input_rois[0]) * spatial_scale; + T roi_start_h = static_cast(offset_input_rois[1]) * spatial_scale; + T roi_end_w = static_cast(offset_input_rois[2]) * spatial_scale; + T roi_end_h = static_cast(offset_input_rois[3]) * spatial_scale; + + T roi_width = std::max(roi_end_w - roi_start_w, static_cast(0.0)); + T roi_height = std::max(roi_end_h - roi_start_h, static_cast(0.0)); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + T win_size = std::max(static_cast(0.0), bin_size_w * bin_size_h); + + // calculate each pixel of the output feature map. + int out_roi_offset = n * out_stride[0]; + for (int c = 0; c < output_channels; ++c) { + // per category + int out_plane_offset = out_roi_offset + c * out_stride[1]; + for (int ph = 0; ph < pooled_height; ++ph) { + int out_row_offset = out_plane_offset + ph * out_stride[2]; + for (int pw = 0; pw < pooled_width; ++pw) { + // calculate w and h at input feature map + T win_start_h = static_cast(ph) * bin_size_h + roi_start_h; + T win_start_w = static_cast(pw) * bin_size_w + roi_start_w; + T win_end_h = win_start_h + bin_size_h; + T win_end_w = win_start_w + bin_size_w; + // Add roi offsets and clip to input boundaries + int s_w = std::floor(win_start_w); + int e_w = std::ceil(win_end_w); + int s_h = std::floor(win_start_h); + int e_h = std::ceil(win_end_h); + + int output_index = out_row_offset + pw; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_plane_offset = + roi_batch_id * in_stride[0] + input_channel * in_stride[1]; + const T* offset_input_data = input_data + input_plane_offset; + T sum_out = 0.; + + if (win_size > static_cast(0.0)) { + for (int w_iter = s_w; w_iter < e_w; ++w_iter) { + for (int h_iter = s_h; h_iter < e_h; ++h_iter) { + sum_out += PrRoIPoolingMatCalculation( + offset_input_data, h_iter, w_iter, h_iter + 1, w_iter + 1, + std::max(win_start_h, static_cast(h_iter)), + std::max(win_start_w, static_cast(w_iter)), + std::min(win_end_h, + static_cast(h_iter) + static_cast(1.0)), + std::min(win_end_w, + static_cast(w_iter) + static_cast(1.0)), + height, width); + } + } + + output_data[output_index] = sum_out / win_size; + } else { + output_data[output_index] = 0.; + } + } + } + } + } + } +}; + +template +class CPUPRROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + if (input_grad) { + auto in_dims = in->dims(); + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + // set roi batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(ctx.GetPlace()); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + const T* input_rois = rois->data(); + const T* output_grad_data = output_grad->data(); + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + + // set gradient of X to be 0. before backpropagate. + math::SetConstant set_zero; + set_zero(ctx.template device_context(), input_grad, + static_cast(0)); + + // backpropagate gradient per output pixel + int output_grad_size = output_grad->numel(); + for (int i = 0; i < output_grad_size; ++i) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_offset = + (roi_batch_id * input_channels + input_channel) * height * width; + T* offset_input_grad_data = input_grad_data + input_offset; + const T* offset_output_grad_data = output_grad_data + i; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(offset_input_rois[0]) * spatial_scale; + T roi_start_h = static_cast(offset_input_rois[1]) * spatial_scale; + T roi_end_w = static_cast(offset_input_rois[2]) * spatial_scale; + T roi_end_h = static_cast(offset_input_rois[3]) * spatial_scale; + + T roi_width = std::max(roi_end_w - roi_start_w, static_cast(0.0)); + T roi_height = std::max(roi_end_h - roi_start_h, static_cast(0.0)); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T win_start_w = roi_start_w + bin_size_w * pw; + T win_start_h = roi_start_h + bin_size_h * ph; + T win_end_w = win_start_w + bin_size_w; + T win_end_h = win_start_h + bin_size_h; + + T win_size = std::max(static_cast(0.0), bin_size_w * bin_size_h); + + T sum_out = win_size == static_cast(0.) + ? static_cast(0.) + : *offset_output_grad_data / win_size; + + int s_w = std::floor(win_start_w); + int e_w = std::ceil(win_end_w); + int s_h = std::floor(win_start_h); + int e_h = std::ceil(win_end_h); + + for (int w_iter = s_w; w_iter < e_w; ++w_iter) { + for (int h_iter = s_h; h_iter < e_h; ++h_iter) { + PrRoIPoolingMatDistributeDiff( + offset_input_grad_data, sum_out, h_iter, w_iter, h_iter + 1, + w_iter + 1, std::max(win_start_h, static_cast(h_iter)), + std::max(win_start_w, static_cast(w_iter)), + std::min(win_end_h, + static_cast(h_iter) + static_cast(1.0)), + std::min(win_end_w, + static_cast(w_iter) + static_cast(1.0)), + height, width, PrRoIPoolingDistributeDiff); + } + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/pull_box_sparse_op.cc b/paddle/fluid/operators/pull_box_sparse_op.cc new file mode 100644 index 00000000..85326496 --- /dev/null +++ b/paddle/fluid/operators/pull_box_sparse_op.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/pull_box_sparse_op.h" + +namespace paddle { +namespace operators { + +class PullBoxSparseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_GE(ctx->Inputs("Ids").size(), 1UL, + "Inputs(Ids) of PullBoxSparseOp should not be empty."); + PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL, + "Outputs(Out) of PullBoxSparseOp should not be empty."); + auto hidden_size = static_cast(ctx->Attrs().Get("size")); + auto all_ids_dim = ctx->GetInputsDim("Ids"); + const size_t n_ids = all_ids_dim.size(); + std::vector outs_dims; + outs_dims.resize(n_ids); + for (size_t i = 0; i < n_ids; ++i) { + const auto ids_dims = all_ids_dim[i]; + int ids_rank = ids_dims.size(); + PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + "Shape error in %lu id, the last dimension of the " + "'Ids' tensor must be 1.", + i); + auto out_dim = framework::vectorize( + framework::slice_ddim(ids_dims, 0, ids_rank - 1)); + out_dim.push_back(hidden_size); + outs_dims[i] = framework::make_ddim(out_dim); + } + ctx->SetOutputsDim("Out", outs_dims); + for (size_t i = 0; i < n_ids; ++i) { + ctx->ShareLoD("Ids", "Out", i, i); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.device_context()); + } +}; + +class PullBoxSparseOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Ids", + "Input tensors with type int32 or int64 " + "contains the ids to be looked up in BoxPS. " + "The last dimension size must be 1.") + .AsDuplicable(); + AddOutput("Out", "The lookup results tensors.").AsDuplicable(); + AddAttr("size", "(int, the embedding hidden size").SetDefault(1); + AddComment(R"DOC( +Pull Box Sparse Operator. + +This operator is used to perform lookups on the BoxPS, +then concatenated into a dense tensor. + +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + +)DOC"); + } +}; + +class PushBoxSparseOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("push_box_sparse"); + op->SetInput("Ids", Input("Ids")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +class PushBoxSparseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.MultiInput(framework::GradVarName("Out"))[0] + ->type(), + ctx.device_context()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(pull_box_sparse, ops::PullBoxSparseOp, + ops::PullBoxSparseOpMaker, ops::PushBoxSparseOpDescMaker); +REGISTER_OPERATOR(push_box_sparse, ops::PushBoxSparseOp); +REGISTER_OP_CPU_KERNEL(pull_box_sparse, ops::PullBoxSparseCPUKernel) +REGISTER_OP_CPU_KERNEL(push_box_sparse, ops::PushBoxSparseCPUKernel) diff --git a/paddle/fluid/operators/pull_box_sparse_op.cu b/paddle/fluid/operators/pull_box_sparse_op.cu new file mode 100644 index 00000000..8bba9db5 --- /dev/null +++ b/paddle/fluid/operators/pull_box_sparse_op.cu @@ -0,0 +1,44 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/pull_box_sparse_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { +using platform::PADDLE_CUDA_NUM_THREADS; +using LoDTensor = framework::LoDTensor; + +template +class PullBoxSparseCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PullBoxSparseFunctor(ctx); + } +}; + +template +class PushBoxSparseCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PushBoxSparseFunctor(ctx); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseCUDAKernel) +REGISTER_OP_CUDA_KERNEL(push_box_sparse, ops::PushBoxSparseCUDAKernel) diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h new file mode 100644 index 00000000..48a9e4d9 --- /dev/null +++ b/paddle/fluid/operators/pull_box_sparse_op.h @@ -0,0 +1,90 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/fleet/box_wrapper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { + +template +static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) { + auto inputs = ctx.MultiInput("Ids"); + auto outputs = ctx.MultiOutput("Out"); + auto hidden_size = ctx.Attr("size"); + const auto slot_size = inputs.size(); + std::vector all_keys(slot_size); + // BoxPS only supports float now + std::vector all_values(slot_size); + std::vector slot_lengths(slot_size); + for (size_t i = 0; i < slot_size; i++) { + const auto *slot = inputs[i]; + const uint64_t *single_slot_keys = + reinterpret_cast(slot->data()); + all_keys[i] = single_slot_keys; + slot_lengths[i] = slot->numel(); + auto *output = outputs[i]->mutable_data(ctx.GetPlace()); + all_values[i] = output; + } + auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); + box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths, + hidden_size); +} + +template +static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) { + auto inputs = ctx.MultiInput("Ids"); + auto d_output = + ctx.MultiInput(framework::GradVarName("Out")); + auto hidden_size = ctx.Attr("size"); + const auto slot_size = inputs.size(); + std::vector all_keys(slot_size); + std::vector all_grad_values(slot_size); + std::vector slot_lengths(slot_size); + for (size_t i = 0; i < slot_size; i++) { + const auto *slot = inputs[i]; + const uint64_t *single_slot_keys = + reinterpret_cast(slot->data()); + all_keys[i] = single_slot_keys; + slot_lengths[i] = slot->numel(); + const float *grad_value = d_output[i]->data(); + all_grad_values[i] = grad_value; + } + auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); + box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values, + slot_lengths, hidden_size); +} + +using LoDTensor = framework::LoDTensor; +template +class PullBoxSparseCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PullBoxSparseFunctor(ctx); + } +}; + +template +class PushBoxSparseCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PushBoxSparseFunctor(ctx); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/quantize_op.cc b/paddle/fluid/operators/quantize_op.cc index bf70c08b..d8e20f4c 100644 --- a/paddle/fluid/operators/quantize_op.cc +++ b/paddle/fluid/operators/quantize_op.cc @@ -43,5 +43,4 @@ void QuantOpMaker::Make() { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(quantize, ops::QuantOp, ops::QuantOpMaker, - paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(quantize, ops::QuantOp, ops::QuantOpMaker); diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc index dad46ec6..65a8d603 100644 --- a/paddle/fluid/operators/random_crop_op.cc +++ b/paddle/fluid/operators/random_crop_op.cc @@ -56,7 +56,7 @@ class RandomCropOpInferShape : public framework::InferShapeBase { auto shape = ctx->Attrs().Get>("shape"); auto x_dim = ctx->GetInputDim("X"); PADDLE_ENFORCE_GT(x_dim.size(), static_cast(shape.size())); - auto out_dim = framework::vectorize2int(x_dim); + auto out_dim = framework::vectorize(x_dim); for (size_t i = 1; i <= shape.size(); ++i) { size_t x_i = x_dim.size() - i; size_t shape_i = shape.size() - i; diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h index ee034b27..ae58358c 100644 --- a/paddle/fluid/operators/random_crop_op.h +++ b/paddle/fluid/operators/random_crop_op.h @@ -60,7 +60,16 @@ HOSTDEVICE inline void StridedMemcpy(const T* x, const size_t* x_dims, T* out, size_t offset_i = offsets[i]; if (i == rank - 1) { - PADDLE_ASSERT(x_stride == 1 && out_stride == 1); + PADDLE_ENFORCE(x_stride == 1, + "When i:%d == rank:%d - 1, x_stride of random_crop_op " + "expected to be 1, but got %ld. Please check input " + "value.", + i, rank, x_stride); + PADDLE_ENFORCE(out_stride == 1, + "When i:%d == rank:%d - 1, out_stride of random_crop_op " + "expected to be 1, but got %ld. Please check input " + "value.", + i, rank, out_stride); x += offset_i; for (size_t j = 0; j < out_dim_i; ++j) { *out++ = *x++; diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 61690139..f61af333 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -20,14 +20,7 @@ endfunction() cc_library(py_reader SRCS py_reader.cc DEPS reader) cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) -reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader) -reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc) -reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc) -reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc) -reader_library(create_recordio_file_reader_op SRCS create_recordio_file_reader_op.cc) reader_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS buffered_reader) -reader_library(create_multi_pass_reader_op SRCS create_multi_pass_reader_op.cc) -reader_library(create_custom_reader_op SRCS create_custom_reader_op.cc) reader_library(create_py_reader_op SRCS create_py_reader_op.cc DEPS py_reader) cc_test(reader_blocking_queue_test SRCS reader_blocking_queue_test.cc) diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 16cb08f4..b332450c 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -128,9 +128,18 @@ void BufferedReader::ReadAsync(size_t i) { boost::get(cpu_place), cpu_ptr, size, stream_); } else { + platform::CUDAPinnedPlace cuda_pinned_place; + framework::LoDTensor cuda_pinned_tensor; + cuda_pinned_tensor.Resize(cpu[i].dims()); + auto cuda_pinned_ptr = + cuda_pinned_tensor.mutable_data(cuda_pinned_place, cpu[i].type()); + memory::Copy(cuda_pinned_place, cuda_pinned_ptr, + boost::get(cpu_place), cpu_ptr, + size); memory::Copy(boost::get(place_), gpu_ptr, - boost::get(cpu_place), cpu_ptr, size, - stream_); + cuda_pinned_place, cuda_pinned_ptr, size, stream_); + PADDLE_ENFORCE(cudaStreamSynchronize(stream_), + "cuda stream sync error."); } gpu[i].set_lod(cpu[i].lod()); } diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc deleted file mode 100644 index f771cebd..00000000 --- a/paddle/fluid/operators/reader/create_batch_reader_op.cc +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reader/reader_op_registry.h" - -namespace paddle { -namespace operators { -namespace reader { - -class BatchReader : public framework::DecoratedReader { - public: - BatchReader(const std::shared_ptr& reader, int batch_size, - bool discard_leftover) - : DecoratedReader(reader), - batch_size_(static_cast(batch_size)), - discard_leftover_(discard_leftover) { - buffer_.reserve(batch_size_); - } - - void ReadNextImpl(std::vector* out) override; - - private: - size_t batch_size_; - bool discard_leftover_; - std::vector> buffer_; -}; - -class CreateBatchReaderOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); - if (out->Get() != nullptr) { - return; - } - const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) - ->Get(); - out->Reset(framework::MakeDecoratedReader( - underlying_reader, Attr("batch_size"), - Attr("discard_leftover"))); - } -}; - -class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase { - protected: - void Apply() override { - AddAttr("batch_size", - "How many instances the batch reader yields each time.") - .GreaterThan(0); - AddAttr("discard_leftover", - "If true, the leftover instances that are not enough for a " - "new batch will be discarded.") - .SetDefault(true); - AddComment(R"DOC( - CreateBatchReader Operator - - A batch reader takes another reader as its 'underlying reader', - gathers the underlying reader's outputs and then yields them in batches. - )DOC"); - } -}; - -void BatchReader::ReadNextImpl(std::vector* out) { - buffer_.clear(); - buffer_.reserve(batch_size_); - for (size_t i = 0; i < batch_size_; ++i) { - buffer_.push_back(std::vector()); - reader_->ReadNext(&buffer_.back()); - if (buffer_.back().empty()) { - buffer_.pop_back(); - break; - } - } - if (discard_leftover_ && buffer_.size() < batch_size_) { - buffer_.clear(); - } - // Concat instances - out->clear(); - if (buffer_.empty()) { - // if buffer_ is empty, the 'out' will return as an empty vector. - return; - } - size_t out_num = buffer_[0].size(); - out->reserve(out_num); - for (size_t j = 0; j < out_num; ++j) { - // Merge shape and check date type - auto batch_type = buffer_[0][j].type(); - framework::DDim batch_shape = buffer_[0][j].dims(); - for (size_t i = 1; i < buffer_.size(); ++i) { - auto ins_type = buffer_[i][j].type(); - framework::DDim ins_shape = buffer_[i][j].dims(); - PADDLE_ENFORCE_EQ(batch_type, ins_type); - PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()), - slice_ddim(ins_shape, 1, ins_shape.size())); - PADDLE_ENFORCE_GT(ins_shape[0], 0); - batch_shape[0] += ins_shape[0]; - } - - framework::LoDTensor out_tensor; - out_tensor.Resize(batch_shape); - out_tensor.mutable_data(platform::CPUPlace(), batch_type); - int64_t dst_offset = 0; - - // Merge lod and data - framework::LoD batch_lod; - for (size_t i = 0; i < buffer_.size(); ++i) { - framework::DDim ins_shape = buffer_[i][j].dims(); - framework::LoD ins_lod = buffer_[i][j].lod(); - if (i == 0) { - batch_lod = ins_lod; - } else { - PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size()); - for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) { - auto& lod_level = batch_lod[level_idx]; - for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) { - lod_level.push_back(ins_lod[level_idx][k] + lod_level.back()); - } - } - } - auto dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]); - TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst); - dst_offset += ins_shape[0]; - } - out_tensor.set_lod(batch_lod); - out->push_back(out_tensor); - } -} - -} // namespace reader -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators::reader; -REGISTER_DECORATED_READER_OPERATOR(create_batch_reader, - ops::CreateBatchReaderOp, - ops::CreateBatchReaderOpMaker); diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc index fdc7b0f6..975f7b99 100644 --- a/paddle/fluid/operators/reader/create_custom_reader_op.cc +++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc @@ -167,7 +167,7 @@ void CustomReader::ReadNextImpl(std::vector* out) { tensor->set_lod(underlying_outs[i].lod()); } // 2. Run the sub-block. - exe_.Run(program_, exe_scope, sub_block_id_, false, true); + exe_.Run(program_, exe_scope, sub_block_id_, false, true, {}, true); // 3. Copy LoDTensors from sink variables to out. out->resize(sink_var_names_.size()); for (size_t i = 0; i < sink_var_names_.size(); ++i) { diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc deleted file mode 100644 index 0a225597..00000000 --- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/detail/safe_ref.h" -#include "paddle/fluid/operators/reader/reader_op_registry.h" - -namespace paddle { -namespace operators { -namespace reader { - -class MultiPassReader : public framework::DecoratedReader { - public: - MultiPassReader(const std::shared_ptr& reader, int pass_num) - : DecoratedReader(reader), pass_num_(pass_num), pass_count_(0) {} - - void ReadNextImpl(std::vector* out) override { - reader_->ReadNext(out); - if (out->empty() && pass_count_ < pass_num_ - 1) { - reader_->Shutdown(); - reader_->Start(); - reader_->ReadNext(out); - ++pass_count_; - } - } - - private: - void StartImpl() override { - pass_count_ = 0; - reader_->Start(); - } - - int pass_num_; - mutable int pass_count_; -}; - -class CreateMultiPassReaderOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - auto* out = detail::Ref(scope.FindVar(Output("Out"))) - .GetMutable(); - if (out->Get() != nullptr) { - return; - } - const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) - ->Get(); - int pass_num = Attr("pass_num"); - out->Reset(framework::MakeDecoratedReader( - underlying_reader, pass_num)); - } -}; - -class CreateMultiPassReaderOpMaker : public DecoratedReaderMakerBase { - protected: - void Apply() override { - AddAttr("pass_num", "The number of pass to run.").GreaterThan(0); - AddComment(R"DOC( - CreateMultiPassReader Operator - - This operator creates a multi-pass reader. A multi-pass reader - is used to yield data for several pass training continuously. - It takes the number of passes to run as one of its attributes - ('pass_num'), and maintains a pass counter to record how many - passes it has completed. When the underlying reader reaches the - EOF, the multi-pass reader checks whether it has completed training - of the given number of pass. If not, the underlying reader will - be re-initialized and starts a new pass automatically. - )DOC"); - } -}; - -} // namespace reader -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators::reader; -REGISTER_DECORATED_READER_OPERATOR(create_multi_pass_reader, - ops::CreateMultiPassReaderOp, - ops::CreateMultiPassReaderOpMaker); diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc deleted file mode 100644 index e5c116df..00000000 --- a/paddle/fluid/operators/reader/create_random_data_generator_op.cc +++ /dev/null @@ -1,107 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reader/reader_op_registry.h" - -namespace paddle { -namespace operators { -namespace reader { - -template -class RandomDataGenerator : public framework::FileReader { - public: - RandomDataGenerator(const std::vector& shapes, float low, - float high) - : framework::FileReader(), low_(low), high_(high), shapes_(shapes) { - PADDLE_ENFORCE_LE(low, high, - "'low' shouldn't be greater than 'high'.(%f vs %f)", low, - high); - unsigned int seed = std::random_device()(); - engine_.seed(seed); - dist_ = std::uniform_real_distribution(low_, high_); - } - - void ReadNextImpl(std::vector* out) override { - out->clear(); - out->reserve(shapes_.size()); - for (const framework::DDim& shape : shapes_) { - PADDLE_ENFORCE_GE( - shape.size(), 2, - "The rank of reader's output data should be 2 at least.(Now it's %d)", - shape.size()); - framework::LoDTensor out_tensor; - out_tensor.Resize(shape); - T* data = out_tensor.mutable_data(platform::CPUPlace()); - int64_t numel = framework::product(shape); - for (int64_t i = 0; i < numel; ++i) { - data[i] = dist_(engine_); - } - out->push_back(out_tensor); - } - } - - private: - float low_; - float high_; - std::minstd_rand engine_; - std::uniform_real_distribution dist_; - std::vector shapes_; -}; - -template -class CreateRandomDataGeneratorOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - const auto& shape_concat = Attr>("shape_concat"); - const auto& ranks = Attr>("ranks"); - PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); - PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), - static_cast(shape_concat.size()), - "The accumulate of all ranks should be equal to the " - "shape concat's length."); - std::vector shapes = RestoreShapes(shape_concat, ranks); - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); - out->Reset(std::make_shared>( - shapes, Attr("low"), Attr("high"))); - } -}; - -class CreateRandomDataGeneratorOpMaker : public FileReaderMakerBase { - protected: - void Apply() override { - AddAttr("low", "The lower bound of reader's uniform distribution."); - AddAttr("high", "The upper bound of reader's uniform distribution."); - AddComment(R"DOC( - CreateRandomDataGenerator Operator - - This Op creates a random reader. - The reader generates random data instead of really reading from files. - Generated data follow an uniform distribution between 'low' and 'high'. - )DOC"); - } -}; - -} // namespace reader -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators::reader; -REGISTER_FILE_READER_OPERATOR(create_random_data_generator, - ops::CreateRandomDataGeneratorOp, - ops::CreateRandomDataGeneratorOpMaker); diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc deleted file mode 100644 index d7a04825..00000000 --- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/reader/reader_op_registry.h" -#include "paddle/fluid/platform/lock_guard_ptr.h" -#include "paddle/fluid/recordio/scanner.h" - -namespace paddle { -namespace operators { -namespace reader { -template -class RecordIOFileReader : public framework::FileReader { - public: - explicit RecordIOFileReader(const std::string& filename) - : scanner_(filename), - dev_ctx_(*platform::DeviceContextPool::Instance().Get( - platform::CPUPlace())) { - if (ThreadSafe) { - mutex_.reset(new std::mutex()); - } - LOG(INFO) << "Creating file reader" << filename; - } - - protected: - void ReadNextImpl(std::vector* out) override { - platform::LockGuardPtr guard(mutex_); - bool ok = framework::ReadFromRecordIO(&scanner_, dev_ctx_, out); - if (!ok) { - out->clear(); - } - } - - void StartImpl() override { scanner_.Reset(); } - - private: - std::unique_ptr mutex_; - recordio::Scanner scanner_; - const platform::DeviceContext& dev_ctx_; -}; - -class CreateRecordIOReaderOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - std::string filename = Attr("filename"); - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); - - out->Reset(std::make_shared>(filename)); - } -}; - -class CreateRecordIOReaderOpMaker : public FileReaderMakerBase { - protected: - void Apply() override { - AddAttr( - "filename", - "The filename of record file. This file will given to reader."); - AddComment(R"DOC( -Open a recordio file and return the reader object. The returned reader object -is thread-safe. - -NOTE: This is a very low-level API. It is used for debugging data file or -training. Please use `open_files` instead of this API for production usage. - )DOC"); - } -}; - -} // namespace reader -} // namespace operators -} // namespace paddle - -namespace reader = paddle::operators::reader; - -REGISTER_FILE_READER_OPERATOR(create_recordio_file_reader, - reader::CreateRecordIOReaderOp, - reader::CreateRecordIOReaderOpMaker); - -REGISTER_FILE_READER(recordio, reader::RecordIOFileReader); diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc deleted file mode 100644 index 3f72890a..00000000 --- a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc +++ /dev/null @@ -1,124 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include "glog/logging.h" -#include "paddle/fluid/operators/detail/safe_ref.h" -#include "paddle/fluid/operators/reader/reader_op_registry.h" - -namespace paddle { -namespace operators { -namespace reader { - -class ShuffleReader : public framework::DecoratedReader { - public: - ShuffleReader(const std::shared_ptr& reader, size_t buffer_size, - size_t seed = 0) - : DecoratedReader(reader), buffer_size_(buffer_size), seed_(seed) { - VLOG(10) << "Create shuffle reader of " << reader_; - if (seed_ == 0) { - std::random_device device; - seed_ = device(); - } - ReloadBuffer(); - } - - void ReadNextImpl(std::vector* out) override { - out->clear(); - if (iteration_pos_ >= buffer_.size()) { - VLOG(10) << "Resetting shuffle buffer"; - ReloadBuffer(); - if (buffer_.empty()) { - return; - } - } - *out = buffer_[iteration_pos_++]; - } - - private: - void ShutdownImpl() override { - reader_->Shutdown(); - buffer_.clear(); - iteration_pos_ = 0; - } - - void StartImpl() override { - reader_->Start(); - ReloadBuffer(); - } - - void ReloadBuffer() { - buffer_.clear(); - buffer_.reserve(buffer_size_); - iteration_pos_ = 0; - for (size_t i = 0; i < buffer_size_; ++i) { - std::vector ins; - reader_->ReadNext(&ins); - if (ins.empty()) { - break; - } - buffer_.emplace_back(ins); - } - std::mt19937 g(seed_); - std::shuffle(buffer_.begin(), buffer_.end(), g); - seed_ = g(); // update seed_; - VLOG(10) << "random buffer size = " << buffer_.size(); - } - - size_t buffer_size_; - std::vector> buffer_; - - size_t iteration_pos_; - size_t seed_; -}; - -class CreateShuffleReaderOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - auto* out = detail::Ref(scope.FindVar(Output("Out"))) - .GetMutable(); - if (out->Get() != nullptr) { - return; - } - const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) - ->Get(); - out->Reset(framework::MakeDecoratedReader( - underlying_reader, static_cast(Attr("buffer_size")))); - } -}; - -class CreateShuffleReaderOpMaker : public DecoratedReaderMakerBase { - protected: - void Apply() override { - AddAttr("buffer_size", "The shuffle buffer size.").GreaterThan(0); - AddComment(R"DOC( - CreateShuffleReader Operator - - A shuffle reader takes another reader as its 'underlying reader' - and yields the underlying reader's outputs in a shuffled order. - )DOC"); - } -}; -} // namespace reader -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators::reader; -REGISTER_DECORATED_READER_OPERATOR(create_shuffle_reader, - ops::CreateShuffleReaderOp, - ops::CreateShuffleReaderOpMaker); diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc deleted file mode 100644 index 38223e06..00000000 --- a/paddle/fluid/operators/reader/open_files_op.cc +++ /dev/null @@ -1,277 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include // NOLINT -#include "ThreadPool.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/operators/reader/blocking_queue.h" -#include "paddle/fluid/operators/reader/buffered_reader.h" -#include "paddle/fluid/operators/reader/reader_op_registry.h" - -namespace paddle { -namespace operators { -namespace reader { - -class IReaderContainer { - public: - virtual ~IReaderContainer() {} - virtual void AppendReader( - std::unique_ptr&& readers) = 0; - virtual void Stop() = 0; - virtual void Start() = 0; - virtual void ReadNext(std::vector* out) = 0; -}; - -class OrderedReaderContainer : public IReaderContainer { - public: - void AppendReader(std::unique_ptr&& reader) override { - pending_.emplace(std::move(reader)); - } - - void Stop() override { - while (!pending_.empty()) { - MoveFrontPendingToDone(); - } - } - - void Start() override { std::swap(done_, pending_); } - - void ReadNext(std::vector* out) override { - if (!pending_.empty()) { - pending_.front()->ReadNext(out); - if (out->empty()) { - MoveFrontPendingToDone(); - ReadNext(out); - } - } else { - out->clear(); - } - } - - private: - void MoveFrontPendingToDone() { - pending_.front()->Shutdown(); - pending_.front()->Start(); - done_.emplace(move(pending_.front())); - pending_.pop(); - } - - std::queue> pending_; - std::queue> done_; -}; - -class PreemptiveReaderContainer : public IReaderContainer { - using ReaderList = std::list>; - - struct FutureItem { - std::vector data_; - ReaderList::iterator reader_it_; - std::exception_ptr exception_; - }; - - using FutureList = std::list>; - - public: - explicit PreemptiveReaderContainer(size_t thread_num) : pool_(thread_num) {} - - void Stop() override { - if (!pending_.empty()) { - for (auto& reader : pending_) { - reader->Shutdown(); - } - for (auto& fu : futures_) { - fu.wait(); - } - futures_.clear(); - for (auto& reader : pending_) { - reader->Start(); - done_.emplace_back(std::move(reader)); - } - pending_.clear(); - bool timeout; - complete_queue_.PopAll(1000, &timeout); - PADDLE_ENFORCE(!timeout); - } - } - - void Start() override { - for (auto& reader : done_) { - AppendReader(std::move(reader)); - } - done_.clear(); - } - - void ReadNext(std::vector* out) override { - if (!pending_.empty()) { - auto future_it = complete_queue_.Pop(); - FutureItem item = future_it->get(); - if (item.exception_) { - for (auto it = futures_.begin(); it != futures_.end(); ++it) { - if (it != future_it) { - it->wait(); // Wait all other threads complete. - } - } - std::rethrow_exception(item.exception_); - - } else if (item.data_.empty()) { // reader done. - done_.emplace_back(std::move(*item.reader_it_)); - pending_.erase(item.reader_it_); - futures_.erase(future_it); - ReadNext(out); - } else { - *out = item.data_; - // continue read async - ReadAsync(item.reader_it_, &future_it); - } - } else { - out->clear(); - } - } - - private: - void AppendReader(std::unique_ptr&& reader) override { - pending_.emplace_back(std::move(reader)); - auto reader_it = pending_.end(); - --reader_it; - - futures_.emplace_back(); - auto future_it = futures_.end(); - --future_it; - - ReadAsync(reader_it, &future_it); - } - - void ReadAsync(const ReaderList::iterator& reader_it, - FutureList::iterator* future_it_ptr) { - auto& future_it = *future_it_ptr; - *future_it = pool_.enqueue([reader_it, future_it, this] { - try { - FutureItem item; - item.reader_it_ = reader_it; - (*reader_it)->ReadNext(&item.data_); - if (item.data_.empty()) { - (*reader_it)->Shutdown(); - (*reader_it)->Start(); - } - complete_queue_.Push(future_it); - return item; - } catch (...) { - FutureItem item; - item.exception_ = std::current_exception(); - complete_queue_.Push(future_it); - return item; - } - }); - } - - FutureList futures_; - ThreadPool pool_; - framework::BlockingQueue complete_queue_; - std::list> pending_; - std::list> done_; -}; - -class MultiFileReader : public framework::ReaderBase { - public: - MultiFileReader(const std::vector& file_names, - std::unique_ptr&& container) - : container_(std::move(container)) { - for (auto& fn : file_names) { - container_->AppendReader(CreateReaderByFileName(fn)); - } - } - - ~MultiFileReader() { container_->Stop(); } - - protected: - void ReadNextImpl(std::vector* out) override { - container_->ReadNext(out); - } - void ShutdownImpl() override { container_->Stop(); } - void StartImpl() override { container_->Start(); } - - private: - std::unique_ptr container_; -}; - -class OpenFilesOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - const auto& shape_concat = Attr>("shape_concat"); - const auto& ranks = Attr>("ranks"); - PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); - PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), - static_cast(shape_concat.size()), - "The accumulate of all ranks should be equal to the " - "shape concat's length."); - const auto& file_names = Attr>("file_names"); - PADDLE_ENFORCE(!file_names.empty(), "No file to be read!"); - bool is_test = Attr("is_test"); - - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); - std::unique_ptr container; - - if (is_test) { - container.reset(new OrderedReaderContainer()); - } else { - container.reset(new PreemptiveReaderContainer( - static_cast(Attr("thread_num")))); - } - - std::shared_ptr reader( - new MultiFileReader(file_names, std::move(container))); - auto buffer_size = Attr("buffer_size"); - if (buffer_size > 1) { - reader = framework::MakeDecoratedReader( - reader, platform::CPUPlace(), buffer_size); - } - out->Reset(reader); - } -}; - -class OpenFilesOpMaker : public FileReaderMakerBase { - protected: - void Apply() override { - AddAttr>("file_names", "Files to be read."); - AddAttr("is_test", "Used for testing data.").SetDefault(false); - - AddComment(R"DOC( - OpenFiles Operator - - An OpenFilesOp creates a MultiFileReader, which is able to - read data multi-threaded from multiple files. - )DOC"); - AddAttr("thread_num", - "The maximal concurrent prefetch thread number. Used only " - "when is_test = False"); - AddAttr("buffer_size", "The reading buffer of these files.") - .GreaterThan(0); - } -}; - -} // namespace reader -} // namespace operators -} // namespace paddle - -namespace reader = paddle::operators::reader; - -REGISTER_FILE_READER_OPERATOR(open_files, reader::OpenFilesOp, - reader::OpenFilesOpMaker); diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc index 64a1f6b6..6a9506b5 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.cc +++ b/paddle/fluid/operators/reader/reader_op_registry.cc @@ -38,21 +38,6 @@ std::unordered_map& FileReaderRegistry() { return regs; } -std::unique_ptr CreateReaderByFileName( - const std::string& file_name) { - size_t separator_pos = file_name.find_last_of(kFileFormatSeparator); - PADDLE_ENFORCE_NE(separator_pos, std::string::npos, - "File name illegal! A legal file name should be like: " - "[file_name].[file_format] (e.g., 'data_file.recordio')."); - std::string filetype = file_name.substr(separator_pos + 1); - - auto itor = FileReaderRegistry().find(filetype); - PADDLE_ENFORCE(itor != FileReaderRegistry().end(), - "No file reader registered for '%s' format.", filetype); - framework::ReaderBase* reader = (itor->second)(file_name); - return std::unique_ptr(reader); -} - void FileReaderMakerBase::Make() { AddOutput("Out", "(ReaderHolder): The created random reader.").AsDuplicable(); AddAttr>("shape_concat", "The concat of all data's shapes."); diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h index 795a5806..de0c34ad 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.h +++ b/paddle/fluid/operators/reader/reader_op_registry.h @@ -40,9 +40,6 @@ int RegisterFileReader(const std::string& filetype) { return 0; } -std::unique_ptr CreateReaderByFileName( - const std::string& file_name); - extern std::vector RestoreShapes( const std::vector& shape_concat, const std::vector& ranks); diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index d26a85fb..91615a1b 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -54,20 +54,6 @@ static void ClearStepScopes(const platform::DeviceContext &dev_ctx, step_scopes->clear(); } -// StepScopes manages scopes inside RNN. -// StepScopes::CurScope() get the current scope -// StepScopes::ExScope() get the ex-scope, or scope in previous time step. -// StepScopes::Next() move to next time step. -// -// if is_train = False, then -// there are two scopes for the RNN and just support forward. -// else -// the len(scopes) == seq_len -// -// if is_backward = True, then -// reversely access scopes -// else -// access scopes from begin to end. StepScopes::StepScopes(const platform::DeviceContext &dev_ctx, const framework::Scope &parent, StepScopeVar *scopes, bool is_train, size_t seq_len, bool is_backward) @@ -76,8 +62,8 @@ StepScopes::StepScopes(const platform::DeviceContext &dev_ctx, is_train_(is_train), is_backward_(is_backward) { size_t num_step_scopes = is_train ? seq_len : 2; - PADDLE_ENFORCE(is_train || !is_backward, - "Cannot backward when is not training"); + PADDLE_ENFORCE_EQ(is_train || !is_backward, true, + "Cannot backward when is not training"); if (!is_backward_) { ClearStepScopes(dev_ctx, const_cast(&parent), scopes); scopes->reserve(static_cast(num_step_scopes)); @@ -94,12 +80,22 @@ framework::Scope &StepScopes::ExScope() { return scope; } -void StepScopes::Next() { - if (is_backward_) { - --counter_; - } else { - ++counter_; +void StepScopes::BackwardNext(const platform::DeviceContext &dev_ctx, + framework::Scope *parent_scope) { + PADDLE_ENFORCE_EQ(is_backward_, true, + "Cannot get backward next scope when is forward"); + if (counter_ + 2 == scopes_->size()) { + parent_scope->DeleteScope((*scopes_)[counter_ + 1]); + scopes_->pop_back(); + VLOG(3) << "Deleted scope at " << counter_ + 1; } + --counter_; +} + +void StepScopes::ForwardNext() { + PADDLE_ENFORCE_EQ(is_backward_, false, + "Cannot get forward next scope when is backward"); + ++counter_; } framework::Scope &StepScopes::GetScope(size_t scope_id) const { @@ -125,11 +121,11 @@ int64_t RecurrentBase::GetSequenceLength(const framework::Scope &scope) const { // Dim format SEQ_LEN, BATCH_SIZE, ... int64_t seq_len = -1; auto &all_inputs = Inputs(kInputs); - PADDLE_ENFORCE(!all_inputs.empty()); + PADDLE_ENFORCE_EQ(all_inputs.empty(), false); for (auto &iname : all_inputs) { auto *var = scope.FindVar(iname); - PADDLE_ENFORCE(var != nullptr); - PADDLE_ENFORCE(var->IsType()); + PADDLE_ENFORCE_NOT_NULL(var); + PADDLE_ENFORCE_EQ(var->IsType(), true); auto &dim = var->Get().dims(); if (seq_len == -1) { seq_len = dim[0]; @@ -220,29 +216,41 @@ void RecurrentOp::RunImpl(const framework::Scope &scope, } } - // Every inputs are linked now, execute! + // Link inside::output -> outside::output + // outside::output[seq_offset: seq_offset + 1] = inside::output + executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_); + if (i > 0) { + LinkTensorWithCallback(scope, Outputs(kOutputs), cur_scope, + Outputs(kOutputs), + [&](const framework::LoDTensor &src_tensor, + framework::LoDTensor *dst_tensor) { + framework::Tensor src_slice = + src_tensor.Slice(seq_offset, seq_offset + 1); + dst_tensor->ShareDataWith(src_slice); + }); + } + + // Linked now, execute! executor.RunPreparedContext(ctx.get(), &cur_scope, false /*create_local_scope*/, - true /*create_vars*/, true /* keep_kids */); - - // Copy inside::output -> outside::output - // outside::output[seq_offset: seq_offset + 1] = inside::output - this->LinkTensorWithCallback( - cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs), - [&](const framework::LoDTensor &src_tensor, - framework::LoDTensor *dst_tensor) { - if (i == 0) { // create output tensor at begin + false /*create_vars*/, true /* keep_kids */); + if (i == 0) { + LinkTensorWithCallback( + cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs), + [&](const framework::LoDTensor &src_tensor, + framework::LoDTensor *dst_tensor) { + // create output tensor at begin dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims())); dst_tensor->mutable_data(place, src_tensor.type()); - } - auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1); - // Explicit copy output since the local RNN scope can be destroyed - // early. - framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out); - }); + auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1); + // Explicit copy output since the local RNN scope can be destroyed + // early. + framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out); + }); + } - scopes.Next(); + scopes.ForwardNext(); } } @@ -250,7 +258,7 @@ StepScopes RecurrentOp::CreateStepScopes(const platform::DeviceContext &dev_ctx, const framework::Scope &scope, size_t seq_len) const { auto *var = scope.FindVar(Output(kStepScopes)); - PADDLE_ENFORCE(var != nullptr); + PADDLE_ENFORCE_NOT_NULL(var); return StepScopes(dev_ctx, scope, var->GetMutable(), Attr(kIsTrain), seq_len); } @@ -322,23 +330,42 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope, for (size_t i = 0; i < ex_state_grads.size(); ++i) { auto &cur_grad = cur_state_grads[i]; auto &ex_grad = ex_state_grads[i]; - auto &ex_tensor = + auto &ex_grad_tensor = ex_scope.FindVar(ex_grad)->Get(); VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad; auto *cur_grad_var = cur_scope.Var(cur_grad); - auto cur_grad_tensor = + framework::LoDTensor *cur_grad_tensor = cur_grad_var->GetMutable(); - framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor); + cur_grad_tensor->ShareDataWith(ex_grad_tensor); } } } + // Link inside::output -> outside::output + // outside::output[seq_offset: seq_offset + 1] = inside::output + executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_); + if (step_id > 0) { + LinkTensorWithCallback(scope, Outputs(kInputGrads), cur_scope, + GradVarLists(Inputs(kInputs)), + [&](const framework::LoDTensor &src_tensor, + framework::LoDTensor *dst_tensor) { + if (src_tensor.memory_size() == + 0) { // Inside Gradient is not created. + return; + } + framework::Tensor src_slice = + src_tensor.Slice(seq_offset, seq_offset + 1); + dst_tensor->ShareDataWith(src_slice); + }, + true /*is_backward*/); + } + VLOG(5) << "Recurrent memory linking finished "; // Run step block with cur_scope executor.RunPreparedContext(ctx.get(), &cur_scope, false /*create_local_scope*/, - true /*create_vars*/, true /* keep_kids */); + false /*create_vars*/, true /* keep_kids */); VLOG(5) << "executor.Run finished "; @@ -368,7 +395,7 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope, cur_scope.FindVar(inside_grad_name)->Get(); framework::AttributeMap attrs; attrs["dtype"] = inside_tensor.type(); - attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); + attrs["shape"] = framework::vectorize(inside_tensor.dims()); attrs["value"] = 0.0f; auto zero_op = framework::OpRegistry::CreateOp( @@ -393,21 +420,23 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope, // Copy input gradient from inside to outside // outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad - LinkTensorWithCallback( - cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads), - [&](const framework::LoDTensor &inside, framework::LoDTensor *outside) { - if (inside.memory_size() == 0) { // IG is not created. - return; - } - if (step_id == 0) { // alloc memory + if (step_id == 0) { + LinkTensorWithCallback( + cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads), + [&](const framework::LoDTensor &inside, + framework::LoDTensor *outside) { + if (inside.memory_size() == 0) { // IG is not created. + return; + } + // Alloc outside memory outside->Resize(PrependDims(seq_len, inside.dims())); outside->mutable_data(place, inside.type()); - } - auto dst = outside->Slice(seq_offset, seq_offset + 1); - framework::TensorCopy(inside, place, dev_ctx, &dst); - }, - true /*is_backward*/); + auto dst = outside->Slice(seq_offset, seq_offset + 1); + framework::TensorCopy(inside, place, dev_ctx, &dst); + }, + true /*is_backward*/); + } VLOG(5) << "Link outside gradient finished "; if (has_state) { @@ -426,11 +455,11 @@ void RecurrentGradOp::RunImpl(const framework::Scope &scope, VLOG(5) << "Link initialize state gradient finished "; } } - scopes.Next(); + scopes.BackwardNext(dev_ctx, const_cast(&scope)); } // Delete the scope of StepScopes auto *var = scope.FindVar(Input(kStepScopes)); - PADDLE_ENFORCE(var != nullptr); + PADDLE_ENFORCE_NOT_NULL(var); auto *step_scopes = var->GetMutable(); ClearStepScopes(dev_ctx, const_cast(&scope), step_scopes); } @@ -439,7 +468,7 @@ StepScopes RecurrentGradOp::CreateStepScopes( const platform::DeviceContext &dev_ctx, const framework::Scope &scope, size_t seq_len) const { auto *var = scope.FindVar(Input(kStepScopes)); - PADDLE_ENFORCE(var != nullptr); + PADDLE_ENFORCE_NOT_NULL(var); return StepScopes(dev_ctx, scope, var->GetMutable(), Attr(kIsTrain), seq_len, true /*is_backward*/); } @@ -458,6 +487,7 @@ std::unordered_set RecurrentGradOp::LocalVarNames( const framework::Scope &scope) const { return this->List2Set(scope.LocalVarNames()); } + std::vector RecurrentGradOp::GradVarLists( const std::vector &var_names) { std::vector retv; @@ -594,25 +624,25 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase { 0, "The Attr(%s) should be empty.", RecurrentBase::kStates); } - PADDLE_ENFORCE(ctx->HasInputs(RecurrentBase::kInputs), - "The input(%s) should not be empty.", - RecurrentBase::kInputs); - PADDLE_ENFORCE(ctx->HasInputs(RecurrentBase::kOutputs), - "The input(%s) should not be empty.", - RecurrentBase::kOutputs); + PADDLE_ENFORCE_EQ(ctx->HasInputs(RecurrentBase::kInputs), true, + "The input(%s) should not be empty.", + RecurrentBase::kInputs); + PADDLE_ENFORCE_EQ(ctx->HasInputs(RecurrentBase::kOutputs), true, + "The input(%s) should not be empty.", + RecurrentBase::kOutputs); // In some case the kInitialStates is empty. if (ctx->HasInputs(RecurrentBase::kInitialStates)) { - PADDLE_ENFORCE(ctx->HasOutputs( - framework::GradVarName(RecurrentBase::kInitialStates)), - "The output of(%s) should not be empty.", - framework::GradVarName(RecurrentBase::kInitialStates)); + PADDLE_ENFORCE_EQ(ctx->HasOutputs(framework::GradVarName( + RecurrentBase::kInitialStates)), + true, "The output of(%s) should not be empty.", + framework::GradVarName(RecurrentBase::kInitialStates)); ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kInitialStates), ctx->GetInputsDim(RecurrentBase::kInitialStates)); } - PADDLE_ENFORCE( - ctx->HasOutputs(framework::GradVarName(RecurrentBase::kInputs)), + PADDLE_ENFORCE_EQ( + ctx->HasOutputs(framework::GradVarName(RecurrentBase::kInputs)), true, "The output of(%s) should not be empty.", framework::GradVarName(RecurrentBase::kInputs)); ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kInputs), @@ -620,9 +650,9 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase { // In some case the kParameters is empty. if (ctx->HasInputs(RecurrentBase::kParameters)) { - PADDLE_ENFORCE( + PADDLE_ENFORCE_EQ( ctx->HasOutputs(framework::GradVarName(RecurrentBase::kParameters)), - "The output of(%s) should not be empty.", + true, "The output of(%s) should not be empty.", framework::GradVarName(RecurrentBase::kParameters)); ctx->SetOutputsDim(framework::GradVarName(RecurrentBase::kParameters), ctx->GetInputsDim(RecurrentBase::kParameters)); diff --git a/paddle/fluid/operators/recurrent_op.h b/paddle/fluid/operators/recurrent_op.h index 8da0fcac..a4b21448 100644 --- a/paddle/fluid/operators/recurrent_op.h +++ b/paddle/fluid/operators/recurrent_op.h @@ -25,20 +25,17 @@ limitations under the License. */ namespace paddle { namespace operators { -// StepScopes manages scopes inside RNN. -// StepScopes::CurScope() get the current scope -// StepScopes::ExScope() get the ex-scope, or scope in previous time step. -// StepScopes::Next() move to next time step. +// StepScopes manages the scopes inside Recurrent Op. // // if is_train = False, then -// there are two scopes for the RNN and just support forward. +// there are two scopes for the RNN and just support forward // else // the len(scopes) == seq_len // // if is_backward = True, then -// reversely access scopes +// reversely access scopes, delete useless ex-scope // else -// access scopes from begin to end. +// access scopes from beginning to end class StepScopes { public: StepScopes(const platform::DeviceContext &dev_ctx, @@ -46,11 +43,19 @@ class StepScopes { std::vector *scopes, bool is_train, size_t seq_len, bool is_backward = false); + // Get the current scope framework::Scope &CurScope(); + // Get the ex-scope, which is the scope in previous time step framework::Scope &ExScope(); - void Next(); + // Move to next time step when forwarding + void ForwardNext(); + + // Delete ex-scope after using it, then move to next time step when + // backwarding + void BackwardNext(const platform::DeviceContext &dev_ctx, + framework::Scope *parent_scope); private: framework::Scope &GetScope(size_t scope_id) const; @@ -154,7 +159,7 @@ class RecurrentBase : public framework::OperatorBase { if (is_backward && src_var == nullptr) { return; } - PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name); + PADDLE_ENFORCE_NOT_NULL(src_var, "%s is not found.", src_var_name); auto &src_tensor = src_var->Get(); auto *dst_var = dst_scope->Var(dst_var_name); @@ -173,9 +178,9 @@ class RecurrentBase : public framework::OperatorBase { return; } auto *src_var = src_scope.FindVar(src_var_name); - PADDLE_ENFORCE(src_var != nullptr, "%s is not found.", src_var_name); + PADDLE_ENFORCE_NOT_NULL(src_var, "%s is not found.", src_var_name); auto &src_tensor = src_var->Get(); - PADDLE_ENFORCE(dst_var != nullptr, "%s is not found.", dst_var_name); + PADDLE_ENFORCE_NOT_NULL(dst_var, "%s is not found.", dst_var_name); auto *dst_tensor = dst_var->GetMutable(); callback(src_tensor, dst_tensor); } diff --git a/paddle/fluid/operators/reduce_ops/cub_reduce.h b/paddle/fluid/operators/reduce_ops/cub_reduce.h index afd3922b..af56e85e 100644 --- a/paddle/fluid/operators/reduce_ops/cub_reduce.h +++ b/paddle/fluid/operators/reduce_ops/cub_reduce.h @@ -251,7 +251,7 @@ void TensorReduce(const framework::Tensor& x, framework::Tensor* y, std::vector origin_reduce_dims, const Ty& init, const ReduceOp& reducer, const TransformOp& transformer, cudaStream_t stream) { - auto x_dim = framework::vectorize2int(x.dims()); + auto x_dim = framework::vectorize(x.dims()); std::vector new_x_dim, new_reduce_dims; int is_reduced = 0; for (auto e : origin_reduce_dims) { diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc index d1b50879..e549d2bd 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc @@ -61,6 +61,8 @@ class ReduceMeanDoubleGradMaker : public framework::GradOpDescMakerBase { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ReduceMeanGradNoNeedBufferVarInference, + "X"); } // namespace operators } // namespace paddle @@ -73,7 +75,8 @@ class __reduce_meanMaker__ : public ops::ReduceOpMaker { REGISTER_OPERATOR(reduce_mean, ops::ReduceOp, __reduce_meanMaker__, ops::ReduceMeanOpGradDescMaker); REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp, - ops::ReduceMeanDoubleGradMaker); + ops::ReduceMeanDoubleGradMaker, + ops::ReduceMeanGradNoNeedBufferVarInference); REGISTER_OP_CPU_KERNEL(reduce_mean, ops::ReduceKernel, @@ -83,12 +86,13 @@ REGISTER_OP_CPU_KERNEL(reduce_mean, int, ops::MeanFunctor>, ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL(reduce_mean_grad, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); + +template +using CPUReduceMeanGradKernel = + ops::ReduceGradKernel; + +REGISTER_OP_CPU_KERNEL(reduce_mean_grad, CPUReduceMeanGradKernel, + CPUReduceMeanGradKernel, + CPUReduceMeanGradKernel, + CPUReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu index 9324ec1e..12eceb33 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.part.cu @@ -15,12 +15,12 @@ // .part used to speed up nvcc compile #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_mean_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +template +using CUDAReduceMeanGradKernel = + ops::ReduceGradKernel; + +REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel, + CUDAReduceMeanGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 67fd3e1d..838ac895 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -75,7 +75,8 @@ class ReduceKernel : public framework::OpKernel { } }; -template +template class ReduceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -88,6 +89,17 @@ class ReduceGradKernel : public framework::OpKernel { auto* output = context.Output(framework::GradVarName("X")); output->mutable_data(context.GetPlace()); + // NOTE: EigenTensor::From() uses tensor->data() + // if op has NoNeedBufferVarsInferer, the corresponding kNoNeedBufferX or + // kNoNeedBufferY should set true + // and use fake var that has same dims. + if (kNoNeedBufferX) { + input0 = output; + } + if (kNoNeedBufferY) { + input1 = input2; + } + // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and // not be set as Input in grad Maker, use Out_grad to replace here if (!input1) input1 = input2; @@ -220,6 +232,14 @@ class ReduceGradOp : public framework::OperatorWithKernel { ctx->ShareLoD("X", /*->*/ x_grad_name); } } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; class ReduceOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc index c7742f45..14bb2cf0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc @@ -13,8 +13,47 @@ // limitations under the License. #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" +#include +#include + +namespace paddle { +namespace operators { + +// NOTE: Input(Out) is unnecessary in reduce_sum_grad, and Input(X) needs no +// buffer +class ReduceSumOpGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("reduce_sum_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetAttrMap(Attrs()); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ReduceSumGradNoNeedBufferVarInference, + "X"); + +} // namespace operators +} // namespace paddle + +class ReduceSumOpMaker : public ops::ReduceOpMaker { + protected: + virtual std::string GetName() const { return "reduce_sum"; } + virtual std::string GetOpType() const { return "Reduce reduce_sum"; } +}; + +REGISTER_OPERATOR(reduce_sum, ops::ReduceOp, ReduceSumOpMaker, + ops::ReduceSumOpGradDescMaker); +REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp, + ops::ReduceSumGradNoNeedBufferVarInference); -REGISTER_REDUCE_OP(reduce_sum); REGISTER_OP_CPU_KERNEL( reduce_sum, ops::ReduceKernel, @@ -23,13 +62,13 @@ REGISTER_OP_CPU_KERNEL( ops::ReduceKernel, ops::ReduceKernel); -REGISTER_OP_CPU_KERNEL( - reduce_sum_grad, - ops::ReduceSumGradKernel, - ops::ReduceSumGradKernel, - ops::ReduceSumGradKernel, - ops::ReduceSumGradKernel); + +template +using CPUReduceSumGradKernel = + ops::ReduceSumGradKernel; + +REGISTER_OP_CPU_KERNEL(reduce_sum_grad, CPUReduceSumGradKernel, + CPUReduceSumGradKernel, + CPUReduceSumGradKernel, + CPUReduceSumGradKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h index 26f59c72..7343d01e 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.h @@ -22,7 +22,8 @@ namespace paddle { namespace operators { // use for loop to speed up Eigen broadcast. 4 timer faster then broadcast -template +template class ReduceSumGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -72,7 +73,7 @@ class ReduceSumGradKernel : public framework::OpKernel { } // default use Eigen broadcast - ReduceGradKernel kernel; + ReduceGradKernel kernel; kernel.Compute(context); } }; diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu index eb329573..0d689d71 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu @@ -15,12 +15,12 @@ #include "paddle/fluid/operators/reduce_ops/cub_reduce.h" #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" -REGISTER_OP_CUDA_KERNEL( - reduce_sum_grad, ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel, - ops::ReduceGradKernel); +template +using CUDAReduceSumGradKernel = + ops::ReduceGradKernel; + +REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel, + CUDAReduceSumGradKernel); diff --git a/paddle/fluid/operators/requantize_op.cc b/paddle/fluid/operators/requantize_op.cc index 08ba1470..d156ae20 100644 --- a/paddle/fluid/operators/requantize_op.cc +++ b/paddle/fluid/operators/requantize_op.cc @@ -42,5 +42,4 @@ void ReQuantOpMaker::Make() { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(requantize, ops::ReQuantOp, ops::ReQuantOpMaker, - paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(requantize, ops::ReQuantOp, ops::ReQuantOpMaker); diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 9750bc87..0059921c 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -50,29 +50,56 @@ class ReshapeOp : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ReshapeOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ReshapeOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of ReshapeOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of ReshapeOp should not be null."); if (ctx->HasInputs("ShapeTensor")) { // top prority shape - auto inputs_name = ctx->Inputs("ShapeTensor"); - PADDLE_ENFORCE(inputs_name.size() > 0, "shape tensor size can't be zero"); - auto out_dims = std::vector(inputs_name.size(), -1); - ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + auto ShapeTensor = ctx->Inputs("ShapeTensor"); + PADDLE_ENFORCE_GT(ShapeTensor.size(), 0, + "The size of Input(ShapeTensor) can't be zero"); + auto infer_shape = ctx->Attrs().Get>("shape"); + const int64_t copy_dim_val = 0; + auto in_dims = ctx->GetInputDim("X"); + for (size_t i = 0; i < infer_shape.size(); ++i) { + if (infer_shape[i] == copy_dim_val) { + PADDLE_ENFORCE_LT( + static_cast(i), in_dims.size(), + "The dimension of data to copy from input must be less " + "than the dimension of input."); + infer_shape[i] = in_dims[i]; + } + } + auto infer_out_dims = framework::make_ddim(infer_shape); + ctx->SetOutputDim("Out", infer_out_dims); + return; + } + const std::vector &shape = ctx->Attrs().Get>("shape"); + if (ctx->HasInput("Shape") && shape.empty()) { + auto shape_dims = ctx->GetInputDim("Shape"); + int num_ele = 1; + for (int i = 0; i < shape_dims.size(); ++i) { + num_ele *= shape_dims[i]; + } + auto vec_dims = std::vector(num_ele, -1); + auto out_dims = framework::make_ddim(vec_dims); + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", /*->*/ "Out"); return; } - if (ctx->HasInput("Shape") && ctx->IsRuntime()) { + + if (ctx->HasInput("Shape") && !shape.empty() && ctx->IsRuntime()) { // If true, set the shape of Output(Out) according to Input(Shape) in // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel. ctx->ShareLoD("X", /*->*/ "Out"); return; } - const std::vector &shape = ctx->Attrs().Get>("shape"); - PADDLE_ENFORCE(!shape.empty(), - "The shape information must be set by Attr(shape)."); + + PADDLE_ENFORCE_EQ(!shape.empty(), true, + "The shape information must be set by Attr(shape)."); auto x_dims = ctx->GetInputDim("X"); auto out_dims = ValidateShape(shape, x_dims); ctx->SetOutputDim("Out", out_dims); @@ -99,18 +126,18 @@ class ReshapeOp : public framework::OperatorWithKernel { int unk_dim_idx = -1; for (size_t i = 0; i < shape.size(); ++i) { if (shape[i] == unk_dim_val) { - PADDLE_ENFORCE( - unk_dim_idx == -1, + PADDLE_ENFORCE_EQ( + unk_dim_idx, -1, "Only one input dimension of Attr(shape) can be unknown."); unk_dim_idx = i; } else if (shape[i] == copy_dim_val) { - PADDLE_ENFORCE( - static_cast(i) < in_dims.size(), + PADDLE_ENFORCE_LT( + static_cast(i), in_dims.size(), "The index of dimension to copy from input shape must be less " "than the size of input shape."); } else { - PADDLE_ENFORCE( - shape[i] > 0, + PADDLE_ENFORCE_GT( + shape[i], 0, "Each input dimension of Attr(shape) must not be negtive except " "one unknown dimension."); } @@ -231,9 +258,9 @@ class ReshapeGradOp : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) shouldn't be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) shouldn't be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) shouldn't be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } @@ -314,8 +341,8 @@ class Reshape2Op : public ReshapeOp { : ReshapeOp(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasOutput("XShape"), - "Output(XShape) of ReshapeOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true, + "Output(XShape) of ReshapeOp should not be null."); const auto &x_dims = ctx->GetInputDim("X"); std::vector xshape_dims(x_dims.size() + 1); xshape_dims[0] = 0; @@ -365,9 +392,10 @@ class Reshape2GradOp : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("XShape"), "Input(XShape) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) shouldn't be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("XShape"), true, + "Input(XShape) shouldn't be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) shouldn't be null."); auto xshape_dims = ctx->GetInputDim("XShape"); auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); ctx->SetOutputDim(framework::GradVarName("X"), x_dims); @@ -393,21 +421,10 @@ class Reshape2GradOp : public framework::OperatorWithKernel { } }; -class ReshapeOpInplaceInToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc &op_desc, bool use_cuda) const override { - return {{"X", "Out"}}; - } -}; - -class ReshapeGradInplaceInToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc &op_desc, bool use_cuda) const override { - return {{framework::GradVarName("Out"), framework::GradVarName("X")}}; - } -}; +DECLARE_INPLACE_OP_INFERER(ReshapeOpInplaceInToOut, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(ReshapeGradInplaceInToOut, + {framework::GradVarName("Out"), + framework::GradVarName("X")}); } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc index 9f652480..f360ae3c 100644 --- a/paddle/fluid/operators/rnn_memory_helper_op.cc +++ b/paddle/fluid/operators/rnn_memory_helper_op.cc @@ -107,7 +107,7 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { framework::AttributeMap attrs; attrs["dtype"] = in_var_tensor.type(); - attrs["shape"] = framework::vectorize2int(in_var_tensor.dims()); + attrs["shape"] = framework::vectorize(in_var_tensor.dims()); attrs["value"] = 0.0f; auto zero_op = framework::OpRegistry::CreateOp( diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index d0dd861a..21c3dd27 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -85,7 +85,7 @@ class ROIAlignGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), + return framework::OpKernelType(ctx.Input("ROIs")->type(), ctx.device_context()); } }; @@ -167,13 +167,16 @@ class ROIAlignGradDescMaker : public framework::SingleGradOpDescMaker { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(RoiAlignGradNoNeedBufVarsInferer, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, ops::ROIAlignGradDescMaker); -REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp); +REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp, + ops::RoiAlignGradNoNeedBufVarsInferer); REGISTER_OP_CPU_KERNEL( roi_align, ops::CPUROIAlignOpKernel, diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu index 8d695fde..943c5c81 100644 --- a/paddle/fluid/operators/roi_align_op.cu +++ b/paddle/fluid/operators/roi_align_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/roi_align_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -272,10 +272,8 @@ class GPUROIAlignOpKernel : public framework::OpKernel { } } auto& dev_ctx = ctx.cuda_device_context(); - auto& allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = allocator.Allocate(bytes); + auto roi_ptr = memory::Alloc(dev_ctx, bytes); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const auto gplace = boost::get(ctx.GetPlace()); memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, @@ -322,9 +320,8 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel { } } auto& dev_ctx = ctx.cuda_device_context(); - auto& allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); - auto roi_ptr = allocator.Allocate(roi_batch_id_list.numel() * sizeof(int)); + auto roi_ptr = + memory::Alloc(dev_ctx, roi_batch_id_list.numel() * sizeof(int)); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); int bytes = roi_batch_id_list.numel() * sizeof(int); const auto gplace = boost::get(ctx.GetPlace()); diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h index 936b2f0e..78befea2 100644 --- a/paddle/fluid/operators/roi_align_op.h +++ b/paddle/fluid/operators/roi_align_op.h @@ -256,13 +256,15 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel { auto spatial_scale = ctx.Attr("spatial_scale"); auto sampling_ratio = ctx.Attr("sampling_ratio"); auto in_dims = in->dims(); - if (!in_grad) { - return; - } + int channels = in_dims[1]; int height = in_dims[2]; int width = in_dims[3]; int rois_num = rois->dims()[0]; + + if (!in_grad) { + return; + } Tensor roi_batch_id_list; roi_batch_id_list.Resize({rois_num}); int* roi_batch_id_data = @@ -275,15 +277,21 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel { roi_batch_id_data[i] = n; } } + in_grad->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.template device_context(); + math::SetConstant set_zero; + set_zero(dev_ctx, in_grad, static_cast(0)); + + int output_grad_size = out_grad->numel(); + + if ((!out_grad->IsInitialized()) || (output_grad_size <= 0)) { + return; + } const T* rois_data = rois->data(); const T* out_grad_data = out_grad->data(); T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - auto& dev_ctx = ctx.template device_context(); - math::SetConstant set_zero; - set_zero(dev_ctx, in_grad, static_cast(0)); - auto in_stride = framework::stride(in->dims()); auto roi_stride = framework::stride(rois->dims()); auto out_stride = framework::stride(out_grad->dims()); diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu index ac3a4201..da8088d2 100644 --- a/paddle/fluid/operators/roi_pool_op.cu +++ b/paddle/fluid/operators/roi_pool_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/operators/roi_pool_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -170,10 +170,8 @@ class GPUROIPoolOpKernel : public framework::OpKernel { } auto& dev_ctx = ctx.cuda_device_context(); - auto& allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = allocator.Allocate(bytes); + auto roi_ptr = memory::Alloc(dev_ctx, bytes); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const auto gplace = boost::get(ctx.GetPlace()); memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, @@ -221,10 +219,8 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { } auto& dev_ctx = ctx.cuda_device_context(); - auto& allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = allocator.Allocate(bytes); + auto roi_ptr = memory::Alloc(dev_ctx, bytes); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const auto gplace = boost::get(ctx.GetPlace()); memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes, diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc index 7e961167..1645c47e 100644 --- a/paddle/fluid/operators/row_conv_op.cc +++ b/paddle/fluid/operators/row_conv_op.cc @@ -1,5 +1,4 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -43,13 +42,7 @@ class RowConvOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); auto filter_dims = ctx->GetInputDim("Filter"); - PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2."); - if (ctx->IsRuntime() || (x_dims[1] > 0 && filter_dims[1] > 0)) { - PADDLE_ENFORCE_EQ( - x_dims[1], filter_dims[1], - "The 2nd dimension of Input(X) and Input(Filter) should be same."); - } ctx->SetOutputDim("Out", x_dims); ctx->ShareLoD("X", "Out"); @@ -84,11 +77,12 @@ class RowConvOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "the input(X) is a LodTensor, which supports " + "the input(X) is a LodTensor or tensor, LodTensor(X) supports " "variable time-length input sequences. The underlying tensor " "in this LoDTensor is a matrix with shape (T x N), where T " "is the total time steps in this mini-batch and N is the input " - "data dimension."); + "data dimension. the shape of Tensor input(X) has shape " + "(B x T x N), B is batch size;"); AddInput("Filter", "the input(Filter) is a learnable parameter. It " "is a 2-D tensor with shape (future_context x N), where, " @@ -152,8 +146,26 @@ class RowConvKernel out->mutable_data(context.GetPlace()); - auto batch_indices = x->lod()[0]; - auto input_dim = x->dims()[1]; // 'in' is of size T x N + bool is_tensor = x->lod().empty(); + int batch_size = 0; + if (is_tensor) { + batch_size = x->dims()[0]; + } else { + batch_size = x->lod()[0].size() - 1; + } + framework::Vector batch_indices(batch_size + 1); + int input_dim = 0; + int timesteps = 0; + if (is_tensor) { + for (int i = 0; i < batch_size + 1; i++) { + batch_indices[i] = i; + } + input_dim = x->dims()[2]; + timesteps = x->dims()[1]; + } else { + batch_indices = x->lod()[0]; + input_dim = x->dims()[1]; + } size_t num_sequence = batch_indices.size() - 1; auto future_context = filter->dims()[0]; @@ -162,11 +174,23 @@ class RowConvKernel for (size_t i = 0; i < num_sequence; i++) { int start = static_cast(batch_indices[i]); int end = static_cast(batch_indices[i + 1]); - int current_timesteps = end - start; + int current_timesteps = 0; + if (is_tensor) { + current_timesteps = timesteps; + } else { + current_timesteps = end - start; + } + // int current_timesteps = end - start; Tensor cur_input_sequence = x->Slice(start, end); // Current input sequence + cur_input_sequence = + cur_input_sequence.Resize({current_timesteps, input_dim}); + Tensor cur_output_sequence = out->Slice(start, end); // Current output sequence + cur_output_sequence = + cur_output_sequence.Resize({current_timesteps, input_dim}); + auto cip_seq = EigenMatrix::From(cur_input_sequence); auto cot_seq = EigenMatrix::From(cur_output_sequence); @@ -198,11 +222,30 @@ class RowConvGradKernel auto *dx = context.Output(framework::GradVarName("X")); auto *d_filter = context.Output(framework::GradVarName("Filter")); - auto input_dim = x->dims()[1]; // 'x' is of size T x N - auto batch_indices = x->lod()[0]; + auto &x_lod = x->lod(); + bool is_tensor = x_lod.empty(); + int batch_size = 0; + if (is_tensor) { + batch_size = x->dims()[0]; + } else { + batch_size = x->lod()[0].size() - 1; + } + framework::Vector batch_indices(batch_size + 1); + int timesteps = 0; + int input_dim = 0; + if (is_tensor) { + for (int i = 0; i < batch_size + 1; i++) { + batch_indices[i] = i; + } + input_dim = x->dims()[2]; + timesteps = x->dims()[1]; + } else { + batch_indices = x->lod()[0]; + input_dim = x->dims()[1]; + } + size_t num_sequence = batch_indices.size() - 1; auto future_context = filter->dims()[0]; - if (d_filter) { d_filter->mutable_data(context.GetPlace()); auto dweights = @@ -213,14 +256,19 @@ class RowConvGradKernel int start = static_cast(batch_indices[i]); int end = static_cast(batch_indices[i + 1]); + int current_timesteps = 0; + if (is_tensor) { + current_timesteps = timesteps; + } else { + current_timesteps = end - start; + } Tensor cur_input = x->Slice(start, end); // Current input sequence + cur_input = cur_input.Resize({current_timesteps, input_dim}); Tensor cur_doutput = d_out->Slice(start, end); // Current output grad sequence - + cur_doutput = cur_doutput.Resize({current_timesteps, input_dim}); auto cur_ip = EigenMatrix::From(cur_input); auto cur_dout = EigenMatrix::From(cur_doutput); - int current_timesteps = end - start; - for (int k = 0; k < current_timesteps; k++) { // For different time steps in the same sequence for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); @@ -241,15 +289,23 @@ class RowConvGradKernel int start = static_cast(batch_indices[i]); int end = static_cast(batch_indices[i + 1]); + int current_timesteps = 0; + if (is_tensor) { + current_timesteps = timesteps; + } else { + current_timesteps = end - start; + } + Tensor cur_doutput = d_out->Slice(start, end); // Current output grad sequence + cur_doutput = cur_doutput.Resize({current_timesteps, input_dim}); Tensor cur_dinput = dx->Slice(start, end); // Current input grad sequence + cur_dinput = cur_dinput.Resize({current_timesteps, input_dim}); auto cur_dout = EigenMatrix::From(cur_doutput); auto cur_dip = EigenMatrix::From(cur_dinput); cur_dip.setZero(); - int current_timesteps = end - start; for (int k = 0; k < current_timesteps; k++) { // For different time steps in the same sequence diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu index 9ae80da6..a7128788 100644 --- a/paddle/fluid/operators/row_conv_op.cu +++ b/paddle/fluid/operators/row_conv_op.cu @@ -1,5 +1,4 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -47,11 +46,11 @@ __global__ void RowConvForwardSharedMemory(const T *in, const T *wt, (d < input_dim) ? wt[thy * input_dim + d] : static_cast(0); } __syncthreads(); - for (size_t i = 0; i < num_sequence; i++) { int start = static_cast(batch_indices[i]); int end = static_cast(batch_indices[i + 1]); int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { T sum = 0; for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); @@ -77,11 +76,11 @@ __global__ void RowConvForward(const T *in, const T *wt, int num_sequence, int thy = threadIdx.y; if (d >= input_dim) return; - for (size_t i = 0; i < num_sequence; i++) { int start = static_cast(batch_indices[i]); int end = static_cast(batch_indices[i + 1]); int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { T sum = 0; for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); @@ -114,10 +113,12 @@ __global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt, } __syncthreads(); + int current_timesteps = 0; for (int i = 0; i < num_sequence; i++) { int start = static_cast(batch_indices[i]); int end = static_cast(batch_indices[i + 1]); - int current_timesteps = end - start; + current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { T sum = 0; for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) { @@ -142,10 +143,13 @@ __global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence, int thy = threadIdx.y; if (d >= input_dim) return; + int current_timesteps = 0; + for (int i = 0; i < num_sequence; i++) { int start = static_cast(batch_indices[i]); int end = static_cast(batch_indices[i + 1]); - int current_timesteps = end - start; + current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { T sum = 0; for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) { @@ -175,7 +179,6 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout, int xdim_sh_in = block_y; int xdim_sh_dout = block_y; - // int xdim_sh_dfilter = future_context; int ydim_sh_in = block_x; int ydim_sh_dout = block_x + future_context - 1; int ydim_sh_dfilter = block_y; @@ -197,6 +200,7 @@ __global__ void RowConvGradFilterImproved(const T *in, const T *dout, int start = static_cast(batch_indices[i]); int end = static_cast(batch_indices[i + 1]); int current_timesteps = end - start; + int scaled_cur_steps = ((current_timesteps + block_x - 1) / block_x) * block_x; @@ -258,11 +262,11 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, // NOTE(zcd): temporary solution unsigned mask = 0u; CREATE_SHFL_MASK(mask, true); - for (int i = 0; i < num_sequence; i++) { int start = static_cast(batch_indices[i]); int end = static_cast(batch_indices[i + 1]); int current_timesteps = end - start; + int scaled_cur_steps = ((current_timesteps + block_x - 1) / block_x) * block_x; @@ -310,9 +314,26 @@ class RowConvKernel const T *in = X->data(); const T *weight = Filter->data(); T *out = Out->mutable_data(context.GetPlace()); + bool is_tensor = X->lod().empty(); + int batch_size = 0; + if (is_tensor) { + batch_size = X->dims()[0]; + } else { + batch_size = X->lod()[0].size() - 1; + } + int input_dim = 0; + framework::Vector batch_indices(batch_size + 1); + int timesteps = X->dims()[1]; + if (is_tensor) { + for (int i = 0; i < batch_size + 1; i++) { + batch_indices[i] = i * timesteps; + } + input_dim = X->dims()[2]; + } else { + batch_indices = X->lod()[0]; + input_dim = X->dims()[1]; + } - auto batch_indices = X->lod()[0]; - int input_dim = X->dims()[1]; int num_sequence = batch_indices.size() - 1; int future_context = Filter->dims()[0]; size_t *idx = batch_indices.CUDAMutableData(context.GetPlace()); @@ -348,9 +369,27 @@ class RowConvGradKernel Tensor *dX = context.Output(framework::GradVarName("X")); Tensor *dFilter = context.Output(framework::GradVarName("Filter")); + int batch_size = 0; + bool is_tensor = X->lod().empty(); + if (is_tensor) { + batch_size = X->dims()[0]; + } else { + batch_size = X->lod()[0].size() - 1; + } - auto batch_indices = X->lod()[0]; - int input_dim = X->dims()[1]; + int input_dim = 0; + framework::Vector batch_indices(batch_size + 1); + int timesteps = X->dims()[1]; + if (is_tensor) { + for (int i = 0; i < batch_size + 1; i++) { + batch_indices[i] = i * timesteps; + } + input_dim = X->dims()[2]; + } else { + batch_indices = X->lod()[0]; + input_dim = X->dims()[1]; + } + // int input_dim = X->dims()[1]; int num_sequence = batch_indices.size() - 1; int future_context = Filter->dims()[0]; size_t *idx = batch_indices.CUDAMutableData(context.GetPlace()); diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h index b55a2486..18ef6c9d 100644 --- a/paddle/fluid/operators/sample_logits_op.h +++ b/paddle/fluid/operators/sample_logits_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -33,7 +34,8 @@ using EigenMatrix = framework::EigenMatrix; template struct TolerableValue { HOSTDEVICE T operator()(const T& x) const { - PADDLE_ASSERT(std::is_floating_point::value); + PADDLE_ENFORCE(std::is_floating_point::value, + "TolerableValue should be float in sample_logits_op."); const T kApproInf = 1e20; if (x == INFINITY) return kApproInf; if (x == -INFINITY) return -kApproInf; @@ -47,11 +49,12 @@ static void CPUTakeAlongD1(const platform::DeviceContext& ctx, const framework::Tensor& array, const framework::Tensor& index, framework::Tensor* value) { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true); // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K) - PADDLE_ENFORCE(index.dims().size() == 2 && array.dims().size() == 2 && - index.dims()[0] == array.dims()[0] && - index.dims() == value->dims()); + PADDLE_ENFORCE_EQ(index.dims().size(), 2); + PADDLE_ENFORCE_EQ(array.dims().size(), 2); + PADDLE_ENFORCE_EQ(index.dims()[0], array.dims()[0]); + PADDLE_ENFORCE_EQ(index.dims(), value->dims()); const auto batch_size = index.dims()[0]; const auto num_take = index.dims()[1]; @@ -86,11 +89,12 @@ static void CPUPutAlongD1(const platform::DeviceContext& ctx, framework::Tensor* array, const framework::Tensor& index, const framework::Tensor& value) { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true); // UNDERSTAND: check shape src(B, C), index(B, K), out should also be (B, K) - PADDLE_ENFORCE(index.dims().size() == 2 && array->dims().size() == 2 && - index.dims()[0] == array->dims()[0] && - index.dims() == value.dims()); + PADDLE_ENFORCE_EQ(index.dims().size(), 2); + PADDLE_ENFORCE_EQ(array->dims().size(), 2); + PADDLE_ENFORCE_EQ(index.dims()[0], array->dims()[0]); + PADDLE_ENFORCE_EQ(index.dims(), value.dims()); const auto batch_size = index.dims()[0]; const auto num_put = index.dims()[1]; auto array_dims = array->dims(); @@ -145,8 +149,8 @@ class SampleLogitsKernel : public framework::OpKernel { public: using Tensor = framework::Tensor; void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()), - "This kernel only runs on CPU."); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(context.GetPlace()), true, + "This kernel only runs on CPU."); VLOG(3) << "Enter SampleLogitsKernel"; // get necessary inputs const Tensor* logits = context.Input("Logits"); diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 4e4a015e..383e7940 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -108,5 +108,8 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, REGISTER_OP_CPU_KERNEL( scale, ops::ScaleKernel, ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel, ops::ScaleKernel, ops::ScaleKernel); diff --git a/paddle/fluid/operators/scale_op.cu b/paddle/fluid/operators/scale_op.cu index 349f3936..e1f20a73 100644 --- a/paddle/fluid/operators/scale_op.cu +++ b/paddle/fluid/operators/scale_op.cu @@ -20,6 +20,11 @@ REGISTER_OP_CUDA_KERNEL( scale, paddle::operators::ScaleKernel, paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, paddle::operators::ScaleKernel, paddle::operators::ScaleKernel, diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h index ce4af442..0e83219d 100644 --- a/paddle/fluid/operators/scatter.cu.h +++ b/paddle/fluid/operators/scatter.cu.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -14,8 +14,10 @@ limitations under the License. */ #pragma once #include +#include #include "math/math_function.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/place.h" @@ -57,6 +59,26 @@ __global__ void ScatterCUDAKernel(const T* params, const IndexT* indices, } } +template +__global__ void ScatterNdCUDAKernel(const T* update, const IndexT* indices, + T* output, const int* output_dims, + size_t remain_size, size_t slice_size, + size_t end_size) { + CUDA_1D_KERNEL_LOOP(i, remain_size * slice_size) { + int indices_i = i / slice_size; + int slice_i = i - indices_i * slice_size; // offset inside the slice + IndexT gather_i = 0; + int64_t temp = slice_size; + for (int64_t j = end_size - 1; j >= 0; --j) { + IndexT index_value = indices[indices_i * end_size + j]; + gather_i += (index_value * temp); + temp *= output_dims[j]; + } + IndexT output_i = gather_i + slice_i; + paddle::platform::CudaAtomicAdd(output + output_i, *(update + i)); + } +} + /** * A thin wrapper on gpu tensor * Return a new updated tensor from source tensor, scatter-assigned according to @@ -69,12 +91,16 @@ template void GPUScatterAssign(const framework::ExecutionContext& context, const Tensor& src, const Tensor& index, Tensor* output, bool overwrite = true) { - // PADDLE_ENFORCE(platform::is_gpu_place(place)); // check index of shape 1-D - const auto& ctx = context.device_context(); - PADDLE_ENFORCE(index.dims().size() == 1 || - (index.dims().size() == 2 && index.dims()[1] == 1)); + if (index.dims().size() == 2) { + PADDLE_ENFORCE_EQ(index.dims()[1], 1, + "index.dims()[1] should be 1 when index.dims().size() == " + "2 in scatter_op."); + } else { + PADDLE_ENFORCE_EQ(index.dims().size(), 1, + "index.dims().size() should be 1 or 2 in scatter_op."); + } int index_size = index.dims()[0]; auto src_dims = src.dims(); @@ -109,5 +135,58 @@ void GPUScatterAssign(const framework::ExecutionContext& context, p_src, p_index, p_output, index_size, slice_size, overwrite); } +template +void GPUScatterNdAdd(const framework::ExecutionContext& context, + const Tensor& update, const Tensor& index, + Tensor* output) { + auto index_dims = index.dims(); + auto index_dims_size = index_dims.size(); + + auto output_dims = output->dims(); + auto output_dims_size = output_dims.size(); + + const T* p_update = update.data(); + const IndexT* p_index = index.data(); + T* p_output = output->data(); + + // final dim + int64_t end_size = index_dims[index_dims_size - 1]; + // remain dim + auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1); + int64_t remain_numel = framework::product(remain_ddim); + // slice size + int64_t slice_size = 1; + for (int64_t i = end_size; i < output_dims_size; ++i) { + slice_size *= output_dims[i]; + } + const size_t slice_bytes = slice_size * sizeof(T); + // put output_dims int CUDA + // gplace and cplace + const auto& ctx = context.template device_context(); + const auto gplace = boost::get(ctx.GetPlace()); + auto cplace = platform::CPUPlace(); + + std::vector v_output_dims(output_dims_size); + for (int i = 0; i < output_dims_size; ++i) { + v_output_dims[i] = static_cast(output_dims[i]); + } + auto& dev_ctx = context.cuda_device_context(); + int bytes = output_dims_size * sizeof(int); + auto output_dims_ptr = memory::Alloc(dev_ctx, bytes); + int* g_output_dims = reinterpret_cast(output_dims_ptr->ptr()); + memory::Copy(gplace, g_output_dims, cplace, v_output_dims.data(), bytes, + ctx.stream()); + + int block = 512; + int n = slice_size * remain_numel; + int grid = (n + block - 1) / block; + + ScatterNdCUDAKernel<<< + grid, block, 0, + reinterpret_cast(ctx).stream()>>>( + p_update, p_index, p_output, g_output_dims, remain_numel, slice_size, + end_size); +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h index 680dc282..2a88b96d 100644 --- a/paddle/fluid/operators/scatter.h +++ b/paddle/fluid/operators/scatter.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -73,10 +73,16 @@ elementwise_inner_add(const framework::ExecutionContext& ctx, template void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace())); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true); // check index of shape 1-D - PADDLE_ENFORCE(index.dims().size() == 1 || - (index.dims().size() == 2 && index.dims()[1] == 1)); + if (index.dims().size() == 2) { + PADDLE_ENFORCE_EQ(index.dims()[1], 1, + "index.dims()[1] should be 1 when index.dims().size() == " + "2 in scatter_op."); + } else { + PADDLE_ENFORCE_EQ(index.dims().size(), 1, + "index.dims().size() should be 1 or 2 in scatter_op."); + } int index_size = index.dims()[0]; auto src_dims = src.dims(); @@ -88,7 +94,7 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, // check src shape and dst shape should match for (int i = 1; i < src_dims.size(); i++) - PADDLE_ENFORCE(src_dims[i] == dst_dims[i]); + PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i]); // slice size size_t slice_size = 1; @@ -105,10 +111,12 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, template void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src, const Tensor& index, Tensor* output) { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.device_context().GetPlace())); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.device_context().GetPlace()), + true); // check index of shape 1-D PADDLE_ENFORCE(index.dims().size() == 1 || - (index.dims().size() == 2 && index.dims()[1] == 1)); + (index.dims().size() == 2 && index.dims()[1] == 1), + ""); int index_size = index.dims()[0]; auto src_dims = src.dims(); @@ -122,7 +130,7 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src, // check src shape and dst shape should match for (int i = 1; i < src_dims.size(); i++) - PADDLE_ENFORCE(src_dims[i] == dst_dims[i]); + PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i]); // slice size size_t slice_size = 1; @@ -136,6 +144,7 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src, memset(result_p_output + slice_size * index_, 0, slice_bytes); } + // if not in overwrite mode, need to init output data for (int i = 0; i < index_size; ++i) { const IndexT& index_ = p_index[i]; elementwise_inner_add(ctx, p_src, p_output, result_p_output, src, @@ -144,5 +153,49 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src, } } +template +void ScatterNdAdd(const framework::ExecutionContext& ctx, const Tensor& update, + const Tensor& index, Tensor* output) { + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.device_context().GetPlace()), + true, "It should be running on the CPU"); + + // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:] + auto index_dims = index.dims(); + auto index_dims_size = index_dims.size(); + + auto output_dims = output->dims(); + auto output_dims_size = output_dims.size(); + + const T* p_update = update.data(); + const IndexT* p_index = index.data(); + T* result_p_output = output->data(); + const T* p_output = output->data(); + + // final dim + int64_t end_size = index_dims[index_dims_size - 1]; + // remain dim + auto remain_ddim = framework::slice_ddim(index_dims, 0, index_dims_size - 1); + int64_t remain_numel = framework::product(remain_ddim); + // slice size + int64_t slice_size = 1; + for (int64_t i = end_size; i < output_dims_size; ++i) { + slice_size *= output_dims[i]; + } + const size_t slice_bytes = slice_size * sizeof(T); + + for (int64_t i = 0; i < remain_numel; ++i) { + IndexT index_ = 0; + IndexT temp = 1; + for (int64_t j = end_size - 1; j >= 0; --j) { + IndexT index_value = p_index[i * end_size + j]; + index_ += (index_value * temp); + temp *= output_dims[j]; + } + elementwise_inner_add(ctx, p_update, p_output, result_p_output, + update, output, i, index_, slice_size, + slice_bytes); + } +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc new file mode 100644 index 00000000..41f18eae --- /dev/null +++ b/paddle/fluid/operators/scatter_nd_add_op.cc @@ -0,0 +1,186 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/scatter_nd_add_op.h" +#include +#include +#include "paddle/fluid/framework/ddim.h" + +namespace paddle { +namespace operators { + +class ScatterNdAddOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of ScatterNdAddOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true, + "Input(Index) of ScatterNdAddOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Updates"), true, + "Input(Updates) of ScatterNdAddOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of ScatterNdAddOp should not be null."); + + auto ref_dims = ctx->GetInputDim("X"); + auto ref_dims_size = ref_dims.size(); + auto index_dims = ctx->GetInputDim("Index"); + auto index_dims_size = index_dims.size(); + auto updates_dims = ctx->GetInputDim("Updates"); + auto updates_dims_size = updates_dims.size(); + + PADDLE_ENFORCE_LE( + index_dims[index_dims_size - 1], ref_dims_size, + "Input(Index).shape[-1] should be no greater than Input(X).rank"); + PADDLE_ENFORCE_GE(index_dims_size, 2UL, + "The rank of Input(Index) should be greater than 1"); + + // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:] + std::vector r_updates_dims; + for (int64_t i = 0; i < index_dims_size - 1; ++i) { + r_updates_dims.emplace_back(index_dims[i]); + } + for (int64_t i = index_dims[index_dims_size - 1]; i < ref_dims_size; ++i) { + r_updates_dims.emplace_back(ref_dims[i]); + } + + PADDLE_ENFORCE_EQ(r_updates_dims.size(), updates_dims_size, + "Updates has wrong shape"); + + for (int64_t i = 0; i < updates_dims_size; ++i) { + PADDLE_ENFORCE_EQ(r_updates_dims[i], updates_dims[i], + "Updates has wrong shape"); + } + ctx->SetOutputDim("Out", ref_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE_EQ(ctx.Input("X")->type(), + ctx.Input("Updates")->type(), + "Ref and Updates must have same type"); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class ScatterNdAddGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + if (ctx->HasOutput(framework::GradVarName("Updates"))) { + ctx->SetOutputDim(framework::GradVarName("Updates"), + ctx->GetInputDim("Updates")); + } + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); + } +}; + +class ScatterNdAddOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The source input of scatter_nd_add op"); + AddInput("Index", + "The index input of scatter_nd_add op where X will be updated"); + AddInput("Updates", "The updated value of scatter_nd_add op"); + AddOutput("Out", "The output of scatter_nd_add op"); + AddComment(R"DOC( +Scatter_nd_add Operator. + +Output is obtained by applying sparse addition to a single value or slice in a Variable. + + Given: + * Case 1: + ref = [0, 1, 2, 3, 4, 5] + index = [[1], [2], [3], [1]] + updates = [9, 10, 11, 12] + + we get: + + output = [0, 22, 12, 14, 4, 5] + + * Case 2: + ref = [[65, 17], [-14, -25]] + index = [[], []] + updates = [[[-1, -2], [1, 2]], + [[3, 4], [-3, -4]]] + ref.shape = (2, 2) + index.shape = (2, 0) + updates.shape = (2, 2, 2) + + we get: + + output = [[67, 19], [-16, -27]] +)DOC"); + } +}; + +class ScatterNdAddGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("scatter_nd_add_grad"); + op->SetInput("Index", Input("Index")); + op->SetInput("Updates", Input("Updates")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ScatterNdAddGradNoNeedBufferVarsInference, + "Updates"); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(scatter_nd_add, ops::ScatterNdAddOp, ops::ScatterNdAddOpMaker, + ops::ScatterNdAddGradDescMaker); + +REGISTER_OPERATOR(scatter_nd_add_grad, ops::ScatterNdAddGradOp, + ops::ScatterNdAddGradNoNeedBufferVarsInference); + +REGISTER_OP_CPU_KERNEL(scatter_nd_add, ops::ScatterNdAddOpKernel, + ops::ScatterNdAddOpKernel, + ops::ScatterNdAddOpKernel, + ops::ScatterNdAddOpKernel, + ops::ScatterNdAddOpKernel); + +REGISTER_OP_CPU_KERNEL(scatter_nd_add_grad, + ops::ScatterNdAddGradientOpKernel, + ops::ScatterNdAddGradientOpKernel, + ops::ScatterNdAddGradientOpKernel, + ops::ScatterNdAddGradientOpKernel, + ops::ScatterNdAddGradientOpKernel); diff --git a/paddle/fluid/operators/scatter_nd_add_op.cu b/paddle/fluid/operators/scatter_nd_add_op.cu new file mode 100644 index 00000000..ecd9beb1 --- /dev/null +++ b/paddle/fluid/operators/scatter_nd_add_op.cu @@ -0,0 +1,98 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/gather_op.h" +#include "paddle/fluid/operators/scatter.cu.h" +#include "paddle/fluid/operators/scatter_nd_add_op.h" + +namespace paddle { +namespace operators { + +template +class ScatterNdAddOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, + "This kernel only runs on GPU device."); + auto *X = ctx.Input("X"); + auto *Ids = ctx.Input("Index"); + auto *Updates = ctx.Input("Updates"); + auto *Out = ctx.Output("Out"); + + framework::TensorCopySync(*X, ctx.GetPlace(), Out); + const auto &index_type = Ids->type(); + bool index_type_match = index_type == framework::proto::VarType::INT32 || + index_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + "Index holds the wrong type, it holds %s, but desires to be %s or %s", + paddle::framework::DataTypeToString(index_type), + paddle::framework::DataTypeToString(framework::proto::VarType::INT32), + paddle::framework::DataTypeToString(framework::proto::VarType::INT64)); + if (index_type == framework::proto::VarType::INT32) { + GPUScatterNdAdd(ctx, *Updates, *Ids, Out); + } else { + GPUScatterNdAdd(ctx, *Updates, *Ids, Out); + } + } +}; + +template +class ScatterNdAddGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true, + "This kernel only runs on GPU device."); + auto *dX = ctx.Output(framework::GradVarName("X")); + auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); + auto *Ids = ctx.Input("Index"); + auto *dOut = ctx.Input(framework::GradVarName("Out")); + if (dX) { + // In place gradient: dX = dO + framework::TensorCopy(*dOut, ctx.GetPlace(), dX); + } + if (dUpdates) { + dUpdates->mutable_data(ctx.GetPlace()); + // Gradient by Gather + const auto &index_type = Ids->type(); + if (index_type == framework::proto::VarType::INT32) { + GPUGatherNd(ctx, *dOut, *Ids, dUpdates); + } else { + GPUGatherNd(ctx, *dOut, *Ids, dUpdates); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CUDA = paddle::platform::CUDADeviceContext; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(scatter_nd_add, + ops::ScatterNdAddOpCUDAKernel, + ops::ScatterNdAddOpCUDAKernel, + ops::ScatterNdAddOpCUDAKernel, + ops::ScatterNdAddOpCUDAKernel, + ops::ScatterNdAddOpCUDAKernel); + +REGISTER_OP_CUDA_KERNEL(scatter_nd_add_grad, + ops::ScatterNdAddGradOpCUDAKernel, + ops::ScatterNdAddGradOpCUDAKernel, + ops::ScatterNdAddGradOpCUDAKernel, + ops::ScatterNdAddGradOpCUDAKernel, + ops::ScatterNdAddGradOpCUDAKernel); diff --git a/paddle/fluid/operators/scatter_nd_add_op.h b/paddle/fluid/operators/scatter_nd_add_op.h new file mode 100644 index 00000000..4b90fa1c --- /dev/null +++ b/paddle/fluid/operators/scatter_nd_add_op.h @@ -0,0 +1,86 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/gather.h" +#include "paddle/fluid/operators/scatter.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ScatterNdAddOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + "This kernel only runs on CPU."); + auto *X = ctx.Input("X"); + auto *Ids = ctx.Input("Index"); + auto *Updates = ctx.Input("Updates"); + auto *Out = ctx.Output("Out"); + + // In place output: Out = X + framework::TensorCopySync(*X, ctx.GetPlace(), Out); + const auto &index_type = Ids->type(); + bool index_type_match = index_type == framework::proto::VarType::INT32 || + index_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + "Index holds the wrong type, it holds %s, but desires to be %s or %s", + paddle::framework::DataTypeToString(index_type), + paddle::framework::DataTypeToString(framework::proto::VarType::INT32), + paddle::framework::DataTypeToString(framework::proto::VarType::INT64)); + + if (index_type == framework::proto::VarType::INT32) { + ScatterNdAdd(ctx, *Updates, *Ids, Out); + } else { + ScatterNdAdd(ctx, *Updates, *Ids, Out); + } + } +}; + +template +class ScatterNdAddGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + "This kernel only runs on CPU."); + auto *dX = ctx.Output(framework::GradVarName("X")); + auto *dUpdates = ctx.Output(framework::GradVarName("Updates")); + auto *Ids = ctx.Input("Index"); + auto *dOut = ctx.Input(framework::GradVarName("Out")); + + if (dX) { + // In place gradient: dX = dO + framework::TensorCopySync(*dOut, ctx.GetPlace(), dX); + } + if (dUpdates) { + dUpdates->mutable_data(ctx.GetPlace()); + // Gradient by Gather: dUpdates = dO[Ids] + const auto &index_type = Ids->type(); + if (index_type == framework::proto::VarType::INT32) { + CPUGatherNd(ctx.device_context(), *dOut, *Ids, dUpdates); + } else { + CPUGatherNd(ctx.device_context(), *dOut, *Ids, dUpdates); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/scatter_op.cu b/paddle/fluid/operators/scatter_op.cu index e17617b4..6c4da760 100644 --- a/paddle/fluid/operators/scatter_op.cu +++ b/paddle/fluid/operators/scatter_op.cu @@ -33,7 +33,22 @@ class ScatterOpCUDAKernel : public framework::OpKernel { bool overwrite = ctx.Attr("overwrite"); Out->ShareDataWith(*X); - GPUScatterAssign(ctx, *Updates, *Ids, Out, overwrite); + // use template class to support int32_t and int64_t + const auto &index_type = Ids->type(); + bool index_type_match = index_type == framework::proto::VarType::INT32 || + index_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + "scatter_op Index holds the wrong type, it holds %s, but desires to be " + "%s or %s", + paddle::framework::DataTypeToString(index_type), + paddle::framework::DataTypeToString(framework::proto::VarType::INT32), + paddle::framework::DataTypeToString(framework::proto::VarType::INT64)); + if (index_type == framework::proto::VarType::INT32) { + GPUScatterAssign(ctx, *Updates, *Ids, Out, overwrite); + } else { + GPUScatterAssign(ctx, *Updates, *Ids, Out, overwrite); + } } }; @@ -54,7 +69,23 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel { if (dUpdates) { dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates = dO[Ids] - GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + const auto &index_type = Ids->type(); + bool index_type_match = index_type == framework::proto::VarType::INT32 || + index_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + "scatter_op Index holds the wrong type, it holds %s, but desires to " + "be %s or %s", + paddle::framework::DataTypeToString(index_type), + paddle::framework::DataTypeToString(framework::proto::VarType::INT32), + paddle::framework::DataTypeToString( + framework::proto::VarType::INT64)); + // Gradient by Gather: dUpdates = dO[Ids] + if (index_type == framework::proto::VarType::INT32) { + GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + } else { + GPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + } } } }; diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h index 3b6184de..97254f81 100644 --- a/paddle/fluid/operators/scatter_op.h +++ b/paddle/fluid/operators/scatter_op.h @@ -81,7 +81,22 @@ class ScatterGradientOpKernel : public framework::OpKernel { if (dUpdates) { dUpdates->mutable_data(ctx.GetPlace()); // Gradient by Gather: dUpdates = dO[Ids] - CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + const auto &index_type = Ids->type(); + bool index_type_match = index_type == framework::proto::VarType::INT32 || + index_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE_EQ( + index_type_match, true, + "scatter_op index holds the wrong type, it holds %s, but desires to " + "be %s or %s", + paddle::framework::DataTypeToString(index_type), + paddle::framework::DataTypeToString(framework::proto::VarType::INT32), + paddle::framework::DataTypeToString( + framework::proto::VarType::INT64)); + if (index_type == framework::proto::VarType::INT32) { + CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + } else { + CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); + } } } }; diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h new file mode 100644 index 00000000..c795f1e3 --- /dev/null +++ b/paddle/fluid/operators/search_compute.h @@ -0,0 +1,138 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/dynload/mklml.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +template +void call_gemm(const math::BlasT& blas, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const T alpha, const T* A, + const T* B, const T beta, T* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +} + +template +void call_gemm(const framework::ExecutionContext& ctx, + const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const T alpha, const T* A, + const T* B, const T beta, T* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + auto blas = math::GetBlas(ctx); + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +} + +template +void call_gemm_with_lda(const math::BlasT& blas, + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const T alpha, const T* A, const T* B, + const T beta, T* C, int lda) { + int ldb = (TransB == CblasNoTrans) ? N : K; + + blas.GEMM(TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +} + +template +void call_gemm_batched(const framework::ExecutionContext& ctx, + const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const int M, const int N, + const int K, const T alpha, const T** A, const T** B, + const T beta, T** C, const int batch) { + for (int i = 0; i < batch; ++i) { + call_gemm(ctx, TransA, TransB, M, N, K, alpha, A[i], B[i], beta, C[i]); + } +} + +#ifndef TYPE_USE_FLOAT +#define TYPE_USE_FLOAT +#endif +#ifndef USE_SSE +#define USE_SSE +#endif + +#if defined(TYPE_USE_FLOAT) + +#define __m256x __m256 +#define __m128x __m128 + +static const unsigned int AVX_STEP_SIZE = 8; +static const unsigned int SSE_STEP_SIZE = 4; +static const unsigned int AVX_CUT_LEN_MASK = 7U; +static const unsigned int SSE_CUT_LEN_MASK = 3U; + +#define _mm256_mul_px _mm256_mul_ps +#define _mm256_add_px _mm256_add_ps +#define _mm256_load_px _mm256_loadu_ps +#define _mm256_store_px _mm256_storeu_ps +#define _mm256_broadcast_sx _mm256_broadcast_ss + +#define _mm_add_px _mm_add_ps +#define _mm_mul_px _mm_mul_ps +#define _mm_load_px _mm_loadu_ps +#define _mm_store_px _mm_storeu_ps +#define _mm_load1_px _mm_load1_ps + +#endif + +template +inline void sse_axpy(const T* x, T* y, size_t len, const T alpha) { + unsigned int jjj, lll; + jjj = lll = 0; + +#if defined(USE_AVX) + lll = len & ~AVX_CUT_LEN_MASK; + __m256x mm_alpha = _mm256_broadcast_sx(&alpha); + for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) { + _mm256_store_px( + y + jjj, + _mm256_add_px(_mm256_load_px(y + jjj), + _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)))); + } + +#elif defined(USE_SSE) + lll = len & ~SSE_CUT_LEN_MASK; + __m128x mm_alpha = _mm_load1_px(&alpha); + for (jjj = 0; jjj < lll; jjj += SSE_STEP_SIZE) { + _mm_store_px(y + jjj, + _mm_add_px(_mm_load_px(y + jjj), + _mm_mul_px(mm_alpha, _mm_load_px(x + jjj)))); + } + +#endif + for (; jjj < len; jjj++) { + y[jjj] += alpha * x[jjj]; + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc index 33d24c11..a7225adb 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.cc @@ -27,7 +27,7 @@ class SequenceMaskOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist"); int maxlen = ctx->Attrs().Get("maxlen"); - auto dim = framework::vectorize2int(ctx->GetInputDim("X")); + auto dim = framework::vectorize(ctx->GetInputDim("X")); if (ctx->HasInputs("MaxLenTensor")) { dim.push_back(-1); diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h index a92c5de6..abddc685 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h @@ -89,7 +89,7 @@ class SequenceMaskKernel : public framework::OpKernel { maxlen = *max_len_tensor->data(); } - auto y_dim = framework::vectorize2int(x->dims()); + auto y_dim = framework::vectorize(x->dims()); y_dim.push_back(maxlen); y->Resize(framework::make_ddim(y_dim)); @@ -110,7 +110,7 @@ class SequenceMaskKernel : public framework::OpKernel { #else maxlen = static_cast(*std::max_element(x_data, x_data + x_numel)); #endif - auto y_dim = framework::vectorize2int(x->dims()); + auto y_dim = framework::vectorize(x->dims()); y_dim.push_back(maxlen); y->Resize(framework::make_ddim(y_dim)); } diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc index 5290d0e6..fcc49096 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc @@ -25,24 +25,25 @@ class SequencePadOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of SequencePadOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("PadValue"), - "Input(PadValue) of SequencePadOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of SequencePadOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Length"), - "Output(Length) of SequencePadOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of SequencePadOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("PadValue"), true, + "Input(PadValue) of SequencePadOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of SequencePadOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Length"), true, + "Output(Length) of SequencePadOp should not be null."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_GE(x_dims.size(), 2, "The rank of Input(X) can't be less than 2."); auto time_step_dims = framework::slice_ddim(x_dims, 1, x_dims.size()); auto pad_value_dims = ctx->GetInputDim("PadValue"); - PADDLE_ENFORCE(pad_value_dims == framework::make_ddim({1}) || - pad_value_dims == time_step_dims, - "The Input(PadValue) must be a scalar or a tensor whose " - "shape equals to time steps in sequences"); + PADDLE_ENFORCE_EQ(pad_value_dims == framework::make_ddim({1}) || + pad_value_dims == time_step_dims, + true, + "The Input(PadValue) must be a scalar or a tensor whose " + "shape equals to time steps in sequences"); int out_dim_0 = -1; @@ -52,7 +53,8 @@ class SequencePadOp : public framework::OperatorWithKernel { framework::Variable* x_var = boost::get(ctx->GetInputVarPtrs("X")[0]); const auto& x_lod = x_var->Get().lod(); - PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + PADDLE_ENFORCE_EQ(x_lod.empty(), false, + "The Input(X) must hold lod info."); const auto& x_lod_0 = x_lod[0]; PADDLE_ENFORCE_GE(x_lod_0.size(), 2, "The Input(X)'s lod info is corrupted."); @@ -80,8 +82,8 @@ class SequencePadOp : public framework::OperatorWithKernel { } std::vector out_dims_vec{out_dim_0, padded_length}; - std::vector len_dims_vec{out_dim_0, 1}; - auto time_step_dims_vec = framework::vectorize2int(time_step_dims); + std::vector len_dims_vec{out_dim_0}; + auto time_step_dims_vec = framework::vectorize(time_step_dims); out_dims_vec.insert(out_dims_vec.end(), time_step_dims_vec.begin(), time_step_dims_vec.end()); ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); @@ -143,7 +145,7 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker { then we get LoDTensor: Out.data = [[a, b, 0, 0], [c, d, e, 0]] - Length.data = [[2], [3]] + Length.data = [2, 3] Case 2: @@ -157,7 +159,7 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker { then we get LoDTensor: Out.data = [[[a1, a2], [b1, b2], [0, 0]], [[c1, c2], [d1, d2], [e1, e2]]] - Length.data = [[2], [3]] + Length.data = [2, 3] Case 3: @@ -171,7 +173,7 @@ class SequencePadOpMaker : public framework::OpProtoAndCheckerMaker { then we get LoDTensor: Out.data = [[[a1, a2], [b1, b2], [p1, p2]], [[c1, c2], [d1, d2], [e1, e2]]] - Length.data = [[2], [3]] + Length.data = [2, 3] )DOC"); } @@ -182,10 +184,11 @@ class SequencePadGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of SequencePadGradOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE_EQ( + ctx->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) of SequencePadGradOp should not be null."); if (ctx->HasOutput(framework::GradVarName("X"))) { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc index f3193fdc..51e354dc 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc @@ -24,14 +24,15 @@ class SequencePoolOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of SequencePoolOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of SequencePoolOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of SequencePoolOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of SequencePoolOp should not be null."); ctx->SetOutputDim("Out", ctx->GetInputDim("X")); if (ctx->Attrs().Get("pooltype") == "MAX") { - PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"), - "Output(MaxIndex) of SequencePoolOp should not be null."); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("MaxIndex"), true, + "Output(MaxIndex) of SequencePoolOp should not be null."); ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X")); } } @@ -102,9 +103,10 @@ class SequencePoolGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Gradient of Out should not be null."); - PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "Gradient of Out should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "The input X should not be null."); auto og_dims = ctx->GetInputDim(framework::GradVarName("Out")); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(og_dims.size(), x_dims.size(), diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h index c3273480..3eec4df1 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h @@ -30,19 +30,30 @@ class SequencePoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* in = context.Input("X"); - auto* out = context.Output("Out"); + auto* out = context.Output("Out"); std::string pooltype = context.Attr("pooltype"); T pad_value = static_cast(context.Attr("pad_value")); auto dims = in->dims(); auto lod = in->lod(); + auto lod_level = lod.size(); // InferShape by lod - PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); + PADDLE_ENFORCE_GE(lod_level, 1UL, + "The lod level of input shall be 1 at least."); + PADDLE_ENFORCE_LE(lod_level, 2UL, + "The lod level of input shall be no more than 2."); PADDLE_ENFORCE_GE( dims[0], - /*batch size = */ static_cast(lod[0].size() - 1), + /*batch size = */ static_cast(lod[lod_level - 1].size() - 1), "The first dimension of Input(X) must be large than batch size."); - dims[0] = lod[0].size() - 1; + if (lod_level > 1UL) { + PADDLE_ENFORCE_EQ(lod[0][lod[0].size() - 1], lod[1].size() - 1, + "The input lod information is illegal."); + framework::LoD out_lod; + out_lod.push_back(lod[0]); + out->set_lod(out_lod); + } + dims[0] = lod[lod_level - 1].size() - 1; out->Resize({dims}); out->mutable_data(context.GetPlace()); Tensor* index = nullptr; @@ -68,7 +79,7 @@ template class SequencePoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* out_g = context.Input(framework::GradVarName("Out")); + auto* out_g = context.Input(framework::GradVarName("Out")); auto* in_g = context.Output(framework::GradVarName("X")); std::string pooltype = context.Attr("pooltype"); const Tensor* index = nullptr; diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h index ed49e947..0555e4ee 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.h @@ -97,6 +97,9 @@ class SequenceSoftmaxKernel : public framework::OpKernel { auto dims = x->dims(); const size_t level = lod.size() - 1; + PADDLE_ENFORCE_GT( + lod.size(), 0U, + "The LoD level of Input X should be larger than 0 (lod.size() > 0)."); PADDLE_ENFORCE_EQ(dims[0], static_cast(lod[level].back()), "The first dimension of Input(X) should be equal to the " "sum of all sequences' lengths."); diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc new file mode 100644 index 00000000..232f324d --- /dev/null +++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h" +#include +#include + +namespace paddle { +namespace operators { + +class SequenceTopkAvgPoolingOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of SequencePoolOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("ROW"), true, + "Input(ROW) of SequencePoolOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("COLUMN"), true, + "Input(COLUMN) of SequencePoolOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of SequencePoolOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("pos"), true, + "pos(out) should not be null"); + + auto attr = ctx->Attrs(); + auto channel_num = attr.Get("channel_num"); + auto topks = attr.Get>("topks"); + + auto row_dim = ctx->GetInputDim("ROW"); + + auto num_k = topks.size(); + auto row_shape_0 = row_dim[0]; + + std::vector vec_out_shape; + vec_out_shape.push_back(row_shape_0); + vec_out_shape.push_back(channel_num * num_k); + + ctx->SetOutputDim("Out", framework::make_ddim(vec_out_shape)); + ctx->ShareLoD("X", "Out"); + } +}; + +class SequenceTopkAvgPoolingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(LoDTensor) The variable-length input of SequenceTopkPoolingOp"); + AddInput("ROW", "(LoDTensor) the row info"); + AddInput("COLUMN", "(LoDTensor) the column info"); + AddOutput( + "Out", + "(Tensor) The output of SequenceTopkPoolingOp does not contain LoD " + "infomation."); + AddOutput("pos", "(Tensor) store the topk index ").AsIntermediate(); + AddAttr>("topks", "topks"); + AddAttr("channel_num", "channel number"); + AddComment(R"DOC( + sequecen topk average pooling op + )DOC"); + } +}; + +class SequenceTopkAvgPoolingGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "Gradient of Out should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "The input X should not be null."); + + ctx->ShareDim("X", /*->*/ framework::GradVarName("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class SequenceTopkAvgPoolGradOpMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op_desc_ptr = new framework::OpDesc(); + op_desc_ptr->SetType("sequence_topk_avg_pooling_grad"); + op_desc_ptr->SetInput("X", Input("X")); + op_desc_ptr->SetInput("ROW", Input("ROW")); + op_desc_ptr->SetInput("COLUMN", Input("COLUMN")); + op_desc_ptr->SetInput("pos", Output("pos")); + op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op_desc_ptr->SetAttrMap(Attrs()); + return std::unique_ptr(op_desc_ptr); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sequence_topk_avg_pooling, ops::SequenceTopkAvgPoolingOp, + ops::SequenceTopkAvgPoolingOpMaker, + ops::SequenceTopkAvgPoolGradOpMaker); +REGISTER_OPERATOR(sequence_topk_avg_pooling_grad, + ops::SequenceTopkAvgPoolingGradOp); +REGISTER_OP_CPU_KERNEL(sequence_topk_avg_pooling, + ops::SequenceTopkAvgPoolingKernel< + paddle::platform::CPUDeviceContext, float>); +REGISTER_OP_CPU_KERNEL(sequence_topk_avg_pooling_grad, + ops::SequenceTopkAvgPoolingGradKernel< + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h new file mode 100644 index 00000000..c6bfdea8 --- /dev/null +++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h @@ -0,0 +1,213 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +template +void get_topk_pos(const T* data, int length, int k, int* pos) { + size_t real_k = k < length ? k : length; + + std::vector v(data, data + length); + + std::vector topk_pos; + T min_val = std::numeric_limits::lowest(); + while (topk_pos.size() < real_k) { + T max_val = min_val; + int max_pos = -1; + for (int i = 0; i < length; ++i) { + if (v[i] > max_val) { + max_pos = i; + max_val = v[i]; + } + } + + assert(max_pos >= 0); + + topk_pos.push_back(max_pos); + v[max_pos] = min_val; + } + + assert(topk_pos.size() > 0); + while (topk_pos.size() < (size_t)k) { + topk_pos.push_back(-1); + } + + for (size_t i = 0; i < topk_pos.size(); ++i) { + pos[i] = topk_pos[i]; + } +} + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +template +class SequenceTopkAvgPoolingKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* row = context.Input("ROW"); + auto* col = context.Input("COLUMN"); + auto* out = context.Output("Out"); + auto* pos = context.Output("pos"); + + auto channel_num = context.Attr("channel_num"); + auto topks = context.Attr>("topks"); + auto k_num = topks.size(); + auto max_k = topks[topks.size() - 1]; + std::vector vec_pos_shape; + auto in_lod = in->lod()[0]; + + auto row_lod = row->lod()[0]; + auto col_lod = col->lod()[0]; + int batch_size = row_lod.size() - 1; + int pos_total_size = row_lod[batch_size] * channel_num * max_k; + vec_pos_shape.push_back(pos_total_size); + pos->Resize({framework::make_ddim(vec_pos_shape)}); + auto pos_data = pos->mutable_data(context.GetPlace()); + + int offset = 0; + framework::Vector vec_out_lod; + vec_out_lod.reserve(batch_size + 1); + for (int i = 0; i <= batch_size; ++i) { + offset = row_lod[i]; + vec_out_lod.push_back(offset); + } + + framework::LoD lod_temp; + lod_temp.push_back(vec_out_lod); + out->set_lod(lod_temp); + + auto din_data = in->data(); + auto dout_data = out->mutable_data(context.GetPlace()); + + T* sum_data = new T[max_k]; + for (int i = 0; i < batch_size; ++i) { + int total_size = in_lod[i + 1] - in_lod[i]; + int row_size = row_lod[i + 1] - row_lod[i]; + int col_size = col_lod[i + 1] - col_lod[i]; + PADDLE_ENFORCE_EQ(total_size, channel_num * row_size * col_size, + "size wrong in sequence_topk_avg_pooling_op!"); + + int feature_num = row_size * col_size; + for (int j = 0; j < channel_num; ++j) { + auto input_offset_feature_data = din_data + in_lod[i] + j * feature_num; + + for (int r = 0; r < row_size; ++r) { + auto row_data = input_offset_feature_data + r * col_size; + + auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k + + r * channel_num * max_k + j * max_k; + auto out_slice_data = dout_data + row_lod[i] * channel_num * k_num + + r * channel_num * k_num + j * k_num; + + get_topk_pos(row_data, col_size, max_k, pos_slice_data); + if (pos_slice_data[0] == -1) { + sum_data[0] = 0.0; + } else { + sum_data[0] = row_data[pos_slice_data[0]]; + } + for (int k = 1; k < max_k; ++k) { + if (pos_slice_data[k] == -1) { + sum_data[k] = sum_data[k - 1]; + } else { + sum_data[k] = sum_data[k - 1] + row_data[pos_slice_data[k]]; + } + } + for (size_t k = 0; k < k_num; ++k) { + out_slice_data[k] = sum_data[topks[k] - 1] / topks[k]; + } + } + } + } + delete[] sum_data; + } +}; + +template +class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* d_out = context.Input(framework::GradVarName("Out")); + auto* d_in = context.Output(framework::GradVarName("X")); + auto* pos_input = context.Input("pos"); + auto* row_input = context.Input("ROW"); + auto* col_input = context.Input("COLUMN"); + auto* forward_input = context.Input("X"); + + int batch_size = row_input->lod()[0].size() - 1; + auto channel_num = context.Attr("channel_num"); + auto topks = context.Attr>("topks"); + auto k_num = topks.size(); + auto max_k = topks[k_num - 1]; + + auto out_lod = forward_input->lod(); + d_in->set_lod(out_lod); + + d_in->mutable_data(context.GetPlace()); + auto pos_data = pos_input->data(); + auto dout_data = d_out->data(); + + auto& dev_ctx = + context.template device_context(); + math::SetConstant zero; + zero(dev_ctx, d_in, static_cast(0.0)); + + auto din_data = d_in->data(); + + auto out_offset = out_lod[0]; + auto row_lod = row_input->lod()[0]; + auto col_lod = col_input->lod()[0]; + + for (int i = 0; i < batch_size; ++i) { + int row_size = row_lod[i + 1] - row_lod[i]; + int col_size = col_lod[i + 1] - col_lod[i]; + int feature_num = row_size * col_size; + + for (int j = 0; j < channel_num; ++j) { + auto in_offset_feature_data = + din_data + out_offset[i] + j * feature_num; + + for (int r = 0; r < row_size; r++) { + auto row_data = dout_data + row_lod[i] * channel_num * k_num + + r * channel_num * k_num + j * k_num; + auto pos_slice_data = pos_data + row_lod[i] * channel_num * max_k + + r * channel_num * max_k + j * max_k; + auto in_slice_data = in_offset_feature_data + r * col_size; + + for (size_t m = 0; m < k_num; ++m) { + for (int k = 0; k < topks[m]; ++k) { + if (pos_slice_data[k] == -1) { + break; + } else { + in_slice_data[pos_slice_data[k]] += row_data[m] / topks[m]; + } + } + } + } + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc index 6c98a3e8..23581c36 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc @@ -25,22 +25,22 @@ class SequenceUnpadOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of SequenceUnpadOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Length"), - "Input(Length) of SequenceUnpadOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of SequenceUnpadOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of SequenceUnpadOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Length"), true, + "Input(Length) of SequenceUnpadOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of SequenceUnpadOp should not be null."); auto x_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_GE(x_dims.size(), 2, "The rank of Input(X) can't be less than 2."); auto len_dims = ctx->GetInputDim("Length"); - PADDLE_ENFORCE(len_dims.size() == 2 && len_dims[1] == 1, - "The shape of Input(Length) should be [batch_size, 1]."); - PADDLE_ENFORCE( - len_dims[0] == x_dims[0], + PADDLE_ENFORCE_EQ(len_dims.size(), 1, + "The shape of Input(Length) should be [batch_size]."); + PADDLE_ENFORCE_EQ( + len_dims[0], x_dims[0], "Input(X) and Input(Length) should have the same first dimension."); int64_t out_dim_0 = -1; @@ -96,7 +96,7 @@ class SequenceUnpadOpMaker : public framework::OpProtoAndCheckerMaker { in which there are 3 sequences padded to length 5, and the acutal length specified by Input(Length): - Length.data = [[2], [3], [4]], + Length.data = [2, 3, 4], after unpadding, Output(Out) will be: @@ -112,10 +112,10 @@ class SequenceUnpadGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of SequenceUnpadGradOp should not be null."); - PADDLE_ENFORCE( - ctx->HasInput(framework::GradVarName("Out")), + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of SequenceUnpadGradOp should not be null."); + PADDLE_ENFORCE_EQ( + ctx->HasInput(framework::GradVarName("Out")), true, "Input(Out@GRAD) of SequenceUnpadGradOp should not be null."); if (ctx->HasOutput(framework::GradVarName("X"))) { diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 1c272645..c453b03d 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -168,6 +168,12 @@ class SigmoidCrossEntropyWithLogitsGradOpDescMaker } }; +DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsInplaceInferer, + {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(SigmoidCrossEntropyWithLogitsGradInplaceInferer, + {framework::GradVarName("Out"), + framework::GradVarName("X")}); + } // namespace operators } // namespace paddle @@ -175,9 +181,11 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp, ops::SigmoidCrossEntropyWithLogitsOpMaker, - ops::SigmoidCrossEntropyWithLogitsGradOpDescMaker); + ops::SigmoidCrossEntropyWithLogitsGradOpDescMaker, + ops::SigmoidCrossEntropyWithLogitsInplaceInferer); REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, - ops::SigmoidCrossEntropyWithLogitsGradOp); + ops::SigmoidCrossEntropyWithLogitsGradOp, + ops::SigmoidCrossEntropyWithLogitsGradInplaceInferer); REGISTER_OP_CPU_KERNEL( sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsKernel { bool normalize = context.Attr("normalize"); // Temporary memory - auto &allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); - auto cnt_ptr = allocator.Allocate(Labels->numel() * sizeof(T)); + auto cnt_ptr = memory::Alloc(dev_ctx, Labels->numel() * sizeof(T)); T *counts = reinterpret_cast(cnt_ptr->ptr()); int limit = Out->numel(); @@ -127,7 +126,7 @@ class GPUSigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { GPUSigmoidForward<<>>( X->data(), Labels->data(), ignore_index, limit, out_data, counts); if (normalize) { - auto norm_ptr = allocator.Allocate(sizeof(T)); + auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); T *norm = reinterpret_cast(norm_ptr->ptr()); Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( counts, limit, static_cast(1e-5), norm); @@ -152,9 +151,7 @@ class GPUSigmoidCrossEntropyWithLogitsGradKernel auto &dev_ctx = context.cuda_device_context(); // Temporary memory - auto &allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); - auto cnt_ptr = allocator.Allocate(X->numel() * sizeof(T)); + auto cnt_ptr = memory::Alloc(dev_ctx, X->numel() * sizeof(T)); T *counts = reinterpret_cast(cnt_ptr->ptr()); int limit = dX->numel(); @@ -165,7 +162,7 @@ class GPUSigmoidCrossEntropyWithLogitsGradKernel dx_data, counts); bool normalize = context.Attr("normalize"); if (normalize) { - auto norm_ptr = allocator.Allocate(sizeof(T)); + auto norm_ptr = memory::Alloc(dev_ctx, sizeof(T)); T *norm = reinterpret_cast(norm_ptr->ptr()); Sum<<<1, kNumCUDAThreads, 0, dev_ctx.stream()>>>( counts, limit, static_cast(1e-5), norm); diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 6eb61834..4cd7b33a 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/slice_op.h" #include #include +#include #include namespace paddle { @@ -26,44 +27,81 @@ class SliceOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Input"), - "Input (Input) of slice op should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output (Out) of slice op should not be null."); + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, + "Input (Input) of slice op should not be null."); + + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output (Out) of slice op should not be null."); auto in_dims = ctx->GetInputDim("Input"); - PADDLE_ENFORCE(in_dims.size() < 7, - "The rank of input should be less than 7."); + PADDLE_ENFORCE_LT(in_dims.size(), 7, + "The rank of input should be less than 7."); framework::DDim out_dims(in_dims); + auto axes = ctx->Attrs().Get>("axes"); auto starts = ctx->Attrs().Get>("starts"); auto ends = ctx->Attrs().Get>("ends"); + auto infer_flags = ctx->Attrs().Get>("infer_flags"); auto decrease_axis = ctx->Attrs().Get>("decrease_axis"); - PADDLE_ENFORCE_EQ(starts.size(), ends.size()); - PADDLE_ENFORCE_EQ(starts.size(), axes.size()); + auto starts_size = starts.size(); + auto ends_size = ends.size(); + if (infer_flags.empty()) { + // Initialize infer_flags with 1. + // To be compatible with other op tests in which infer_flags is not set. + infer_flags = std::vector(axes.size(), 1); + } + + if (ctx->HasInputs("StartsTensorList")) { + auto StartsTensorList = ctx->Inputs("StartsTensorList"); + PADDLE_ENFORCE_GT(StartsTensorList.size(), 0, + "StartsTensorList size can't be zero"); + starts_size = StartsTensorList.size(); + } + if (ctx->HasInputs("EndsTensorList")) { + auto EndsTensorList = ctx->Inputs("EndsTensorList"); + PADDLE_ENFORCE_GT(EndsTensorList.size(), 0, + "EndsTensorList size can't be zero"); + ends_size = EndsTensorList.size(); + } + + if (ctx->HasInput("StartsTensor") == false) { + PADDLE_ENFORCE_EQ( + starts_size, axes.size(), + "The size of starts must be equal to the size of axes."); + } + if (ctx->HasInput("EndsTensor") == false) { + PADDLE_ENFORCE_EQ(ends_size, axes.size(), + "The size of ends must be equal to the size of axes."); + } + int dim_value, start, end; for (size_t i = 0; i < axes.size(); ++i) { - dim_value = out_dims[axes[i]]; - if (dim_value > 0) { - start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; - end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; - start = std::max(start, 0); - end = std::max(end, 0); - // start = std::min(start, dim_value); - end = std::min(end, dim_value); - // start = std::min(start, end); - PADDLE_ENFORCE_GT(end, start, "end should greater than start"); - out_dims[axes[i]] = end - start; + PADDLE_ENFORCE_LT(static_cast(axes[i]), in_dims.size(), + "The index of dimension in axes must be less " + "than the size of input shape."); + if (infer_flags[i] == -1) { + out_dims[axes[i]] = -1; + } else { + // infer out_dim shape + dim_value = out_dims[axes[i]]; + if (dim_value > 0) { + start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; + end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; + start = std::max(start, 0); + end = std::max(end, 0); + end = std::min(end, dim_value); + PADDLE_ENFORCE_GT(end, start, "end should greater than start"); + out_dims[axes[i]] = end - start; + } } } - // generate new shape if (decrease_axis.size() > 0) { std::vector new_out_shape; for (size_t i = 0; i < decrease_axis.size(); ++i) { - if (ctx->IsRuntime()) { + if (ctx->IsRuntime() && infer_flags[i] != -1) { PADDLE_ENFORCE_EQ(out_dims[decrease_axis[i]], 1, "decrease dim should be 1"); } @@ -81,7 +119,6 @@ class SliceOp : public framework::OperatorWithKernel { out_dims = framework::make_ddim(new_out_shape); } - ctx->SetOutputDim("Out", out_dims); if (axes[0] != 0) { ctx->ShareLoD("Input", /*->*/ "Out"); @@ -90,28 +127,67 @@ class SliceOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { + const framework::ExecutionContext &ctx) const override { return framework::OpKernelType(ctx.Input("Input")->type(), - ctx.Input("Input")->place()); + ctx.device_context()); + } + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const override { + if (var_name == "StartsTensor" || var_name == "EndsTensor") { + return expected_kernel_type; + } + if (var_name == "StartsTensorList" || var_name == "EndsTensorList") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); } }; class SliceOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("Input", "Tensor of data to extract slices from."); + AddInput("Input", "(Tensor) Tensor of data to extract slices from."); + AddInput("StartsTensor", + "(Tensor, optional) If provided, slice will use this." + "It has the highest priority of StartsTensor, StartsTensorList " + "and attr(starts).") + .AsDispensable(); + AddInput("EndsTensor", + "(Tensor, optional) If provided, slice will use this." + "It has the highest priority of EndsTensor, EndsTensorList and " + "attr(ends).") + .AsDispensable(); + AddInput( + "StartsTensorList", + "(vector>, optional) If provided, slice will use this." + "The shape of the tensor in vector MUST BE [1]." + "It has higher priority compare with attr(starts).") + .AsDuplicable() + .AsDispensable(); + AddInput( + "EndsTensorList", + "(vector>, optional) If provided, slice will use this." + "The shape of the tensor in vector MUST BE [1]." + "It has higher priority compare with attr(ends).") + .AsDuplicable() + .AsDispensable(); AddOutput("Out", "Sliced data tensor."); - AddAttr>( "axes", "(list) Axes that `starts` and `ends` apply to. It's optional." "If not present, will be treated as [0, 1, ..., len(`starts`) - 1]."); AddAttr>( "starts", - "(list) Starting indices of corresponding axis in `axes`"); + "(list) Starting indices of corresponding axis in `axes`") + .SetDefault({}); + AddAttr>( + "ends", "(list) Ending indices of corresponding axis in `axes`.") + .SetDefault({}); AddAttr>( - "ends", - "(list) Starting indices of corresponding axis in `axes`."); + "infer_flags", "(list) Flags of inferring dims in attributes.") + .SetDefault({}); AddAttr>("decrease_axis", "(list) decrease_axis") .SetDefault({}); AddComment(R"DOC( @@ -155,22 +231,33 @@ class SliceOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Input"), "Input should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, "Input should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) should not be null"); auto x_dims = ctx->GetInputDim("Input"); auto x_grad_name = framework::GradVarName("Input"); if (ctx->HasOutput(x_grad_name)) { ctx->SetOutputDim(x_grad_name, x_dims); } } - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { + const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( ctx.Input(framework::GradVarName("Out"))->type(), - ctx.GetPlace()); + ctx.device_context()); + } + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const override { + if (var_name == "StartsTensor" || var_name == "EndsTensor") { + return expected_kernel_type; + } + if (var_name == "StartsTensorList" || var_name == "EndsTensorList") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); } }; @@ -180,8 +267,12 @@ class SliceOpGradMaker : public framework::SingleGradOpDescMaker { protected: std::unique_ptr Apply() const override { - auto* bind = new framework::OpDesc(); + auto *bind = new framework::OpDesc(); bind->SetInput("Input", Input("Input")); + bind->SetInput("StartsTensor", Input("StartsTensor")); + bind->SetInput("EndsTensor", Input("EndsTensor")); + bind->SetInput("StartsTensorList", Input("StartsTensorList")); + bind->SetInput("EndsTensorList", Input("EndsTensorList")); bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); bind->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); bind->SetAttrMap(Attrs()); diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu index 24a564f9..4db8b837 100644 --- a/paddle/fluid/operators/slice_op.cu +++ b/paddle/fluid/operators/slice_op.cu @@ -65,6 +65,16 @@ class SliceGradKernel>("axes"); auto starts = ctx.Attr>("starts"); + auto list_new_starts_tensor = + ctx.MultiInput("StartsTensorList"); + + if (list_new_starts_tensor.size() > 0) { + starts = get_new_data_from_tensorlist(list_new_starts_tensor); + } else if (ctx.HasInput("StartsTensor")) { + auto* starts_tensor = ctx.Input("StartsTensor"); + starts = get_new_data_from_tensor(starts_tensor); + } + for (size_t i = 0; i < starts.size(); ++i) { if (starts[i] < 0) { starts[i] += in_dims[axes[i]]; @@ -84,9 +94,9 @@ class SliceGradKernel(out_dims); thrust::device_vector out_dims_vec(out_shape.begin(), out_shape.end()); - auto in_shape = framework::vectorize2int(in_dims); + auto in_shape = framework::vectorize(in_dims); thrust::device_vector in_dims_vec(in_shape.begin(), in_shape.end()); thrust::device_vector offsets_vec(offsets.begin(), offsets.end()); const int* out_dims_ptr = thrust::raw_pointer_cast(out_dims_vec.data()); diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h index dadc4f13..5f687fed 100644 --- a/paddle/fluid/operators/slice_op.h +++ b/paddle/fluid/operators/slice_op.h @@ -20,6 +20,39 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; + +inline std::vector get_new_data_from_tensorlist( + const std::vector& list_new_data_tensor) { + // get tensor from + std::vector vec_new_data; + for (size_t i = 0; i < list_new_data_tensor.size(); ++i) { + auto tensor = list_new_data_tensor[i]; + PADDLE_ENFORCE_EQ(tensor->dims(), framework::make_ddim({1}), + "shape of dim tensor should be [1]"); + if (platform::is_gpu_place(tensor->place())) { + framework::Tensor temp; + TensorCopySync(*tensor, platform::CPUPlace(), &temp); + vec_new_data.push_back(static_cast(*temp.data())); + } else { + vec_new_data.push_back(static_cast(*tensor->data())); + } + } + return vec_new_data; +} +inline std::vector get_new_data_from_tensor( + const Tensor* new_data_tensor) { + std::vector vec_new_data; + auto* new_data = new_data_tensor->data(); + framework::Tensor cpu_starts_tensor; + if (platform::is_gpu_place(new_data_tensor->place())) { + TensorCopySync(*new_data_tensor, platform::CPUPlace(), &cpu_starts_tensor); + new_data = cpu_starts_tensor.data(); + } + vec_new_data = + std::vector(new_data, new_data + new_data_tensor->numel()); + return vec_new_data; +} template class SliceKernel : public framework::OpKernel { @@ -58,8 +91,90 @@ class SliceKernel : public framework::OpKernel { auto out_dims = out->dims(); auto in_dims = in->dims(); - // resize out_dims + auto axes = context.Attr>("axes"); + auto starts = context.Attr>("starts"); + auto ends = context.Attr>("ends"); auto decrease_axis = context.Attr>("decrease_axis"); + auto infer_flags = context.Attr>("infer_flags"); + + auto list_new_ends_tensor = + context.MultiInput("EndsTensorList"); + auto list_new_starts_tensor = + context.MultiInput("StartsTensorList"); + + bool need_infer = false; + if (context.HasInput("StartsTensor") || context.HasInput("EndsTensor")) { + need_infer = true; + } + if (list_new_starts_tensor.size() > 0 || list_new_ends_tensor.size() > 0) { + need_infer = true; + } + + if (need_infer) { + if (context.HasInput("StartsTensor")) { + auto* starts_tensor = context.Input("StartsTensor"); + starts = get_new_data_from_tensor(starts_tensor); + } else if (list_new_starts_tensor.size() > 0) { + starts = get_new_data_from_tensorlist(list_new_starts_tensor); + } + PADDLE_ENFORCE_EQ( + starts.size(), axes.size(), + "The size of starts must be equal to the size of axes."); + if (context.HasInput("EndsTensor")) { + auto* ends_tensor = context.Input("EndsTensor"); + ends = get_new_data_from_tensor(ends_tensor); + } else if (list_new_ends_tensor.size() > 0) { + ends = get_new_data_from_tensorlist(list_new_ends_tensor); + } + PADDLE_ENFORCE_EQ(ends.size(), axes.size(), + "The size of ends must be equal to the size of axes."); + out_dims = in_dims; + int dim_value, start, end; + for (size_t i = 0; i < axes.size(); ++i) { + dim_value = out_dims[axes[i]]; + if (dim_value > 0) { + // when end = start+1 and start == -1 + if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) { + auto ret = + std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]); + if (ret != decrease_axis.end()) { + ends[i] = 10000000; + } + } + + start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; + end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; + start = std::max(start, 0); + end = std::max(end, 0); + end = std::min(end, dim_value); + PADDLE_ENFORCE_GT(end, start, "end should greater than start"); + out_dims[axes[i]] = end - start; + } + } + out->Resize(out_dims); + // generate new shape + if (decrease_axis.size() > 0) { + std::vector new_out_shape; + for (size_t i = 0; i < decrease_axis.size(); ++i) { + PADDLE_ENFORCE_EQ(out_dims[decrease_axis[i]], 1, + "decrease dim should be 1"); + out_dims[decrease_axis[i]] = 0; + } + + for (int i = 0; i < out_dims.size(); ++i) { + if (out_dims[i] != 0) { + new_out_shape.push_back(out_dims[i]); + } + } + if (new_out_shape.size() == 0) { + new_out_shape.push_back(1); + } + + out_dims = framework::make_ddim(new_out_shape); + } + } + + // resize out_dims if (decrease_axis.size() > 0) { if (decrease_axis.size() == (size_t)in_dims.size()) { std::vector vec_origin_out_shape(decrease_axis.size(), 1); @@ -85,8 +200,6 @@ class SliceKernel : public framework::OpKernel { } out->mutable_data(context.GetPlace()); - auto axes = context.Attr>("axes"); - auto starts = context.Attr>("starts"); auto new_out_dims = out->dims(); auto offsets = Eigen::array(); @@ -157,6 +270,26 @@ class SliceGradKernel : public framework::OpKernel { auto in_dims = d_input->dims(); auto axes = context.Attr>("axes"); auto starts = context.Attr>("starts"); + auto ends = context.Attr>("ends"); + + auto list_new_ends_tensor = + context.MultiInput("EndsTensorList"); + auto list_new_starts_tensor = + context.MultiInput("StartsTensorList"); + + if (list_new_starts_tensor.size() > 0) { + starts = get_new_data_from_tensorlist(list_new_starts_tensor); + } else if (context.HasInput("StartsTensor")) { + auto* starts_tensor = context.Input("StartsTensor"); + starts = get_new_data_from_tensor(starts_tensor); + } + + if (list_new_ends_tensor.size() > 0) { + ends = get_new_data_from_tensorlist(list_new_ends_tensor); + } else if (context.HasInput("EndsTensor")) { + auto* ends_tensor = context.Input("EndsTensor"); + ends = get_new_data_from_tensor(ends_tensor); + } auto decrease_axis = context.Attr>("decrease_axis"); if (decrease_axis.size() > 0) { diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index 70eec7af..9d73a191 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -47,10 +47,8 @@ class SoftmaxOp : public framework::OperatorWithKernel { "R is the rank of Input(X)."); auto use_cudnn = ctx->Attrs().Get("use_cudnn"); - auto use_mkldnn = ctx->Attrs().Get("use_mkldnn"); if (axis != rank_x - 1 && axis != -1) { PADDLE_ENFORCE(!use_cudnn, "CUDNN kernel only support axis as -1."); - PADDLE_ENFORCE(!use_mkldnn, "MKLDNN kernel only support axis as -1."); } ctx->SetOutputDim("Out", ctx->GetInputDim("X")); @@ -220,14 +218,33 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpDescMaker { } }; +DECLARE_INPLACE_OP_INFERER(SoftmaxInplaceInferer, {"X", "Out"}); + +class SoftmaxGradInplaceInferer final : public framework::InplaceOpInference { + public: + using framework::InplaceOpInference::InplaceOpInference; + + std::unordered_map operator()( + const framework::OpDesc& op_desc, bool use_cuda) const final { + if (use_cuda) { + return {{"Out", framework::GradVarName("X")}}; + } else { + // NOTE(zjl): AVX implementation of SoftmaxGrad does not support in-place + return {}; + } + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, - ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker); -REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); + ops::SoftmaxOpInferVarType, ops::SoftmaxOpGradMaker, + ops::SoftmaxInplaceInferer); +REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad, + ops::SoftmaxGradInplaceInferer); REGISTER_OP_CPU_KERNEL( softmax, ops::SoftmaxKernel, ops::SoftmaxKernel); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 716826bf..8cde7292 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -255,23 +255,11 @@ class SoftmaxGradMaker : public framework::SingleGradOpDescMaker { } }; -class SoftmaxWithCrossEntropyInplaceInference - : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc& op_desc, bool use_cuda) const { - return {{"Logits", "Softmax"}}; - } -}; +DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyInplaceInference, + {"Logits", "Softmax"}); -class SoftmaxWithCrossEntropyGradInplaceInference - : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc& op_desc, bool use_cuda) const { - return {{"Softmax", framework::GradVarName("Logits")}}; - } -}; +DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyGradInplaceInference, + {"Softmax", framework::GradVarName("Logits")}); } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index dc15df2c..7aeb1d96 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,26 +12,31 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/squeeze_op.h" +#include #include +#include #include #include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { -class SqueezeOpInferShape : public framework::InferShapeBase { +class SqueezeOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of Squeeze operator should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of Squeeze operator should not be null."); + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of Squeeze operator should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of Squeeze operator should not be null."); const auto &x_dims = ctx->GetInputDim("X"); // Check input tensor dims (<6) Eigen limit. - PADDLE_ENFORCE(x_dims.size() <= 6, - "Invalid dimnesions, the rank of Input(X) " - "should be in the range of [1, 6] (Eigen limit)."); + PADDLE_ENFORCE_LE(x_dims.size(), 6, + "Invalid dimnesions, the rank of Input(X) " + "should be in the range of [1, 6] (Eigen limit)."); const auto &axes = ctx->Attrs().Get>("axes"); for (int a : axes) { @@ -40,7 +45,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase { "tensor's rank."); } - auto out_dims = GetOutputShape(axes, x_dims, false); + auto out_dims = GetOutputShape(axes, x_dims); ctx->SetOutputDim("Out", out_dims); if (x_dims[0] == out_dims[0]) { // Only pass LoD when the first dimension of output and Input(X) @@ -50,8 +55,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase { } static framework::DDim GetOutputShape(const std::vector squeeze_dims, - const framework::DDim &in_dims, - bool is_runtime) { + const framework::DDim &in_dims) { size_t num_squeeze_dims = squeeze_dims.size(); int cnt_squeezed_dims = 0; bool should_squeeze[9] = {false}; @@ -70,14 +74,8 @@ class SqueezeOpInferShape : public framework::InferShapeBase { int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size() : squeeze_dims[idx]; // Check current index, the upper limit has beed checked in line 36. - PADDLE_ENFORCE(current >= 0, - "Invalid axis, the negative axis is out of range."); - - if (is_runtime) { - PADDLE_ENFORCE(in_dims[current] == 1, - "Invalid axis index, the axis that will be squeezed " - "should be equal to 1."); - } + PADDLE_ENFORCE_GE(current, 0, + "Invalid axis, the negative axis is out of range."); if (!(should_squeeze[current])) { ++cnt_squeezed_dims; @@ -96,27 +94,30 @@ class SqueezeOpInferShape : public framework::InferShapeBase { return framework::make_ddim(output_shape); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } }; -// TODO(paddle-dev): Should use OpKernel. -class SqueezeOp : public framework::OperatorBase { +class SqueezeGradOp : public framework::OperatorWithKernel { public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto &axes = Attr>("axes"); - auto x_dims = scope.FindVar(Input("X"))->Get().dims(); - auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims, true); - - framework::AttributeMap attrs; - attrs["shape"] = framework::vectorize2int(out_dims); - // Invoke Reshape Op - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape", {{"X", {Input("X")}}, {"Shape", {}}}, - {{"Out", {Output("Out")}}}, attrs); - reshape_op->Run(scope, place); + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *context) const override { + context->SetOutputDim(framework::GradVarName("X"), + context->GetInputDim("X")); + context->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -157,32 +158,70 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker { } }; -class SqueezeGradInferShape : public framework::InferShapeBase { +class Squeeze2Op : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *context) const override { - context->SetOutputDim(framework::GradVarName("X"), - context->GetInputDim("X")); - context->ShareLoD("X", framework::GradVarName("X")); + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of Squeeze operator should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of Squeeze operator should not be null."); + + const auto &x_dims = ctx->GetInputDim("X"); + // Check input tensor dims (<6) Eigen limit. + PADDLE_ENFORCE_LE(x_dims.size(), 6, + "Invalid dimnesions, the rank of Input(X) " + "should be in the range of [1, 6] (Eigen limit)."); + + const auto &axes = ctx->Attrs().Get>("axes"); + for (int a : axes) { + PADDLE_ENFORCE_LT(a, x_dims.size(), + "The squeeze axis should be less than input " + "tensor's rank."); + } + + auto out_dims = SqueezeOp::GetOutputShape(axes, x_dims); + ctx->SetOutputDim("Out", out_dims); + if (x_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", "Out"); + } + + PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true, + "Output(XShape) of Squeeze operator should not be null."); + std::vector xshape_dims(x_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < x_dims.size(); ++i) { + xshape_dims[i + 1] = x_dims[i]; + } + ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); + ctx->ShareLoD("X", /*->*/ "XShape"); } }; -class SqueezeGradOp : public framework::OperatorBase { +class Squeeze2GradOp : public framework::OperatorWithKernel { public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto dx_name = Output(framework::GradVarName("X")); - auto dout_name = Input(framework::GradVarName("Out")); - auto x_dims = scope.FindVar(Input("X"))->Get().dims(); - framework::AttributeMap attrs; - attrs["shape"] = framework::vectorize2int(x_dims); - - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}}, - attrs); - reshape_op->Run(scope, place); + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE_EQ(context->HasInput("XShape"), true, + "Input(XShape) shouldn't be null."); + PADDLE_ENFORCE_EQ(context->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) shouldn't be null."); + auto xshape_dims = context->GetInputDim("XShape"); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + context->SetOutputDim(framework::GradVarName("X"), x_dims); + context->ShareLoD("XShape", framework::GradVarName("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -202,44 +241,6 @@ class Squeeze2OpMaker : public SqueezeOpMaker { } }; -class Squeeze2OpInferShape : public SqueezeOpInferShape { - public: - void operator()(framework::InferShapeContext *ctx) const override { - SqueezeOpInferShape::operator()(ctx); - PADDLE_ENFORCE(ctx->HasOutput("XShape"), - "Output(XShape) of Squeeze operator should not be null."); - const auto &x_dims = ctx->GetInputDim("X"); - std::vector xshape_dims(x_dims.size() + 1); - xshape_dims[0] = 0; - for (int i = 0; i < x_dims.size(); ++i) { - xshape_dims[i + 1] = x_dims[i]; - } - ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); - ctx->ShareLoD("X", /*->*/ "XShape"); - } -}; - -class Squeeze2Op : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto &axes = Attr>("axes"); - auto x_dims = scope.FindVar(Input("X"))->Get().dims(); - auto out_dims = Squeeze2OpInferShape::GetOutputShape(axes, x_dims, true); - - framework::AttributeMap attrs; - attrs["shape"] = framework::vectorize2int(out_dims); - // Invoke Reshape Op - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape2", {{"X", {Input("X")}}, {"Shape", {}}}, - {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs); - reshape_op->Run(scope, place); - } -}; - class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; @@ -255,57 +256,47 @@ class Squeeze2GradOpMaker : public framework::SingleGradOpDescMaker { } }; -class Squeeze2GradInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE(context->HasInput("XShape"), - "Input(XShape) shouldn't be null."); - PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) shouldn't be null."); - auto xshape_dims = context->GetInputDim("XShape"); - auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); - context->SetOutputDim(framework::GradVarName("X"), x_dims); - context->ShareLoD("XShape", framework::GradVarName("X")); - } -}; - -class Squeeze2GradOp : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto dx_name = Output(framework::GradVarName("X")); - auto dout_name = Input(framework::GradVarName("Out")); - auto xshape_name = Input("XShape"); - auto xshape_dims = - scope.FindVar(xshape_name)->Get().dims(); - auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); - - framework::AttributeMap attrs; - attrs["shape"] = framework::vectorize2int(x_dims); - - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape2", {{"X", {dout_name}}, {"Shape", {}}}, - {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs); - reshape_op->Run(scope, place); - } -}; +DECLARE_INPLACE_OP_INFERER(SequeezeInplaceInferer, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(SequeezeGradInplaceInferer, + {framework::GradVarName("Out"), + framework::GradVarName("X")}); } // namespace operators } // namespace paddle -// Tell linker to use reshape op -USE_OP(reshape); - namespace ops = paddle::operators; REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker, - ops::SqueezeOpInferShape, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape); +REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp); REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker, - ops::Squeeze2OpInferShape, ops::Squeeze2GradOpMaker); + ops::Squeeze2GradOpMaker, ops::SequeezeInplaceInferer); REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp, - ops::Squeeze2GradInferShape); + ops::SequeezeGradInplaceInferer); + +REGISTER_OP_CPU_KERNEL( + squeeze, ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel); +REGISTER_OP_CPU_KERNEL( + squeeze_grad, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel); +REGISTER_OP_CPU_KERNEL( + squeeze2, ops::Squeeze2Kernel, + ops::Squeeze2Kernel, + ops::Squeeze2Kernel, + ops::Squeeze2Kernel, + ops::Squeeze2Kernel); +REGISTER_OP_CPU_KERNEL( + squeeze2_grad, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel); diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc new file mode 100644 index 00000000..50fee149 --- /dev/null +++ b/paddle/fluid/operators/squeeze_op.cu.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/squeeze_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + squeeze, ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel, + ops::SqueezeKernel); +REGISTER_OP_CUDA_KERNEL( + squeeze_grad, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel, + ops::SqueezeGradKernel); +REGISTER_OP_CUDA_KERNEL( + squeeze2, ops::Squeeze2Kernel, + ops::Squeeze2Kernel, + ops::Squeeze2Kernel, + ops::Squeeze2Kernel, + ops::Squeeze2Kernel); +REGISTER_OP_CUDA_KERNEL( + squeeze2_grad, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel, + ops::Squeeze2GradKernel); diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h new file mode 100644 index 00000000..5aae1865 --- /dev/null +++ b/paddle/fluid/operators/squeeze_op.h @@ -0,0 +1,146 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +template +class SqueezeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *in = context.Input("X"); + auto *out = context.Output("Out"); + + auto &axes = context.Attr>("axes"); + auto x_dims = in->dims(); + auto out_dims = GetOutputShape(axes, x_dims); + + out->mutable_data(context.GetPlace(), in->type()); + framework::TensorCopy( + *in, context.GetPlace(), + context.template device_context(), out); + out->Resize(out_dims); + } + + static framework::DDim GetOutputShape(const std::vector squeeze_dims, + const framework::DDim &in_dims) { + size_t num_squeeze_dims = squeeze_dims.size(); + int cnt_squeezed_dims = 0; + bool should_squeeze[9] = {false}; + + // Determines number of dimensions of output tensor after squeeze. + // Mark and count the dimensions need to be squeezed + if (num_squeeze_dims == 0) { + for (int idx = 0; idx < in_dims.size(); ++idx) { + if (in_dims[idx] == 1) { + should_squeeze[idx] = true; + ++cnt_squeezed_dims; + } + } + } else { + for (size_t idx = 0; idx < num_squeeze_dims; ++idx) { + int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size() + : squeeze_dims[idx]; + // Check current index, the upper limit has beed checked in line 36. + PADDLE_ENFORCE_GE(current, 0, + "Invalid axis, the negative axis is out of range."); + + PADDLE_ENFORCE_EQ(in_dims[current], 1, + "Invalid axis index, the axis that will be squeezed " + "should be equal to 1."); + + if (!(should_squeeze[current])) { + ++cnt_squeezed_dims; + } + should_squeeze[current] = true; + } + } + + // Make output dimensions + std::vector output_shape(in_dims.size() - cnt_squeezed_dims, 0); + for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) { + if (!should_squeeze[in_idx]) { + output_shape[out_idx++] = in_dims[in_idx]; + } + } + + return framework::make_ddim(output_shape); + } +}; + +template +class SqueezeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *d_out = + ctx.Input(framework::GradVarName("Out")); + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto in_dims = ctx.Input("X")->dims(); + + d_x->mutable_data(ctx.GetPlace(), d_out->type()); + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + d_x->Resize(in_dims); + } +}; + +template +class Squeeze2Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *out = context.Output("Out"); + auto *in = context.Input("X"); + + auto &axes = context.Attr>("axes"); + + auto x_dims = in->dims(); + auto out_dims = + SqueezeKernel::GetOutputShape(axes, x_dims); + + out->mutable_data(context.GetPlace(), in->type()); + framework::TensorCopy( + *in, context.GetPlace(), + context.template device_context(), out); + out->Resize(out_dims); + } +}; + +template +class Squeeze2GradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *d_out = + ctx.Input(framework::GradVarName("Out")); + auto *d_x = ctx.Output(framework::GradVarName("X")); + // auto in_dims = d_x->dims(); + + auto xshape_dims = ctx.Input("XShape")->dims(); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + + d_x->mutable_data(ctx.GetPlace(), d_out->type()); + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + d_x->Resize(x_dims); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h index 3d132e43..9ebf166d 100644 --- a/paddle/fluid/operators/stack_op.h +++ b/paddle/fluid/operators/stack_op.h @@ -14,6 +14,7 @@ #pragma once +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/for_range.h" @@ -50,7 +51,7 @@ class StackOp : public framework::OperatorWithKernel { "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank); if (axis < 0) axis += (rank + 1); - auto vec = framework::vectorize2int(input_dims[0]); + auto vec = framework::vectorize(input_dims[0]); vec.insert(vec.begin() + axis, input_dims.size()); ctx->SetOutputDim("Y", framework::make_ddim(vec)); } @@ -196,7 +197,7 @@ class StackOpGrad : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->Outputs(framework::GradVarName("X")).size(), static_cast(dy_dim[axis]), "Number of Outputs(X@Grad) is wrong"); - auto vec = framework::vectorize2int(dy_dim); + auto vec = framework::vectorize(dy_dim); vec.erase(vec.begin() + axis); ctx->SetOutputsDim( framework::GradVarName("X"), diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc new file mode 100644 index 00000000..b6bbb071 --- /dev/null +++ b/paddle/fluid/operators/strided_slice_op.cc @@ -0,0 +1,272 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/strided_slice_op.h" +#include +#include +#include +#include +#include "paddle/fluid/operators/slice_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class StridedSliceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, + "Input (Input) of slice op should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output (Out) of slice op should not be null."); + + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_LT(in_dims.size(), 7, + "The rank of input should be less than 7."); + auto starts = ctx->Attrs().Get>("starts"); + auto ends = ctx->Attrs().Get>("ends"); + auto strides = ctx->Attrs().Get>("strides"); + auto axes = ctx->Attrs().Get>("axes"); + auto infer_flags = ctx->Attrs().Get>("infer_flags"); + + auto starts_size = starts.size(); + auto ends_size = ends.size(); + auto strides_size = strides.size(); + + if (ctx->HasInputs("StartsTensorList")) { + auto StartsTensorList = ctx->Inputs("StartsTensorList"); + PADDLE_ENFORCE_GT(StartsTensorList.size(), 0, + "StartsTensorList size can't be zero"); + starts_size = StartsTensorList.size(); + } + if (ctx->HasInputs("EndsTensorList")) { + auto EndsTensorList = ctx->Inputs("EndsTensorList"); + PADDLE_ENFORCE_GT(EndsTensorList.size(), 0, + "EndsTensorList size can't be zero"); + ends_size = EndsTensorList.size(); + } + if (ctx->HasInputs("StridesTensorList")) { + auto StridesTensorList = ctx->Inputs("StridesTensorList"); + PADDLE_ENFORCE_GT(StridesTensorList.size(), 0, + "StridesTensorList size can't be zero"); + strides_size = StridesTensorList.size(); + } + + auto tensor_input = false; + if (ctx->HasInput("EndsTensor") || ctx->HasInput("StartsTensor") || + ctx->HasInput("StridesTensor")) { + tensor_input = true; + } + if (ctx->HasInput("EndsTensor") == false) { + PADDLE_ENFORCE_EQ(ends_size, axes.size(), + "The size of ends must be equal to the size of axes."); + } + if (ctx->HasInput("StartsTensor") == false) { + PADDLE_ENFORCE_EQ( + starts_size, axes.size(), + "The size of starts must be equal to the size of axes."); + } + if (ctx->HasInput("StridesTensor") == false) { + PADDLE_ENFORCE_EQ( + strides_size, axes.size(), + "The size of strides must be equal to the size of axes."); + } + // we need to analysis strided slice op is valid for + // the parameter that we get from python front + std::vector out_dims_vector(in_dims.size(), -1); + if (!tensor_input) { + StridedSliceOutDims(starts, ends, strides, axes, infer_flags, in_dims, + out_dims_vector.data(), axes.size(), true); + } + framework::DDim out_dims(framework::make_ddim(out_dims_vector)); + + ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("Input", /*->*/ "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.Input("Input")->place()); + } + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const override { + if (var_name == "StartsTensor" || var_name == "EndsTensor" || + var_name == "StridesTensor") { + return expected_kernel_type; + } + if (var_name == "StartsTensorList" || var_name == "EndsTensorList" || + var_name == "StridesTensorList") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + +class StridedSliceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Input", "Tensor of data to extract slices from."); + AddOutput("Out", "Strided Sliced data tensor."); + + AddInput("StartsTensor", + "(Tensor, optional) If provided, slice will use this." + "It has the highest priority of StartsTensor, StartsTensorList " + "and attr(starts).") + .AsDispensable(); + AddInput("EndsTensor", + "(Tensor, optional) If provided, slice will use this." + "It has the highest priority of EndsTensor, EndsTensorList and " + "attr(ends).") + .AsDispensable(); + AddInput( + "StridesTensor", + "(Tensor, optional) If provided, slice will use this." + "It has the highest priority of StridesTensor, StridesTensorList and " + "attr(ends).") + .AsDispensable(); + AddInput( + "StartsTensorList", + "(vector>, optional) If provided, slice will use this." + "The shape of the tensor in vector MUST BE [1]." + "It has higher priority compare with attr(starts).") + .AsDuplicable() + .AsDispensable(); + AddInput( + "EndsTensorList", + "(vector>, optional) If provided, slice will use this." + "The shape of the tensor in vector MUST BE [1]." + "It has higher priority compare with attr(ends).") + .AsDuplicable() + .AsDispensable(); + AddInput( + "StridesTensorList", + "(vector>, optional) If provided, slice will use this." + "The shape of the tensor in vector MUST BE [1]." + "It has higher priority compare with attr(strides).") + .AsDuplicable() + .AsDispensable(); + AddAttr>( + "axes", "(list) Axes that `starts` and `ends` apply to."); + AddAttr>( + "starts", "(list) Start indices for the strided slice start.") + .SetDefault({}); + AddAttr>("ends", + "(list) End indices the tensor slice end") + .SetDefault({}); + AddAttr>( + "strides", "(list Stride step from the start to the end)") + .SetDefault({}); + AddAttr>( + "infer_flags", "(list) Flags of inferring dims in attributes.") + .SetDefault({}); + AddComment(R"DOC( +Strided Slice Operator. +Instead of calling this op directly most users will want to use the +NumPy-style slicing syntax. +For Example: +data = fluid.layers.fill_constant(shape=[3, 3], value=0, dtype='int64') +y = fluid.layers.strided_slice(data, [0, 1], [1,0], [2, 3], [1, 1]) +)DOC"); + } +}; + +class StridedSliceOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, "Input should not be null"); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) should not be null"); + auto x_dims = ctx->GetInputDim("Input"); + auto x_grad_name = framework::GradVarName("Input"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const override { + if (var_name == "StartsTensor" || var_name == "EndsTensor") { + return expected_kernel_type; + } + if (var_name == "StartsTensorList" || var_name == "EndsTensorList") { + return expected_kernel_type; + } + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } +}; + +class StridedSliceOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *bind = new framework::OpDesc(); + bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + bind->SetInput("Input", Input("Input")); + bind->SetInput("StartsTensor", Input("StartsTensor")); + bind->SetInput("EndsTensor", Input("EndsTensor")); + bind->SetInput("StridesTensor", Input("StridesTensor")); + bind->SetInput("StartsTensorList", Input("StartsTensorList")); + bind->SetInput("EndsTensorList", Input("EndsTensorList")); + bind->SetInput("StridesTensorList", Input("StridesTensorList")); + bind->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + bind->SetAttrMap(Attrs()); + bind->SetType("strided_slice_grad"); + return std::unique_ptr(bind); + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + StridedSliceOpGradNoNeedBufferVarsInference, "Input"); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(strided_slice, ops::StridedSliceOp, ops::StridedSliceOpMaker, + ops::StridedSliceOpGradMaker); +REGISTER_OPERATOR(strided_slice_grad, ops::StridedSliceOpGrad, + ops::StridedSliceOpGradNoNeedBufferVarsInference); + +REGISTER_OP_CPU_KERNEL( + strided_slice, + ops::StridedSliceKernel, + ops::StridedSliceKernel, + ops::StridedSliceKernel, + ops::StridedSliceKernel); + +REGISTER_OP_CPU_KERNEL( + strided_slice_grad, + ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel); diff --git a/paddle/fluid/operators/strided_slice_op.cu b/paddle/fluid/operators/strided_slice_op.cu new file mode 100644 index 00000000..f0c9d557 --- /dev/null +++ b/paddle/fluid/operators/strided_slice_op.cu @@ -0,0 +1,30 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/strided_slice_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + strided_slice, + ops::StridedSliceKernel, + ops::StridedSliceKernel, + ops::StridedSliceKernel, + ops::StridedSliceKernel); + +REGISTER_OP_CUDA_KERNEL( + strided_slice_grad, + ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel, + ops::StridedSliceGradKernel); diff --git a/paddle/fluid/operators/strided_slice_op.h b/paddle/fluid/operators/strided_slice_op.h new file mode 100644 index 00000000..57d33f29 --- /dev/null +++ b/paddle/fluid/operators/strided_slice_op.h @@ -0,0 +1,350 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/slice_op.h" +namespace paddle { +namespace operators { + +static void StridedSliceOutDims( + const std::vector& starts, const std::vector& ends, + const std::vector& strides, const std::vector& axes, + const std::vector& infer_flags, const framework::DDim in_dims, + int* out_dims_vector, const size_t size, bool infer_shape) { + for (int i = 0; i < in_dims.size(); i++) { + out_dims_vector[i] = in_dims[i]; + } + int stride_index, start_index, end_index; + for (size_t i = 0; i < size; i++) { + int axes_index = axes[i]; + if (infer_shape && infer_flags[i] == -1) { + out_dims_vector[axes_index] = -1; + continue; + } + + PADDLE_ENFORCE_NE(strides[i], 0, "stride must not to be zero"); + start_index = starts[i]; + end_index = ends[i]; + stride_index = strides[i]; + int axis_size = in_dims[axes_index]; + if (axis_size < 0) { + continue; + } + + if (start_index < 0) { + start_index = start_index + axis_size; + } + if (end_index < 0) { + end_index = end_index + axis_size; + } + + if (stride_index < 0) { + start_index = start_index + 1; + end_index = end_index + 1; + } + + bool zero_dim_condition = + ((stride_index < 0 && (start_index <= end_index)) || + (stride_index > 0 && (start_index >= end_index))); + PADDLE_ENFORCE_EQ(zero_dim_condition, false, + "starts and end must meet requirement in different " + "stride conditiont"); + int left = std::max(0, std::min(start_index, end_index)); + int right = std::min(axis_size, std::max(start_index, end_index)); + int step = std::abs(stride_index); + auto out_dims_index = (std::abs(right - left) + step - 1) / step; + + out_dims_vector[axes_index] = out_dims_index; + } +} + +static void StridedSliceFunctor(int* starts, int* ends, int* strides, int* axes, + int* reverse_axis, const framework::DDim dims, + const size_t size) { + for (size_t axis = 0; axis < size; axis++) { + int axis_size = dims[axes[axis]]; + int axis_index = axis; + if (axis_size < 0) { + starts[axis_index] = 0; + ends[axis_index] = 1; + strides[axis_index] = 1; + } + // stride must not be zero + if (starts[axis_index] < 0) { + starts[axis_index] = starts[axis_index] + axis_size; + } + + if (ends[axis_index] < 0) { + ends[axis_index] = ends[axis_index] + axis_size; + } + if (strides[axis_index] < 0) { + reverse_axis[axis_index] = 1; + strides[axis_index] = -strides[axis_index]; + if (starts[axis_index] > ends[axis_index]) { + // swap the reverse + starts[axis_index] = starts[axis_index] + 1; + ends[axis_index] = ends[axis_index] + 1; + } + std::swap(starts[axis_index], ends[axis_index]); + } else { + reverse_axis[axis_index] = 0; + strides[axis_index] = strides[axis_index]; + } + } +} + +template +class StridedSliceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + int rank = ctx.Input("Input")->dims().size(); + switch (rank) { + case 1: + StridedSliceCompute<1>(ctx); + break; + case 2: + StridedSliceCompute<2>(ctx); + break; + case 3: + StridedSliceCompute<3>(ctx); + break; + case 4: + StridedSliceCompute<4>(ctx); + break; + case 5: + StridedSliceCompute<5>(ctx); + break; + case 6: + StridedSliceCompute<6>(ctx); + break; + } + } + + private: + template + void StridedSliceCompute(const framework::ExecutionContext& context) const { + auto& place = + *context.template device_context().eigen_device(); + auto in = context.Input("Input"); + auto out = context.Output("Out"); + auto in_dims = in->dims(); + + auto starts = context.Attr>("starts"); + auto ends = context.Attr>("ends"); + auto strides = context.Attr>("strides"); + auto axes = context.Attr>("axes"); + auto infer_flags = context.Attr>("infer_flags"); + + auto starts_indices = Eigen::DSizes(); + auto ends_indices = Eigen::DSizes(); + auto strides_indices = Eigen::DSizes(); + auto reverse_axis = Eigen::array(); + + auto list_new_ends_tensor = + context.MultiInput("EndsTensorList"); + auto list_new_starts_tensor = + context.MultiInput("StartsTensorList"); + auto list_new_strides_tensor = + context.MultiInput("StridesTensorList"); + + if (list_new_starts_tensor.size() > 0) { + starts = get_new_data_from_tensorlist(list_new_starts_tensor); + } else if (context.HasInput("StartsTensor")) { + auto* starts_tensor = context.Input("StartsTensor"); + starts = get_new_data_from_tensor(starts_tensor); + } + + if (list_new_ends_tensor.size() > 0) { + ends = get_new_data_from_tensorlist(list_new_ends_tensor); + } else if (context.HasInput("EndsTensor")) { + auto* ends_tensor = context.Input("EndsTensor"); + ends = get_new_data_from_tensor(ends_tensor); + } + + if (list_new_strides_tensor.size() > 0) { + strides = get_new_data_from_tensorlist(list_new_strides_tensor); + } else if (context.HasInput("StridesTensor")) { + auto* strides_tensor = context.Input("StridesTensor"); + strides = get_new_data_from_tensor(strides_tensor); + } + + std::vector out_dims_vector(in_dims.size(), -1); + StridedSliceOutDims(starts, ends, strides, axes, infer_flags, in_dims, + out_dims_vector.data(), axes.size(), false); + framework::DDim out_dims(framework::make_ddim(out_dims_vector)); + + std::vector reverse_vector(starts.size(), 0); + StridedSliceFunctor(starts.data(), ends.data(), strides.data(), axes.data(), + reverse_vector.data(), in_dims, starts.size()); + + for (size_t axis = 0; axis < D; axis++) { + starts_indices[axis] = 0; + ends_indices[axis] = out_dims[axis]; + strides_indices[axis] = 1; + reverse_axis[axis] = false; + } + for (size_t axis = 0; axis < axes.size(); axis++) { + int axis_index = axes[axis]; + starts_indices[axis_index] = starts[axis]; + ends_indices[axis_index] = ends[axis]; + strides_indices[axis_index] = strides[axis]; + reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false; + } + + framework::Tensor tmp; + tmp.mutable_data(out_dims, context.GetPlace()); + + out->Resize(out_dims); + out->mutable_data(context.GetPlace()); + auto in_t = + framework::EigenTensor::From( + *in); + auto tmp_t = + framework::EigenTensor::From( + tmp); + auto out_t = + framework::EigenTensor::From( + *out, out_dims); + tmp_t.device(place) = + in_t.stridedSlice(starts_indices, ends_indices, strides_indices); + out_t.device(place) = tmp_t.reverse(reverse_axis); + } +}; + +template +class StridedSliceGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + size_t rank = ctx.Input("Input")->dims().size(); + switch (rank) { + case 1: + StridedSliceGradCompute<1>(ctx); + break; + case 2: + StridedSliceGradCompute<2>(ctx); + break; + case 3: + StridedSliceGradCompute<3>(ctx); + break; + case 4: + StridedSliceGradCompute<4>(ctx); + break; + case 5: + StridedSliceGradCompute<5>(ctx); + break; + case 6: + StridedSliceGradCompute<6>(ctx); + break; + } + } + + private: + template + void StridedSliceGradCompute( + const framework::ExecutionContext& context) const { + auto& place = + *context.template device_context().eigen_device(); + auto* d_input = + context.Input(framework::GradVarName("Out")); + auto* d_out = + context.Output(framework::GradVarName("Input")); + d_out->mutable_data(context.GetPlace()); + + auto& dev_ctx = context.template device_context(); + math::SetConstant set_zero; + set_zero(dev_ctx, d_out, static_cast(0)); + auto out_dims = d_out->dims(); + auto in_dims = d_input->dims(); + auto starts = context.Attr>("starts"); + auto ends = context.Attr>("ends"); + auto strides = context.Attr>("strides"); + auto axes = context.Attr>("axes"); + + auto list_new_ends_tensor = + context.MultiInput("EndsTensorList"); + auto list_new_starts_tensor = + context.MultiInput("StartsTensorList"); + auto list_new_strides_tensor = + context.MultiInput("StridesTensorList"); + + if (list_new_starts_tensor.size() > 0) { + starts = get_new_data_from_tensorlist(list_new_starts_tensor); + } else if (context.HasInput("StartsTensor")) { + auto* starts_tensor = context.Input("StartsTensor"); + starts = get_new_data_from_tensor(starts_tensor); + } + + if (list_new_ends_tensor.size() > 0) { + ends = get_new_data_from_tensorlist(list_new_ends_tensor); + } else if (context.HasInput("EndsTensor")) { + auto* ends_tensor = context.Input("EndsTensor"); + ends = get_new_data_from_tensor(ends_tensor); + } + + if (list_new_strides_tensor.size() > 0) { + strides = get_new_data_from_tensorlist(list_new_strides_tensor); + } else if (context.HasInput("StridesTensor")) { + auto* strides_tensor = context.Input("StridesTensor"); + strides = get_new_data_from_tensor(strides_tensor); + } + + auto starts_indices = Eigen::DSizes(); + auto ends_indices = Eigen::DSizes(); + auto strides_indices = Eigen::DSizes(); + + auto reverse_axis = Eigen::array(); + std::vector reverse_vector(starts.size(), 0); + + StridedSliceFunctor(starts.data(), ends.data(), strides.data(), axes.data(), + reverse_vector.data(), out_dims, starts.size()); + + for (size_t axis = 0; axis < D; axis++) { + starts_indices[axis] = 0; + ends_indices[axis] = out_dims[axis]; + strides_indices[axis] = 1; + } + for (size_t axis = 0; axis < axes.size(); axis++) { + int axis_index = axes[axis]; + starts_indices[axis_index] = starts[axis]; + ends_indices[axis_index] = ends[axis]; + strides_indices[axis_index] = strides[axis]; + reverse_axis[axis_index] = (reverse_vector[axis] == 1) ? true : false; + } + + framework::Tensor reverse_input; + reverse_input.mutable_data(in_dims, context.GetPlace()); + + auto in_t = + framework::EigenTensor::From( + *d_input); + auto reverse_in_t = + framework::EigenTensor::From( + reverse_input); + auto out_t = + framework::EigenTensor::From( + *d_out, out_dims); + + reverse_in_t.device(place) = in_t.reverse(reverse_axis); + out_t.stridedSlice(starts_indices, ends_indices, strides_indices) + .device(place) = reverse_in_t; + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index e6c87726..37204fd7 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -141,7 +141,7 @@ class SumOp : public framework::OperatorWithKernel { for (auto& x_var : x_vars) { auto& array = x_var->Get(); for (auto& each : array) { - if (each.numel() != 0) { + if (each.numel() != 0 && each.IsInitialized()) { return framework::OpKernelType(each.type(), ctx.device_context(), layout, library); } @@ -238,13 +238,7 @@ class SumGradMaker : public framework::GradOpDescMakerBase { } }; -class SumInplace : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const framework::OpDesc& op_desc, bool use_cuda) const override { - return {{"X", "Out"}}; - } -}; +DECLARE_INPLACE_OP_INFERER(SumInplace, {"X", "Out"}); } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index ba874549..3564ed0c 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -11,6 +11,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/sum_op.h" #include "paddle/fluid/platform/float16.h" @@ -38,18 +39,14 @@ __global__ void SumArrayCUDAKernel(T **in, T *out, int64_t N, size_t in_size, bool read_dst) { int id = blockIdx.x * blockDim.x + threadIdx.x; while (id < N) { - T total(0); + T total(read_dst ? out[id] : static_cast(0)); for (int i = 0; i < in_size; ++i) { const T *tmp = in[i]; if (tmp) { total += tmp[id]; } } - if (read_dst) { - out[id] += total; - } else { - out[id] = total; - } + out[id] = total; id += blockDim.x * gridDim.x; } } @@ -201,8 +198,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { } if (!sr_in_out_data.empty()) { auto tmp_sr_in_out_array = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - sr_in_out_data.size() * sizeof(T *)); + memory::Alloc(dev_ctx, sr_in_out_data.size() * sizeof(T *)); memory::Copy(boost::get(dev_ctx.GetPlace()), tmp_sr_in_out_array->ptr(), platform::CPUPlace(), @@ -220,9 +216,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { } // if indata not null, merge into one kernel call. if (!in_data.empty()) { - auto tmp_in_array = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - in_data.size() * sizeof(T *)); + auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); memory::Copy(boost::get(dev_ctx.GetPlace()), tmp_in_array->ptr(), platform::CPUPlace(), diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 7a3fecac..3b7f4292 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -92,21 +92,21 @@ void LodTensorArrayCompute(const framework::ExecutionContext &context) { bool in_place = out_var == in_vars[0]; auto &out_array = *out_var->GetMutable(); for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) { - PADDLE_ENFORCE(in_vars[i]->IsType(), - "Only support all inputs are TensorArray"); + PADDLE_ENFORCE_EQ(in_vars[i]->IsType(), true, + "Only support all inputs are TensorArray"); auto &in_array = in_vars[i]->Get(); for (size_t i = 0; i < in_array.size(); ++i) { - if (in_array[i].numel() != 0) { + if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) { if (i >= out_array.size()) { out_array.resize(i + 1); } - if (out_array[i].numel() == 0) { + if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) { framework::TensorCopy(in_array[i], in_array[i].place(), context.device_context(), &out_array[i]); out_array[i].set_lod(in_array[i].lod()); } else { - PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); + PADDLE_ENFORCE_EQ(out_array[i].lod(), in_array[i].lod()); auto in = EigenVector::Flatten(in_array[i]); auto result = EigenVector::Flatten(out_array[i]); result.device(*context.template device_context() diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu b/paddle/fluid/operators/sync_batch_norm_op.cu index a5984bfa..fb4ae48e 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cu +++ b/paddle/fluid/operators/sync_batch_norm_op.cu @@ -12,13 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +// clang-format off #include #include +#include #include #include #include "cub/cub.cuh" #include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/operators/norm_utils.h" #include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/nccl_helper.h" @@ -30,24 +34,27 @@ using Tensor = framework::Tensor; using DataLayout = framework::DataLayout; template using CudnnDataType = platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; template -__global__ void KeLocalStats(const T *x, int N, int M, int C, T *mean_var) { - typedef cub::BlockReduce BlockReduce; +__global__ void KeLocalStats(const T *x, int N, int M, int C, + BatchNormParamType *mean_var) { + typedef cub::BlockReduce, BlockDim> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; for (int k = blockIdx.x; k < C; k += gridDim.x) { - T x_sum = 0; - T x2_sum = 0; + BatchNormParamType x_sum = 0.; + BatchNormParamType x2_sum = 0.; for (int i = threadIdx.x; i < N * M; i += BlockDim) { int id = layout == framework::DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M : i * C + k; - T x_in = x[id]; + auto x_in = static_cast>(x[id]); x_sum += x_in; x2_sum += x_in * x_in; } __syncthreads(); - T out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum()); + auto out = BlockReduce(temp_storage).Reduce(x_sum, cub::Sum()); __syncthreads(); if (threadIdx.x == 0) { mean_var[k] = out / (N * M); @@ -59,22 +66,24 @@ __global__ void KeLocalStats(const T *x, int N, int M, int C, T *mean_var) { } } if (blockIdx.x == 0 && threadIdx.x == 0) { - mean_var[2 * C] = static_cast(1.0); + mean_var[2 * C] = static_cast>(1.0); } } template -__global__ void KeSyncAndMovingStats(T *means, T *variances, T *num_dev, - const int C, const T momentum, - const double epsilon, T *sv_mean_data, - T *sv_inv_var_data, T *moving_means, - T *moving_variances) { +__global__ void KeSyncAndMovingStats( + BatchNormParamType *means, BatchNormParamType *variances, + BatchNormParamType *num_dev, const int C, + const BatchNormParamType momentum, const double epsilon, + BatchNormParamType *sv_mean_data, BatchNormParamType *sv_inv_var_data, + BatchNormParamType *moving_means, + BatchNormParamType *moving_variances) { // sync stats across multi-devices int gid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = gid; i < C; i += stride) { - T mean = means[i] / (*num_dev); - T var = variances[i] / (*num_dev); + auto mean = means[i] / (*num_dev); + auto var = variances[i] / (*num_dev); var = var - mean * mean; // sync stats @@ -90,15 +99,21 @@ __global__ void KeSyncAndMovingStats(T *means, T *variances, T *num_dev, } template -static __global__ void KeNormAffine(const T *x, const T *scale, const T *bias, - const T *mean, const T *variance, +static __global__ void KeNormAffine(const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const BatchNormParamType *mean, + const BatchNormParamType *variance, const double epsilon, const int C, const int M, const int num, T *y) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = gid; i < num; i += stride) { const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C; - y[i] = (x[i] - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c]; + auto x_i = static_cast>(x[i]); + auto y_i = + (x_i - mean[c]) / sqrt(variance[c] + epsilon) * scale[c] + bias[c]; + y[i] = static_cast(y_i); } } @@ -126,14 +141,14 @@ class SyncBatchNormKernel : public framework::OpKernel { int x_numel = x->numel(); const T *x_d = x->data(); - const T *s_d = ctx.Input("Scale")->data(); - const T *b_d = ctx.Input("Bias")->data(); + const auto *s_d = ctx.Input("Scale")->data>(); + const auto *b_d = ctx.Input("Bias")->data>(); auto *y = ctx.Output("Y"); T *y_d = y->mutable_data(ctx.GetPlace()); - const T *mean_data = nullptr; - const T *var_data = nullptr; + const BatchNormParamType *mean_data = nullptr; + const BatchNormParamType *var_data = nullptr; auto &dev_ctx = ctx.cuda_device_context(); auto stream = dev_ctx.stream(); @@ -146,53 +161,53 @@ class SyncBatchNormKernel : public framework::OpKernel { if (is_test) { const auto *est_mean = ctx.Input("Mean"); const auto *est_var = ctx.Input("Variance"); - mean_data = est_mean->data(); - var_data = est_var->data(); + mean_data = est_mean->data>(); + var_data = est_var->data>(); } else { - auto &allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); // x, x^2, 1, here 1 is used to calc device num // device num also can be got from platform::DeviceContextPool - const int bytes = (C * 2 + 1) * sizeof(T); - alloc_ptr = allocator.Allocate(bytes); + const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType); + alloc_ptr = memory::Alloc(dev_ctx, bytes); - T *stats = reinterpret_cast(alloc_ptr->ptr()); + auto *stats = reinterpret_cast *>(alloc_ptr->ptr()); const int threads = 256; int grid = std::min(C, (max_threads + threads - 1) / threads); if (layout == framework::DataLayout::kNCHW) { - KeLocalStats< - T, threads, - framework::DataLayout::kNCHW><<>>( - x_d, N, H * W * D, C, stats); + KeLocalStats + <<>>(x_d, N, H * W * D, C, stats); } else { - KeLocalStats< - T, threads, - framework::DataLayout::kNHWC><<>>( - x_d, N, H * W * D, C, stats); + KeLocalStats + <<>>(x_d, N, H * W * D, C, stats); } + // moving mean/variance + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + auto *est_mean_data = + mean_out->mutable_data>(ctx.GetPlace()); + auto *est_var_data = + variance_out->mutable_data>(ctx.GetPlace()); + + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_inv_variance = ctx.Output("SavedVariance"); + auto *sv_mean_data = + saved_mean->mutable_data>(ctx.GetPlace()); + auto *sv_inv_var_data = + saved_inv_variance->mutable_data>( + ctx.GetPlace()); + Tensor c_g_st; - T *c_g_st_d = c_g_st.mutable_data({2 * C + 1}, platform::CPUPlace()); + auto *c_g_st_d = c_g_st.mutable_data>( + {2 * C + 1}, platform::CPUPlace()); auto gplace = boost::get(ctx.GetPlace()); memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0); - int dtype = platform::ToNCCLDataType(x->type()); + int dtype = platform::ToNCCLDataType(mean_out->type()); // In-place operation - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, comm, stream)); - // moving mean/variance - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - T *est_mean_data = mean_out->mutable_data(ctx.GetPlace()); - T *est_var_data = variance_out->mutable_data(ctx.GetPlace()); - - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_inv_variance = ctx.Output("SavedVariance"); - T *sv_mean_data = saved_mean->mutable_data(ctx.GetPlace()); - T *sv_inv_var_data = saved_inv_variance->mutable_data(ctx.GetPlace()); - // Note, Input('Mean')/Input('Variance') share variable with // Output('MeanOut')/Output('VarianceOut') KeSyncAndMovingStats<<<(C + block - 1) / block, block, 0, stream>>>( @@ -205,39 +220,40 @@ class SyncBatchNormKernel : public framework::OpKernel { int grid2 = (std::min(x_numel, max_threads) + block - 1) / block; if (layout == framework::DataLayout::kNCHW) { - KeNormAffine<<>>( - x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel, - y_d); + KeNormAffine + <<>>(x_d, s_d, b_d, mean_data, var_data, + epsilon, C, H * W * D, x_numel, y_d); } else { - KeNormAffine<<>>( - x_d, s_d, b_d, mean_data, var_data, epsilon, C, H * W * D, x_numel, - y_d); + KeNormAffine + <<>>(x_d, s_d, b_d, mean_data, var_data, + epsilon, C, H * W * D, x_numel, y_d); } } }; template -__global__ void KeBackwardLocalStats(const T *dy, const T *x, const T *means, - int N, int M, int C, T *sum_dy_prod) { - typedef cub::BlockReduce BlockReduce; +__global__ void KeBackwardLocalStats(const T *dy, const T *x, + const BatchNormParamType *means, int N, + int M, int C, + BatchNormParamType *sum_dy_prod) { + typedef cub::BlockReduce, BlockDim> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; for (int k = blockIdx.x; k < C; k += gridDim.x) { - T sum1 = 0; - T sum2 = 0; - T mean = means[k]; + BatchNormParamType sum1 = 0.; + BatchNormParamType sum2 = 0.; + auto mean = means[k]; for (int i = threadIdx.x; i < N * M; i += blockDim.x) { int id = layout == framework::DataLayout::kNCHW ? (i / M) * C * M + k * M + i % M : i * C + k; - T g = dy[id]; + auto g = static_cast>(dy[id]); sum1 += g; - sum2 += g * (x[id] - mean); + auto x_i = static_cast>(x[id]); + sum2 += g * (x_i - mean); } __syncthreads(); - T out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum()); + auto out = BlockReduce(temp_storage).Reduce(sum1, cub::Sum()); __syncthreads(); if (threadIdx.x == 0) { sum_dy_prod[k] = out; @@ -249,72 +265,75 @@ __global__ void KeBackwardLocalStats(const T *dy, const T *x, const T *means, } } if (blockIdx.x == 0 && threadIdx.x == 0) { - sum_dy_prod[2 * C] = static_cast(1.0); + sum_dy_prod[2 * C] = 1.0; } } template -static __global__ void KeBNBackwardScaleBias(const T *dy, const T *x, - const T *mean, - const T *inv_variance, - const double epsilon, const int N, - const int C, const int HxW, - T *dscale, T *dbias) { +static __global__ void KeBNBackwardScaleBias( + const T *dy, const T *x, const BatchNormParamType *mean, + const BatchNormParamType *inv_variance, const double epsilon, + const int N, const int C, const int HxW, BatchNormParamType *dscale, + BatchNormParamType *dbias) { const int outer_size = C; const int inner_size = N * HxW; - typedef cub::BlockReduce BlockReduce; + typedef cub::BlockReduce, BlockDim> BlockReduce; __shared__ typename BlockReduce::TempStorage temp_storage; for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - T ds_sum = static_cast(0); - T db_sum = static_cast(0); + BatchNormParamType ds_sum = 0.; + BatchNormParamType db_sum = 0.; - T inv_var_i = inv_variance[i]; - T mean_i = mean[i]; + auto inv_var_i = inv_variance[i]; + auto mean_i = mean[i]; for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { const int id = layout == framework::DataLayout::kNCHW ? ((j / HxW) * C + i) * HxW + (j % HxW) : j * outer_size + i; - ds_sum += dy[id] * (x[id] - mean_i); - db_sum += dy[id]; + auto x_i = static_cast>(x[id]); + auto dy_i = static_cast>(dy[id]); + ds_sum += dy_i * (x_i - mean_i); + db_sum += dy_i; } __syncthreads(); - double os = BlockReduce(temp_storage) - .Reduce(static_cast(ds_sum), cub::Sum()); + auto os = BlockReduce(temp_storage).Reduce(ds_sum, cub::Sum()); __syncthreads(); - double ob = BlockReduce(temp_storage) - .Reduce(static_cast(db_sum), cub::Sum()); + auto ob = BlockReduce(temp_storage).Reduce(db_sum, cub::Sum()); __syncthreads(); if (threadIdx.x == 0) { - dscale[i] = static_cast(os * inv_var_i); - dbias[i] = static_cast(ob); + dscale[i] = os * inv_var_i; + dbias[i] = ob; } __syncthreads(); } } template -static __global__ void KeBNBackwardData(const T *dy, const T *x, const T *beta, - const T *mean, const T *inv_variance, - const T *g_sum_dy, - const T *g_sum_dy_prod, - const T *num_dev, const double epsilon, - const int C, const int HxW, - const int num, T *dx) { +static __global__ void KeBNBackwardData( + const T *dy, const T *x, const BatchNormParamType *gamma, + const BatchNormParamType *mean, + const BatchNormParamType *inv_variance, + const BatchNormParamType *g_sum_dy, + const BatchNormParamType *g_sum_dy_prod, + const BatchNormParamType *num_dev, const double epsilon, const int C, + const int HxW, const int num, T *dx) { int gid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; - T scale = static_cast(C) / num; - T dev_num = num_dev[0]; + auto scale = static_cast>(C) / num; + auto dev_num = num_dev[0]; for (int i = gid; i < num; i += stride) { const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - T inv_var = inv_variance[c]; - T s_d = beta[c]; - T gvar = -1.0 * (g_sum_dy_prod[c] / dev_num) * s_d * inv_var * - (inv_var * inv_var); - T gmean = -1.0 * (g_sum_dy[c] / dev_num) * s_d * inv_var; - - dx[i] = - dy[i] * s_d * inv_var + gmean * scale + gvar * scale * (x[i] - mean[c]); + auto inv_var = inv_variance[c]; + auto s_d = gamma[c]; + auto gvar = + -((g_sum_dy_prod[c] / dev_num) * s_d * inv_var * (inv_var * inv_var)); + auto gmean = -((g_sum_dy[c] / dev_num) * s_d * inv_var); + + auto x_i = static_cast>(x[i]); + auto dy_i = static_cast>(dy[i]); + auto dx_i = + dy_i * s_d * inv_var + gmean * scale + gvar * scale * (x_i - mean[c]); + dx[i] = static_cast(dx_i); } } @@ -348,8 +367,8 @@ class SyncBatchNormGradKernel : public framework::OpKernel { d_x->mutable_data(ctx.GetPlace()); if (d_scale && d_bias) { - d_scale->mutable_data(ctx.GetPlace()); - d_bias->mutable_data(ctx.GetPlace()); + d_scale->mutable_data>(ctx.GetPlace()); + d_bias->mutable_data>(ctx.GetPlace()); } PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL); PADDLE_ENFORCE_EQ(scale->dims()[0], C); @@ -371,13 +390,13 @@ class SyncBatchNormGradKernel : public framework::OpKernel { auto stream = dev_ctx.stream(); auto *comm = dev_ctx.nccl_comm(); - const T *saved_mean = ctx.Input("SavedMean")->data(); - const T *saved_inv_var = ctx.Input("SavedVariance")->data(); - auto &allocator = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx); - const int bytes = (C * 2 + 1) * sizeof(T); - auto alloc_ptr = allocator.Allocate(bytes); - T *stats = reinterpret_cast(alloc_ptr->ptr()); + const auto *saved_mean = + ctx.Input("SavedMean")->data>(); + const auto *saved_inv_var = + ctx.Input("SavedVariance")->data>(); + const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType); + auto alloc_ptr = memory::Alloc(dev_ctx, bytes); + auto *stats = reinterpret_cast *>(alloc_ptr->ptr()); const int threads = 256; int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); @@ -386,19 +405,17 @@ class SyncBatchNormGradKernel : public framework::OpKernel { int fsize = H * W * D; if (layout == framework::DataLayout::kNCHW) { - KeBackwardLocalStats< - T, threads, - framework::DataLayout::kNCHW><<>>( - dy_d, x_d, saved_mean, N, fsize, C, stats); + KeBackwardLocalStats + <<>>(dy_d, x_d, saved_mean, N, fsize, C, + stats); } else { - KeBackwardLocalStats< - T, threads, - framework::DataLayout::kNHWC><<>>( - dy_d, x_d, saved_mean, N, fsize, C, stats); + KeBackwardLocalStats + <<>>(dy_d, x_d, saved_mean, N, fsize, C, + stats); } - int dtype = platform::ToNCCLDataType(x->type()); + int dtype = platform::ToNCCLDataType(scale->type()); // In-place operation - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( stats, stats, 2 * C + 1, static_cast(dtype), ncclSum, comm, stream)); @@ -406,33 +423,33 @@ class SyncBatchNormGradKernel : public framework::OpKernel { int grid2 = (std::min(x_numel, max_threads) + block - 1) / block; if (layout == framework::DataLayout::kNCHW) { if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, threads, - framework::DataLayout::kNCHW><<>>( - dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize, - d_scale->data(), d_bias->data()); + KeBNBackwardScaleBias + <<>>( + dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize, + d_scale->data>(), + d_bias->data>()); } if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNCHW><<>>( - dy_d, x_d, scale->data(), saved_mean, saved_inv_var, stats, - stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(), - d_x->data()); + KeBNBackwardData + <<>>( + dy_d, x_d, scale->data>(), saved_mean, + saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, + fsize, x->numel(), d_x->data()); } } else { if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, threads, - framework::DataLayout::kNHWC><<>>( - dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize, - d_scale->data(), d_bias->data()); + KeBNBackwardScaleBias + <<>>( + dy_d, x_d, saved_mean, saved_inv_var, epsilon, N, C, fsize, + d_scale->data>(), + d_bias->data>()); } if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNHWC><<>>( - dy_d, x_d, scale->data(), saved_mean, saved_inv_var, stats, - stats + C, stats + 2 * C, epsilon, C, fsize, x->numel(), - d_x->data()); + KeBNBackwardData + <<>>( + dy_d, x_d, scale->data>(), saved_mean, + saved_inv_var, stats, stats + C, stats + 2 * C, epsilon, C, + fsize, x->numel(), d_x->data()); } } } @@ -445,8 +462,12 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( sync_batch_norm, ops::SyncBatchNormKernel, - ops::SyncBatchNormKernel); + ops::SyncBatchNormKernel, + ops::SyncBatchNormKernel); REGISTER_OP_CUDA_KERNEL( sync_batch_norm_grad, ops::SyncBatchNormGradKernel, - ops::SyncBatchNormGradKernel); + ops::SyncBatchNormGradKernel, + ops::SyncBatchNormGradKernel); + +// clang-format on diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc index 3b7d90b7..f2a8ae9a 100644 --- a/paddle/fluid/operators/temporal_shift_op.cc +++ b/paddle/fluid/operators/temporal_shift_op.cc @@ -26,10 +26,10 @@ class TemporalShiftOp : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of TemporalShiftOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of TemporalShiftOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of TemporalShiftOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of TemporalShiftOp should not be null."); auto dim_x = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ(dim_x.size(), 4, @@ -38,9 +38,10 @@ class TemporalShiftOp : public framework::OperatorWithKernel { int seg_num = ctx->Attrs().Get("seg_num"); float shift_ratio = ctx->Attrs().Get("shift_ratio"); PADDLE_ENFORCE_GT(seg_num, 0, "Attr(seg_num) should be greater than 0."); - PADDLE_ENFORCE(shift_ratio > 0 || shift_ratio < .5, - "Attr(shift_ratio) should be greater than 0 and less " - "than 0.5."); + PADDLE_ENFORCE_GT(shift_ratio, 0., + "Attr(shift_ratio) should be greater than 0"); + PADDLE_ENFORCE_LT(shift_ratio, 0.5, + "Attr(shift_ratio) should be less than 0.5"); if (ctx->IsRuntime()) { PADDLE_ENFORCE_EQ( diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc index 2b83c42f..8cba4961 100644 --- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc +++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc @@ -105,15 +105,7 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase { auto out_inx_dim = out_inx.dims(); out_inx_dim[0] = inx.size(); out_inx.Resize(out_inx_dim); - - auto &local_scope = scope.NewScope(); - std::string var_name = "out_index"; - framework::Variable *tmp_index_var = local_scope.Var(var_name); - auto &tmp_index_tensor = - *(tmp_index_var->GetMutable()); - tmp_index_tensor.Resize(out_inx_dim); - int *tmp_index_data = - tmp_index_tensor.mutable_data(platform::CPUPlace()); + int *tmp_index_data = out_inx.mutable_data(platform::CPUPlace()); auto out_dims = inx[0].dims(); size_t out_dim_sum = 0; @@ -122,18 +114,17 @@ class LoDTensorArray2TensorOp : public framework::OperatorBase { out_dim_sum += inx_dims[axis]; tmp_index_data[index] = inx_dims[axis]; } - out_inx.ShareDataWith(tmp_index_tensor); // get input array items' dims out_dims[axis] = out_dim_sum; out.Resize(out_dims); - LodTensorArray2LodTensorVector(local_scope, base_name, Input("X"), &names); + LodTensorArray2LodTensorVector(scope, base_name, Input("X"), &names); // Invoke concat Op auto concat_op = framework::OpRegistry::CreateOp( "concat", {{"X", names}}, {{"Out", {Output("Out")}}}, attrs); - concat_op->Run(local_scope, place); + concat_op->Run(scope, place); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 79c9f759..22c0c9e9 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -48,12 +48,14 @@ class TensorRTEngineOp : public framework::OperatorBase { int workspace_size_; std::unique_ptr calibrator_; bool enable_int8_; + bool enable_fp16_; bool use_calib_mode_; std::string calibration_data_; std::string engine_key_; bool calibration_mode_; int predictor_id_; int device_id_; + AnalysisConfig::Precision precision_mode_; public: TensorRTEngineOp(const std::string &type, @@ -66,6 +68,7 @@ class TensorRTEngineOp : public framework::OperatorBase { workspace_size_ = Attr("workspace_size"); device_id_ = Attr("gpu_id"); enable_int8_ = Attr("enable_int8"); + enable_fp16_ = Attr("enable_fp16"); use_calib_mode_ = Attr("use_calib_mode"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); @@ -93,6 +96,13 @@ class TensorRTEngineOp : public framework::OperatorBase { inference::Singleton::Global() .Get(engine_key_ + std::to_string(predictor_id_)); } + precision_mode_ = AnalysisConfig::Precision::kFloat32; + if (enable_int8_) { + precision_mode_ = AnalysisConfig::Precision::kInt8; + } + if (enable_fp16_) { + precision_mode_ = AnalysisConfig::Precision::kHalf; + } } protected: @@ -141,7 +151,7 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_buffers, runtime_batch, engine_key_, dev_place)); calib_res->thr_.reset(new std::thread([&]() { calib_res->engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, enable_int8_, + max_batch_size_, workspace_size_, precision_mode_, calib_res->calib_.get(), boost::get(dev_place).device)); VLOG(3) << "start the calib trt engine thread"; @@ -173,7 +183,8 @@ class TensorRTEngineOp : public framework::OperatorBase { auto stream = reinterpret_cast(dev_ctx).stream(); - PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); + PADDLE_ENFORCE_EQ(input_names_.empty(), false, + "should pass at least one input"); std::vector output_maps = Attr>("output_name_mapping"); @@ -193,7 +204,21 @@ class TensorRTEngineOp : public framework::OperatorBase { // convert input and copy to TRT engine's buffer auto &t = inference::analysis::GetFromScope(scope, x); - auto t_shape = framework::vectorize(t.dims()); + auto t_shape = framework::vectorize(t.dims()); + // check if the input shapes are consistent with model. + if (HasAttr(x + "_shape")) { + std::vector i_shape = Attr>(x + "_shape"); + std::vector model_input_shape(i_shape.begin() + 1, + i_shape.end()); + std::vector runtime_input_shape(t_shape.begin() + 1, + t_shape.end()); + PADDLE_ENFORCE_EQ(model_input_shape == runtime_input_shape, true, + "Input shapes are inconsistent with the model. TRT 5 " + "or lower version " + "does not support dynamic input shapes. Please check " + "your input shapes."); + } + runtime_batch = t_shape[0]; const int bind_index = engine->engine()->getBindingIndex(x.c_str()); @@ -241,7 +266,7 @@ class TensorRTEngineOp : public framework::OperatorBase { trt_engine_ = inference::Singleton::Global() .Create(engine_key_ + std::to_string(predictor_id_), - max_batch_size_, workspace_size_, enable_int8_, + max_batch_size_, workspace_size_, precision_mode_, calibrator_.get(), device_id_); PrepareTRTEngine(scope, trt_engine_); } diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index efc50fc0..e813e9ca 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -105,6 +105,7 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetAttr("predictor_id", 1); engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("enable_fp16", static_cast(false)); engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); engine_op_desc.SetAttr("output_name_mapping", std::vector({"z0"})); @@ -205,6 +206,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("predictor_id", 1); engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("enable_fp16", static_cast(false)); engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); engine_op_desc.SetAttr("output_name_mapping", std::vector({"z3"})); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cc similarity index 56% rename from paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h rename to paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cc index c898be69..9a06a9a2 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_brelu_mkldnn_fuse_pass.h +++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cc @@ -12,28 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once - -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" -#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h" namespace paddle { -namespace framework { -namespace ir { +namespace operators { -/* - * Fuse the CONV and ReLU6 to a ConvReLU6Op. - */ -class ConvBReLUFusePass : public FusePassBase { - public: - virtual ~ConvBReLUFusePass() {} +TEST(leaky_relu_grad_grad, test_cpu) { + ASSERT_TRUE( + TestLeakyReluGradGradMain({32, 64}, platform::CPUPlace(), 0.02)); +} - protected: - void ApplyImpl(ir::Graph* graph) const override; -}; +TEST(leaky_relu_grad_grad, test_cpu_zero_alpha) { + ASSERT_TRUE( + TestLeakyReluGradGradMain({32, 64}, platform::CPUPlace(), 0.0)); +} -} // namespace ir -} // namespace framework +} // namespace operators } // namespace paddle diff --git a/paddle/fluid/recordio/header_test.cc b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cu similarity index 52% rename from paddle/fluid/recordio/header_test.cc rename to paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cu index 00f1887d..6f0f840b 100644 --- a/paddle/fluid/recordio/header_test.cc +++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -12,18 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/recordio/header.h" +#include "paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h" -#include +namespace paddle { +namespace operators { -#include "gtest/gtest.h" +TEST(leaky_relu_grad_grad, test_gpu) { + ASSERT_TRUE( + TestLeakyReluGradGradMain({32, 64}, platform::CUDAPlace(0), 0.15)); +} -TEST(Recordio, ChunkHead) { - paddle::recordio::Header hdr(0, 1, paddle::recordio::Compressor::kGzip, 3); - std::stringstream ss; - hdr.Write(ss); - ss.seekg(0, std::ios::beg); - paddle::recordio::Header hdr2; - hdr2.Parse(ss); - EXPECT_TRUE(hdr == hdr2); +TEST(leaky_relu_grad_grad, test_gpu_zero_alpha) { + ASSERT_TRUE( + TestLeakyReluGradGradMain({32, 64}, platform::CUDAPlace(0), 0.0)); } + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h new file mode 100644 index 00000000..f416aa6e --- /dev/null +++ b/paddle/fluid/operators/test_leaky_relu_grad_grad_functor.h @@ -0,0 +1,124 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +template +static void InitRandom(framework::Tensor *tensor, + const platform::Place &place) { + framework::Tensor cpu_tensor; + auto *cpu_ptr = + cpu_tensor.mutable_data(tensor->dims(), platform::CPUPlace()); + int64_t numel = cpu_tensor.numel(); + std::mt19937 engine; + std::uniform_real_distribution dist(static_cast(-2.0), + static_cast(2.0)); + for (int64_t i = 0; i < numel; ++i) { + cpu_ptr[i] = dist(engine); + } + framework::TensorCopySync(cpu_tensor, place, tensor); +} + +template +struct LeakyReluGradGradEachElementFunctor { + LeakyReluGradGradEachElementFunctor(const T *ddx, const T *out, T alpha, + T *ddout) + : ddx_(ddx), out_(out), alpha_(alpha), ddout_(ddout) {} + + HOSTDEVICE void operator()(int idx) { + if (out_[idx] > 0) { + ddout_[idx] = ddx_[idx]; + } else { + ddout_[idx] = ddx_[idx] * alpha_; + } + } + + const T *ddx_; + const T *out_; + T alpha_; + T *ddout_; +}; + +template +static bool TestLeakyReluGradGradMain(const framework::DDim &dim, + const platform::Place &place, + float alpha) { + LeakyReluGradGradFunctor functor; + functor.alpha = alpha; + auto &dev_ctx = *platform::DeviceContextPool::Instance().Get(place); + framework::Tensor *x = nullptr; + framework::Tensor *dout = nullptr; + framework::Tensor *dx = nullptr; + + framework::Tensor out; + out.Resize(dim); + InitRandom(&out, place); + + framework::Tensor ddx; + ddx.Resize(dim); + InitRandom(&ddx, place); + + framework::Tensor ddout; + ddout.Resize(dim); + InitRandom(&ddout, place); + + framework::Tensor ddout_actual; + ddout_actual.mutable_data(dim, place); + LeakyReluGradGradEachElementFunctor actual_functor( + ddx.data(), out.data(), static_cast(alpha), + ddout_actual.data()); + + int64_t limit = out.numel(); + +#ifdef __NVCC__ + if (platform::is_gpu_place(place)) { + auto &cuda_dev_ctx = dynamic_cast(dev_ctx); + functor(cuda_dev_ctx, x, &out, &ddx, &ddout, dout, dx); + platform::ForRange for_range(cuda_dev_ctx, + limit); + for_range(actual_functor); + } else { +#endif + auto &cpu_dev_ctx = dynamic_cast(dev_ctx); + functor(cpu_dev_ctx, x, &out, &ddx, &ddout, dout, dx); + platform::ForRange for_range(cpu_dev_ctx, + limit); + for_range(actual_functor); +#ifdef __NVCC__ + } +#endif + + dev_ctx.Wait(); + + framework::Tensor ddout_cpu, ddout_actual_cpu; + framework::TensorCopySync(ddout, platform::CPUPlace(), &ddout_cpu); + framework::TensorCopySync(ddout_actual, platform::CPUPlace(), + &ddout_actual_cpu); + + bool is_equal = std::equal(ddout_cpu.data(), ddout_cpu.data() + limit, + ddout_actual_cpu.data()); + return is_equal; +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index c27039dd..fe243a3b 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -14,7 +14,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/top_k_op.h" -#include "paddle/fluid/platform/assert.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc index 7260fe25..598c9042 100644 --- a/paddle/fluid/operators/uniform_random_batch_size_like_op.cc +++ b/paddle/fluid/operators/uniform_random_batch_size_like_op.cc @@ -56,6 +56,14 @@ with random values sampled from a uniform distribution. "Note that if seed is not 0, this operator will always " "generate the same random numbers every time.") .SetDefault(0); + AddAttr("diag_num", + "The number of diag elements. Note that if " + "diag_num is 0, it means without diag init.[default 0].") + .SetDefault(0); + AddAttr("diag_step", "The step between two diag element.[default 0].") + .SetDefault(0); + AddAttr("diag_val", "The value of diag element. [default 1.0].") + .SetDefault(1.0f); AddAttr("dtype", "(int, default 5(FP32)) Output tensor data type") .SetDefault(framework::proto::VarType::FP32); } diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index bb6a1c5b..35fa0d7f 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -53,6 +53,19 @@ class CPUUniformRandomKernel : public framework::OpKernel { for (int64_t i = 0; i < size; ++i) { data[i] = dist(engine); } + unsigned int diag_num = + static_cast(ctx.Attr("diag_num")); + unsigned int diag_step = + static_cast(ctx.Attr("diag_step")); + auto diag_val = static_cast(ctx.Attr("diag_val")); + if (diag_num > 0) { + PADDLE_ENFORCE_GT(size, (diag_num - 1) * (diag_step + 1), + "The index of diagonal elements is out of bounds"); + for (int64_t i = 0; i < diag_num; ++i) { + int64_t pos = i * diag_step + i; + data[pos] = diag_val; + } + } } }; @@ -61,13 +74,17 @@ class UniformRandomOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of UniformRandomOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of UniformRandomOp should not be null."); - PADDLE_ENFORCE( - ctx->Attrs().Get("min") < ctx->Attrs().Get("max"), - "uniform_random's min must less then max"); + PADDLE_ENFORCE_LT(ctx->Attrs().Get("min"), + ctx->Attrs().Get("max"), + "uniform_random's min must less then max"); auto &shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE_GE(ctx->Attrs().Get("diag_num"), 0, + "diag_num must greater than or equal 0"); + PADDLE_ENFORCE_GE(ctx->Attrs().Get("diag_step"), 0, + "diag_step must greater than or equal 0"); std::vector temp; temp.reserve(shape.size()); for (auto dim : shape) { @@ -105,6 +122,14 @@ uniform distribution. The random result is in set [min, max]. "Note that if seed is not 0, this operator will always " "generate the same random numbers every time. [default 0].") .SetDefault(0); + AddAttr("diag_num", + "The number of diag elements. Note that if " + "diag_num is 0, it means without diag init.[default 0].") + .SetDefault(0); + AddAttr("diag_step", "The step between two diag element.[default 0].") + .SetDefault(0); + AddAttr("diag_val", "The value of diag element. [default 1.0].") + .SetDefault(1.0f); AddAttr("dtype", "Output tensor data type. [default 5(FP32)].") .SetDefault(framework::proto::VarType::FP32); } diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index 2bb0ecc1..a9f10d8b 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -23,16 +23,29 @@ template struct UniformGenerator { T min_, max_; unsigned int seed_; - - __host__ __device__ UniformGenerator(T min, T max, int seed) - : min_(min), max_(max), seed_(seed) {} + T diag_val_; + unsigned int diag_num_; + unsigned int diag_step_; + __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num, + int diag_step, T diag_val) + : min_(min), + max_(max), + seed_(seed), + diag_num_(diag_num), + diag_step_(diag_step), + diag_val_(diag_val) {} __host__ __device__ T operator()(const unsigned int n) const { thrust::minstd_rand rng; rng.seed(seed_); thrust::uniform_real_distribution dist(min_, max_); rng.discard(n); - return dist(rng); + T out = dist(rng); + unsigned int remainder = n % (diag_step_ + 1); + if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) { + out = diag_val_; + } + return out; } }; @@ -64,11 +77,17 @@ class GPUUniformRandomKernel : public framework::OpKernel { } T min = static_cast(context.Attr("min")); T max = static_cast(context.Attr("max")); + unsigned int diag_num = + static_cast(context.Attr("diag_num")); + unsigned int diag_step = + static_cast(context.Attr("diag_step")); + T diag_val = static_cast(context.Attr("diag_val")); thrust::counting_iterator index_sequence_begin(0); int64_t size = tensor->numel(); - thrust::transform(index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - UniformGenerator(min, max, seed)); + thrust::transform( + index_sequence_begin, index_sequence_begin + size, + thrust::device_ptr(data), + UniformGenerator(min, max, seed, diag_num, diag_step, diag_val)); } }; diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h index b6e41347..4b492e9c 100644 --- a/paddle/fluid/operators/unique_op.h +++ b/paddle/fluid/operators/unique_op.h @@ -28,10 +28,12 @@ struct UniqueOpFunctor { framework::Tensor* out_; framework::Tensor* index_; const framework::Tensor* in_; + framework::Tensor* count_; UniqueOpFunctor(framework::Tensor* out, framework::Tensor* index, - const framework::Tensor* in) - : out_(out), index_(index), in_(in) {} + const framework::Tensor* in, + framework::Tensor* count = nullptr) + : out_(out), index_(index), in_(in), count_(count) {} template void apply() const { @@ -50,8 +52,8 @@ struct UniqueOpFunctor { for (auto i = 0; i < in_->numel(); i++) { auto it = dict.find(in_data[i]); if (it == dict.end()) { - dict.insert(std::make_pair(in_data[i], j)); - uniq.push_back(in_data[i]); + dict.emplace(std::make_pair(in_data[i], j)); + uniq.emplace_back(in_data[i]); index_data[i] = static_cast(j); j++; } else { @@ -59,6 +61,37 @@ struct UniqueOpFunctor { } } + if (count_ != nullptr) { + // Resize the count tensor dims to allocate the memory + count_->Resize(framework::make_ddim({static_cast(uniq.size())})); + IndexT* count_data = count_->mutable_data(platform::CPUPlace()); + // init count_data to 0 + memset(count_data, 0, uniq.size() * sizeof(IndexT)); + + const auto& index_type = index_->type(); + bool index_type_match = index_type == framework::proto::VarType::INT32 || + index_type == framework::proto::VarType::INT64; + PADDLE_ENFORCE( + index_type_match, + "Index holds the wrong type, it holds %s, but desires to be %s or %s", + paddle::framework::DataTypeToString(index_type), + paddle::framework::DataTypeToString(framework::proto::VarType::INT32), + paddle::framework::DataTypeToString( + framework::proto::VarType::INT64)); + + if (index_type == framework::proto::VarType::INT32) { + for (auto i = 0; i < in_->numel(); ++i) { + const IndexT& index = index_data[i]; + count_data[static_cast(index)] += static_cast(1); + } + } else { + for (auto i = 0; i < in_->numel(); ++i) { + const IndexT& index = index_data[i]; + count_data[static_cast(index)] += static_cast(1); + } + } + } + out_->Resize(framework::make_ddim({static_cast(uniq.size())})); auto out_data = out_->mutable_data(platform::CPUPlace()); std::memcpy(out_data, uniq.data(), uniq.size() * sizeof(InT)); diff --git a/paddle/fluid/operators/unique_with_counts_op.cc b/paddle/fluid/operators/unique_with_counts_op.cc new file mode 100644 index 00000000..770bbefe --- /dev/null +++ b/paddle/fluid/operators/unique_with_counts_op.cc @@ -0,0 +1,71 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/unique_with_counts_op.h" + +namespace paddle { +namespace operators { + +class UniqueWithCountsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of UniqueWithCountsOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of UniqueWithCountsOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Index"), + "Output(Index) of UniqueWithCountsOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Count"), + "Output(Count) of UniqueWithCountsOp should not be null."); + + auto in_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(in_dims.size() == 1, + "The op of fluid.layers.unique_with_counts, Input(X) should " + "be a vector."); + + ctx->SetOutputDim("Out", {-1}); + ctx->SetOutputDim("Index", in_dims); + ctx->SetOutputDim("Count", {-1}); + } +}; + +class UniqueWithCountsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input tensor. It should be a 1-D tensor."); + AddAttr("dtype", "data type for output index"); + AddOutput("Out", "A unique subsequence for input tensor."); + AddOutput("Index", + "An index tensor pointing to unique subsequence, which has " + "identical shape with input tensor and the data type is set by " + "the attr `dtype`"); + AddOutput("Count", "A subsequence for the count of unique index"); + AddComment(R"DOC( + Return a unique subsequence for 1-D input tensor, index tensor pointing to this unique subsequence, + and the subsequence for the count of unique index. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(unique_with_counts, ops::UniqueWithCountsOp, + ops::UniqueWithCountsOpMaker); +REGISTER_OP_CPU_KERNEL(unique_with_counts, ops::UniqueWithCountsKernel, + ops::UniqueWithCountsKernel, + ops::UniqueWithCountsKernel, + ops::UniqueWithCountsKernel); diff --git a/paddle/fluid/operators/unique_with_counts_op.h b/paddle/fluid/operators/unique_with_counts_op.h new file mode 100644 index 00000000..f61bac7c --- /dev/null +++ b/paddle/fluid/operators/unique_with_counts_op.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/unique_op.h" + +namespace paddle { +namespace operators { + +template +class UniqueWithCountsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto data_type = static_cast( + context.Attr("dtype")); + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + auto* index = context.Output("Index"); + auto* count = context.Output("Count"); + framework::VisitDataType(data_type, + UniqueOpFunctor(out, index, x, count)); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc index 405943ad..fc849e73 100644 --- a/paddle/fluid/operators/unsqueeze_op.cc +++ b/paddle/fluid/operators/unsqueeze_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/unsqueeze_op.h" +#include #include #include #include "paddle/fluid/framework/op_registry.h" @@ -19,20 +21,22 @@ limitations under the License. */ namespace paddle { namespace operators { -class UnsqueezeOpInferShape : public framework::InferShapeBase { +class UnsqueezeOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of Unsqueeze operator should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of Unsqueeze operator should not be null."); + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of Unsqueeze operator should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of Unsqueeze operator should not be null."); const auto &axes = ctx->Attrs().Get>("axes"); const auto &x_dims = ctx->GetInputDim("X"); // Validity Check: input tensor dims (<6). - PADDLE_ENFORCE(x_dims.size() <= 6, - "Invalid dimensions, the rank of Input(X) " - "should be in the range of [1, 6] (Eigen limit)"); + PADDLE_ENFORCE_LE(x_dims.size(), 6, + "Invalid dimensions, the rank of Input(X) " + "should be in the range of [1, 6] (Eigen limit)"); auto out_dims = GetOutputShape(axes, x_dims); ctx->SetOutputDim("Out", out_dims); if (x_dims[0] == out_dims[0]) { @@ -49,15 +53,14 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase { std::vector output_shape(output_size, 0); // Validity Check: rank range. - PADDLE_ENFORCE(output_size <= 6, - "The output tensor's rank should be less than 6."); + PADDLE_ENFORCE_LE(output_size, 6, + "The output tensor's rank should be less than 6."); for (int axis : unsqz_dims) { int cur = axis < 0 ? axis + cur_output_size + 1 : axis; // Vaildity Check: the axis bound - PADDLE_ENFORCE( - cur >= 0 && cur <= cur_output_size, - "The unsqueeze dims must be within range of current rank."); + PADDLE_ENFORCE_GE(cur, 0); + PADDLE_ENFORCE_LE(cur, cur_output_size); // Move old axis, and insert new axis for (int i = cur_output_size; i >= cur; --i) { if (output_shape[i] == 1) { @@ -82,27 +85,6 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase { } }; -class UnsqueezeOp : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto &axes = Attr>("axes"); - auto x_dims = scope.FindVar(Input("X"))->Get().dims(); - auto out_dims = UnsqueezeOpInferShape::GetOutputShape(axes, x_dims); - - framework::AttributeMap attrs; - attrs["shape"] = framework::vectorize2int(out_dims); - // Invoke Reshape op. - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape", {{"X", {Input("X")}}, {"Shape", {}}}, - {{"Out", {Output("Out")}}}, attrs); - reshape_op->Run(scope, place); - } -}; - class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -112,17 +94,17 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker { "(std::vector). List of integers," " indicating the dimensions to be inserted") .AddCustomChecker([](const std::vector &axes) { - PADDLE_ENFORCE(!axes.empty(), - "Invalid axes, The unsqueeze axes is empty."); + PADDLE_ENFORCE_EQ(!axes.empty(), true, + "Invalid axes, The unsqueeze axes is empty."); // Validity Check: axes dims (<6). - PADDLE_ENFORCE(static_cast(axes.size()) < 6, - "Invalid dimensions, dynamic dimensions should be " - "within [1, 6] dimensions (Eigen limit)."); + PADDLE_ENFORCE_LT(static_cast(axes.size()), 6, + "Invalid dimensions, dynamic dimensions should be " + "within [1, 6] dimensions (Eigen limit)."); // Validity Check: the range of unsqueeze aixs. for (int axis : axes) { - PADDLE_ENFORCE(axis < 6, - "Invalid dimensions, input axis should be" - " within [1, 6] dimensions (Eigen limit)."); + PADDLE_ENFORCE_LT(axis, 6, + "Invalid dimensions, input axis should be" + " within [1, 6] dimensions (Eigen limit)."); } }); AddComment(R"DOC( @@ -139,47 +121,47 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker { } }; -class UnsqueezeGradInferShape : public framework::InferShapeBase { +class UnsqueezeGradOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *ctx) const override { + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); ctx->ShareLoD("X", framework::GradVarName("X")); } }; -class UnsqueezeGradOp : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto dx_name = Output(framework::GradVarName("X")); - auto dout_name = Input(framework::GradVarName("Out")); - auto x_dims = scope.FindVar(Input("X"))->Get().dims(); - - framework::AttributeMap attrs; - attrs["shape"] = framework::vectorize2int(x_dims); - - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}}, - attrs); - reshape_op->Run(scope, place); - } -}; - // FIXME(zcd): unsqueeze2 adds an intermediate output(XShape) based on // unsqueeze, the XShape is used to carry the shape and lod of X which // will be used in unsqueeze_grad, in this way, the framework can reuse // the memory of X immediately the unsqueeze2_op is finished. // Considering compatibility issues, we could not fix unsqueeze2_op -class Unsqueeze2OpInferShape : public UnsqueezeOpInferShape { +class Unsqueeze2Op : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *ctx) const override { - UnsqueezeOpInferShape::operator()(ctx); - PADDLE_ENFORCE(ctx->HasOutput("XShape"), - "Output(XShape) of Unsqueeze operator should not be null."); + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + "Input(X) of Unsqueeze operator should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + "Output(Out) of Unsqueeze operator should not be null."); + + const auto &axes = ctx->Attrs().Get>("axes"); const auto &x_dims = ctx->GetInputDim("X"); + // Validity Check: input tensor dims (<6). + PADDLE_ENFORCE_LE(x_dims.size(), 6, + "Invalid dimensions, the rank of Input(X) " + "should be in the range of [1, 6] (Eigen limit)"); + auto out_dims = UnsqueezeOp::GetOutputShape(axes, x_dims); + ctx->SetOutputDim("Out", out_dims); + if (x_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", "Out"); + } + + PADDLE_ENFORCE_EQ( + ctx->HasOutput("XShape"), true, + "Output(XShape) of Unsqueeze operator should not be null."); std::vector xshape_dims(x_dims.size() + 1); xshape_dims[0] = 0; for (int i = 0; i < x_dims.size(); ++i) { @@ -201,27 +183,6 @@ class Unsqueeze2OpMaker : public UnsqueezeOpMaker { } }; -class Unsqueeze2Op : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto &axes = Attr>("axes"); - auto x_dims = scope.FindVar(Input("X"))->Get().dims(); - auto out_dims = Unsqueeze2OpInferShape::GetOutputShape(axes, x_dims); - - framework::AttributeMap attrs; - attrs["shape"] = framework::vectorize2int(out_dims); - // Invoke Reshape op. - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape2", {{"X", {Input("X")}}, {"Shape", {}}}, - {{"Out", {Output("Out")}}, {"XShape", {Output("XShape")}}}, attrs); - reshape_op->Run(scope, place); - } -}; - class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker { public: using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; @@ -237,57 +198,70 @@ class Unsqueeze2GradOpMaker : public framework::SingleGradOpDescMaker { } }; -class Unsqueeze2GradInferShape : public framework::InferShapeBase { +class Unsqueeze2GradOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *context) const override { - PADDLE_ENFORCE(context->HasInput("XShape"), - "Input(XShape) shouldn't be null."); - PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) shouldn't be null."); + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext *context) const override { + PADDLE_ENFORCE_EQ(context->HasInput("XShape"), true, + "Input(XShape) shouldn't be null."); + PADDLE_ENFORCE_EQ(context->HasInput(framework::GradVarName("Out")), true, + "Input(Out@GRAD) shouldn't be null."); auto xshape_dims = context->GetInputDim("XShape"); auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); context->SetOutputDim(framework::GradVarName("X"), x_dims); context->ShareLoD("XShape", framework::GradVarName("X")); } -}; -class Unsqueeze2GradOp : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto dx_name = Output(framework::GradVarName("X")); - auto dout_name = Input(framework::GradVarName("Out")); - auto xshape_name = Input("XShape"); - auto xshape_dims = - scope.FindVar(xshape_name)->Get().dims(); - auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); - - framework::AttributeMap attrs; - attrs["shape"] = framework::vectorize2int(x_dims); - - auto reshape_op = framework::OpRegistry::CreateOp( - "reshape2", {{"X", {dout_name}}, {"Shape", {}}}, - {{"Out", {dx_name}}, {"XShape", {xshape_name}}}, attrs); - reshape_op->Run(scope, place); + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; + +DECLARE_INPLACE_OP_INFERER(UnsqueezeInplaceInferer, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(UnsqueezeGradInplaceInferer, + {framework::GradVarName("Out"), + framework::GradVarName("X")}); } // namespace operators } // namespace paddle -// Tell linker to use reshape op. -USE_OP(reshape); - namespace ops = paddle::operators; REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker, - ops::UnsqueezeOpInferShape, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp, - ops::UnsqueezeGradInferShape); +REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp); REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker, - ops::Unsqueeze2OpInferShape, ops::Unsqueeze2GradOpMaker); + ops::Unsqueeze2GradOpMaker, ops::UnsqueezeInplaceInferer); REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp, - ops::Unsqueeze2GradInferShape); + ops::UnsqueezeGradInplaceInferer); + +REGISTER_OP_CPU_KERNEL( + unsqueeze, ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel); +REGISTER_OP_CPU_KERNEL( + unsqueeze_grad, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel); +REGISTER_OP_CPU_KERNEL( + unsqueeze2, + ops::Unsqueeze2Kernel, + ops::Unsqueeze2Kernel, + ops::Unsqueeze2Kernel, + ops::Unsqueeze2Kernel, + ops::Unsqueeze2Kernel); +REGISTER_OP_CPU_KERNEL( + unsqueeze2_grad, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel); diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc new file mode 100644 index 00000000..fbdec5af --- /dev/null +++ b/paddle/fluid/operators/unsqueeze_op.cu.cc @@ -0,0 +1,45 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/unsqueeze_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + unsqueeze, ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel, + ops::UnsqueezeKernel); +REGISTER_OP_CUDA_KERNEL( + unsqueeze_grad, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel, + ops::UnsqueezeGradKernel); +REGISTER_OP_CUDA_KERNEL( + unsqueeze2, + ops::Unsqueeze2Kernel, + ops::Unsqueeze2Kernel, + ops::Unsqueeze2Kernel, + ops::Unsqueeze2Kernel, + ops::Unsqueeze2Kernel); +REGISTER_OP_CUDA_KERNEL( + unsqueeze2_grad, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel, + ops::Unsqueeze2GradKernel); diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h new file mode 100644 index 00000000..68f0cbe8 --- /dev/null +++ b/paddle/fluid/operators/unsqueeze_op.h @@ -0,0 +1,137 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/pooling.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace operators { + +template +class UnsqueezeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto &axes = context.Attr>("axes"); + auto *in = context.Input("X"); + auto *out = context.Output("Out"); + auto x_dims = in->dims(); + auto out_dims = GetOutputShape(axes, x_dims); + + out->mutable_data(context.GetPlace(), in->type()); + framework::TensorCopy( + *in, context.GetPlace(), + context.template device_context(), out); + out->Resize(out_dims); + } + + static framework::DDim GetOutputShape(const std::vector unsqz_dims, + const framework::DDim &in_dims) { + int output_size = in_dims.size() + static_cast(unsqz_dims.size()); + int cur_output_size = in_dims.size(); + std::vector output_shape(output_size, 0); + + // Validity Check: rank range. + PADDLE_ENFORCE_LE(output_size, 6, + "The output tensor's rank should be less than 6."); + + for (int axis : unsqz_dims) { + int cur = axis < 0 ? axis + cur_output_size + 1 : axis; + // Vaildity Check: the axis bound + PADDLE_ENFORCE_GE(cur, 0); + PADDLE_ENFORCE_LE(cur, cur_output_size); + // Move old axis, and insert new axis + for (int i = cur_output_size; i >= cur; --i) { + if (output_shape[i] == 1) { + // Move axis + output_shape[i + 1] = 1; + output_shape[i] = 0; + } + } + output_shape[cur] = 1; + // Add the output size. + cur_output_size++; + } + + // Make output shape + for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) { + if (output_shape[out_idx] == 0) { + output_shape[out_idx] = in_dims[in_idx++]; + } + } + + return framework::make_ddim(output_shape); + } +}; + +template +class UnsqueezeGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *d_out = + ctx.Input(framework::GradVarName("Out")); + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto in_dims = ctx.Input("X")->dims(); + + d_x->mutable_data(ctx.GetPlace(), d_out->type()); + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + d_x->Resize(in_dims); + } +}; + +template +class Unsqueeze2Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *out = context.Output("Out"); + auto *in = context.Input("X"); + + auto &axes = context.Attr>("axes"); + + auto x_dims = in->dims(); + auto out_dims = + UnsqueezeKernel::GetOutputShape(axes, x_dims); + + out->mutable_data(context.GetPlace(), in->type()); + framework::TensorCopy( + *in, context.GetPlace(), + context.template device_context(), out); + out->Resize(out_dims); + } +}; + +template +class Unsqueeze2GradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *d_out = + ctx.Input(framework::GradVarName("Out")); + auto *d_x = ctx.Output(framework::GradVarName("X")); + // auto in_dims = d_x->dims(); + + auto xshape_dims = ctx.Input("XShape")->dims(); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + + d_x->mutable_data(ctx.GetPlace(), d_out->type()); + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + d_x->Resize(x_dims); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc index 4ff3249c..204aa1fa 100644 --- a/paddle/fluid/operators/unstack_op.cc +++ b/paddle/fluid/operators/unstack_op.cc @@ -1,26 +1,140 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #include "paddle/fluid/operators/unstack_op.h" +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +class UnStackOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, "Input(X) must exist."); + + int axis = ctx->Attrs().Get("axis"); + int num = ctx->Attrs().Get("num"); + auto x_dim = ctx->GetInputDim("X"); + int rank = x_dim.size(); + PADDLE_ENFORCE_GE( + axis, -rank, "Attr(axis) must be inside [-rank, rank), where rank = %d", + rank); + PADDLE_ENFORCE_LT( + axis, rank, "Attr(axis) must be inside [-rank, rank), where rank = %d", + rank); + if (axis < 0) axis += rank; + + PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast(num), + "Number of Outputs(Y) is wrong"); + if (x_dim[axis] > 0) { + PADDLE_ENFORCE_EQ(num, x_dim[axis], "Number of Outputs(Y) is wrong"); + } + auto vec = framework::vectorize(x_dim); + vec.erase(vec.begin() + axis); + ctx->SetOutputsDim("Y", std::vector( // NOLINT + x_dim[axis], framework::make_ddim(vec))); + } +}; + +class UnStackOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input of unstack op."); + AddOutput("Y", "The output of unstack op.").AsDuplicable(); + AddAttr("axis", "The axis along which Input(X) should be unstacked.") + .SetDefault(0); + AddAttr("num", "The number of outputs(Y).").GreaterThan(0); + AddComment(R"DOC( + UnStack Operator. + + UnStack Input(X) into several tensors along Attr(axis). + )DOC"); + } +}; + +class UnStackGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("unstack_grad"); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +class UnStackGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0, + "Number of Inputs(Y@Grad) must be larger than 0"); + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("X")), true, + "Output(X@Grad) must exist."); + + auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y")); + for (size_t i = 1; i < input_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0], + "Dims of all Inputs(Y@Grad) must be the same"); + } + + int axis = ctx->Attrs().Get("axis"); + int rank = input_dims[0].size(); + PADDLE_ENFORCE_GE( + axis, -(rank + 1), + "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank); + PADDLE_ENFORCE_LT( + axis, rank + 1, + "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank); + if (axis < 0) axis += (rank + 1); + + auto vec = framework::vectorize(input_dims[0]); + vec.insert(vec.begin() + axis, input_dims.size()); + ctx->SetOutputDim(framework::GradVarName("X"), framework::make_ddim(vec)); + } +}; + +} // namespace operators +} // namespace paddle namespace plat = paddle::platform; namespace ops = paddle::operators; -USE_OP(stack); - REGISTER_OPERATOR(unstack, ops::UnStackOp, ops::UnStackOpMaker, - ops::UnStackOpInferShape, ops::UnStackGradOpDescMaker); + ops::UnStackGradOpDescMaker); + +REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp); + +REGISTER_OP_CPU_KERNEL(unstack, + ops::UnStackKernel, + ops::UnStackKernel, + ops::UnStackKernel, + ops::UnStackKernel); -REGISTER_OPERATOR(unstack_grad, ops::UnStackGradOp, - ops::UnStackOpGradInferShape); +REGISTER_OP_CPU_KERNEL(unstack_grad, + ops::UnStackGradKernel, + ops::UnStackGradKernel, + ops::UnStackGradKernel, + ops::UnStackGradKernel); diff --git a/paddle/fluid/operators/unstack_op.cu b/paddle/fluid/operators/unstack_op.cu new file mode 100644 index 00000000..b591898a --- /dev/null +++ b/paddle/fluid/operators/unstack_op.cu @@ -0,0 +1,32 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/unstack_op.h" + +namespace plat = paddle::platform; +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + unstack, ops::UnStackKernel, + ops::UnStackKernel, + ops::UnStackKernel, + ops::UnStackKernel, + ops::UnStackKernel); + +REGISTER_OP_CUDA_KERNEL( + unstack_grad, ops::UnStackGradKernel, + ops::UnStackGradKernel, + ops::UnStackGradKernel, + ops::UnStackGradKernel, + ops::UnStackGradKernel); diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h index 348a1038..6344ea16 100644 --- a/paddle/fluid/operators/unstack_op.h +++ b/paddle/fluid/operators/unstack_op.h @@ -1,133 +1,173 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/for_range.h" + +#ifdef __NVCC__ +#include +#include "paddle/fluid/framework/array.h" +#endif namespace paddle { namespace operators { -class UnStackOpInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist."); - - int axis = ctx->Attrs().Get("axis"); - int num = ctx->Attrs().Get("num"); - auto x_dim = ctx->GetInputDim("X"); - int rank = x_dim.size(); - PADDLE_ENFORCE(axis >= -rank && axis < rank, - "Attr(axis) must be inside [-rank, rank), where rank = %d", - rank); - if (axis < 0) axis += rank; - - PADDLE_ENFORCE_EQ(ctx->Outputs("Y").size(), static_cast(num), - "Number of Outputs(Y) is wrong"); - if (x_dim[axis] > 0) { - PADDLE_ENFORCE_EQ(num, x_dim[axis], "Number of Outputs(Y) is wrong"); - } - auto vec = framework::vectorize2int(x_dim); - vec.erase(vec.begin() + axis); - ctx->SetOutputsDim("Y", std::vector( // NOLINT - x_dim[axis], framework::make_ddim(vec))); - } -}; +template +struct StackFunctor { + HOSTDEVICE StackFunctor(const VecXType &x, T *y, int n, int post) + : x_(x), y_(y), n_(n), post_(post) {} -class UnStackOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "The input of unstack op."); - AddOutput("Y", "The output of unstack op.").AsDuplicable(); - AddAttr("axis", "The axis along which Input(X) should be unstacked.") - .SetDefault(0); - AddAttr("num", "The number of outputs(Y).").GreaterThan(0); - AddComment(R"DOC( - UnStack Operator. - - UnStack Input(X) into several tensors along Attr(axis). - )DOC"); + HOSTDEVICE void operator()(int idx) { + int i = idx / (n_ * post_); + int which_x = idx / post_ - i * n_; + int x_index = i * post_ + idx % post_; + y_[idx] = x_[which_x][x_index]; } -}; - -class UnStackOp : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto stack_grad_op = framework::OpRegistry::CreateOp( - "stack_grad", {{framework::GradVarName("Y"), {Input("X")}}}, - {{framework::GradVarName("X"), Outputs("Y")}}, Attrs()); - stack_grad_op->Run(scope, place); - } + VecXType x_; + T *y_; + int n_; + int post_; }; -class UnStackOpGradInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0, - "Number of Inputs(Y@Grad) must be larger than 0"); - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), - "Output(X@Grad) must exist."); - - auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y")); - for (size_t i = 1; i < input_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0], - "Dims of all Inputs(Y@Grad) must be the same"); - } - - int axis = ctx->Attrs().Get("axis"); - int rank = input_dims[0].size(); - PADDLE_ENFORCE( - axis >= -(rank + 1) && axis < rank + 1, - "Attr(axis) must be inside [-(rank+1), rank+1), where rank = %d", rank); - if (axis < 0) axis += (rank + 1); +template +struct StackGradFunctor { + HOSTDEVICE StackGradFunctor(const VecDxType &dx, const T *dy, int n, int post) + : dx_(dx), dy_(dy), n_(n), post_(post) {} - auto vec = framework::vectorize2int(input_dims[0]); - vec.insert(vec.begin() + axis, input_dims.size()); - ctx->SetOutputDim(framework::GradVarName("X"), framework::make_ddim(vec)); + HOSTDEVICE void operator()(int idx) { + int i = idx / (n_ * post_); + int which_x = idx / post_ - i * n_; + int x_index = i * post_ + idx % post_; + dx_[which_x][x_index] = dy_[idx]; } + + private: + VecDxType dx_; + const T *dy_; + int n_; + int post_; }; -class UnStackGradOpDescMaker : public framework::SingleGradOpDescMaker { +template +static inline void StackFunctorForRange(const DeviceContext &ctx, + const VecXType &x, T *y, int total_num, + int n, int post) { + platform::ForRange for_range(ctx, total_num); + for_range(StackFunctor(x, y, n, post)); +} + +template +static inline void StackGradFunctorForRange(const DeviceContext &ctx, + const VecDxType &dx, const T *dy, + int total_num, int n, int post) { + platform::ForRange for_range(ctx, total_num); + for_range(StackGradFunctor(dx, dy, n, post)); +} + +template +class UnStackGradKernel : public framework::OpKernel { + using Tensor = framework::LoDTensor; + public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - protected: - std::unique_ptr Apply() const override { - std::unique_ptr op(new framework::OpDesc()); - op->SetType("unstack_grad"); - op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); - op->SetOutput(framework::GradVarName("X"), InputGrad("X")); - op->SetAttrMap(Attrs()); - return op; + void Compute(const framework::ExecutionContext &ctx) const override { + auto x = ctx.MultiInput(framework::GradVarName("Y")); + auto *y = ctx.Output(framework::GradVarName("X")); + + int axis = ctx.Attr("axis"); + if (axis < 0) axis += (x[0]->dims().size() + 1); + + int n = static_cast(x.size()); + auto *y_data = y->mutable_data(ctx.GetPlace()); + std::vector x_datas(n); + for (int i = 0; i < n; i++) x_datas[i] = x[i]->data(); + + int pre = 1; + int post = 1; + auto &dim = x[0]->dims(); + for (auto i = 0; i < axis; ++i) pre *= dim[i]; + for (auto i = axis; i < dim.size(); ++i) post *= dim[i]; + +#ifdef __NVCC__ + int total_num = pre * n * post; + auto &dev_ctx = ctx.template device_context(); + + thrust::device_vector device_x_vec(x_datas); + auto x_data_arr = device_x_vec.data().get(); + + StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); + + // Wait() must be called because device_x_vec may be destructed before + // kernel ends + dev_ctx.Wait(); +#else + auto x_data_arr = x_datas.data(); + + size_t x_offset = 0; + size_t y_offset = 0; + for (int i = 0; i < pre; i++) { + for (int j = 0; j < n; j++) { + std::memcpy(y_data + y_offset, x_data_arr[j] + x_offset, + post * sizeof(T)); + y_offset += post; + } + x_offset += post; + } +#endif } }; -class UnStackGradOp : public framework::OperatorBase { - public: - using OperatorBase::OperatorBase; +template +class UnStackKernel : public framework::OpKernel { + using Tensor = framework::LoDTensor; - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto stack_op = framework::OpRegistry::CreateOp( - "stack", {{"X", Inputs(framework::GradVarName("Y"))}}, - {{"Y", {Output(framework::GradVarName("X"))}}}, Attrs()); - stack_op->Run(scope, place); + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *dy = ctx.Input("X"); + auto dx = ctx.MultiOutput("Y"); + int axis = ctx.Attr("axis"); + if (axis < 0) axis += dy->dims().size(); + + int n = dy->dims()[axis]; + std::vector dx_datas(n); // NOLINT + for (int i = 0; i < n; i++) { + dx_datas[i] = dx[i]->mutable_data(ctx.GetPlace()); + } + auto dy_data = dy->data(); + + int pre = 1; + for (int i = 0; i < axis; ++i) pre *= dy->dims()[i]; + int total_num = dy->numel(); + int post = total_num / (n * pre); + + auto &dev_ctx = ctx.template device_context(); +#ifdef __NVCC__ + thrust::device_vector device_dx_vec(dx_datas); + auto dx_data_arr = device_dx_vec.data().get(); +#else + auto dx_data_arr = dx_datas.data(); +#endif + StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post); +#ifdef __NVCC__ + // Wait() must be called because device_dx_vec may be destructed before + // kernel ends + dev_ctx.Wait(); +#endif } }; diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc new file mode 100644 index 00000000..23207520 --- /dev/null +++ b/paddle/fluid/operators/var_conv_2d_op.cc @@ -0,0 +1,431 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/var_conv_2d_op.h" +#include +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/dynload/mklml.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +void VarConv2dOpMaker::Make() { + AddInput("X", + "X (LoDTensor, default LoDTensor) Input variable which " + "should contain lod information."); + AddInput("ROW", "(LoDTensor) the row variable provides lod information"); + AddInput("COLUMN", + "(LoDTensor) the column variable provides lod information"); + AddInput("W", "W (Tensor), the filter."); + AddAttr("InputChannel", "the input filter num").SetDefault(1); + AddAttr("OutputChannel", "the output filter num").SetDefault(1); + AddAttr("StrideH", "the height of Stride").SetDefault(1); + AddAttr("StrideW", "the width of Stride").SetDefault(1); + AddAttr("KernelH", "the height of Kernel").SetDefault(1); + AddAttr("KernelW", "the width of Kernel").SetDefault(1); + + AddOutput("Out", "(LoDTensor, default LoDTensor) Output variable"); + AddOutput("Col", + "(LoDTensor, default LoDTensor) the intermediate result " + "variable"); + + AddComment(R"DOC( + Var Size Conv Operator + + This operator calculate Out = \sigma \left ( W * X + b \right ), + only support 2-D for X. + + NOTE: only support 'float32' data type now. + + )DOC"); +} + +void VarConv2dOP::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), + "X(Input) of VarConv2dOP should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "W(Input) of VarConv2dOP should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ROW"), + "Input(ROW) of VarConv2dOP should not be null."); + PADDLE_ENFORCE(ctx->HasInput("COLUMN"), + "Input(COLUMN) of VarConv2dOP should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Out(Output) of VarConv2dOP should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Col"), + "Col(Output) of VarConv2dOP should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, + "The rank of X(Input) can't be less than 2."); + + auto w_dims = ctx->GetInputDim("W"); + + PADDLE_ENFORCE_EQ(w_dims.size(), 2, "W should be 2-D tensor"); + int output_channel = ctx->Attrs().Get("OutputChannel"); + int input_channel = ctx->Attrs().Get("InputChannel"); + int kernel_h = ctx->Attrs().Get("KernelH"); + int kernel_w = ctx->Attrs().Get("KernelW"); + PADDLE_ENFORCE_EQ(w_dims[0], output_channel, + "W dim[0] should be equal to OutputChannel"); + PADDLE_ENFORCE_EQ( + w_dims[1], input_channel * kernel_h * kernel_w, + "W dim[1] should be equal to InputChannel * StrideH * StrideW"); + + if (ctx->IsRuntime()) { + framework::Variable* x_var = + boost::get(ctx->GetInputVarPtrs("X")[0]); + const auto& x_lod = x_var->Get().lod(); + PADDLE_ENFORCE(!x_lod.empty(), "The Input(X) must hold lod info."); + + PADDLE_ENFORCE_GE(x_lod.size(), 1, "The Input(X)'s lod info is corrupted."); + PADDLE_ENFORCE_EQ( + x_dims[0], static_cast(x_lod[0].back()), + "The Input(X)'s lod info mismatches the actual tensor shape."); + + framework::Variable* row_var = + boost::get(ctx->GetInputVarPtrs("ROW")[0]); + const auto& row_lod = row_var->Get().lod(); + PADDLE_ENFORCE(!row_lod.empty(), "The Input(ROW) must hold lod info."); + + framework::Variable* col_var = + boost::get(ctx->GetInputVarPtrs("COLUMN")[0]); + const auto& col_lod = col_var->Get().lod(); + PADDLE_ENFORCE(!col_lod.empty(), "The Input(COLUMN) must hold lod info."); + } else { + std::vector out_dims_vec{-1}; + out_dims_vec.push_back(1); + std::vector col_dims_vec{-1}; + col_dims_vec.push_back(1); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims_vec)); + ctx->SetOutputDim("Col", framework::make_ddim(col_dims_vec)); + } +} + +template +class CPUVarConv2dOPKernel : public framework::OpKernel { + public: + void Im2Col(const framework::ExecutionContext& ctx, const LoDTensor& input, + LoDTensor* col) const { + int input_channel = ctx.Attr("InputChannel"); + auto* in_row = ctx.Input("ROW"); + auto* in_col = ctx.Input("COLUMN"); + int kernel_h = ctx.Attr("KernelH"); + int kernel_w = ctx.Attr("KernelW"); + int stride_h = ctx.Attr("StrideH"); + int stride_w = ctx.Attr("StrideW"); + + int batch = input.lod()[0].size() - 1; + const auto& bottom_offset = input.lod()[0]; + // 2-D lod info. + const auto& offset_x = in_col->lod()[0]; + const auto& offset_y = in_row->lod()[0]; + + // top offset is the whole size of each data sample + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_x = top_im_y * top_im_x; + int top_y = input_channel * kernel_h * kernel_w; + top_size += top_y * top_x; + top_offset.push_back(top_size); + } + framework::LoD col_lod; + col_lod.push_back(top_offset); + col->set_lod(col_lod); + std::vector col_dims_vec{top_size}; + col_dims_vec.push_back(1); + auto* top_data = col->mutable_data(framework::make_ddim(col_dims_vec), + ctx.GetPlace()); + auto* bottom_data = input.data(); + + int kernel_win_size = kernel_h * kernel_w; + int half_kernel_h = kernel_h / 2; + int half_kernel_w = kernel_w / 2; + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int top_im_x = (width - 1) / stride_w + 1; + int top_im_y = (height - 1) / stride_h + 1; + int top_x = top_im_y * top_im_x; + for (int z = 0; z < input_channel; ++z) { + int row_offset = kernel_win_size * z; + int im_offset = z * width * height; + for (int y = 0; y < height; y += stride_h) { + for (int x = 0; x < width; x += stride_w) { + int col_offset = x / stride_w + y / stride_h * top_im_x; + for (int ky = 0; ky < kernel_h; ++ky) { + for (int kx = 0; kx < kernel_w; ++kx) { + int im_y = y + ky - half_kernel_h; + int im_x = x + kx - half_kernel_w; + if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) { + top_data[t_offset + + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = + bottom_data[b_offset + im_offset + im_y * width + im_x]; + } else { + top_data[t_offset + + (row_offset + ky * kernel_w + kx) * top_x + + col_offset] = 0; + } + } + } + } + } + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* bottom = ctx.Input("X"); + auto* in_row = ctx.Input("ROW"); + auto* in_col = ctx.Input("COLUMN"); + auto* w = ctx.Input("W"); + auto* top = ctx.Output("Out"); + auto* col = ctx.Output("Col"); + + int output_channel = ctx.Attr("OutputChannel"); + int input_channel = ctx.Attr("InputChannel"); + int kernel_h = ctx.Attr("KernelH"); + int kernel_w = ctx.Attr("KernelW"); + int stride_h = ctx.Attr("StrideH"); + int stride_w = ctx.Attr("StrideW"); + + Im2Col(ctx, *bottom, col); + int batch = bottom->lod()[0].size() - 1; + const auto& col_offset = col->lod()[0]; + const auto& offset_x = in_col->lod()[0]; + const auto& offset_y = in_row->lod()[0]; + std::vector top_offset; + int top_size = 0; + top_offset.push_back(top_size); + for (int b = 0; b < batch; ++b) { + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + int top_im_x = 0; + if (width == 0) { + top_im_x = 0; + } else { + top_im_x = (width - 1) / stride_w + 1; + } + int top_im_y = 0; + if (height == 0) { + top_im_y = 0; + } else { + top_im_y = (height - 1) / stride_h + 1; + } + int top_im_size = top_im_y * top_im_x; + top_size += output_channel * top_im_size; + top_offset.push_back(top_size); + } + + framework::LoD top_lod; + top_lod.push_back(top_offset); + + top->set_lod(top_lod); + std::vector top_dims_vec{top_size}; + top_dims_vec.push_back(1); + auto* top_data = top->mutable_data(framework::make_ddim(top_dims_vec), + ctx.GetPlace()); + + auto* w_data = w->data(); + auto* col_data = col->data(); + + auto blas = math::GetBlas(ctx); + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } + + blas.GEMM(CblasNoTrans, CblasNoTrans, output_channel, top_im_size, + input_channel * kernel_h * kernel_w, 1.0, w_data, + col_data + col_offset[b], 0.0, top_data + top_offset[b]); + } + } +}; + +void VarConv2dOpGrad::InferShape(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(W) of SequencePadGradOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) of SequencePadGradOp should not be null."); + + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); + } + if (ctx->HasOutput(framework::GradVarName("W"))) { + ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + } +} + +template +class CPUVarConv2dOPGradKernel : public framework::OpKernel { + public: + void Im2ColGrad(const framework::ExecutionContext& ctx, T* top_diff) const { + auto* x = ctx.Input("X"); + auto* in_row = ctx.Input("ROW"); + auto* in_col = ctx.Input("COLUMN"); + auto* col = ctx.Input("Col"); + + int input_channel = ctx.Attr("InputChannel"); + int kernel_h = ctx.Attr("KernelH"); + int kernel_w = ctx.Attr("KernelW"); + int stride_h = ctx.Attr("StrideH"); + int stride_w = ctx.Attr("StrideW"); + + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto* dx_data = dx->mutable_data(ctx.GetPlace()); + memset(dx_data, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T)); + + const auto& bottom_offset = x->lod()[0]; + const auto& offset_x = in_col->lod()[0]; + const auto& offset_y = in_row->lod()[0]; + const auto& top_offset = col->lod()[0]; + int batch = x->lod()[0].size() - 1; + int kernel_win_size = kernel_h * kernel_w; + int half_kernel_h = kernel_h / 2; + int half_kernel_w = kernel_w / 2; + for (int b = 0; b < batch; ++b) { + int t_offset = top_offset[b]; + int b_offset = bottom_offset[b]; + int width = offset_x[b + 1] - offset_x[b]; + int height = offset_y[b + 1] - offset_y[b]; + if (width == 0 || height == 0) { + continue; + } + int top_im_x = (width - 1) / stride_w + 1; + int top_im_y = (height - 1) / stride_h + 1; + int top_x = top_im_y * top_im_x; + for (int z = 0; z < input_channel; ++z) { + int row_offset = kernel_win_size * z; + int im_offset = z * width * height; + for (int y = 0; y < height; y += stride_h) { + for (int x = 0; x < width; x += stride_w) { + int col_offset = x / stride_w + y / stride_h * top_im_x; + for (int ky = 0; ky < kernel_h; ++ky) { + for (int kx = 0; kx < kernel_w; ++kx) { + int im_y = y + ky - half_kernel_h; + int im_x = x + kx - half_kernel_w; + if (im_x >= 0 && im_x < width && im_y >= 0 && im_y < height) { + dx_data[b_offset + im_offset + im_y * width + im_x] += + top_diff[t_offset + + (row_offset + ky * kernel_w + kx) * top_x + + col_offset]; + } + } + } + } + } + } + } + } + + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* w = ctx.Input("W"); + auto* col = ctx.Input("Col"); + auto* out = ctx.Input("Out"); + + int output_channel = ctx.Attr("OutputChannel"); + int input_channel = ctx.Attr("InputChannel"); + int kernel_h = ctx.Attr("KernelH"); + int kernel_w = ctx.Attr("KernelW"); + + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* d_w = ctx.Output(framework::GradVarName("W")); + + Tensor col_grad; + col_grad.Resize(col->dims()); + auto* col_diff = col_grad.mutable_data(ctx.GetPlace()); + auto* dx_data = dx->mutable_data(ctx.GetPlace()); + auto* w_diff = d_w->mutable_data(ctx.GetPlace()); + + memset(dx_data, 0.0, x->dims()[0] * x->dims()[1] * sizeof(T)); + memset(w_diff, 0.0, w->dims()[0] * w->dims()[1] * sizeof(T)); + memset(col_diff, 0.0, col->dims()[0] * col->dims()[1] * sizeof(T)); + auto* top_diff = d_out->data(); + auto* w_data = w->data(); + auto* col_data = col->data(); + int batch = x->lod()[0].size() - 1; + const auto& top_offset = out->lod()[0]; + const auto& col_offset = col->lod()[0]; + auto blas = math::GetBlas(ctx); + for (int b = 0; b < batch; ++b) { + int top_im_size = (top_offset[b + 1] - top_offset[b]) / output_channel; + if (top_im_size == 0) { + continue; + } + + blas.GEMM(CblasTrans, CblasNoTrans, input_channel * kernel_h * kernel_w, + top_im_size, output_channel, 1.0, w_data, + top_diff + top_offset[b], 1.0, col_diff + col_offset[b]); + + blas.GEMM(CblasNoTrans, CblasTrans, output_channel, + input_channel * kernel_h * kernel_w, top_im_size, 1.0, + top_diff + top_offset[b], col_data + col_offset[b], 1.0, + w_diff); + } + Im2ColGrad(ctx, col_diff); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; +namespace frm = paddle::framework; +REGISTER_OPERATOR(var_conv_2d, ops::VarConv2dOP, ops::VarConv2dOpMaker, + frm::DefaultGradOpDescMaker); +REGISTER_OPERATOR(var_conv_2d_grad, ops::VarConv2dOpGrad); + +REGISTER_OP_CPU_KERNEL(var_conv_2d, + ops::CPUVarConv2dOPKernel); +// ops::CPUVarConv2dOPKernel +REGISTER_OP_CPU_KERNEL( + var_conv_2d_grad, + ops::CPUVarConv2dOPGradKernel); +// ops::CPUVarConv2dOPGradKernel diff --git a/paddle/fluid/operators/var_conv_2d_op.h b/paddle/fluid/operators/var_conv_2d_op.h new file mode 100644 index 00000000..b8d5de06 --- /dev/null +++ b/paddle/fluid/operators/var_conv_2d_op.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using LoD = framework::LoD; + +class VarConv2dOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +class VarConv2dOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; +}; + +class VarConv2dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc b/paddle/fluid/operators/warpctc_cudnn_op.cu.cc deleted file mode 100644 index 2a744f66..00000000 --- a/paddle/fluid/operators/warpctc_cudnn_op.cu.cc +++ /dev/null @@ -1,197 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/operators/math/softmax.h" -#include "paddle/fluid/operators/warpctc_op.h" -#include "paddle/fluid/platform/cudnn_helper.h" - -namespace paddle { -namespace operators { - -#if CUDNN_VERSION >= 7001 -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using ScopedCTCLossDescriptor = platform::ScopedCTCLossDescriptor; -using DataLayout = platform::DataLayout; - -template -class CudnnCTCKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - // =====================Copied code from warpctc=========================== - auto* logits = ctx.Input("Logits"); - auto* label = ctx.Input("Label"); - auto* warpctc_grad = ctx.Output("WarpCTCGrad"); - auto* loss = ctx.Output("Loss"); - - const size_t level = 0; - - auto logits_lod = framework::ToAbsOffset(logits->lod()); - auto logits_dims = logits->dims(); - PADDLE_ENFORCE_EQ(logits_dims[0], - static_cast(logits_lod[level].back()), - "The first dimension of Input(Logits) should be equal to " - "the sum of all sequences' lengths."); - - auto label_lod = framework::ToAbsOffset(label->lod()); - auto label_dims = label->dims(); - PADDLE_ENFORCE_EQ( - label_dims[0], label->numel(), - "The width of each timestep in Input(Label) should be 1."); - - const size_t num_sequences = logits_lod[level].size() - 1; - PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1, - "The number of sequences of Input(Logits) should be " - "equal to that of Input(Label)."); - PADDLE_ENFORCE_LE(num_sequences, 256, - "The labelLengths must less than 256 for cudnn call."); - - const size_t sequence_width = logits->numel() / logits_dims[0]; - auto loss_dims = - framework::make_ddim({static_cast(num_sequences), 1}); - - // NOTE: cudnn takes softmax input, calculate softmax first, then do padding - auto& dev_ctx = ctx.template device_context(); - LoDTensor softmax_logits; - softmax_logits.mutable_data(logits->dims(), ctx.GetPlace()); - softmax_logits.set_lod(logits_lod); - int rank = logits->dims().size(); - int axis_dim = logits->dims()[rank - 1]; - Tensor in_2d = framework::ReshapeToMatrix(*logits, rank - 1); - Tensor out_2d = framework::ReshapeToMatrix(softmax_logits, rank - 1); - math::SoftmaxFunctor()(dev_ctx, axis_dim, &in_2d, - &out_2d); - - // ctc needs sequences data stored in transposed padding format - // logits and grad using padding data of layout 'TNC' - // T: max_sequence_length - // N: batch_size (num_sequences) - // C: width - LoDTensor warpctc_logits; - const size_t max_sequence_length = - math::MaximumSequenceLength(logits_lod[level]); - auto warpctc_logits_dims = - framework::make_ddim({static_cast(max_sequence_length), - static_cast(num_sequences), - static_cast(sequence_width)}); - warpctc_logits.mutable_data(warpctc_logits_dims, ctx.GetPlace()); - - LoDTensor cpu_pad_value; - T* pad_value_data = - cpu_pad_value.mutable_data({1}, platform::CPUPlace()); - *pad_value_data = static_cast(0); - LoDTensor pad_value; - if (platform::is_cpu_place(ctx.GetPlace())) { - pad_value = cpu_pad_value; - } else { - TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value); - } - - math::PaddingLoDTensorFunctor()( - ctx.template device_context(), softmax_logits, - &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */, - math::kLengthBatchWidth); - const T* warpctc_logits_data = warpctc_logits.data(); - - std::vector warpctc_label_lengths(num_sequences); - std::vector warpctc_logits_lengths(num_sequences); - - for (size_t i = 0; i < num_sequences; ++i) { - warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i]; - warpctc_logits_lengths[i] = - logits_lod[level][i + 1] - logits_lod[level][i]; - } - - T* warpctc_grad_data = - warpctc_grad->mutable_data(warpctc_logits.dims(), ctx.GetPlace()); - - math::SetConstant()( - ctx.template device_context(), warpctc_grad, - static_cast(0)); - - Tensor warpctc_label; - TensorCopySync(*label, platform::CPUPlace(), &warpctc_label); - const int* warpctc_label_data = warpctc_label.data(); - // ======================================================================== - - ScopedTensorDescriptor logits_desc; - ScopedTensorDescriptor grad_desc; - ScopedCTCLossDescriptor ctcloss_desc; - // layout here doesn't have effect. - DataLayout layout = DataLayout::kNCHW; - - auto cu_logits_desc = logits_desc.descriptor( - layout, framework::vectorize2int(warpctc_logits.dims())); - auto cu_grad_desc = grad_desc.descriptor( - layout, framework::vectorize2int(warpctc_grad->dims())); - auto cu_ctcloss_desc = ctcloss_desc.descriptor(); - - auto handle = dev_ctx.cudnn_handle(); - size_t workspace_size; - - CUDNN_ENFORCE(platform::dynload::cudnnGetCTCLossWorkspaceSize( - handle, cu_logits_desc, cu_grad_desc, warpctc_label_data, - warpctc_label_lengths.data(), warpctc_logits_lengths.data(), - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, &workspace_size)); - - T* loss_data = loss->mutable_data(loss_dims, ctx.GetPlace()); - - auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnCTCLoss( - handle, cu_logits_desc, warpctc_logits_data, warpctc_label_data, - warpctc_label_lengths.data(), warpctc_logits_lengths.data(), - loss_data, cu_grad_desc, warpctc_grad_data, - CUDNN_CTC_LOSS_ALGO_DETERMINISTIC, cu_ctcloss_desc, cudnn_workspace, - workspace_size)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size); - } -}; - -template -class CudnnCTCGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* warpctc_grad = ctx.Input("WarpCTCGrad"); - auto* logits_grad = ctx.Output(framework::GradVarName("Logits")); - const Tensor* loss_grad = ctx.Input(framework::GradVarName("Loss")); - - logits_grad->mutable_data(ctx.GetPlace()); - bool norm_by_times = ctx.Attr("norm_by_times"); - math::UnpaddingLoDTensorFunctor()( - ctx.template device_context(), *warpctc_grad, - logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth); - - const T* loss_grad_data = loss_grad->data(); - math::ScaleLoDTensorFunctor()( - ctx.template device_context(), loss_grad_data, - logits_grad); - } -}; - -#endif -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#if CUDNN_VERSION >= 7001 -REGISTER_OP_KERNEL( - warpctc, CUDNN, plat::CUDAPlace, - ops::CudnnCTCKernel); -REGISTER_OP_KERNEL( - warpctc_grad, CUDNN, plat::CUDAPlace, - ops::CudnnCTCGradKernel); -#endif diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc index deb5681f..7033d55a 100644 --- a/paddle/fluid/operators/warpctc_op.cc +++ b/paddle/fluid/operators/warpctc_op.cc @@ -38,12 +38,19 @@ class WarpCTCOp : public framework::OperatorWithKernel { "Output(Loss) of WarpCTCOp should not be null."); auto logits_dims = ctx->GetInputDim("Logits"); - int sequence_width = - static_cast(framework::product(logits_dims) / logits_dims[0]); int blank = ctx->Attrs().Get("blank"); + int sequence_width = 0; + + if (ctx->HasInput("LogitsLength")) { + sequence_width = logits_dims[2]; + } else { + sequence_width = + static_cast(framework::product(logits_dims) / logits_dims[0]); + } PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width), "The value of Attr(blank) should be in interval [0, %d).", sequence_width); + // TODO(liuyiqun): it is tricky to set the wrong dimension here. ctx->SetOutputDim("Loss", {logits_dims[0], 1}); } @@ -52,20 +59,6 @@ class WarpCTCOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { framework::LibraryType library_{framework::LibraryType::kPlain}; -#ifdef PADDLE_WITH_CUDA - if (platform::CanCUDNNBeUsed(ctx)) { -#if CUDA_VERSION >= 9000 - LOG(WARNING) - << "The cudnnCTCLoss of CUDNN7 have some diff between " - "CUDA9/CUDA10 and CUDA8. You can close use_cudnn option to " - "use " - "baidu-research/warp-ctc(https://github.com/baidu-research/" - "warp-ctc)"; -#endif - - library_ = framework::LibraryType::kCUDNN; - } -#endif framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; return framework::OpKernelType(ctx.Input("Logits")->type(), ctx.device_context(), layout_, library_); @@ -76,17 +69,32 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("Logits", - "(LodTensor, default: LoDTensor), the unscaled " - "probabilities of variable-length sequences, which is a 2-D " - "Tensor with LoD information. It's shape is " - "[Lp, num_classes + 1], where Lp is the sum of all input " - "sequences' length and num_classes is the true number of classes " - "(not including the blank label)."); + "(2-D LoDTensor) or (3-D Tensor), the " + "unscaled probabilities of variable-length sequences." + "When is a 2-D Tensor with LoD information, " + "it's shape is [Lp, num_classes + 1], " + "where Lp is the sum of all input sequences' length " + "and num_classes is the true number of classes " + "(not including the blank label)." + "When it is 3-D Tensor, it's shape is " + "[max_logit_length, batch_size, num_classes + 1], " + "where max_logit_length is the length of the longest " + "logit sequence."); AddInput("Label", - "(LodTensor, default: LoDTensor), the ground truth " - "of variable-length sequence, which is a 2-D Tensor with LoD " - "information. It is of the shape [Lg, 1], where Lg is th sum of " - "all labels' length."); + "(2-D LoDTensor) or (2-D Tensor), the " + "ground truth of variable-length sequence. " + "When it is a 2-D Tensor with LoD information, " + "it is of the shape [Lg, 1], where Lg is th sum of " + "all labels' length." + "When it is a 2-D Tensor, it's shape is also [Lg, 1]."); + AddInput("LogitsLength", + "1-D Tensor. " + "Input sequence length for Logits when Logits is a 3-D tensor.") + .AsDispensable(); + AddInput("LabelLength", + "1-D Tensor. " + "Target sequence length for Label when Label is a 2-D tensor.") + .AsDispensable(); AddOutput("WarpCTCGrad", "(Tensor, default: Tensor), a temporary " "output Tensor to store the gradients of warp-ctc, which is " @@ -107,10 +115,6 @@ class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker { "normalize the gradients by the number of time-step, " "which is also the sequence's length.") .SetDefault(false); - AddAttr("use_cudnn", - "(bool, default: false), whether to " - "use cudnn kernel.") - .SetDefault(false); AddComment(R"DOC( An operator integrating the open-source [warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in @@ -143,6 +147,8 @@ class WarpCTCGradOpDescMaker : public framework::SingleGradOpDescMaker { op->SetInput("Logits", Input("Logits")); op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + op->SetInput("LogitsLength", Input("LogitsLength")); + op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits")); op->SetAttrMap(Attrs()); diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index 444265f5..1859c748 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -128,63 +128,93 @@ class WarpCTCKernel : public framework::OpKernel { auto* warpctc_grad = ctx.Output("WarpCTCGrad"); auto* loss = ctx.Output("Loss"); - const size_t level = 0; - - auto logits_lod = framework::ToAbsOffset(logits->lod()); - auto logits_dims = logits->dims(); - PADDLE_ENFORCE_EQ(logits_dims[0], - static_cast(logits_lod[level].back()), - "The first dimension of Input(Logits) should be equal to " - "the sum of all sequences' lengths."); - - auto label_lod = framework::ToAbsOffset(label->lod()); - auto label_dims = label->dims(); - PADDLE_ENFORCE_EQ( - label_dims[0], label->numel(), - "The width of each timestep in Input(Label) should be 1."); - - const size_t num_sequences = logits_lod[level].size() - 1; - PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1, - "The number of sequences of Input(Logits) should be " - "equal to that of Input(Label)."); - - const size_t sequence_width = logits->numel() / logits_dims[0]; + size_t num_sequences, sequence_width, max_sequence_length; + framework::Vector logits_lod; + framework::Vector label_lod; + + if (ctx.HasInput("LogitsLength") && ctx.HasInput("LabelLength")) { + num_sequences = logits->dims()[1]; + sequence_width = logits->dims()[2]; + max_sequence_length = logits->dims()[0]; + + auto* logits_length = ctx.Input("LogitsLength"); + auto* labels_length = ctx.Input("LabelLength"); + framework::Tensor logits_length_cpu; + framework::Tensor labels_length_cpu; + framework::TensorCopy(*logits_length, platform::CPUPlace(), + &logits_length_cpu); + framework::TensorCopy(*labels_length, platform::CPUPlace(), + &labels_length_cpu); + + logits_lod.push_back(0); + label_lod.push_back(0); + for (auto i = 0; i < num_sequences; i++) { + logits_lod.push_back(logits_lod[i] + + logits_length_cpu.data()[i]); + label_lod.push_back(label_lod[i] + + labels_length_cpu.data()[i]); + } + } else { + logits_lod = framework::ToAbsOffset(logits->lod())[0]; + auto logits_dims = logits->dims(); + PADDLE_ENFORCE_EQ( + logits_dims[0], static_cast(logits_lod.back()), + "The first dimension of Input(Logits) should be equal to " + "the sum of all sequences' lengths."); + + label_lod = framework::ToAbsOffset(label->lod())[0]; + auto label_dims = label->dims(); + PADDLE_ENFORCE_EQ( + label_dims[0], label->numel(), + "The width of each timestep in Input(Label) should be 1."); + + num_sequences = logits_lod.size() - 1; + PADDLE_ENFORCE_EQ(num_sequences, label_lod.size() - 1, + "The number of sequences of Input(Logits) should be " + "equal to that of Input(Label)."); + + sequence_width = logits->numel() / logits_dims[0]; + max_sequence_length = math::MaximumSequenceLength(logits_lod); + } + auto loss_dims = framework::make_ddim({static_cast(num_sequences), 1}); // warpctc needs sequences data stored in transposed padding format LoDTensor warpctc_logits; - const size_t max_sequence_length = - math::MaximumSequenceLength(logits_lod[level]); auto warpctc_logits_dims = framework::make_ddim({static_cast(max_sequence_length), static_cast(num_sequences), static_cast(sequence_width)}); warpctc_logits.mutable_data(warpctc_logits_dims, ctx.GetPlace()); - LoDTensor cpu_pad_value; - T* pad_value_data = - cpu_pad_value.mutable_data({1}, platform::CPUPlace()); - *pad_value_data = static_cast(0); - LoDTensor pad_value; - if (platform::is_cpu_place(ctx.GetPlace())) { - pad_value = cpu_pad_value; + if (ctx.HasInput("LogitsLength")) { + TensorCopySync(*logits, ctx.GetPlace(), &warpctc_logits); } else { - TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value); + LoDTensor cpu_pad_value; + T* pad_value_data = + cpu_pad_value.mutable_data({1}, platform::CPUPlace()); + *pad_value_data = static_cast(0); + LoDTensor pad_value; + if (platform::is_cpu_place(ctx.GetPlace())) { + pad_value = cpu_pad_value; + } else { + TensorCopySync(cpu_pad_value, ctx.GetPlace(), &pad_value); + } + + math::PaddingLoDTensorFunctor()( + ctx.template device_context(), *logits, + &warpctc_logits, pad_value, -1, 0, false /* norm_by_times */, + math::kLengthBatchWidth); } - - math::PaddingLoDTensorFunctor()( - ctx.template device_context(), *logits, &warpctc_logits, - pad_value, -1, 0, false /* norm_by_times */, math::kLengthBatchWidth); const T* warpctc_logits_data = warpctc_logits.data(); std::vector warpctc_label_lengths(num_sequences); std::vector warpctc_logits_lengths(num_sequences); for (size_t i = 0; i < num_sequences; ++i) { - warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i]; - warpctc_logits_lengths[i] = - logits_lod[level][i + 1] - logits_lod[level][i]; + warpctc_label_lengths[i] = label_lod[i + 1] - label_lod[i]; + warpctc_logits_lengths[i] = logits_lod[i + 1] - logits_lod[i]; } // warpctc computes loss and gradient in one call, gradient data also stored @@ -199,6 +229,7 @@ class WarpCTCKernel : public framework::OpKernel { // warpctc accesses labels in CPU memory Tensor warpctc_label; TensorCopySync(*label, platform::CPUPlace(), &warpctc_label); + const int* warpctc_label_data = warpctc_label.data(); // warpctc stores loss in CPU memory Tensor warpctc_loss; @@ -227,14 +258,53 @@ class WarpCTCGradKernel : public framework::OpKernel { logits_grad->mutable_data(ctx.GetPlace()); bool norm_by_times = ctx.Attr("norm_by_times"); - math::UnpaddingLoDTensorFunctor()( - ctx.template device_context(), *warpctc_grad, - logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth); - - const T* loss_grad_data = loss_grad->data(); - math::ScaleLoDTensorFunctor()( - ctx.template device_context(), loss_grad_data, - logits_grad); + + if (ctx.HasInput("LogitsLength")) { + size_t max_seq_length = warpctc_grad->dims()[0]; + size_t num_sequences = warpctc_grad->dims()[1]; + size_t seq_width = warpctc_grad->dims()[2]; + + LoDTensor logits_grad_with_lod; + auto logits_grad_dims = + framework::make_ddim({static_cast(max_seq_length), + static_cast(num_sequences), + static_cast(seq_width)}); + T* logits_grad_cpu_data = logits_grad_with_lod.mutable_data( + logits_grad_dims, platform::CPUPlace()); + + TensorCopySync(*warpctc_grad, platform::CPUPlace(), + &logits_grad_with_lod); + + Tensor loss_grad_cpu; + loss_grad_cpu.mutable_data(loss_grad->dims(), platform::CPUPlace()); + TensorCopySync(*loss_grad, platform::CPUPlace(), &loss_grad_cpu); + + LoDTensor scaled_logits; + T* scaled_logits_data = + scaled_logits.mutable_data(logits_grad_dims, platform::CPUPlace()); + + const T* loss_grad_data = loss_grad_cpu.data(); + for (size_t i = 0; i < max_seq_length; ++i) { + for (size_t j = 0; j < num_sequences; ++j) { + for (size_t k = 0; k < seq_width; ++k) { + size_t idx = i * (num_sequences * seq_width) + j * seq_width + k; + scaled_logits_data[idx] = + logits_grad_cpu_data[idx] * loss_grad_data[j]; + } + } + } + + TensorCopySync(scaled_logits, ctx.GetPlace(), logits_grad); + } else { + math::UnpaddingLoDTensorFunctor()( + ctx.template device_context(), *warpctc_grad, + logits_grad, -1, 0, norm_by_times, math::kLengthBatchWidth); + + const T* loss_grad_data = loss_grad->data(); + math::ScaleLoDTensorFunctor()( + ctx.template device_context(), loss_grad_data, + logits_grad); + } } }; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 575eed35..a84f521f 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -20,10 +20,12 @@ add_custom_command(TARGET profiler_py_proto POST_BUILD WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif(NOT WIN32) +cc_library(flags SRCS flags.cc DEPS gflags) + if(WITH_GPU) - nv_library(enforce SRCS enforce.cc) + nv_library(enforce SRCS enforce.cc DEPS flags) else() - cc_library(enforce SRCS enforce.cc) + cc_library(enforce SRCS enforce.cc DEPS flags) endif() cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce) @@ -59,8 +61,6 @@ ELSE() set(MKLDNN_CTX_DEPS) ENDIF() -cc_library(temp_allocator SRCS temporary_allocator.cc DEPS allocator_facade) - nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) IF(WITH_GPU) set(STREAM_CALLBACK_DEPS stream_callback_manager) @@ -70,9 +70,9 @@ ENDIF() # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies -cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS} +cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} - temp_allocator ${dgc_deps}) + ${dgc_deps}) if (WITH_DISTRIBUTE) cc_library(collective_helper SRCS collective_helper.cc DEPS framework_proto device_context enforce) @@ -115,8 +115,9 @@ cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) -if(WITH_GPU) - nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator) -else() - cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator) +if(NOT APPLE AND NOT WIN32) + cc_library(device_code SRCS device_code.cc DEPS device_context) + if(WITH_GPU) + cc_test(device_code_test SRCS device_code_test.cc DEPS device_code lod_tensor) + endif() endif() diff --git a/paddle/fluid/platform/assert.h b/paddle/fluid/platform/assert.h deleted file mode 100644 index e3884a98..00000000 --- a/paddle/fluid/platform/assert.h +++ /dev/null @@ -1,48 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#define STRINGIFY(x) #x -#define TOSTRING(x) STRINGIFY(x) - -// For cuda, the assertions can affect performance and it is therefore -// recommended to disable them in production code -// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion -#if defined(__CUDA_ARCH__) -#include -#define EXIT() asm("trap;") -#else -#include -#define EXIT() throw std::runtime_error("Exception encounter.") -#endif - -#define PADDLE_ASSERT(_IS_NOT_ERROR) \ - do { \ - if (!(_IS_NOT_ERROR)) { \ - printf("Exception: %s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, \ - TOSTRING(_IS_NOT_ERROR)); \ - EXIT(); \ - } \ - } while (0) - -// NOTE: PADDLE_ASSERT is mainly used in CUDA Kernel or HOSTDEVICE function. -#define PADDLE_ASSERT_MSG(_IS_NOT_ERROR, __MSG, __VAL) \ - do { \ - if (!(_IS_NOT_ERROR)) { \ - printf("Exception: %s:%d Assertion `%s` failed (%s %ld).\n", __FILE__, \ - __LINE__, TOSTRING(_IS_NOT_ERROR), __MSG, __VAL); \ - EXIT(); \ - } \ - } while (0) diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index ddd242cd..2025e534 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -53,46 +53,88 @@ class NCCLCommImpl : public NCCLComm { std::unique_ptr dev_ctx_; }; -// NOTE: not thread-safe NCCLComm* NCCLCommContext::CreateNCCLComm(ncclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id) { PADDLE_ENFORCE_NOT_NULL(nccl_id); PADDLE_ENFORCE_GT(nranks, 1); - PADDLE_ENFORCE(rank >= 0 && rank < nranks, - "Expected rank id range [0, %d), but get %d", nranks, rank); + PADDLE_ENFORCE_GE(rank, 0); + PADDLE_ENFORCE_LT(rank, nranks); PADDLE_ENFORCE_GE(dev_id, 0); - if (dev_ctx_map_.count(dev_id) == 0) { - dev_ctx_map_.emplace(dev_id, std::unique_ptr( - new CUDADeviceContext(CUDAPlace(dev_id)))); - } - ncclComm_t comm = nullptr; - PADDLE_ENFORCE(cudaSetDevice(dev_id)); - PADDLE_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(dev_id)); + PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::ncclCommInitRank(&comm, nranks, *nccl_id, rank)); std::unique_ptr dev_ctx( new CUDADeviceContext(CUDAPlace(dev_id))); dev_ctx->set_nccl_comm(comm); - NCCLCommImpl* communicator = new NCCLCommImpl; - communicator->set_ring_id(ring_id); - communicator->set_nranks(nranks); - communicator->set_rank(rank); - communicator->set_dev_ctx(std::move(dev_ctx)); + NCCLCommImpl* c = new NCCLCommImpl; + c->set_ring_id(ring_id); + c->set_nranks(nranks); + c->set_rank(rank); + c->set_dev_ctx(std::move(dev_ctx)); + + comm_map_mutex_.lock(); + if (comm_map_.count(ring_id) == 0) { + comm_map_.emplace(ring_id, std::map>()); + } + auto& dev2comm = comm_map_[ring_id]; - comm_map_.emplace(ring_id, std::unique_ptr(communicator)); + dev2comm.emplace(dev_id, std::unique_ptr(c)); + comm_map_mutex_.unlock(); - VLOG(0) << "nccl communicator of rank " << rank << " in ring " << ring_id + VLOG(1) << "nccl communicator of rank " << rank << " in ring " << ring_id << " has been created"; - return comm_map_.at(ring_id).get(); + std::call_once(once_flag_, []() { + std::atexit([]() { NCCLCommContext::Instance().ReleaseNCCLComms(); }); + }); + + return comm_map_[ring_id][dev_id].get(); +} + +void NCCLCommContext::CreateAllNCCLComms(const std::vector& dev_ids, + int ring_id) { + PADDLE_ENFORCE_GT(dev_ids.size(), 0); + + const int kDevices = dev_ids.size(); + ncclComm_t comms[kDevices]; + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll( + comms, dev_ids.size(), dev_ids.data())); + + PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0); + comm_map_.emplace(ring_id, std::map>()); + + auto& dev2comm = comm_map_[ring_id]; + for (size_t i = 0; i < dev_ids.size(); ++i) { + std::unique_ptr dev_ctx( + new CUDADeviceContext(CUDAPlace(dev_ids[i]))); + dev_ctx->set_nccl_comm(comms[i]); + + NCCLCommImpl* c = new NCCLCommImpl; + c->set_ring_id(ring_id); + c->set_nranks(dev_ids.size()); + c->set_rank(i); + c->set_dev_ctx(std::move(dev_ctx)); + + dev2comm.emplace(dev_ids[i], std::unique_ptr(c)); + } + + std::call_once(once_flag_, []() { + std::atexit([]() { NCCLCommContext::Instance().ReleaseNCCLComms(); }); + }); } -NCCLCommContext::~NCCLCommContext() { +void NCCLCommContext::ReleaseNCCLComms() { + // CUDADeviceContext maintain the lifetime of nccl_comm_t, so we should not + // destroy nccl_comm_t explicitly. Please refer to + // platform::CUDADeviceContext::~CUDADeviceContext() for (auto& p : comm_map_) { - PADDLE_ENFORCE(platform::dynload::ncclCommDestroy(p.second->comm())); + for (auto& q : p.second) { + q.second.reset(); + } } } diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h index 7479ebaf..747e8400 100644 --- a/paddle/fluid/platform/collective_helper.h +++ b/paddle/fluid/platform/collective_helper.h @@ -15,9 +15,9 @@ #pragma once #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include #include #include -#include #include #include "boost/variant.hpp" @@ -58,37 +58,57 @@ class NCCLComm { virtual ~NCCLComm() = default; }; -// a singleton NCCL communicator context reserves communication ring ids -// Assume multiprocessing mode +// A singleton NCCL communicator context reserves communication ring ids class NCCLCommContext { public: static NCCLCommContext& Instance() { static NCCLCommContext comm_ctx; return comm_ctx; } - ~NCCLCommContext(); NCCLComm* CreateNCCLComm(ncclUniqueId* nccl_id, int nranks, int rank, int dev_id, int ring_id = 0); - // retrieve a communicator by the ring id + void CreateAllNCCLComms(const std::vector& dev_ids, int ring_id = 0); + + // retrieve a communicator by the ring id in multiprocessing mode NCCLComm* Get(int ring_id) const { - PADDLE_ENFORCE(comm_map_.count(ring_id), - "comunicator in ring id %d has not been initialized", - ring_id); - return comm_map_.at(ring_id).get(); + PADDLE_ENFORCE_GT(comm_map_.count(ring_id), 0, + "comunicator in ring id %d has not been initialized", + ring_id); + PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1, + "you should specify a device id to retrieve from " + "multiple communicators"); + return comm_map_.at(ring_id).begin()->second.get(); + } + + // retrieve a communicator by the ring id and the device id + NCCLComm* Get(int ring_id, int dev_id) const { + PADDLE_ENFORCE_GT(comm_map_.count(ring_id), 0, + "comunicator of ring id %d has not been initialized", + ring_id); + PADDLE_ENFORCE_GT( + comm_map_.at(ring_id).count(dev_id), 0, + "comunicator at device id %d has not been initialized in ring %d", + dev_id, ring_id); + return comm_map_.at(ring_id).at(dev_id).get(); + } + + // retrieve a communicator by the ring id and place + NCCLComm* Get(int ring_id, Place place) const { + return Get(ring_id, boost::get(place).device); } private: - // ring id to NCCLComm - std::unordered_map> comm_map_; + std::once_flag once_flag_; + std::mutex comm_map_mutex_; + // ring id to dev-NCCLComm + std::map>> comm_map_; - // device id to CUDADeviceContext - std::unordered_map> dev_ctx_map_; + void ReleaseNCCLComms(); NCCLCommContext() = default; - NCCLCommContext(const NCCLCommContext& other) = delete; - NCCLCommContext& operator=(const NCCLCommContext& other) = delete; + DISABLE_COPY_AND_ASSIGN(NCCLCommContext); }; } // namespace platform diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index bdfe2607..b7ed66bd 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -32,16 +32,9 @@ limitations under the License. */ #include #include "gflags/gflags.h" -DEFINE_double(fraction_of_cpu_memory_to_use, 1, - "Default use 100% of CPU memory for PaddlePaddle," - "reserve the rest for page tables, etc"); -DEFINE_uint64(initial_cpu_memory_in_mb, 500ul, - "Initial CPU memory for PaddlePaddle, in MD unit."); - -DEFINE_double( - fraction_of_cuda_pinned_memory_to_use, 0.5, - "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," - "reserve the rest for page tables, etc"); +DECLARE_double(fraction_of_cpu_memory_to_use); +DECLARE_uint64(initial_cpu_memory_in_mb); +DECLARE_double(fraction_of_cuda_pinned_memory_to_use); // If use_pinned_memory is true, CPUAllocator calls mlock, which // returns pinned and locked memory as staging areas for data exchange diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h index 122de72e..c21924ae 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/cuda_helper.h @@ -29,17 +29,19 @@ namespace platform { class CublasHandleHolder { public: CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { - PADDLE_ENFORCE(dynload::cublasCreate(&handle_)); - PADDLE_ENFORCE(dynload::cublasSetStream(handle_, stream)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasCreate(&handle_)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasSetStream(handle_, stream)); #if CUDA_VERSION >= 9000 if (math_type == CUBLAS_TENSOR_OP_MATH) { - PADDLE_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH)); } #endif } - ~CublasHandleHolder() { PADDLE_ENFORCE(dynload::cublasDestroy(handle_)); } + ~CublasHandleHolder() { + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cublasDestroy(handle_)); + } template inline void Call(Callback &&callback) const { diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h index 39a50b3b..f503dfb3 100644 --- a/paddle/fluid/platform/cudnn_desc.h +++ b/paddle/fluid/platform/cudnn_desc.h @@ -102,7 +102,7 @@ class TensorDescriptor { T* desc() { return desc_.get(); } T* desc() const { return desc_.get(); } void set(const Tensor& tensor, const int groups = 1) { - auto dims = framework::vectorize2int(tensor.dims()); + auto dims = framework::vectorize(tensor.dims()); std::vector strides(dims.size()); strides[dims.size() - 1] = 1; for (int i = dims.size() - 2; i >= 0; i--) { @@ -142,7 +142,7 @@ class FilterDescriptor { void set(const Tensor& tensor, const cudnnTensorFormat_t format, const int groups = 1) { - auto dims = framework::vectorize2int(tensor.dims()); + auto dims = framework::vectorize(tensor.dims()); if (groups > 1) { dims[1] = dims[1] / groups; } diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 61a25064..8c124e71 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -221,10 +221,10 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat( class ScopedTensorDescriptor { public: ScopedTensorDescriptor() { - PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateTensorDescriptor(&desc_)); } ~ScopedTensorDescriptor() { - PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyTensorDescriptor(desc_)); } inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format, @@ -243,7 +243,7 @@ class ScopedTensorDescriptor { if (groups > 1) { dims_with_group[1] = dims_with_group[1] / groups; } - PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetTensorNdDescriptor( desc_, type, dims_with_group.size(), dims_with_group.data(), strides.data())); return desc_; @@ -265,10 +265,10 @@ class ScopedTensorDescriptor { class ScopedFilterDescriptor { public: ScopedFilterDescriptor() { - PADDLE_ENFORCE(dynload::cudnnCreateFilterDescriptor(&desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateFilterDescriptor(&desc_)); } ~ScopedFilterDescriptor() { - PADDLE_ENFORCE(dynload::cudnnDestroyFilterDescriptor(desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFilterDescriptor(desc_)); } inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format, @@ -284,7 +284,7 @@ class ScopedFilterDescriptor { kernel_with_group[0] /= groups; // NOTE: input filter(C) of the filter is already asserted to be C/groups. } - PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetFilterNdDescriptor( desc_, type, format, kernel_with_group.size(), kernel_with_group.data())); return desc_; @@ -306,10 +306,12 @@ class ScopedFilterDescriptor { class ScopedConvolutionDescriptor { public: ScopedConvolutionDescriptor() { - PADDLE_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnCreateConvolutionDescriptor(&desc_)); } ~ScopedConvolutionDescriptor() { - PADDLE_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnDestroyConvolutionDescriptor(desc_)); } inline cudnnConvolutionDescriptor_t descriptor( @@ -332,7 +334,7 @@ class ScopedConvolutionDescriptor { cudnnDataType_t compute_type = (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; - PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor( desc_, pads.size(), pads.data(), strides.data(), dilations.data(), CUDNN_CROSS_CORRELATION, compute_type)); return desc_; @@ -353,10 +355,10 @@ class ScopedConvolutionDescriptor { class ScopedPoolingDescriptor { public: ScopedPoolingDescriptor() { - PADDLE_ENFORCE(dynload::cudnnCreatePoolingDescriptor(&desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreatePoolingDescriptor(&desc_)); } ~ScopedPoolingDescriptor() { - PADDLE_ENFORCE(dynload::cudnnDestroyPoolingDescriptor(desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyPoolingDescriptor(desc_)); } inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode, @@ -365,7 +367,7 @@ class ScopedPoolingDescriptor { const std::vector& strides) { PADDLE_ENFORCE_EQ(kernel.size(), pads.size()); PADDLE_ENFORCE_EQ(kernel.size(), strides.size()); - PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetPoolingNdDescriptor( desc_, (GetPoolingMode(mode)), CUDNN_PROPAGATE_NAN, // Always propagate nans. kernel.size(), kernel.data(), pads.data(), strides.data())); @@ -380,16 +382,18 @@ class ScopedPoolingDescriptor { class ScopedSpatialTransformerDescriptor { public: ScopedSpatialTransformerDescriptor() { - PADDLE_ENFORCE(dynload::cudnnCreateSpatialTransformerDescriptor(&desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnCreateSpatialTransformerDescriptor(&desc_)); } ~ScopedSpatialTransformerDescriptor() { - PADDLE_ENFORCE(dynload::cudnnDestroySpatialTransformerDescriptor(desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnDestroySpatialTransformerDescriptor(desc_)); } template inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims, const int dimA[]) { - PADDLE_ENFORCE(dynload::cudnnSetSpatialTransformerNdDescriptor( + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetSpatialTransformerNdDescriptor( desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType::type, nbDims, dimA)); return desc_; } @@ -402,10 +406,12 @@ class ScopedSpatialTransformerDescriptor { class ScopedActivationDescriptor { public: ScopedActivationDescriptor() { - PADDLE_ENFORCE(dynload::cudnnCreateActivationDescriptor(&desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnCreateActivationDescriptor(&desc_)); } ~ScopedActivationDescriptor() { - PADDLE_ENFORCE(dynload::cudnnDestroyActivationDescriptor(desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnDestroyActivationDescriptor(desc_)); } template @@ -467,15 +473,15 @@ inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { class ScopedCTCLossDescriptor { public: ScopedCTCLossDescriptor() { - PADDLE_ENFORCE(dynload::cudnnCreateCTCLossDescriptor(&desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreateCTCLossDescriptor(&desc_)); } ~ScopedCTCLossDescriptor() { - PADDLE_ENFORCE(dynload::cudnnDestroyCTCLossDescriptor(desc_)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyCTCLossDescriptor(desc_)); } template inline cudnnCTCLossDescriptor_t descriptor() { - PADDLE_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( dynload::cudnnSetCTCLossDescriptor(desc_, CudnnDataType::type)); return desc_; } diff --git a/paddle/fluid/platform/cudnn_workspace_helper.h b/paddle/fluid/platform/cudnn_workspace_helper.h index 58f76e31..29abdc72 100644 --- a/paddle/fluid/platform/cudnn_workspace_helper.h +++ b/paddle/fluid/platform/cudnn_workspace_helper.h @@ -17,7 +17,7 @@ namespace paddle { namespace platform { -static constexpr int kDefaultConvWorkspaceSizeLimitMB = 4096; +static constexpr int kDefaultConvWorkspaceSizeLimitMB = 512; } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_code.cc b/paddle/fluid/platform/device_code.cc new file mode 100644 index 00000000..24421b5c --- /dev/null +++ b/paddle/fluid/platform/device_code.cc @@ -0,0 +1,123 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/device_code.h" +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +#ifdef PADDLE_WITH_CUDA +inline bool is_error(nvrtcResult stat) { return stat != NVRTC_SUCCESS; } + +inline void throw_on_error(nvrtcResult stat, const std::string& msg) { +#ifndef REPLACE_ENFORCE_GLOG + throw std::runtime_error(dynload::nvrtcGetErrorString(stat) + msg); +#else + LOG(FATAL) << dynload::nvrtcGetErrorString(stat) << msg; +#endif +} + +CUDADeviceCode::CUDADeviceCode(const Place& place, const std::string& name, + const std::string& kernel) { + if (!is_gpu_place(place)) { + PADDLE_THROW("CUDADeviceCode can only launch on GPU place."); + } + + place_ = place; + name_ = name; + kernel_ = kernel; +} + +void CUDADeviceCode::Compile() { + nvrtcProgram program; + PADDLE_ENFORCE_EQ(dynload::nvrtcCreateProgram(&program, + kernel_.c_str(), // buffer + name_.c_str(), // name + 0, // numHeaders + nullptr, // headers + nullptr), // includeNames + NVRTC_SUCCESS, + "nvrtcCreateProgram failed."); + + // Compile the program for specified compute_capability + auto* dev_ctx = reinterpret_cast( + DeviceContextPool::Instance().Get(place_)); + int compute_capability = dev_ctx->GetComputeCapability(); + std::string compute_flag = + "--gpu-architecture=compute_" + std::to_string(compute_capability); + const std::vector options = {"--std=c++11", + compute_flag.c_str()}; + nvrtcResult compile_result = + dynload::nvrtcCompileProgram(program, // program + options.size(), // numOptions + options.data()); // options + if (compile_result == NVRTC_ERROR_COMPILATION) { + // Obtain compilation log from the program + size_t log_size; + PADDLE_ENFORCE_EQ(dynload::nvrtcGetProgramLogSize(program, &log_size), + NVRTC_SUCCESS, "nvrtcGetProgramLogSize failed."); + std::vector log; + log.resize(log_size + 1); + PADDLE_ENFORCE_EQ(dynload::nvrtcGetProgramLog(program, log.data()), + NVRTC_SUCCESS, "nvrtcGetProgramLog failed."); + LOG(FATAL) << "JIT compiling of CUDA code failed:\n" << log.data(); + } + + // Obtain PTX from the program + size_t ptx_size; + PADDLE_ENFORCE_EQ(dynload::nvrtcGetPTXSize(program, &ptx_size), NVRTC_SUCCESS, + "nvrtcGetPTXSize failed."); + ptx_.resize(ptx_size + 1); + PADDLE_ENFORCE_EQ(dynload::nvrtcGetPTX(program, ptx_.data()), NVRTC_SUCCESS, + "nvrtcGetPTX failed."); + + PADDLE_ENFORCE_EQ(dynload::nvrtcDestroyProgram(&program), NVRTC_SUCCESS, + "nvrtcDestroyProgram failed."); + + PADDLE_ENFORCE_EQ( + dynload::cuModuleLoadData(&module_, ptx_.data()), CUDA_SUCCESS, + "Fail to load PTX of %s (in cuModuleLoadData.)", name_.c_str()); + PADDLE_ENFORCE_EQ( + dynload::cuModuleGetFunction(&function_, module_, name_.c_str()), + CUDA_SUCCESS, "Fail to get function of %s (in cuModuleGetFunction.)", + name_.c_str()); + + max_threads_ = dev_ctx->GetMaxPhysicalThreadCount(); +} + +void CUDADeviceCode::Launch(const size_t n, std::vector* args) const { + int max_blocks = std::max(max_threads_ / num_threads_, 1); + int workload_per_block = workload_per_thread_ * num_threads_; + int num_blocks = + std::min(max_blocks, (static_cast(n) + workload_per_block - 1) / + workload_per_block); + + auto* dev_ctx = reinterpret_cast( + DeviceContextPool::Instance().Get(place_)); + PADDLE_ENFORCE_EQ( + dynload::cuLaunchKernel(function_, num_blocks, 1, 1, // grid dim + num_threads_, 1, 1, // block dim + 0, // shared memory + dev_ctx->stream(), // stream + args->data(), // arguments + nullptr), + CUDA_SUCCESS, "Fail to launch kernel %s (in cuLaunchKernel.)", + name_.c_str()); +} +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_code.h b/paddle/fluid/platform/device_code.h new file mode 100644 index 00000000..19adb070 --- /dev/null +++ b/paddle/fluid/platform/device_code.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/platform/device_context.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/dynload/cuda_driver.h" +#include "paddle/fluid/platform/dynload/nvrtc.h" +#endif + +namespace paddle { +namespace platform { + +class DeviceCode { + public: + virtual ~DeviceCode() {} + virtual void Compile() = 0; + virtual void Launch(const size_t n, std::vector* args) const = 0; + + protected: + Place place_; + std::string name_; + std::string kernel_; +}; + +#ifdef PADDLE_WITH_CUDA +class CUDADeviceCode : public DeviceCode { + public: + explicit CUDADeviceCode(const Place& place, const std::string& name, + const std::string& kernel); + void Compile() override; + void Launch(const size_t n, std::vector* args) const override; + + void SetNumThreads(int num_threads) { num_threads_ = num_threads; } + void SetWorkloadPerThread(int workload_per_thread) { + workload_per_thread_ = workload_per_thread; + } + + private: + int max_threads_{0}; + int num_threads_{1024}; + int workload_per_thread_{1}; + std::vector ptx_; + CUmodule module_; + CUfunction function_; +}; +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc new file mode 100644 index 00000000..3b63ed4e --- /dev/null +++ b/paddle/fluid/platform/device_code_test.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/device_code.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/init.h" + +constexpr auto saxpy_code = R"( +extern "C" __global__ +void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) { + for (size_t tid = blockIdx.x * blockDim.x + threadIdx.x; tid < n; + tid += blockDim.x * gridDim.x) { + z[tid] = a * x[tid] + y[tid]; + } +} +)"; + +#ifdef PADDLE_WITH_CUDA +TEST(device_code, cuda) { + paddle::framework::InitDevices(false, {0}); + paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0); + paddle::platform::CUDADeviceCode code(place, "saxpy_kernel", saxpy_code); + + paddle::framework::Tensor cpu_x; + paddle::framework::Tensor cpu_y; + paddle::framework::Tensor cpu_z; + + float scale = 2; + auto dims = paddle::framework::make_ddim( + {static_cast(256), static_cast(1024)}); + cpu_x.mutable_data(dims, paddle::platform::CPUPlace()); + cpu_y.mutable_data(dims, paddle::platform::CPUPlace()); + + size_t n = cpu_x.numel(); + for (size_t i = 0; i < n; ++i) { + cpu_x.data()[i] = static_cast(i); + } + for (size_t i = 0; i < n; ++i) { + cpu_y.data()[i] = static_cast(0.5); + } + + paddle::framework::Tensor x; + paddle::framework::Tensor y; + paddle::framework::Tensor z; + + float* x_data = x.mutable_data(dims, place); + float* y_data = y.mutable_data(dims, place); + float* z_data = z.mutable_data(dims, place); + + TensorCopySync(cpu_x, place, &x); + TensorCopySync(cpu_y, place, &y); + + code.Compile(); + + std::vector args = {&scale, &x_data, &y_data, &z_data, &n}; + code.SetNumThreads(1024); + code.SetWorkloadPerThread(1); + code.Launch(n, &args); + + TensorCopySync(z, paddle::platform::CPUPlace(), &cpu_z); + for (size_t i = 0; i < n; i++) { + PADDLE_ENFORCE_EQ(cpu_z.data()[i], + static_cast(i) * scale + 0.5); + } +} +#endif diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index c9ce7ed1..31665933 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -18,11 +18,39 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #endif #include "glog/logging.h" +namespace paddle { +namespace memory { + +AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { + auto place = dev_ctx.GetPlace(); +#ifdef PADDLE_WITH_CUDA + if (size == 0 || !platform::is_gpu_place(place)) { + return Alloc(place, size); + } + auto* default_dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto& desired_dev_ctx = + static_cast(dev_ctx); + if (default_dev_ctx->stream() == desired_dev_ctx.stream()) { + return Alloc(place, size); + } else { + return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc( + desired_dev_ctx, size); + } +#else + return Alloc(place, size); +#endif +} + +} // namespace memory +} // namespace paddle + namespace paddle { namespace platform { @@ -32,8 +60,10 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { PADDLE_THROW( - "Place %s is not supported, Please re-compile with WITH_GPU " - "option", + "Place %s is not supported, Please check that your paddle compiles " + "with WITH_GPU " + "option or check that your train process hold the correct gpu_id if " + "you use Executor", place); } return it->second.get().get(); @@ -87,47 +117,6 @@ DeviceContextPool::DeviceContextPool( } } -DeviceTemporaryAllocator* DeviceTemporaryAllocator::allocators = nullptr; - -#ifdef PADDLE_WITH_CUDA -platform::TemporaryAllocator& DeviceTemporaryAllocator::Get( - const platform::Place& place, const cudaStream_t& stream) { - PADDLE_ENFORCE(platform::is_gpu_place(place)); - auto place_stream = std::make_pair(place, stream); - std::unique_lock lock(mtx_); - auto it = device_allocator_.find(place_stream); - if (it == device_allocator_.end()) { - auto tmp_allocator = new TemporaryAllocator(place); - tmp_allocator->SetCallback([stream]() { - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE(cudaGetLastError()); - }); - device_allocator_[place_stream].reset(tmp_allocator); - return *tmp_allocator; - } else { - return *it->second; - } -} - -template <> -platform::TemporaryAllocator& DeviceTemporaryAllocator::Get( - const platform::CUDADeviceContext& dev_ctx) { - return Get(dev_ctx.GetPlace(), dev_ctx.stream()); -} -#endif - -template <> -platform::TemporaryAllocator& DeviceTemporaryAllocator::Get( - const platform::CPUDeviceContext& dev_ctx) { - return cpu_allocator_; -} - -platform::TemporaryAllocator& DeviceTemporaryAllocator::Get( - const platform::Place& place) { - PADDLE_ENFORCE(platform::is_cpu_place(place), "You should pass CPUPlace"); - return cpu_allocator_; -} - CPUDeviceContext::CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); } @@ -167,7 +156,9 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { if (UNLIKELY(num_bytes == 0)) { return nullptr; } - auto buf = paddle::memory::Alloc(place_, num_bytes); + auto buf = memory::Alloc(place_, num_bytes); + VLOG(4) << "Eigen allocated at " << buf->ptr() << ", size" << buf->size() + << " requested " << num_bytes; void* retv = buf->ptr(); { std::lock_guard lock(mtx_); @@ -195,7 +186,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { char* scratch = static_cast(scratchpad()) + Eigen::kCudaScratchSize; semaphore_ = reinterpret_cast(scratch); - PADDLE_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_)); } return semaphore_; @@ -211,36 +202,21 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { mutable std::unordered_map allocations_; }; -CudnnHolder::CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place) - : workspace_(nullptr), stream_(stream), place_(place) { - PADDLE_ENFORCE(cudaSetDevice(place_.device)); - PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); - PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, *stream_)); -} - -CudnnHolder::~CudnnHolder() { - PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); -} - -void CudnnHolder::ReallocateWorkspace(size_t required_workspace_len) { - if (required_workspace_len <= WorkspaceSize()) { +void CudnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) { + if (required_workspace_bytes <= WorkspaceSize()) { return; } - if (workspace_ != nullptr) { - // Maybe someone is using the current workspace - PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); - workspace_.reset(); - } - workspace_ = paddle::memory::Alloc(place_, required_workspace_len); + // reset allocation first before re-allocate to save memory + allocation_.reset(); + allocation_ = memory::Alloc(device_context_, required_workspace_bytes); } -CUDADeviceContext::CUDADeviceContext(CUDAPlace place) - : place_(place), cudnn_holder_(nullptr) { +CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { CUDADeviceGuard guard(place_.device); compute_capability_ = GetCUDAComputeCapability(place_.device); multi_process_ = GetCUDAMultiProcessors(place_.device); max_threads_per_mp_ = GetCUDAMaxThreadsPerMultiProcessor(place_.device); - PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_)); eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); @@ -300,6 +276,14 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) << "Please recompile or reinstall Paddle with compatible CUDNN " "version."; } + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnCreate(&cudnn_handle_), + "Failed to create Cudnn handle in DeviceContext"); + PADDLE_ENFORCE_CUDA_SUCCESS( + dynload::cudnnSetStream(cudnn_handle_, stream_), + "Failed to set stream for Cudnn handle in DeviceContext"); + } else { + cudnn_handle_ = nullptr; } } @@ -314,10 +298,14 @@ CUDADeviceContext::~CUDADeviceContext() { cublas_tensor_core_handle_.reset(); eigen_stream_.reset(); eigen_device_.reset(); - PADDLE_ENFORCE(cudaStreamDestroy(stream_)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamDestroy(stream_)); + if (cudnn_handle_) { + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroy(cudnn_handle_), + "Failed to destory Cudnn handle"); + } #if !defined(_WIN32) if (nccl_comm_) { - PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_)); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclCommDestroy(nccl_comm_)); } #endif } @@ -325,21 +313,17 @@ CUDADeviceContext::~CUDADeviceContext() { Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { - auto& allocator = - DeviceTemporaryAllocator::Instance().Get(*this); - allocator.Release([this]() { - cudaError_t e_sync = cudaStreamSynchronize(stream_); - if (e_sync != 0) { - LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync) - << " errno:" << e_sync; - } + cudaError_t e_sync = cudaStreamSynchronize(stream_); + if (e_sync != 0) { + LOG(FATAL) << "cudaStreamSynchronize " << cudaGetErrorString(e_sync) + << " errno: " << e_sync; + } - cudaError_t e_get = cudaGetLastError(); - if (e_get != 0) { - LOG(FATAL) << "cudaGetLastError " << cudaGetErrorString(e_get) - << " errno:" << e_get; - } - }); + cudaError_t e_get = cudaGetLastError(); + if (e_get != 0) { + LOG(FATAL) << "cudaGetLastError " << cudaGetErrorString(e_get) + << " errno: " << e_get; + } } int CUDADeviceContext::GetComputeCapability() const { @@ -358,21 +342,10 @@ bool CUDADeviceContext::tensor_core_available() const { return cublas_tensor_core_handle_ != nullptr; } -CudnnHolder* CUDADeviceContext::cudnn_holder() const { - std::call_once(init_cudnn_, [&]() { - if (dynload::HasCUDNN()) { - cudnn_holder_.reset(new CudnnHolder(&stream_, place_)); - } - }); - return cudnn_holder_.get(); -} - -cudnnHandle_t CUDADeviceContext::cudnn_handle() const { - return cudnn_holder()->cudnn_handle(); -} +cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { - return CudnnWorkspaceHandle(cudnn_holder()); + return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_); } cudaStream_t CUDADeviceContext::stream() const { return stream_; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index a17a0bdf..3504f62b 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include #include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/platform/temporary_allocator.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" @@ -45,71 +44,6 @@ limitations under the License. */ namespace paddle { namespace platform { -/*! \brief device temporary allocator singleton. - * - * Some operator needs temporary memory during computation, for example, - * conv_gemm, which needs use col to store the result of im2col. If we - * create a stack memory which is used by CUDA Kernel, before the - * Computation(...) returns, we should add ctx->Wait(), because the - * execution of CUDA is async, if there doesn't have ctx->Wait(), - * the temporary memory will be released before the CUDA Kernel uses - * it. - * - * DeviceTemporaryAllocator is a singleton, which contains a - * `TemporaryAllocator` for each . And the TemporaryAllocator - * contains a temp_allocation_queue which is used to store the temporary - * allocations. The allocation, which is allocated by TemporaryAllocator, - * is a unique_ptr, and when it is not held by any variable, it will be - * pushed into the temp_allocation_queue. There are two opportunities to free - * the allocations of temp_allocation_queue: - * - when the Stream calls cudaStreamSynchronize; - * - when the allocation size of opportunities exceeds a certain threshold - * (defined by FLAGS_limit_of_tmp_allocation). - * - * */ -class DeviceTemporaryAllocator { - public: - static DeviceTemporaryAllocator& Instance() { - PADDLE_ENFORCE_NOT_NULL(allocators, - "Need to Create DeviceTemporaryAllocator first!"); - return *allocators; - } - - static DeviceTemporaryAllocator& Init() { - if (allocators == nullptr) { - allocators = new DeviceTemporaryAllocator(); - } - return *allocators; - } - -/*! \brief Return handle of single temporary allocator. */ -#ifdef PADDLE_WITH_CUDA - platform::TemporaryAllocator& Get(const platform::Place& place, - const cudaStream_t& stream); -#endif - template - platform::TemporaryAllocator& Get(const DeviceContext& dev_ctx); - - platform::TemporaryAllocator& Get(const platform::Place& place); - - private: - DeviceTemporaryAllocator() : cpu_allocator_(platform::CPUPlace()) {} - - static DeviceTemporaryAllocator* allocators; - - platform::TemporaryAllocator cpu_allocator_; - -#ifdef PADDLE_WITH_CUDA - std::map, - std::unique_ptr> - device_allocator_; -#endif - - std::mutex mtx_; - - DISABLE_COPY_AND_ASSIGN(DeviceTemporaryAllocator); -}; - class DeviceContext { public: virtual ~DeviceContext() {} @@ -143,102 +77,7 @@ struct DefaultDeviceContextType { #ifdef PADDLE_WITH_CUDA class EigenCudaStreamDevice; -class CudnnHolder { - public: - CudnnHolder(const cudaStream_t* stream, const CUDAPlace& place); - ~CudnnHolder(); - cudnnHandle_t cudnn_handle() const { return cudnn_handle_; } - - private: - friend class CudnnWorkspaceHandle; - void ReallocateWorkspace(size_t required_workspace_len); - - template - void RunFuncImpl(Callback&& cudnn_func, size_t required_workspace_len) { - if (required_workspace_len > WorkspaceSize()) { - ReallocateWorkspace(required_workspace_len); - } - VLOG(2) << "Cudnn workspace size: " - << static_cast(WorkspaceSize()) / (1 << 20) << " MB"; - cudnn_func(WorkspacePtr()); - } - - /*! \brief Reset workspace thus release the memory */ - inline void ResetWorkspace() { - if (workspace_) { - // Maybe someone is using the current workspace - PADDLE_ENFORCE(cudaStreamSynchronize(*stream_)); - workspace_ = nullptr; - } - } - - inline void* WorkspacePtr() { - if (workspace_) { - return workspace_->ptr(); - } else { - return nullptr; - } - } - - inline size_t WorkspaceSize() { - if (workspace_) { - return workspace_->size(); - } else { - return 0; - } - } - - std::mutex& Mutex() { return mtx_; } - - cudnnHandle_t cudnn_handle_; - memory::AllocationPtr workspace_; - - const cudaStream_t* stream_; // not owned; - const CUDAPlace place_; - - std::mutex mtx_; -}; - -class CudnnWorkspaceHandle { - public: - /*! \brief The lock would not be acquired when constructor calls. - * The lock would be acquired when RunFunc() is called first time. */ - inline explicit CudnnWorkspaceHandle(CudnnHolder* holder) : holder_(holder) {} - - /*! \brief Thread which call RunFunc() would acquire the lock first - * before invoking cudnn functions. */ - template - inline void RunFunc(Callback&& cudnn_func, size_t required_workspace_len) { - if (!guard_) { - guard_.reset(new std::lock_guard(holder_->Mutex())); - } - holder_->RunFuncImpl(std::forward(cudnn_func), - required_workspace_len); - } - - /*! \brief Thread which call RunFuncSync() would acquire the lock first - * before invoking cudnn function and release gpu memory after running - * the function. Currently this function is only used when cudnn - * exhaustive searching and callers have to guarantee that the input function - * is host blocking */ - template - inline void RunFuncSync(Callback&& cudnn_func, - size_t required_workspace_len) { - if (!guard_) { - guard_.reset(new std::lock_guard(holder_->Mutex())); - } - holder_->RunFuncImpl(std::forward(cudnn_func), - required_workspace_len); - holder_->ResetWorkspace(); - } - - CudnnWorkspaceHandle(CudnnWorkspaceHandle&&) = default; - CudnnWorkspaceHandle& operator=(CudnnWorkspaceHandle&&) = delete; - - private: - CudnnHolder* holder_; // not own - std::unique_ptr> guard_; -}; +class CudnnWorkspaceHandle; class CUDADeviceContext : public DeviceContext { public: @@ -306,7 +145,7 @@ class CUDADeviceContext : public DeviceContext { template void RecordEvent(cudaEvent_t ev, Callback callback) { callback(); - PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(ev, stream_)); } template @@ -323,9 +162,11 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; - mutable std::unique_ptr cudnn_holder_; cudaStream_t stream_; + cudnnHandle_t cudnn_handle_; + mutable std::mutex cudnn_handle_mtx_; + std::unique_ptr cublas_handle_; std::unique_ptr cublas_tensor_core_handle_; @@ -346,11 +187,59 @@ class CUDADeviceContext : public DeviceContext { // StreamCallbackManager is thread-safe std::unique_ptr callback_manager_; - CudnnHolder* cudnn_holder() const; DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); }; +class CudnnWorkspaceHandle { + public: + inline CudnnWorkspaceHandle(const CUDADeviceContext& dev_ctx, std::mutex* mtx) + : device_context_(dev_ctx), mtx_(mtx) {} + + template + inline void RunFunc(Callback&& cudnn_func, size_t required_workspace_bytes) { + if (required_workspace_bytes > WorkspaceSize()) { + ReallocWorkspace(required_workspace_bytes); + } + VLOG(2) << "Cudnn workspace size at RunFunc: " + << static_cast(WorkspaceSize()) / (1 << 20) << " MB"; + { + std::lock_guard guard(*mtx_); + cudnn_func(allocation_ ? allocation_->ptr() : nullptr); + } + } + + /*! \brief Thread which call RunFuncSync() would release gpu memory after + * running the function. Currently this function is only used when cudnn + * exhaustive searching and callers have to guarantee that the input function + * is host blocking */ + template + inline void RunFuncSync(Callback&& cudnn_func, + size_t required_workspace_bytes) { + RunFunc(cudnn_func, required_workspace_bytes); + ResetWorkspace(); + } + + void ReallocWorkspace(size_t required_workspace_bytes); + + inline void ResetWorkspace() { allocation_ = nullptr; } + + inline size_t WorkspaceSize() { + if (allocation_ == nullptr) { + return 0; + } + return allocation_->size(); + } + + CudnnWorkspaceHandle(CudnnWorkspaceHandle&&) = default; + CudnnWorkspaceHandle& operator=(CudnnWorkspaceHandle&&) = delete; + + private: + memory::allocation::AllocationPtr allocation_; + const CUDADeviceContext& device_context_; + std::mutex* mtx_; +}; + template <> struct DefaultDeviceContextType { using TYPE = CUDADeviceContext; diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 35dbdc5a..81312111 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -3,8 +3,9 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc) # There is no macOS version of NCCL. +# Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux. if (NOT APPLE AND NOT WIN32) - list(APPEND CUDA_SRCS nccl.cc) + list(APPEND CUDA_SRCS nccl.cc nvrtc.cc cuda_driver.cc) endif() if (TENSORRT_FOUND) @@ -16,12 +17,8 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) -configure_file(warpctc_lib_path.h.in ${CMAKE_CURRENT_BINARY_DIR}/warpctc_lib_path.h) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() -if (WITH_CUSTOM_TRAINER) - cc_library(dynload_custom_trainer SRCS custom_trainer.cc DEPS dynamic_loader) -endif() # TODO(TJ): add iomp, mkldnn? diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index ced789b9..ed9b9133 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -1,16 +1,16 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #pragma once diff --git a/paddle/fluid/platform/dynload/cuda_driver.cc b/paddle/fluid/platform/dynload/cuda_driver.cc new file mode 100644 index 00000000..2c2edb2c --- /dev/null +++ b/paddle/fluid/platform/dynload/cuda_driver.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/cuda_driver.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag cuda_dso_flag; +void* cuda_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +CUDA_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/cuda_driver.h b/paddle/fluid/platform/dynload/cuda_driver.h new file mode 100644 index 00000000..89479772 --- /dev/null +++ b/paddle/fluid/platform/dynload/cuda_driver.h @@ -0,0 +1,79 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag cuda_dso_flag; +extern void* cuda_dso_handle; + +#ifdef PADDLE_USE_DSO + +#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using cuda_func = decltype(&::__name); \ + std::call_once(cuda_dso_flag, []() { \ + cuda_dso_handle = paddle::platform::dynload::GetCUDADsoHandle(); \ + }); \ + static void* p_##__name = dlsym(cuda_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +#else + +#define DECLARE_DYNAMIC_LOAD_CUDA_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) { \ + return ::__name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#endif + +/** + * include all needed cuda driver functions + **/ +#define CUDA_ROUTINE_EACH(__macro) \ + __macro(cuGetErrorString); \ + __macro(cuModuleLoadData); \ + __macro(cuModuleGetFunction); \ + __macro(cuModuleUnload); \ + __macro(cuOccupancyMaxActiveBlocksPerMultiprocessor); \ + __macro(cuLaunchKernel); \ + __macro(cuCtxCreate); \ + __macro(cuCtxGetCurrent); \ + __macro(cuDeviceGet); \ + __macro(cuDevicePrimaryCtxGetState) + +CUDA_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDA_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_CUDA_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index e6811ddc..4a1cd5a8 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -20,7 +20,6 @@ limitations under the License. */ #include "gflags/gflags.h" #include "glog/logging.h" #include "paddle/fluid/platform/dynload/cupti_lib_path.h" -#include "paddle/fluid/platform/dynload/warpctc_lib_path.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/port.h" @@ -34,8 +33,6 @@ DEFINE_string(cuda_dir, "", "libcurand. For instance, /usr/local/cuda/lib64. If default, " "dlopen will search cuda from LD_LIBRARY_PATH"); -DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so."); - DEFINE_string(nccl_dir, "", "Specify path for loading nccl library, such as libcublas, " "libcurand. For instance, /usr/local/cuda/lib64. If default, " @@ -48,13 +45,19 @@ DEFINE_string( "Specify path for loading tensorrt library, such as libnvinfer.so."); DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); -DEFINE_string(custom_trainer_dir, "", "Specify path for loading custom_trainer.so."); namespace paddle { namespace platform { namespace dynload { + +struct PathNode { + PathNode() {} + std::string path = ""; +}; + static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH; -static constexpr char warpctc_lib_path[] = WARPCTC_LIB_PATH; + +static PathNode s_py_site_pkg_path; #if defined(_WIN32) && defined(PADDLE_WITH_CUDA) static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll"; @@ -79,6 +82,11 @@ static inline std::string join(const std::string& part1, return ret; } +void SetPaddleLibPath(const std::string& py_site_pkg_path) { + s_py_site_pkg_path.path = py_site_pkg_path; + VLOG(3) << "Set paddle lib path : " << py_site_pkg_path; +} + static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path, int dynload_flags) { VLOG(3) << "Try to find library: " << dso_path @@ -214,10 +222,26 @@ void* GetCurandDsoHandle() { #endif } +void* GetNVRTCDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.dylib"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvrtc.so"); +#endif +} + +void* GetCUDADsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.dylib"); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcuda.so"); +#endif +} + void* GetWarpCTCDsoHandle() { - std::string warpctc_dir = warpctc_lib_path; - if (!FLAGS_warpctc_dir.empty()) { - warpctc_dir = FLAGS_warpctc_dir; + std::string warpctc_dir = ""; + if (!s_py_site_pkg_path.path.empty()) { + warpctc_dir = s_py_site_pkg_path.path; } #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(warpctc_dir, "libwarpctc.dylib"); @@ -239,6 +263,8 @@ void* GetNCCLDsoHandle() { void* GetTensorRtDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_mklml_dir, "nvinfer.dll"); #else return GetDsoHandleFromSearchPath(FLAGS_tensorrt_dir, "libnvinfer.so"); #endif @@ -254,10 +280,6 @@ void* GetMKLMLDsoHandle() { #endif } -void* GetCustomTrainerDsoHandle() { - return GetDsoHandleFromSearchPath(FLAGS_custom_trainer_dir, "custom_trainer.so"); -} - } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index b820c132..df101474 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include namespace paddle { namespace platform { @@ -28,12 +29,14 @@ void* GetCublasDsoHandle(); void* GetCUDNNDsoHandle(); void* GetCUPTIDsoHandle(); void* GetCurandDsoHandle(); +void* GetNVRTCDsoHandle(); +void* GetCUDADsoHandle(); void* GetWarpCTCDsoHandle(); void* GetNCCLDsoHandle(); void* GetTensorRtDsoHandle(); void* GetMKLMLDsoHandle(); -void* GetCustomTrainerDsoHandle(); +void SetPaddleLibPath(const std::string&); } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/mklml.cc b/paddle/fluid/platform/dynload/mklml.cc index 0f61a5e0..020c02d9 100644 --- a/paddle/fluid/platform/dynload/mklml.cc +++ b/paddle/fluid/platform/dynload/mklml.cc @@ -25,6 +25,11 @@ void* mklml_dso_handle = nullptr; MKLML_ROUTINE_EACH(DEFINE_WRAP); +#if !defined(_WIN32) +DEFINE_WRAP(mkl_scsrmm); +DEFINE_WRAP(mkl_dcsrmm); +#endif + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index a5b846f5..5070be43 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -92,6 +92,11 @@ extern void* mklml_dso_handle; MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); +#if !defined(_WIN32) +DYNAMIC_LOAD_MKLML_WRAP(mkl_scsrmm); +DYNAMIC_LOAD_MKLML_WRAP(mkl_dcsrmm); +#endif + #undef DYNAMIC_LOAD_MKLML_WRAP } // namespace dynload diff --git a/paddle/fluid/platform/dynload/nvrtc.cc b/paddle/fluid/platform/dynload/nvrtc.cc new file mode 100644 index 00000000..793b5b8d --- /dev/null +++ b/paddle/fluid/platform/dynload/nvrtc.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/nvrtc.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag nvrtc_dso_flag; +void* nvrtc_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NVRTC_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nvrtc.h b/paddle/fluid/platform/dynload/nvrtc.h new file mode 100644 index 00000000..20647aff --- /dev/null +++ b/paddle/fluid/platform/dynload/nvrtc.h @@ -0,0 +1,77 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include // NOLINT +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag nvrtc_dso_flag; +extern void* nvrtc_dso_handle; + +#ifdef PADDLE_USE_DSO + +#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using nvrtc_func = decltype(&::__name); \ + std::call_once(nvrtc_dso_flag, []() { \ + nvrtc_dso_handle = paddle::platform::dynload::GetNVRTCDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(nvrtc_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern struct DynLoad__##__name __name + +#else + +#define DECLARE_DYNAMIC_LOAD_NVRTC_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) { \ + return ::__name(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#endif + +/** + * include all needed nvrtc functions + **/ +#define NVRTC_ROUTINE_EACH(__macro) \ + __macro(nvrtcGetErrorString); \ + __macro(nvrtcCompileProgram); \ + __macro(nvrtcCreateProgram); \ + __macro(nvrtcDestroyProgram); \ + __macro(nvrtcGetPTX); \ + __macro(nvrtcGetPTXSize); \ + __macro(nvrtcGetProgramLog); \ + __macro(nvrtcGetProgramLogSize) + +NVRTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVRTC_WRAP); + +#undef DECLARE_DYNAMIC_LOAD_NVRTC_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index 751aa54b..4c7ba0f0 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once #include +#if !defined(_WIN32) #include +#endif #include // NOLINT @@ -34,7 +36,7 @@ extern void* tensorrt_dso_handle; struct DynLoad__##__name { \ template \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using tensorrt_func = decltype(__name(args...)) (*)(Args...); \ + using tensorrt_func = decltype(&::__name); \ std::call_once(tensorrt_dso_flag, []() { \ tensorrt_dso_handle = \ paddle::platform::dynload::GetTensorRtDsoHandle(); \ diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 127be445..f9ae4113 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -26,7 +26,9 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA +#include #include +#include #include #include #include @@ -64,63 +66,63 @@ inline std::string demangle(std::string name) { inline std::string demangle(std::string name) { return name; } #endif +template +inline std::string GetTraceBackString(StrType&& what, const char* file, + int line) { + static constexpr int TRACE_STACK_LIMIT = 100; + std::ostringstream sout; + + sout << string::Sprintf("%s at [%s:%d]", std::forward(what), file, + line) + << std::endl; + sout << "PaddlePaddle Call Stacks: " << std::endl; +#if !defined(_WIN32) + void* call_stack[TRACE_STACK_LIMIT]; + auto size = backtrace(call_stack, TRACE_STACK_LIMIT); + auto symbols = backtrace_symbols(call_stack, size); + Dl_info info; + for (int i = 0; i < size; ++i) { + if (dladdr(call_stack[i], &info) && info.dli_sname) { + auto demangled = demangle(info.dli_sname); + auto addr_offset = static_cast(call_stack[i]) - + static_cast(info.dli_saddr); + sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, 2 + sizeof(void*) * 2, + call_stack[i], demangled, addr_offset); + } else { + sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2, + call_stack[i]); + } + } + free(symbols); +#else + sout << "Windows not support stack backtrace yet."; +#endif + return sout.str(); +} + struct EnforceNotMet : public std::exception { std::string err_str_; - EnforceNotMet(std::exception_ptr e, const char* f, int l) { + EnforceNotMet(std::exception_ptr e, const char* file, int line) { try { std::rethrow_exception(e); } catch (std::exception& e) { - Init(e.what(), f, l); + err_str_ = GetTraceBackString(e.what(), file, line); } } - EnforceNotMet(const std::string& str, const char* f, int l) { - Init(str, f, l); - } + EnforceNotMet(const std::string& str, const char* file, int line) + : err_str_(GetTraceBackString(str, file, line)) {} const char* what() const noexcept override { return err_str_.c_str(); } - - private: - template - inline void Init(StrType what, const char* f, int l) { - static constexpr int TRACE_STACK_LIMIT = 100; - std::ostringstream sout; - - sout << string::Sprintf("%s at [%s:%d]", what, f, l) << std::endl; - sout << "PaddlePaddle Call Stacks: " << std::endl; -#if !defined(_WIN32) - void* call_stack[TRACE_STACK_LIMIT]; - auto size = backtrace(call_stack, TRACE_STACK_LIMIT); - auto symbols = backtrace_symbols(call_stack, size); - Dl_info info; - for (int i = 0; i < size; ++i) { - if (dladdr(call_stack[i], &info) && info.dli_sname) { - auto demangled = demangle(info.dli_sname); - auto addr_offset = static_cast(call_stack[i]) - - static_cast(info.dli_saddr); - sout << string::Sprintf("%-3d %*0p %s + %zd\n", i, - 2 + sizeof(void*) * 2, call_stack[i], demangled, - addr_offset); - } else { - sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2, - call_stack[i]); - } - } - free(symbols); -#else - sout << "Windows not support stack backtrace yet."; -#endif - err_str_ = sout.str(); - } }; struct EOFException : public std::exception { std::string err_str_; - EOFException(const char* err_msg, const char* f, int l) { - err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, f, l); + EOFException(const char* err_msg, const char* file, int line) { + err_str_ = string::Sprintf("%s at [%s:%d]", err_msg, file, line); } - const char* what() const noexcept { return err_str_.c_str(); } + const char* what() const noexcept override { return err_str_.c_str(); } }; // Because most enforce conditions would evaluate to true, we can use @@ -236,12 +238,50 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) { #endif // __APPLE__ and windows #endif // PADDLE_WITH_CUDA +#ifdef PADDLE_WITH_CUDA +namespace details { + +template +struct CudaStatusType {}; + +#define DEFINE_CUDA_STATUS_TYPE(type, success_value) \ + template <> \ + struct CudaStatusType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_CUDA_STATUS_TYPE(cudaError_t, cudaSuccess); +DEFINE_CUDA_STATUS_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS); +DEFINE_CUDA_STATUS_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS); +DEFINE_CUDA_STATUS_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS); + +#if !defined(__APPLE__) && !defined(_WIN32) +DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); +#endif + +} // namespace details +#endif + #define PADDLE_THROW(...) \ do { \ throw ::paddle::platform::EnforceNotMet( \ ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \ } while (0) +#if defined(__CUDA_ARCH__) +// For cuda, the assertions can affect performance and it is therefore +// recommended to disable them in production code +// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#assertion +#define PADDLE_ENFORCE(_IS_NOT_ERROR, __FORMAT, ...) \ + do { \ + if (!(_IS_NOT_ERROR)) { \ + printf("Exception: %s:%d Assertion `%s` failed. " __FORMAT "\n", \ + __FILE__, __LINE__, #_IS_NOT_ERROR, ##__VA_ARGS__); \ + asm("trap;"); \ + } \ + } while (0) +#else #define PADDLE_ENFORCE(COND, ...) \ do { \ auto __cond__ = (COND); \ @@ -255,12 +295,41 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) { } \ } \ } while (0) +#endif + +#ifdef PADDLE_WITH_CUDA +#define PADDLE_ENFORCE_CUDA_SUCCESS(COND, ...) \ + do { \ + auto __cond__ = (COND); \ + using __CUDA_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::paddle::platform::details::CudaStatusType< \ + __CUDA_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + try { \ + ::paddle::platform::throw_on_error( \ + __cond__, ::paddle::string::Sprintf(__VA_ARGS__)); \ + } catch (...) { \ + throw ::paddle::platform::EnforceNotMet(std::current_exception(), \ + __FILE__, __LINE__); \ + } \ + } \ + } while (0) + +#undef DEFINE_CUDA_STATUS_TYPE +#endif #define PADDLE_THROW_EOF() \ do { \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ __LINE__); \ - } while (false) + } while (0) + +#define PADDLE_THROW_BAD_ALLOC(...) \ + do { \ + throw ::paddle::memory::allocation::BadAlloc( \ + ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \ + } while (0) /* * Some enforce helpers here, usage: @@ -319,28 +388,72 @@ using CommonType1 = typename std::add_lvalue_reference< template using CommonType2 = typename std::add_lvalue_reference< typename std::add_const::Type2>::type>::type; + +// Here, we use SFINAE to check whether T can be converted to std::string +template +struct CanToString { + private: + using YesType = uint8_t; + using NoType = uint16_t; + + template + static YesType Check(decltype(std::cout << std::declval())) { + return 0; + } + + template + static NoType Check(...) { + return 0; + } + + public: + static constexpr bool kValue = + std::is_same(std::cout))>::value; +}; + +template +struct BinaryCompareMessageConverter { + template + static std::string Convert(const char* expression, const T& value) { + return expression + std::string(":") + string::to_string(value); + } +}; + +template <> +struct BinaryCompareMessageConverter { + template + static const char* Convert(const char* expression, const T& value) { + return expression; + } +}; + } // namespace details -#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...) \ - do { \ - auto __val1 = (__VAL1); \ - auto __val2 = (__VAL2); \ - using __TYPE1__ = decltype(__val1); \ - using __TYPE2__ = decltype(__val2); \ - using __COMMON_TYPE1__ = \ - ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>; \ - using __COMMON_TYPE2__ = \ - ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>; \ - bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP( \ - static_cast<__COMMON_TYPE2__>(__val2)); \ - if (UNLIKELY(!__is_not_error)) { \ - PADDLE_THROW("Enforce failed. Expected %s " #__CMP \ - " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \ - #__VAL1, #__VAL2, #__VAL1, \ - ::paddle::string::to_string(__val1), #__VAL2, \ - ::paddle::string::to_string(__val2), \ - ::paddle::string::Sprintf(__VA_ARGS__)); \ - } \ +#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...) \ + do { \ + auto __val1 = (__VAL1); \ + auto __val2 = (__VAL2); \ + using __TYPE1__ = decltype(__val1); \ + using __TYPE2__ = decltype(__val2); \ + using __COMMON_TYPE1__ = \ + ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>; \ + using __COMMON_TYPE2__ = \ + ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>; \ + bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP( \ + static_cast<__COMMON_TYPE2__>(__val2)); \ + if (UNLIKELY(!__is_not_error)) { \ + constexpr bool __kCanToString__ = \ + ::paddle::platform::details::CanToString<__TYPE1__>::kValue && \ + ::paddle::platform::details::CanToString<__TYPE2__>::kValue; \ + PADDLE_THROW("Enforce failed. Expected %s " #__CMP \ + " %s, but received %s " #__INV_CMP " %s.\n%s", \ + #__VAL1, #__VAL2, \ + ::paddle::platform::details::BinaryCompareMessageConverter< \ + __kCanToString__>::Convert(#__VAL1, __val1), \ + ::paddle::platform::details::BinaryCompareMessageConverter< \ + __kCanToString__>::Convert(#__VAL2, __val2), \ + ::paddle::string::Sprintf(__VA_ARGS__)); \ + } \ } while (0) #define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index adcc9536..4e34f3cb 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -11,7 +11,9 @@ limitations under the License. */ #include #include +#include #include +#include #include "gtest/gtest.h" #include "paddle/fluid/platform/enforce.h" @@ -253,3 +255,107 @@ TEST(EOF_EXCEPTION, THROW_EOF) { } EXPECT_TRUE(caught_eof); } + +#ifdef PADDLE_WITH_CUDA +template +bool CheckCudaStatusSuccess(T value, const std::string& msg = "success") { + PADDLE_ENFORCE_CUDA_SUCCESS(value, msg); + return true; +} + +template +bool CheckCudaStatusFailure( + T value, const std::string& msg = "self-defined cuda status failed") { + try { + PADDLE_ENFORCE_CUDA_SUCCESS(value, msg); + return false; + } catch (paddle::platform::EnforceNotMet& error) { + std::string ex_msg = error.what(); + return ex_msg.find(msg) != std::string::npos; + } +} + +TEST(enforce, cuda_success) { + EXPECT_TRUE(CheckCudaStatusSuccess(cudaSuccess)); + EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorInvalidValue)); + EXPECT_TRUE(CheckCudaStatusFailure(cudaErrorMemoryAllocation)); + + EXPECT_TRUE(CheckCudaStatusSuccess(CURAND_STATUS_SUCCESS)); + EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_VERSION_MISMATCH)); + EXPECT_TRUE(CheckCudaStatusFailure(CURAND_STATUS_NOT_INITIALIZED)); + + EXPECT_TRUE(CheckCudaStatusSuccess(CUDNN_STATUS_SUCCESS)); + EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_NOT_INITIALIZED)); + EXPECT_TRUE(CheckCudaStatusFailure(CUDNN_STATUS_ALLOC_FAILED)); + + EXPECT_TRUE(CheckCudaStatusSuccess(CUBLAS_STATUS_SUCCESS)); + EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_NOT_INITIALIZED)); + EXPECT_TRUE(CheckCudaStatusFailure(CUBLAS_STATUS_INVALID_VALUE)); +#if !defined(__APPLE__) && !defined(_WIN32) + EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); + EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError)); + EXPECT_TRUE(CheckCudaStatusFailure(ncclSystemError)); +#endif +} +#endif + +struct CannotToStringType { + explicit CannotToStringType(int num) : num_(num) {} + + bool operator==(const CannotToStringType& other) const { + return num_ == other.num_; + } + + bool operator!=(const CannotToStringType& other) const { + return num_ != other.num_; + } + + private: + int num_; +}; + +TEST(enforce, cannot_to_string_type) { + static_assert( + !paddle::platform::details::CanToString::kValue, + "CannotToStringType must not be converted to string"); + static_assert(paddle::platform::details::CanToString::kValue, + "int can be converted to string"); + CannotToStringType obj1(3), obj2(4), obj3(3); + + PADDLE_ENFORCE_NE(obj1, obj2, "Object 1 is not equal to Object 2"); + PADDLE_ENFORCE_EQ(obj1, obj3, "Object 1 is equal to Object 3"); + + std::string msg = "Compare obj1 with obj2"; + try { + PADDLE_ENFORCE_EQ(obj1, obj2, msg); + } catch (paddle::platform::EnforceNotMet& error) { + std::string ex_msg = error.what(); + LOG(INFO) << ex_msg; + EXPECT_TRUE(ex_msg.find(msg) != std::string::npos); + EXPECT_TRUE( + ex_msg.find("Expected obj1 == obj2, but received obj1 != obj2") != + std::string::npos); + } + + msg = "Compare x with y"; + try { + int x = 3, y = 2; + PADDLE_ENFORCE_EQ(x, y, msg); + } catch (paddle::platform::EnforceNotMet& error) { + std::string ex_msg = error.what(); + LOG(INFO) << ex_msg; + EXPECT_TRUE(ex_msg.find(msg) != std::string::npos); + EXPECT_TRUE(ex_msg.find("Expected x == y, but received x:3 != y:2") != + std::string::npos); + } + + std::set set; + PADDLE_ENFORCE_EQ(set.begin(), set.end()); + set.insert(3); + PADDLE_ENFORCE_NE(set.begin(), set.end()); + + std::list list; + PADDLE_ENFORCE_EQ(list.begin(), list.end()); + list.push_back(4); + PADDLE_ENFORCE_NE(list.begin(), list.end()); +} diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc new file mode 100644 index 00000000..e40d0cf1 --- /dev/null +++ b/paddle/fluid/platform/flags.cc @@ -0,0 +1,453 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gflags/gflags.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_workspace_helper.h" +#endif + +/** + * NOTE(paddle-dev): This file is designed to define all public FLAGS. + */ + +/** + * Paddle initialization related FLAG + * Name: FLAGS_paddle_num_threads + * Since Version: 0.15.0 + * Value Range: int32, default=1 + * Example: FLAGS_paddle_num_threads=2, set the maximum thread number per + * instance to 2 + * Note: + */ +DEFINE_int32(paddle_num_threads, 1, + "Number of threads for each paddle instance."); + +/** + * Operator related FLAG + * Name: FLAGS_check_nan_inf + * Since Version: 0.13.0 + * Value Range: bool, default=false + * Example: + * Note: Used to debug. Checking whether operator produce NAN/INF or not. + */ +DEFINE_bool(check_nan_inf, false, + "Checking whether operator produce NAN/INF or not. It will be " + "extremely slow so please use this flag wisely."); + +#ifdef PADDLE_WITH_CUDA + +/** + * CUDA related related FLAG + * Name: FLAGS_enable_cublas_tensor_op_math + * Since Version: 1.2.0 + * Value Range: bool, default=false + * Example: + * Note: whether to use Tensor Core, faster but it may loss precision. + */ +DEFINE_bool( + enable_cublas_tensor_op_math, false, + "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " + "but it may loss precision. Currently, There are two CUDA libraries that" + " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up" + " GEMM computations(the matrices must be either half precision or single " + "precision); cuDNN uses Tensor Cores to speed up both convolutions(the " + "input and output must be half precision) and recurrent neural networks " + "(RNNs)."); + +/** + * CUDA related FLAG + * Name: FLAGS_selected_gpus + * Since Version: 1.3.0 + * Value Range: integer list separated by comma, default empty list + * Example: FLAGS_selected_gpus=0,1,2,3,4,5,6,7 to train or predict with 0~7 gpu + * cards + * Note: A list of device ids separated by comma, like: 0,1,2,3 + */ +DEFINE_string(selected_gpus, "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (GPU). If you want to use " + "all visible devices, set this to empty string. NOTE: the " + "reason of doing this is that we want to use P2P communication" + "between GPU devices, use CUDA_VISIBLE_DEVICES can only use" + "share-memory only."); +#endif + +#ifdef PADDLE_WITH_CUDA + +/** + * CUDNN related FLAG + * Name: FLAGS_cudnn_deterministic + * Since Version: 0.13.0 + * Value Range: bool, default=false + * Example: + * Note: whether to use deterministic algorithm in cudnn. + * If true, it will slow down some operators such as conv and pooling. + */ +DEFINE_bool(cudnn_deterministic, false, + "Whether allow using an autotuning algorithm for convolution " + "operator. The autotuning algorithm may be non-deterministic. If " + "true, the algorithm is deterministic."); + +/** + * CUDNN related FLAG + * Name: FLAGS_conv_workspace_size_limit + * Since Version: 0.13.0 + * Value Range: uint64, default=512 (MB) + * Example: + * Note: The internal function of cuDNN obtains the fastest matching algorithm + * within this memory limit. Usually, faster algorithms can be chosen in + * larger workspaces, but memory space can also be significantly + * increased. + * Users need to balance memory and speed. + */ +DEFINE_uint64(conv_workspace_size_limit, + paddle::platform::kDefaultConvWorkspaceSizeLimitMB, + "cuDNN convolution workspace limit in MB unit."); + +/** + * CUDNN related FLAG + * Name: FLAGS_cudnn_exhaustive_search + * Since Version: 1.2.0 + * Value Range: bool, default=false + * Example: + * Note: Represents whether an exhaustive search method is used to + * select a convolution algorithm. There are two search methods in cuDNN, + * heuristic search and exhaustive search. Exhaustive search attempts + * all cuDNN algorithms to select the fastest. This method is very + * time-consuming, and the selected algorithm will be cached for a given + * layer specification. Once you change the layer specifications + * (such as batch size, feature map size), it will search again. + */ +DEFINE_bool(cudnn_exhaustive_search, false, + "Whether enable exhaustive search for cuDNN convolution or " + "not, default is False."); + +/** + * CUDNN related FLAG + * Name: FLAGS_cudnn_exhaustive_search_times + * Since Version: + * Value Range: + * Example: + * Note: only used to predict for advanced developer + */ +DEFINE_int64(cudnn_exhaustive_search_times, -1, + "Exhaustive search times for cuDNN convolution, " + "default is -1, not exhaustive search"); + +/** + * CUDNN related FLAG + * Name: FLAGS_cudnn_batchnorm_spatial_persistent + * Since Version: 1.4.0 + * Value Range: bool, default=false + * Example: + * Note: CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be + * faster in + * some tasks because an optimized path may be selected for + * CUDNN_DATA_FLOAT + * and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The + * reason we set it to false by default is that this mode may use scaled + * atomic integer reduction that may cause a numerical overflow for + * certain + * input data range. + */ +DEFINE_bool(cudnn_batchnorm_spatial_persistent, false, + "Whether enable CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode for cudnn " + "batch_norm, default is False."); +#endif + +#ifdef PADDLE_WITH_CUDA + +/** + * NCCL related FLAG + * Name: FLAGS_enable_cublas_tensor_op_math + * Since Version: + * Value Range: + * Example: + * Note: asynchronous nccl allreduce or synchronous issue: + * https://github.com/PaddlePaddle/Paddle/issues/15049 + * If you want to change this default value, why?(gongwb) + */ +DEFINE_bool( + sync_nccl_allreduce, true, + "If set true, will call `cudaStreamSynchronize(nccl_stream)`" + "after allreduce, this mode can get better performance in some scenarios."); +#endif + +#ifdef PADDLE_WITH_DISTRIBUTE +/** + * Distributed related FLAG + * Name: FLAGS_communicator_max_merge_var_num + * Since Version: 1.5.0 + * Value Range: int32, default=20 + * Example: + * Note: The maximum number of gradients to be merged into a gradient and + * sent through the communicator. The trainer puts all the gradients + * into the queue, and then the communicator takes the gradients out + * of the queue and sends them after merging. + */ +DEFINE_int32(communicator_max_merge_var_num, 20, + "max var num to merge and send"); + +/** + * Distributed related FLAG + * Name: FLAGS_communicator_send_queue_size + * Since Version: 1.5.0 + * Value Range: int32, default=20 + * Example: + * Note: Size for each gradient queue. The trainer puts the gradient into + * the queue, and then the communicator takes it out of the queue and + * sends it out. When the communicator is slow, the queue may be full, + * and the trainer will be continuously blocked before the queue has + * space. It is used to avoid training much faster than communication, + * so that too many gradients are not sent out in time. + */ +DEFINE_int32(communicator_send_queue_size, 20, + "queue size to recv gradient before send"); +#endif + +/** + * Distributed related FLAG + * Name: FLAGS_dist_threadpool_size + * Since Version: 1.0.0 + * Value Range: int32, default=0 + * Example: + * Note: Control the number of threads used for distributed modules. + * If it is not set, it is set to a hard thread. + */ +DEFINE_int32(dist_threadpool_size, 0, + "number of threads used for distributed executed."); + +/** + * Garbage collector related FLAG + * Name: FLAGS_eager_delete_tensor_gb + * Since Version: 1.0.0 + * Value Range: double, default=kDefaultEagerDeleteTensorGB + * Example: FLAGS_eager_delete_tensor_gb=0.0, Release memory garbage once it is + * no longer used. + * FLAGS_eager_delete_tensor_gb=1.0, Release memory garbage when + * garbage occupies 1.0GB of memory. + * FLAGS_eager_delete_tensor_gb=-1.0, Disable garbage collection + * policy. + * Note: Represents whether a garbage collection strategy is used to optimize + * network memory usage. + * It is recommended that users set FLAGS_eager_delete_tensor_gb=0.0 to + * enable garbage collection strategy when training large networks. + */ +// Disable gc by default when inference library is built +#ifdef PADDLE_ON_INFERENCE +static const double kDefaultEagerDeleteTensorGB = -1; +#else +static const double kDefaultEagerDeleteTensorGB = 0; +#endif + +DEFINE_double( + eager_delete_tensor_gb, kDefaultEagerDeleteTensorGB, + "Memory size threshold (GB) when the garbage collector clear tensors." + "Disabled when this value is less than 0"); + +/** + * Memory related FLAG + * Name: FLAGS_fast_eager_deletion_mode + * Since Version: 1.3.0 + * Value Range: bool, default=true + * Example: + * Note: Whether to use fast garbage collection strategy. + * If not set, the GPU memory is released at the end of the CUDA kernel. + * Otherwise, the GPU memory will be released before the CUDA kernel + * has finished, which will make the garbage collection strategy faster. + * Only works when garbage collection strategy is enabled. + */ +DEFINE_bool(fast_eager_deletion_mode, true, + "Fast eager deletion mode. If enabled, memory would release " + "immediately without waiting GPU kernel ends."); + +/** + * Memory related FLAG + * Name: FLAGS_memory_fraction_of_eager_deletion + * Since Version: 1.4 + * Value Range: double [0.0, 1.0], default=1.0 + * Example: + * Note: The percentage of memory size of garbage collection policy + * to release variables. + * If FLAGS_memory_fraction_of_eager_deletion = 1.0, + * all temporary variables in the network will be released. + * If FLAGS_memory_fraction_of_eager_deletion = 0.0, + * no temporary variables in the network are released. + * If 0.0 < FLAGS_memory_fraction_of_eager_deletion < 1.0, + * all temporary variables will be sorted in descending order + * according to their memory size, and only variables with the + * largest FLAGS_memory_fraction_of_eager_deletion ratio will be released. + * The flag is only valid when running parallel data compilers. + */ +DEFINE_double(memory_fraction_of_eager_deletion, 1.0, + "Fraction of eager deletion. If less than 1.0, all variables in " + "the program would be sorted according to its memory size, and " + "only the FLAGS_memory_fraction_of_eager_deletion of the largest " + "variables would be deleted."); + +/** + * Allocator related FLAG + * Name: FLAGS_allocator_strategy + * Since Version: 1.2 + * Value Range: string, {naive_best_fit, auto_groth}, default=naive_best_fit + * Example: + * Note: Allocator policy for selecting Paddle Paddle. + * The allocator strategy is under development and the non-legacy + * allocator is not yet stable. + */ +DEFINE_string(allocator_strategy, "naive_best_fit", + "The allocation strategy. naive_best_fit means the original best " + "fit allocator of Fluid. " + "auto_growth means the experimental auto-growth allocator. " + "Enum in [naive_best_fit, auto_growth]."); + +/** + * Memory related FLAG + * Name: FLAGS_fraction_of_cpu_memory_to_use + * Since Version: 0.12.0 + * Value Range: double, [0.0, 1.0], default=1 + * Example: + * Note: Represents the proportion of allocated CPU memory blocks + * to the total memory size of the CPU. Future CPU memory usage + * will be allocated from this memory block. If the memory block does + * not have enough CUDA pinned memory, new memory blocks of the same + * size as the memory block will be allocated from the CUDA pinned + * request util the CPU does not have enough memory. + */ +DEFINE_double(fraction_of_cpu_memory_to_use, 1, + "Default use 100% of CPU memory for PaddlePaddle," + "reserve the rest for page tables, etc"); + +/** + * Memory related FLAG + * Name: FLAGS_initial_cpu_memory_in_mb + * Since Version: 0.14.0 + * Value Range: uint64, default=500 (MB) + * Example: + * Note: The CPU memory block size of the initial allocator in MB. + * The allocator takes the minimum values of + * FLAGS_initial_cpu_memory_in_mb and + * FLAGS_fraction_of_cpu_memory_to_use*(total physical memory) + * as memory block sizes. + */ +DEFINE_uint64(initial_cpu_memory_in_mb, 500ul, + "Initial CPU memory for PaddlePaddle, in MD unit."); + +/** + * Memory related FLAG + * Name: FLAGS_fraction_of_cuda_pinned_memory_to_use + * Since Version: 0.12.0 + * Value Range: double, [0.0, 1.0], default=0.5 + * Example: + * Note: Represents the proportion of allocated CUDA pinned memory blocks + * to the total memory size of the CPU. Future CUDA pinned memory usage + * will be allocated from this memory block. If the memory block does + * not have enough CPU memory, new memory blocks of the same + * size as the memory block will be allocated from the CPU + * request util the CPU does not have enough memory. + */ +DEFINE_double( + fraction_of_cuda_pinned_memory_to_use, 0.5, + "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," + "reserve the rest for page tables, etc"); + +#ifdef PADDLE_WITH_CUDA + +/** + * Memory related FLAG + * Name: FLAGS_fraction_of_gpu_memory_to_use + * Since Version: 1.2.0 + * Value Range: double, default=0.5 if win32, 0.92 else + * Example: + * Note: Represents the proportion of allocated memory blocks to the total + * memory size + * of the GPU. Future memory usage will be allocated from this memory + * block. + * If the memory block does not have enough GPU memory, new memory blocks + * of + * the same size as the memory block will be allocated from the GPU + * request + * until the GPU does not have enough memory. + */ + +#ifndef _WIN32 +constexpr static float fraction_of_gpu_memory_to_use = 0.92f; +#else +// fraction_of_gpu_memory_to_use cannot be too high on windows, +// since the win32 graphic sub-system can occupy some GPU memory +// which may lead to insufficient memory left for paddle +constexpr static float fraction_of_gpu_memory_to_use = 0.5f; +#endif +DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, + "Allocate a trunk of gpu memory that is this fraction of the " + "total gpu memory size. Future memory usage will be allocated " + "from the trunk. If the trunk doesn't have enough gpu memory, " + "additional trunks of the same size will be requested from gpu " + "until the gpu has no memory left for another trunk."); + +/** + * Memory related FLAG + * Name: FLAGS_initial_gpu_memory_in_mb + * Since Version: 1.4.0 + * Value Range: uint64, default=0 (MB) + * Example: + * Note: Allocate a specified size of GPU memory block. Later memory usage + * will be allocated from that memory block. If the memory block does not + * have enough GPU memory, the memory block with the size + * FLAGS_reallocate_gpu_memory_in_mb will be requested from the GPU until + * the GPU has no remaining memory. + */ +DEFINE_uint64( + initial_gpu_memory_in_mb, 0ul, + "Allocate a trunk of gpu memory whose byte size is specified by " + "the flag. Future memory usage will be allocated from the " + "trunk. If the trunk doesn't have enough gpu memory, additional " + "trunks of the gpu memory will be requested from gpu with size " + "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has " + "no memory left for the additional trunk. Note: if you set this " + "flag, the memory size set by " + "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this " + "flag. If you don't set this flag, PaddlePaddle will use " + "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory"); + +/** + * Memory related FLAG + * Name: FLAGS_reallocate_gpu_memory_in_mb + * Since Version: 1.4.0 + * Value Range: uint64, default=0 (MB) + * Example: + * Note: If the allocated GPU memory blocks are exhausted, + * additional GPU memory blocks are reallocated + */ +DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul, + "If this flag is set, Paddle will reallocate the gpu memory with " + "size specified by this flag. Else Paddle will reallocate by " + "FLAGS_fraction_of_gpu_memory_to_use"); + +#endif + +/** + * Scope related FLAG + * Name: local_exe_sub_scope_limit + * Since Version: 1.6.0 + * Value Range: double, default=256 (MB) + * Example: + * Note: + */ +DEFINE_double(local_exe_sub_scope_limit, 256.0, // MBytes + "The memory up limit of sub-scopes of local execution scope for " + "each CUDAPlace. If you don't need to limit the memory, " + "you should set FLAGS_local_exe_sub_scope_limit=-1. " + "The default value is 256 MBytes."); diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc index 3a937dfa..f411c386 100644 --- a/paddle/fluid/platform/float16_test.cc +++ b/paddle/fluid/platform/float16_test.cc @@ -15,6 +15,7 @@ limitations under the License. */ #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/init.h" namespace paddle { @@ -144,7 +145,7 @@ TEST(float16, lod_tensor_cpu) { TEST(float16, floating) { // compile time assert. - PADDLE_ASSERT(std::is_floating_point::value); + PADDLE_ENFORCE_EQ(std::is_floating_point::value, true); } TEST(float16, print) { diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index 14cad927..bf203841 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/enforce.h" #define ARITHMETIC_KERNEL(op_type, sign) \ __global__ void op_type(const half* in1, const half* in2, half* out) { \ @@ -260,8 +261,8 @@ TEST(float16, typeid) { int b(0); // compile time assert - PADDLE_ASSERT(functor(a) == true); - PADDLE_ASSERT(functor2(b) == false); + PADDLE_ENFORCE_EQ(functor(a), true); + PADDLE_ENFORCE_EQ(functor2(b), false); } // GPU test diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 24209662..8191d688 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -21,61 +21,14 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/split.h" -#ifndef _WIN32 -constexpr static float fraction_of_gpu_memory_to_use = 0.92f; -#else -// fraction_of_gpu_memory_to_use cannot be too high on windows, -// since the win32 graphic sub-system can occupy some GPU memory -// which may lead to insufficient memory left for paddle -constexpr static float fraction_of_gpu_memory_to_use = 0.5f; -#endif +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +DECLARE_bool(enable_cublas_tensor_op_math); +DECLARE_string(selected_gpus); constexpr static float fraction_reserve_gpu_memory = 0.05f; -DEFINE_double(fraction_of_gpu_memory_to_use, fraction_of_gpu_memory_to_use, - "Allocate a trunk of gpu memory that is this fraction of the " - "total gpu memory size. Future memory usage will be allocated " - "from the trunk. If the trunk doesn't have enough gpu memory, " - "additional trunks of the same size will be requested from gpu " - "until the gpu has no memory left for another trunk."); - -DEFINE_uint64( - initial_gpu_memory_in_mb, 0ul, - "Allocate a trunk of gpu memory whose byte size is specified by " - "the flag. Future memory usage will be allocated from the " - "trunk. If the trunk doesn't have enough gpu memory, additional " - "trunks of the gpu memory will be requested from gpu with size " - "specified by FLAGS_reallocate_gpu_memory_in_mb until the gpu has " - "no memory left for the additional trunk. Note: if you set this " - "flag, the memory size set by " - "FLAGS_fraction_of_gpu_memory_to_use will be overrided by this " - "flag. If you don't set this flag, PaddlePaddle will use " - "FLAGS_fraction_of_gpu_memory_to_use to allocate gpu memory"); - -DEFINE_uint64(reallocate_gpu_memory_in_mb, 0ul, - "If this flag is set, Paddle will reallocate the gpu memory with " - "size specified by this flag. Else Paddle will reallocate by " - "FLAGS_fraction_of_gpu_memory_to_use"); - -DEFINE_bool( - enable_cublas_tensor_op_math, false, - "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " - "but it may loss precision. Currently, There are two CUDA libraries that" - " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up" - " GEMM computations(the matrices must be either half precision or single " - "precision); cuDNN uses Tensor Cores to speed up both convolutions(the " - "input and output must be half precision) and recurrent neural networks " - "(RNNs)."); - -DEFINE_string(selected_gpus, "", - "A list of device ids separated by comma, like: 0,1,2,3. " - "This option is useful when doing multi process training and " - "each process have only one device (GPU). If you want to use " - "all visible devices, set this to empty string. NOTE: the " - "reason of doing this is that we want to use P2P communication" - "between GPU devices, use CUDA_VISIBLE_DEVICES can only use" - "share-memory only."); - namespace paddle { namespace platform { @@ -227,47 +180,47 @@ void GpuMemoryUsage(size_t *available, size_t *total) { error_code, CudaErrorWebsite()); } -size_t GpuMaxAllocSize() { - return std::max(GpuInitAllocSize(), GpuReallocSize()); -} - -size_t GpuInitAllocSize() { - if (FLAGS_initial_gpu_memory_in_mb > 0ul) { - // Initial memory will be allocated by FLAGS_initial_gpu_memory_in_mb - return static_cast(FLAGS_initial_gpu_memory_in_mb << 20); - } - - // FLAGS_initial_gpu_memory_in_mb is 0, initial memory will be allocated by - // fraction +size_t GpuAvailableMemToAlloc() { size_t total = 0; size_t available = 0; - GpuMemoryUsage(&available, &total); - size_t reserving = static_cast(fraction_reserve_gpu_memory * total); - - return static_cast((total - reserving) * - FLAGS_fraction_of_gpu_memory_to_use); + size_t reserving = + static_cast(fraction_reserve_gpu_memory * available); + // If available size is less than minimum chunk size, no usable memory exists + size_t available_to_alloc = available - reserving; + size_t min_chunk_size = GpuMinChunkSize(); + if (available_to_alloc < min_chunk_size) { + available_to_alloc = 0; + } + VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20) + << "M, " << (available_to_alloc >> 20) << "M available to allocate"; + return available_to_alloc; } -size_t GpuReallocSize() { - if (FLAGS_reallocate_gpu_memory_in_mb > 0ul) { - // Additional memory will be allocated by - // FLAGS_reallocate_gpu_memory_in_mb - return static_cast(FLAGS_reallocate_gpu_memory_in_mb << 20); - } +size_t GpuMaxAllocSize() { + return std::max(GpuInitAllocSize(), GpuReallocSize()); +} - // FLAGS_reallocate_gpu_memory_in_mb is 0, additional memory will be - // allocated - // by fraction - size_t total = 0; - size_t available = 0; +static size_t GpuAllocSize(bool realloc) { + size_t available_to_alloc = GpuAvailableMemToAlloc(); + PADDLE_ENFORCE_GT(available_to_alloc, 0, "No enough available GPU memory"); + // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be + // allocated by fraction + size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb + : FLAGS_initial_gpu_memory_in_mb; + size_t alloc_bytes = + (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * + FLAGS_fraction_of_gpu_memory_to_use); + PADDLE_ENFORCE_GE(available_to_alloc, alloc_bytes, + "No enough available GPU memory"); + VLOG(10) << "Alloc size is " << (alloc_bytes >> 20) + << " MiB, is it Re-alloc: " << realloc; + return alloc_bytes; +} - GpuMemoryUsage(&available, &total); - size_t reserving = static_cast(fraction_reserve_gpu_memory * total); +size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); } - return static_cast((total - reserving) * - FLAGS_fraction_of_gpu_memory_to_use); -} +size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); } size_t GpuMinChunkSize() { // Allow to allocate the minimum chunk size is 256 bytes. @@ -275,24 +228,9 @@ size_t GpuMinChunkSize() { } size_t GpuMaxChunkSize() { - size_t total = 0; - size_t available = 0; - - GpuMemoryUsage(&available, &total); - VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/" - << total / 1024 / 1024 << "M"; - size_t reserving = static_cast(fraction_reserve_gpu_memory * total); - // If available less than minimum chunk size, no usable memory exists. - available = - std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(), - total - reserving); - - size_t allocating = GpuMaxAllocSize(); - - PADDLE_ENFORCE_LE(allocating, available, - "Insufficient GPU memory to allocation."); - - return allocating; + size_t max_chunk_size = GpuMaxAllocSize(); + VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; + return max_chunk_size; } void GpuMemcpyAsync(void *dst, const void *src, size_t count, diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index d4be7ac9..e468c4aa 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -57,6 +57,10 @@ void SetDeviceId(int device_id); //! Get the memory usage of current GPU device. void GpuMemoryUsage(size_t *available, size_t *total); +//! Get the available memory to allocate, which is the size of available gpu +//! minus reserving. +size_t GpuAvailableMemToAlloc(); + //! Get the maximum allocation size of current GPU device. size_t GpuMaxAllocSize(); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 9b7b2120..be6519b1 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -36,8 +36,7 @@ limitations under the License. */ #include "dgc/dgc.h" #endif -DEFINE_int32(paddle_num_threads, 1, - "Number of threads for each paddle instance."); +DECLARE_int32(paddle_num_threads); DEFINE_int32(multiple_of_cupti_buffer_size, 1, "Multiple of the CUPTI device buffer size. If the timestamps have " "been dropped when you are profiling, try increasing this value."); @@ -45,6 +44,10 @@ DEFINE_int32(multiple_of_cupti_buffer_size, 1, namespace paddle { namespace framework { +#ifdef _WIN32 +#define strdup _strdup +#endif + std::once_flag gflags_init_flag; std::once_flag p2p_init_flag; @@ -150,7 +153,6 @@ void InitDevices(bool init_p2p, const std::vector devices) { } places.emplace_back(platform::CPUPlace()); platform::DeviceContextPool::Init(places); - platform::DeviceTemporaryAllocator::Init(); #ifndef PADDLE_WITH_MKLDNN platform::SetNumThreads(FLAGS_paddle_num_threads); @@ -204,9 +206,10 @@ void InitDevices(bool init_p2p, const std::vector devices) { } #ifndef _WIN32 -static void SignalHandle(const char *data, int size) { +void SignalHandle(const char *data, int size) { auto file_path = string::Sprintf("/tmp/paddle.%d.dump_info", ::getpid()); try { + LOG(WARNING) << std::string(data, size); std::ofstream dump_info; dump_info.open(file_path, std::ios::app); dump_info << std::string(data, size); diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h index 01d66f57..d25e79e7 100644 --- a/paddle/fluid/platform/init.h +++ b/paddle/fluid/platform/init.h @@ -32,5 +32,9 @@ void InitDevices(bool init_p2p, const std::vector devices); void InitDGC(); +#ifndef _WIN32 +void SignalHandle(const char *data, int size); +#endif + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/platform/init_test.cc b/paddle/fluid/platform/init_test.cc index eef1470a..3f911843 100644 --- a/paddle/fluid/platform/init_test.cc +++ b/paddle/fluid/platform/init_test.cc @@ -38,3 +38,10 @@ TEST(InitDevices, CUDA) { ASSERT_EQ(pool.size(), 1U + static_cast(count)); #endif } + +#ifndef _WIN32 +TEST(SignalHandle, SignalHandle) { + std::string msg = "Signal raises"; + paddle::framework::SignalHandle(msg.c_str(), msg.size()); +} +#endif diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index dafdb4ea..1ff568ce 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -17,11 +17,14 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/place.h" - namespace paddle { +#ifdef PADDLE_WITH_MKLDNN +using MKLDNNMemoryFormat = mkldnn::memory::format; +#endif namespace platform { using MKLDNNStream = mkldnn::stream; @@ -70,7 +73,7 @@ tf_pd MKLDNNBwdPrimitiveDesc(const Engine& e, const Primitive& p, inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector& dims, mkldnn::memory::data_type data_type, - mkldnn::memory::format format) { + MKLDNNMemoryFormat format) { mkldnn::memory::dims tz = dims; return mkldnn::memory::desc({tz}, data_type, format); } @@ -82,22 +85,24 @@ inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) { template mkldnn::memory::data_type MKLDNNGetDataType() { - return mkldnn::memory::data_undef; + return mkldnn::memory::data_type::data_undef; } template <> inline mkldnn::memory::data_type MKLDNNGetDataType() { - return mkldnn::memory::f32; + return mkldnn::memory::data_type::f32; +} +template <> +inline mkldnn::memory::data_type MKLDNNGetDataType() { + return mkldnn::memory::data_type::s32; } - template <> inline mkldnn::memory::data_type MKLDNNGetDataType() { - return mkldnn::memory::s8; + return mkldnn::memory::data_type::s8; } - template <> inline mkldnn::memory::data_type MKLDNNGetDataType() { - return mkldnn::memory::u8; + return mkldnn::memory::data_type::u8; } inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) { @@ -107,66 +112,104 @@ inline void Reorder(const mkldnn::memory& src, const mkldnn::memory& dst) { mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } -inline mkldnn::memory::format GetMKLDNNFormat(const mkldnn::memory memory) { - return static_cast( +inline MKLDNNMemoryFormat GetMKLDNNFormat(const mkldnn::memory memory) { + return static_cast( memory.get_primitive_desc().desc().data.format); } -inline mkldnn::memory::format GetMKLDNNFormat( +inline MKLDNNMemoryFormat GetMKLDNNFormat( const mkldnn::sum::primitive_desc& memory) { - return static_cast( + return static_cast( memory.dst_primitive_desc().desc().data.format); } -inline mkldnn::memory::format MKLDNNFormatForSize( - size_t dims_size, mkldnn::memory::format data_format) { +inline MKLDNNMemoryFormat MKLDNNFormatForSize(size_t dims_size, + MKLDNNMemoryFormat data_format) { if (dims_size == 1) { - return mkldnn::memory::format::x; + return MKLDNNMemoryFormat::x; } else if (dims_size == 2) { - return mkldnn::memory::format::nc; + return MKLDNNMemoryFormat::nc; } else if (dims_size == 3) { - if (data_format == mkldnn::memory::format::nchw) { - return mkldnn::memory::format::ncw; - } else if (data_format == mkldnn::memory::format::nhwc) { - return mkldnn::memory::format::nwc; + if (data_format == MKLDNNMemoryFormat::nchw) { + return MKLDNNMemoryFormat::ncw; + } else if (data_format == MKLDNNMemoryFormat::nhwc) { + return MKLDNNMemoryFormat::nwc; + } + } else if (dims_size == 4) { + if (data_format == MKLDNNMemoryFormat::goihw) { + return MKLDNNMemoryFormat::oihw; } } else if (dims_size == 5) { - if (data_format == mkldnn::memory::format::nchw) { - return mkldnn::memory::format::ncdhw; - } else if (data_format == mkldnn::memory::format::nhwc) { - return mkldnn::memory::format::ndhwc; + if (data_format == MKLDNNMemoryFormat::goidhw) { + return MKLDNNMemoryFormat::oidhw; + } + if (data_format == MKLDNNMemoryFormat::nchw) { + return MKLDNNMemoryFormat::ncdhw; + } else if (data_format == MKLDNNMemoryFormat::nhwc) { + return MKLDNNMemoryFormat::ndhwc; } } return data_format; } -inline mkldnn::memory::format data_format_to_memory_format( +inline MKLDNNMemoryFormat data_format_to_memory_format( const std::string& data_format) { switch (framework::StringToDataLayout(data_format)) { case framework::DataLayout::kNHWC: - return mkldnn::memory::format::nhwc; + return MKLDNNMemoryFormat::nhwc; case framework::DataLayout::kNCHW: - return mkldnn::memory::format::nchw; + return MKLDNNMemoryFormat::nchw; default: - return mkldnn::memory::format::any; + return MKLDNNMemoryFormat::any; } } -inline mkldnn::memory::format StringToMKLDNNFormat(std::string* format) { +inline MKLDNNMemoryFormat StringToMKLDNNFormat(std::string* format) { std::transform(format->begin(), format->end(), format->begin(), ::tolower); if (!format->compare("nchw")) { - return mkldnn::memory::format::nchw; + return MKLDNNMemoryFormat::nchw; } else if (!format->compare("nchw16c")) { - return mkldnn::memory::format::nChw16c; + return MKLDNNMemoryFormat::nChw16c; } else if (!format->compare("nchw8c")) { - return mkldnn::memory::format::nChw8c; + return MKLDNNMemoryFormat::nChw8c; } else if (!format->compare("nhwc")) { - return mkldnn::memory::format::nhwc; + return MKLDNNMemoryFormat::nhwc; } else { - return mkldnn::memory::format::any; + return MKLDNNMemoryFormat::any; } } +inline std::string ThreadIDasStr(void) { + return std::to_string( + std::hash()(std::this_thread::get_id())); +} + +template +inline void AppendKey(std::string* key, const T& num) { + key->append(std::to_string(num)); +} + +inline void AppendKey(std::string* key, const std::string& str) { + key->append(str); +} + +inline void AppendKey(std::string* key, const char* str) { key->append(str); } + +inline void AppendKey(std::string* key, const std::vector& dims) { + for (size_t i = 0; i < dims.size(); i++) { + AppendKey(key, std::to_string(dims[i])); + } +} + +template +inline std::string CreateKey(ArgTypes&&... args) { + std::string key; + key.reserve(256); + using expand_type = int[]; + expand_type{0, (AppendKey(&key, std::forward(args)), 0)...}; + return key; +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 9f277d68..7396b90e 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include +#include #include #include "boost/optional.hpp" #include "paddle/fluid/framework/data_layout_transform.h" @@ -29,41 +30,172 @@ namespace platform { using user_function = std::function(const float*)>; using memory = mkldnn::memory; -class MKLDNNHandler { +template +class MKLDNNHandlerT { public: - MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : dev_ctx_(dev_ctx), engine_(engine), key_common_(base_key) { - // TODO(jczaja): Make it faster - auto tid = std::this_thread::get_id(); - std::stringstream ss; - ss << tid; - key_ = key_common_ + "-t:" + ss.str(); + MKLDNNHandlerT(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + platform::Place cpu_place, const std::string& base_key) + : dev_ctx_(dev_ctx), + engine_(engine), + place_(cpu_place), + key_common_(base_key), + fwd_pd_(nullptr), + bwd_pd_(nullptr) { if (platform::get_cur_mkldnn_session_id() != platform::kMKLDNNSessionID_Default) { key_ = key_common_; + } else { + key_ = key_common_ + "-t:" + ThreadIDasStr(); } } + template + std::shared_ptr AcquireForwardPrimitive(Args&&... args) { + const std::string key_p = key_ + "@forward_p"; + auto forward_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); + if (forward_p == nullptr) { + forward_p = + std::make_shared(*fwd_pd_, std::forward(args)...); + dev_ctx_.SetBlob(key_p, forward_p); + } + return forward_p; + } + + template + std::shared_ptr AcquireBackwardPrimitive(Args&&... args) { + const std::string key_p = key_ + "@backward_p"; + auto backward_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); + if (backward_p == nullptr) { + backward_p = + std::make_shared(*bwd_pd_, std::forward(args)...); + dev_ctx_.SetBlob(key_p, backward_p); + } + return backward_p; + } + std::shared_ptr AcquireSrcMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_src_mem_p"); + const framework::Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(fwd_pd_->src_primitive_desc(), + to_void_cast(input_data), + "@src_mem_p"); } - std::shared_ptr AcquireSecondSrcMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_src2_mem_p"); + std::shared_ptr AcquireDstMemory(framework::Tensor* output) { + T* ptr = output->mutable_data(place_, + fwd_pd_->dst_primitive_desc().get_size()); + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_primitive_desc(), ptr, + "@dst_mem_p"); } - std::shared_ptr AcquireWeightsMemory( - const mkldnn::memory::desc& md, void* ptr, - user_function custom_func = {}) { - return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func); + std::shared_ptr AcquireDstMemory( + const framework::Tensor* output) { + const T* output_data = output->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_primitive_desc(), + to_void_cast(output_data), + "@bwd-dst_mem_p"); } - std::shared_ptr AcquireBiasMemory( + std::shared_ptr AcquireDiffDstMemory( + const framework::Tensor* diffdst) { + const T* ptr = diffdst->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_primitive_desc(), + to_void_cast(ptr), + "@diff_dst_mem_p"); + } + + std::shared_ptr AcquireDiffSrcMemory( + framework::Tensor* diffsrc) { + T* ptr = diffsrc->mutable_data( + place_, bwd_pd_->diff_src_primitive_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_primitive_desc(), + ptr, "@diff_src_mem_p"); + } + + protected: + template + void AcquireForwardPrimitiveDescriptor(Args&&... args) { + // Forward PD has to be passed to Grad op that + // may be executed by diffrent thread, hence + // for that one we use key that does not contain TID + const std::string key_pd = key_common_ + "@forward_pd"; + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + if (fwd_pd_ == nullptr) { + static std::mutex acquire_barrier; + std::lock_guard block_threads_until_finish_this_job( + acquire_barrier); + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + if (fwd_pd_ == nullptr) { + auto fwd_desc = typename TForward::desc(std::forward(args)...); + fwd_pd_ = std::make_shared(fwd_desc, + engine_); + dev_ctx_.SetBlob(key_pd, fwd_pd_); + } + } + } + + template + void AcquireBackwardPrimitiveDescriptor(Args&&... args) { + const std::string key_fwd_pd = key_common_ + "@forward_pd"; + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_fwd_pd)); + PADDLE_ENFORCE_NOT_NULL(fwd_pd_); + const std::string key_pd = key_ + "@backward_pd"; + bwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + if (bwd_pd_ == nullptr) { + auto bwd_desc = typename TBackward::desc(std::forward(args)...); + bwd_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + dev_ctx_.SetBlob(key_pd, bwd_pd_); + } + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::primitive_desc mdp, void* ptr, + const std::string& suffix) { + auto local_key = key_ + suffix; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + if (mem_p == nullptr) { + mem_p = std::make_shared(mdp, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + } + return mem_p; + } + + const MKLDNNDeviceContext& dev_ctx_; + mkldnn::engine engine_; + platform::Place place_; + std::string key_; + std::string key_common_; + std::shared_ptr fwd_pd_; + std::shared_ptr bwd_pd_; +}; + +// TODO(grygielski) this class will be deleted later. +class MKLDNNHandler { + public: + MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, + const std::string& base_key) + : dev_ctx_(dev_ctx), engine_(engine), key_common_(base_key) { + if (platform::get_cur_mkldnn_session_id() != + platform::kMKLDNNSessionID_Default) { + key_ = key_common_; + } else { + key_ = key_common_ + "-t:" + ThreadIDasStr(); + } + } + + std::shared_ptr AcquireSrcMemory( const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_bias_mem_p"); + return this->AcquireMemory(md, ptr, "@user_src_mem_p"); } std::shared_ptr AcquireDstMemory( @@ -71,14 +203,14 @@ class MKLDNNHandler { return this->AcquireMemory(md, ptr, "@user_dst_mem_p"); } - std::shared_ptr AcquireDiffDstMemory( + std::shared_ptr AcquireDiffSrcMemory( const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p"); + return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p"); } - std::shared_ptr AcquireDiffSrcMemory( + std::shared_ptr AcquireDiffDstMemory( const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p"); + return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p"); } std::shared_ptr AcquireMemoryFromPrimitive( @@ -123,13 +255,20 @@ class MKLDNNHandler { } std::shared_ptr AcquireMemory( - const mkldnn::memory::primitive_desc& mpd, const std::string& suffix) { + const std::vector& dims, const mkldnn::memory::data_type dtype, + const MKLDNNMemoryFormat& fmt, void* ptr, const std::string& suffix) { + /*Generate key*/ auto local_key = key_ + suffix; auto mem_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - mem_p = std::make_shared(mpd); + auto md = mkldnn::memory::desc(dims, dtype, fmt); + + mem_p = std::make_shared( + mkldnn::memory::primitive_desc{md, engine_}, ptr); dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); } return mem_p; } @@ -205,71 +344,11 @@ class MKLDNNHandler { return target_memory_p; } - static std::string GetHash(mkldnn::memory::dims& operand_dims, // NOLINT - const std::string& suffix) { - return dims2str(operand_dims) + suffix; - } - - static void AppendKey( - std::string* key, const mkldnn::memory::dims& input_dims, - const mkldnn::memory::dims& weights_dims, const std::vector& strides, - const std::vector& paddings, const std::vector& dilations, - const int& groups, const mkldnn::memory::data_type& srcdt, - const mkldnn::memory::format& format, const bool& relu, - const bool& residual, const bool& brelu, const std::string& suffix) { - AppendKeyDims(key, input_dims); - - AppendKeyDims(key, weights_dims); - - AppendKeyVec(key, strides); - - AppendKeyVec(key, paddings); - - AppendKeyVec(key, dilations); - - AppendKey(key, std::to_string(groups)); - AppendKey(key, std::to_string(srcdt)); - AppendKey(key, std::to_string(format)); - AppendKey(key, std::to_string(relu)); - AppendKey(key, std::to_string(residual)); - AppendKey(key, std::to_string(brelu)); - AppendKey(key, suffix); - } - - static void AppendKeyDims(std::string* key, - const mkldnn::memory::dims& dims) { - for (unsigned int i = 0; i < dims.size(); i++) { - AppendKey(key, std::to_string(dims[i])); - } - } - - static void AppendKeyVec(std::string* key, const std::vector& dims) { - for (unsigned int i = 0; i < dims.size(); i++) { - AppendKey(key, std::to_string(dims[i])); - } - } - - static void AppendKey(std::string* key, const std::string& s) { - key->append(s); - } - - protected: - static std::string dims2str(const mkldnn::memory::dims& operand_dims) { - std::string dstr = ""; - for (size_t i = 0; i < operand_dims.size(); ++i) { - dstr += std::to_string(operand_dims[i]) + "-"; - } - return dstr; - } - protected: const MKLDNNDeviceContext& dev_ctx_; mkldnn::engine engine_; std::string key_; std::string key_common_; - - public: - static constexpr int MaxKeyLength = 256; }; class SumMKLDNNHandler : public MKLDNNHandler { @@ -304,6 +383,11 @@ class SumMKLDNNHandler : public MKLDNNHandler { "@dst_mem_p"); } + std::shared_ptr AcquireSecondSrcMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_src2_mem_p"); + } + std::shared_ptr AcquireSum( std::shared_ptr dst_memory, std::vector* inputs) { @@ -321,463 +405,210 @@ class SumMKLDNNHandler : public MKLDNNHandler { std::shared_ptr sum_pd_; }; -class ActivationMKLDNNHandler : public MKLDNNHandler { +template +class ActivationMKLDNNHandler + : public MKLDNNHandlerT { public: - ActivationMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx, - mkldnn::engine engine, const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key) {} - - std::shared_ptr - AcquireActivationPrimitiveDescriptor(mkldnn::prop_kind prop_kind, - mkldnn::algorithm algorithm, - const mkldnn::memory::desc& md, - float alpha, float beta) { - // Activation PD has to be passed to Grad op that - // may be executed by diffrent thread, hence - // for that one we use key that does not contain TID - const std::string key_activation_pd = key_common_ + "@activation_pd"; - activation_pd_ = - std::static_pointer_cast( - dev_ctx_.GetBlob(key_activation_pd)); - if (activation_pd_ == nullptr) { - static std::mutex acquire_barrier; - std::lock_guard block_threads_until_finish_this_job( - acquire_barrier); - - activation_pd_ = - std::static_pointer_cast( - dev_ctx_.GetBlob(key_activation_pd)); - if (activation_pd_ == nullptr) { - auto activation_desc = mkldnn::eltwise_forward::desc( - prop_kind, algorithm, md, alpha, beta); - - activation_pd_.reset(new mkldnn::eltwise_forward::primitive_desc( - activation_desc, engine_)); - dev_ctx_.SetBlob(key_activation_pd, activation_pd_); - } - } - return activation_pd_; - } - - std::shared_ptr - AcquireActivationBackwardPrimitiveDescriptor( - mkldnn::algorithm algorithm, const mkldnn::memory::desc& diff_dst_md, - const mkldnn::memory::desc& src_md, float alpha, float beta) { - const std::string key_activation_pd = key_common_ + "@activation_pd"; - const std::string key_activation_bwd_pd = key_ + "@activation_bwd_pd"; - activation_bwd_pd_ = - std::static_pointer_cast( - dev_ctx_.GetBlob(key_activation_bwd_pd)); - if (activation_bwd_pd_ == nullptr) { - activation_pd_ = - std::static_pointer_cast( - dev_ctx_.GetBlob(key_activation_pd)); - // PD from FWD op has to exist. - PADDLE_ENFORCE(activation_pd_ != nullptr, - "Eltwise MKL-DNN not found in cache!"); - auto backward_desc = mkldnn::eltwise_backward::desc( - algorithm, diff_dst_md, src_md, alpha, beta); - activation_bwd_pd_.reset(new mkldnn::eltwise_backward::primitive_desc( - backward_desc, engine_, *activation_pd_)); - dev_ctx_.SetBlob(key_activation_bwd_pd, activation_bwd_pd_); - } - return activation_bwd_pd_; - } - - std::shared_ptr AcquireActivation( - std::shared_ptr dst_memory_p, - std::shared_ptr src_memory_p) { - /*Generate key*/ - auto prim_key = key_ + "@eltwise_p"; - - auto eltwise_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - if (eltwise_p == nullptr) { - eltwise_p = std::make_shared( - *activation_pd_, *(src_memory_p), *(dst_memory_p)); - dev_ctx_.SetBlob(prim_key, eltwise_p); - } - - return eltwise_p; - } - - // TODO(jczaja): Merge all AcquireDstMemoryFromPrimitive into one - std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive( - activation_pd_->dst_primitive_desc(), ptr, "@dst_mem_p"); - } - - std::shared_ptr AcquireDiffSrcMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive( - activation_bwd_pd_->diff_src_primitive_desc(), ptr, "@diff_src_mem_p"); - } - - std::shared_ptr AcquireActivationBackward( - std::shared_ptr diff_src_memory_p, - std::shared_ptr diff_dst_memory_p, - std::shared_ptr src_memory_p) { - /*Generate key*/ - auto prim_key = key_ + "@eltwise_bwd_p"; - - auto eltwise_bwd_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - if (eltwise_bwd_p == nullptr) { - eltwise_bwd_p = std::make_shared( - *activation_bwd_pd_, *(src_memory_p), *(diff_dst_memory_p), - *(diff_src_memory_p)); - dev_ctx_.SetBlob(prim_key, eltwise_bwd_p); - } - - return eltwise_bwd_p; + ActivationMKLDNNHandler(const std::vector& dims, + mkldnn::algorithm algorithm, float alpha, float beta, + const MKLDNNMemoryFormat fmt, bool is_test, + const platform::MKLDNNDeviceContext& dev_ctx, + platform::Place cpu_place, + const std::string& unique_name) + + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dims, algorithm, fmt, alpha, beta, + unique_name)) { + auto md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); + + this->AcquireForwardPrimitiveDescriptor( + is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training, + algorithm, md, alpha, beta); + } + + ActivationMKLDNNHandler(const std::vector& dims, + mkldnn::algorithm algorithm, float alpha, float beta, + const MKLDNNMemoryFormat fmt, + const MKLDNNMemoryFormat diff_fmt, + const platform::MKLDNNDeviceContext& dev_ctx, + platform::Place cpu_place, + const std::string& unique_name) + + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dims, algorithm, fmt, alpha, beta, + unique_name)) { + auto diff_dst_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), diff_fmt); + auto src_md = + platform::MKLDNNMemDesc(dims, platform::MKLDNNGetDataType(), fmt); + + this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, + alpha, beta); + } + + std::shared_ptr AcquireBackwardSrcMemory( + const framework::Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_primitive_desc(), + to_void_cast(input_data), + "@bwd-src_mem_p"); } - - private: - std::shared_ptr activation_pd_; - std::shared_ptr activation_bwd_pd_; }; -class LRNMKLDNNHandler : public MKLDNNHandler { +template +class LRNMKLDNNHandler + : public MKLDNNHandlerT { public: - LRNMKLDNNHandler(bool is_test, const platform::MKLDNNDeviceContext& dev_ctx, - mkldnn::engine engine, const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key), is_test_(is_test) {} - - std::shared_ptr - AcquireLRNPrimitiveDescriptor(const mkldnn::memory::desc& src_md, const int n, - const float alpha, const float beta, - const float k) { - // LRN PD has to be passed to Grad op that - // may be executed by diffrent thread, hence - // for that one we use key that does not contain TID - const std::string key_lrn_pd = key_common_ + "@lrn_pd"; - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_lrn_pd)); - if (fwd_pd_ == nullptr) { - static std::mutex acquire_barrier; - std::lock_guard block_threads_until_finish_this_job( - acquire_barrier); - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_lrn_pd)); - if (fwd_pd_ == nullptr) { - auto forward_desc = mkldnn::lrn_forward::desc{ - is_test_ ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training, - mkldnn::lrn_across_channels, src_md, n, alpha, beta, k}; - fwd_pd_.reset( - new mkldnn::lrn_forward::primitive_desc(forward_desc, engine_)); - dev_ctx_.SetBlob(key_lrn_pd, fwd_pd_); - } - } - return fwd_pd_; - } - - std::shared_ptr AcquireWorkspaceMemory(void) { - // workspace has to be passed to Grad op that - // may be executed by diffrent thread, hence - // for that one we use key that does not contain TID - auto local_key = key_common_ + "@workspace"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - static std::mutex acquire_barrier; - std::lock_guard block_threads_until_finish_this_job( - acquire_barrier); - mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - const std::string key_lrn_pd = key_common_ + "@lrn_pd"; - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_lrn_pd)); - // PD from FWD op has to exist. - PADDLE_ENFORCE(fwd_pd_ != nullptr, - "LRN PD MKL-DNN not found in cache!"); - mkldnn::memory::primitive_desc workspace_mpd = - fwd_pd_->workspace_primitive_desc(); - mem_p = std::make_shared(workspace_mpd); - dev_ctx_.SetBlob(local_key, mem_p); - } - } - return mem_p; - } - - std::shared_ptr AcquireLRN( - std::shared_ptr dst_memory, - std::shared_ptr src_memory) { - auto prim_key = key_ + "@lrn_p"; - - auto lrn_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - if (lrn_p == nullptr) { - if (is_test_) { - lrn_p = std::make_shared(*fwd_pd_, *(src_memory), - *(dst_memory)); - } else { - // For training we need to create workspace - // to store indices from backward - auto workspace_memory = this->AcquireWorkspaceMemory(); - - lrn_p = std::make_shared( - *fwd_pd_, *src_memory, *workspace_memory, *dst_memory); - } - dev_ctx_.SetBlob(prim_key, lrn_p); - } - return lrn_p; - } - - std::shared_ptr - AcquireLRNBackwardPrimitiveDescriptor(const mkldnn::memory::desc& src_md, - const mkldnn::memory::desc& diff_md, - const int n, const float alpha, - const float beta, const float k) { - const std::string key_lrn_pd = key_common_ + "@lrn_pd"; - const std::string key_lrn_bwd_pd = key_ + "@lrn_bwd_pd"; - bwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_lrn_bwd_pd)); - if (bwd_pd_ == nullptr) { - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_lrn_pd)); - // PD from FWD op has to exist. - PADDLE_ENFORCE(fwd_pd_ != nullptr, "LRN MKL-DNN not found in cache!"); - - auto backward_desc = mkldnn::lrn_backward::desc{ - mkldnn::lrn_across_channels, src_md, diff_md, n, alpha, beta, k}; - bwd_pd_.reset(new mkldnn::lrn_backward::primitive_desc( - backward_desc, engine_, *fwd_pd_)); - dev_ctx_.SetBlob(key_lrn_bwd_pd, bwd_pd_); - } - return bwd_pd_; - } - - std::shared_ptr AcquireLRNBackward( - std::shared_ptr src_memory, - std::shared_ptr diff_dst_memory, - std::shared_ptr workspace, - std::shared_ptr diff_src_memory) { - auto prim_key = key_ + "@lrn_bwd_p"; - - auto lrn_bwd_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - if (lrn_bwd_p == nullptr) { - lrn_bwd_p = std::make_shared( - *bwd_pd_, *src_memory, *diff_dst_memory, *workspace, - *diff_src_memory); - dev_ctx_.SetBlob(prim_key, lrn_bwd_p); - } - - return lrn_bwd_p; + LRNMKLDNNHandler(const std::vector& dims, const int n, const float alpha, + const float beta, const float k, + const MKLDNNMemoryFormat fmt, bool is_test, + const platform::MKLDNNDeviceContext& dev_ctx, + platform::Place cpu_place, const std::string& unique_name) + + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dims, n, alpha, beta, k, fmt, unique_name)) { + auto src_md = + mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); + this->AcquireForwardPrimitiveDescriptor( + is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training, + mkldnn::lrn_across_channels, src_md, n, alpha, beta, k); + } + + LRNMKLDNNHandler(const std::vector& dims, const int n, const float alpha, + const float beta, const float k, + const MKLDNNMemoryFormat fmt, + const MKLDNNMemoryFormat diff_fmt, + const platform::MKLDNNDeviceContext& dev_ctx, + platform::Place cpu_place, const std::string& unique_name) + + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(dims, n, alpha, beta, k, fmt, unique_name)) { + auto src_md = + mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); + auto diff_md = + mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), diff_fmt); + + this->AcquireBackwardPrimitiveDescriptor( + mkldnn::lrn_across_channels, src_md, diff_md, n, alpha, beta, k); + } + + std::shared_ptr AcquireWorkspaceMemory( + framework::Tensor* workspace) { + T* ptr = workspace->mutable_data( + this->place_, this->fwd_pd_->dst_primitive_desc().get_size()); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->workspace_primitive_desc(), ptr, "@wrk_mem_p"); } - static std::string GetHash(const memory::dims& input_dims, const int n, - const float alpha, const float beta, const float k, - const memory::format& fmt, - const std::string& suffix) { - std::string key; - key.reserve(platform::MKLDNNHandler::MaxKeyLength); - platform::MKLDNNHandler::AppendKeyDims(&key, input_dims); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(n)); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(alpha)); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(beta)); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(k)); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(fmt)); - platform::MKLDNNHandler::AppendKey(&key, suffix); - return key; + std::shared_ptr AcquireBackwardWorkspaceMemory( + const framework::Tensor* workspace) { + const T* workspace_data = workspace->data(); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->workspace_primitive_desc(), + to_void_cast(workspace_data), "@bwd-wrk_mem_p"); } - - private: - bool is_test_; - std::shared_ptr fwd_pd_; - std::shared_ptr bwd_pd_; }; -class PoolingMKLDNNHandler : public MKLDNNHandler { +template +class PoolingMKLDNNHandler : public MKLDNNHandlerT { public: - PoolingMKLDNNHandler(const std::string& pooling_type, - mkldnn::memory::data_type dt, bool is_test, - const platform::MKLDNNDeviceContext& dev_ctx, - mkldnn::engine engine, const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key), - dt_(dt), - pooling_type_(pooling_type), - is_test_(is_test) {} - - std::shared_ptr - AcquirePoolingPrimitiveDescriptor( - const std::vector& src_tz, const std::vector& dst_tz, - const mkldnn::memory::desc& src_md, const mkldnn::memory::desc& dst_md, + PoolingMKLDNNHandler( + const std::vector& src_dims, const std::vector& dst_dims, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, bool ceil_mode) { - // Pooling PD has to be passed to Grad op that - // may be executed by diffrent thread, hence - // for that one we use key that does not contain TID - const std::string key_pooling_pd = key_common_ + "@pooling_pd"; - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_pooling_pd)); - if (fwd_pd_ == nullptr) { - static std::mutex acquire_barrier; - std::lock_guard block_threads_until_finish_this_job( - acquire_barrier); - fwd_pd_ = - std::static_pointer_cast( - dev_ctx_.GetBlob(key_pooling_pd)); - if (fwd_pd_ == nullptr) { - std::vector padding_left_top(paddings); - std::vector padding_right_bottom(paddings); - if (ceil_mode) { - CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, - padding_right_bottom); - } - auto mkldnn_forward_prop_kind = - is_test_ ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training; - auto pooling_desc = mkldnn::pooling_forward::desc( - mkldnn_forward_prop_kind, - pooling_type_ == "max" ? mkldnn::algorithm::pooling_max - : mkldnn::algorithm::pooling_avg, - src_md, dst_md, strides, ksize, padding_left_top, - padding_right_bottom, mkldnn::padding_kind::zero); - - fwd_pd_.reset( - new mkldnn::pooling_forward::primitive_desc(pooling_desc, engine_)); - dev_ctx_.SetBlob(key_pooling_pd, fwd_pd_); - } + const std::vector& paddings, const std::string& pooling_type, + bool ceil_mode, const MKLDNNMemoryFormat fmt, + mkldnn::memory::data_type dt, bool is_test, + const platform::MKLDNNDeviceContext& dev_ctx, platform::Place cpu_place, + const std::string& unique_name) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(src_dims, pooling_type, ksize, strides, + paddings, dt, fmt, unique_name)) { + auto src_md = mkldnn::memory::desc(src_dims, dt, fmt); + /* create memory descriptor for pooling without specified format + * ('any') which lets a primitive (pooling in this case) choose + * the memory format preferred for best performance + */ + auto dst_md = + platform::MKLDNNMemDesc(dst_dims, dt, MKLDNNMemoryFormat::any); + + std::vector padding_left_top(paddings); + std::vector padding_right_bottom(paddings); + if (ceil_mode) { + CorrectOutputSize(src_dims, dst_dims, ksize, paddings, strides, + padding_right_bottom); } - return fwd_pd_; - } - std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_primitive_desc(), ptr, - "@dst_mem_p"); + this->AcquireForwardPrimitiveDescriptor( + is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training, + pooling_type == "max" ? mkldnn::algorithm::pooling_max + : mkldnn::algorithm::pooling_avg, + src_md, dst_md, strides, ksize, padding_left_top, padding_right_bottom, + mkldnn::padding_kind::zero); + } + + PoolingMKLDNNHandler( + const std::vector& diff_dst_dims, + const std::vector& diff_src_dims, const std::vector& ksize, + const std::vector& strides, const std::vector& paddings, + const std::string& pooling_type, bool ceil_mode, + const MKLDNNMemoryFormat fmt, const MKLDNNMemoryFormat diff_dst_fmt, + mkldnn::memory::data_type dt, + const platform::MKLDNNDeviceContext& dev_ctx, platform::Place cpu_place, + const std::string& unique_name) + : platform::MKLDNNHandlerT( + dev_ctx, dev_ctx.GetEngine(), cpu_place, + platform::CreateKey(diff_src_dims, pooling_type, ksize, strides, + paddings, dt, fmt, unique_name)) { + auto diff_dst_md = mkldnn::memory::desc( + diff_dst_dims, platform::MKLDNNGetDataType(), diff_dst_fmt); + auto diff_src_md = + mkldnn::memory::desc(diff_src_dims, platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); + + this->AcquireBackwardPrimitiveDescriptor( + pooling_type == "max" ? mkldnn::algorithm::pooling_max + : mkldnn::algorithm::pooling_avg, + diff_src_md, diff_dst_md, strides, ksize, paddings, paddings, + mkldnn::padding_kind::zero); } std::shared_ptr AcquireWorkspaceMemory(void) { mkldnn::memory::primitive_desc workspace_mpd = - pooling_type_ == "max" - ? fwd_pd_->workspace_primitive_desc() - : mkldnn::memory::primitive_desc( - {{}, dt_, mkldnn::memory::format::nchw}, engine_); + this->fwd_pd_->workspace_primitive_desc(); // Pooling PD has to be passed to Grad op that // may be executed by diffrent thread, hence // for that one we use key that does not contain TID - auto local_key = key_common_ + "@workspace"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + auto local_key = this->key_common_ + "@workspace"; + auto mem_p = std::static_pointer_cast( + this->dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { static std::mutex acquire_barrier; std::lock_guard block_threads_until_finish_this_job( acquire_barrier); - mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + mem_p = std::static_pointer_cast( + this->dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { mem_p = std::make_shared(workspace_mpd); - dev_ctx_.SetBlob(local_key, mem_p); + this->dev_ctx_.SetBlob(local_key, mem_p); } } return mem_p; } - std::shared_ptr AcquirePooling( - std::shared_ptr dst_memory, - std::shared_ptr src_memory) { - auto prim_key = key_ + "@pooling_p"; - - auto pooling_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - if (pooling_p == nullptr) { - if (is_test_) { - pooling_p = std::make_shared( - *fwd_pd_, *(src_memory), *(dst_memory)); - } else { - // For training we need to create workspace - // to store indices from backward - auto workspace_memory = this->AcquireWorkspaceMemory(); - - pooling_p = std::make_shared( - *fwd_pd_, *src_memory, *dst_memory, *workspace_memory); - } - dev_ctx_.SetBlob(prim_key, pooling_p); - } - return pooling_p; - } - - std::shared_ptr - AcquirePoolingBackwardPrimitiveDescriptor( - const mkldnn::memory::desc& diff_dst_md, - const mkldnn::memory::desc& diff_src_md, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings) { - const std::string key_pooling_pd = key_common_ + "@pooling_pd"; - const std::string key_pooling_bwd_pd = key_ + "@pooling_bwd_pd"; - bwd_pd_ = - std::static_pointer_cast( - dev_ctx_.GetBlob(key_pooling_bwd_pd)); - if (bwd_pd_ == nullptr) { - fwd_pd_ = - std::static_pointer_cast( - dev_ctx_.GetBlob(key_pooling_pd)); - // PD from FWD op has to exist. - PADDLE_ENFORCE(fwd_pd_ != nullptr, "Pooling MKL-DNN not found in cache!"); - - auto backward_desc = mkldnn::pooling_backward::desc( - pooling_type_ == "max" ? mkldnn::algorithm::pooling_max - : mkldnn::algorithm::pooling_avg, - diff_src_md, diff_dst_md, strides, ksize, paddings, paddings, - mkldnn::padding_kind::zero); - bwd_pd_.reset(new mkldnn::pooling_backward::primitive_desc( - backward_desc, engine_, *fwd_pd_)); - - dev_ctx_.SetBlob(key_pooling_bwd_pd, bwd_pd_); - } - return bwd_pd_; - } - - std::shared_ptr AcquireDiffDstMemoryFromDataPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto diff_dst_pd = bwd_pd_->diff_dst_primitive_desc(); - auto user_pd = user_memory_p->get_primitive_desc(); - return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, - "@diff_dst_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffSrcMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_primitive_desc(), - ptr, "@diff_src_mem_p"); - } - - std::shared_ptr AcquirePoolingBackward( - std::shared_ptr diff_dst_memory, - std::shared_ptr workspace, - std::shared_ptr diff_src_memory) { - auto prim_key = key_ + "@pooling_bwd_p"; - - auto pooling_bwd_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - if (pooling_bwd_p == nullptr) { - pooling_bwd_p = std::make_shared( - *bwd_pd_, *diff_dst_memory, *workspace, *diff_src_memory); - dev_ctx_.SetBlob(prim_key, pooling_bwd_p); - } - - return pooling_bwd_p; - } - - static std::string GetHash( - const memory::dims& input_dims, const std::string& pooling_type, - const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, const memory::data_type& dt, - const memory::format& fmt, const std::string& suffix) { - std::string key; - key.reserve(platform::MKLDNNHandler::MaxKeyLength); - platform::MKLDNNHandler::AppendKeyDims(&key, input_dims); - platform::MKLDNNHandler::AppendKey(&key, pooling_type); - platform::MKLDNNHandler::AppendKeyVec(&key, ksize); - platform::MKLDNNHandler::AppendKeyVec(&key, strides); - platform::MKLDNNHandler::AppendKeyVec(&key, paddings); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt)); - platform::MKLDNNHandler::AppendKey(&key, std::to_string(fmt)); - platform::MKLDNNHandler::AppendKey(&key, suffix); - return key; - } - private: static inline int ComputeCeiledOutput(int input_size, int kernel_size, int padding, int stride) { @@ -797,13 +628,6 @@ class PoolingMKLDNNHandler : public MKLDNNHandler { } } } - - private: - mkldnn::memory::data_type dt_; - std::string pooling_type_; - bool is_test_; - std::shared_ptr fwd_pd_; - std::shared_ptr bwd_pd_; }; class TransposeMKLDNNHandler : public MKLDNNHandler { @@ -818,7 +642,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { logical_axis_(dims.size(), 0) {} std::shared_ptr AcquireSrcMemory( - const mkldnn::memory::format& fmt, void* ptr) { + const MKLDNNMemoryFormat& fmt, void* ptr) { auto local_key = key_ + "@user_src_mem_p"; auto mem_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); @@ -828,7 +652,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { for (size_t i = 0; i < logical_axis_.size(); ++i) { logical_axis_[i] = i; } - auto src_md = fmt != mkldnn::memory::format::nchw + auto src_md = fmt != MKLDNNMemoryFormat::nchw ? platform::MKLDNNMemDesc( dims_, platform::MKLDNNGetDataType(), fmt) : Axis2MemoryDesc(dims_, logical_axis_); @@ -875,12 +699,6 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { return transpose_p; } - static std::string GetHash(std::vector& shape, // NOLINT - std::vector& axis, // NOLINT - const std::string& suffix) { - return dims2str(shape) + dims2str(axis) + suffix; - } - protected: mkldnn_memory_desc_t Axis2MemoryDesc(std::vector& nchw_tz, // NOLINT std::vector& axis // NOLINT @@ -930,23 +748,12 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { dtype_(dtype) {} std::shared_ptr AcquireSrcMemory( - const mkldnn::memory::format& fmt, void* ptr) { - auto local_key = key_ + "@user_src_mem_p"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - auto src_md = platform::MKLDNNMemDesc(dims_, dtype_, fmt); - mem_p = std::make_shared( - mkldnn::memory::primitive_desc{src_md, engine_}, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; + const MKLDNNMemoryFormat& fmt, void* ptr) { + return this->AcquireMemory(dims_, dtype_, fmt, ptr, "@user_src_mem_p"); } std::shared_ptr AcquireDstMemory( - framework::Tensor* output, const mkldnn::memory::format& fmt, + framework::Tensor* output, const MKLDNNMemoryFormat& fmt, platform::Place place) { auto local_key = key_ + "@user_dst_mem_p"; auto mem_p = @@ -980,14 +787,6 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { return reorder_p; } - static std::string GetHash(std::vector& shape, // NOLINT - mkldnn::memory::format in_fmt, - mkldnn::memory::format out_fmt, - const std::string& suffix) { - return dims2str(shape) + std::to_string(in_fmt) + "->" + - std::to_string(out_fmt) + "#" + suffix; - } - private: std::vector dims_; framework::proto::VarType::Type vtype_; @@ -1015,15 +814,6 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key) {} - // TODO(jczaja): remove after conv int8 is adapted - ConvMKLDNNTemplateHandler( - std::shared_ptr conv_pd, - const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key) { - conv_pd_ = conv_pd; - } - ConvMKLDNNTemplateHandler( std::shared_ptr conv_pd, std::shared_ptr @@ -1045,8 +835,8 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { return conv_pd_->dst_primitive_desc().get_size(); } - mkldnn::memory::format GetDstFormat() const { - return static_cast( + MKLDNNMemoryFormat GetDstFormat() const { + return static_cast( conv_pd_->dst_primitive_desc().desc().data.format); } @@ -1135,6 +925,17 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { pipeline); } + std::shared_ptr AcquireWeightsMemory( + const mkldnn::memory::desc& md, void* ptr, + user_function custom_func = {}) { + return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func); + } + + std::shared_ptr AcquireBiasMemory( + const mkldnn::memory::desc& md, void* ptr) { + return this->AcquireMemory(md, ptr, "@user_bias_mem_p"); + } + std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, std::vector& pipeline, // NOLINT @@ -1160,35 +961,37 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { scale_data, mask); } - mkldnn::primitive_attr CreatePostOps(bool fuse_relu, bool fuse_residual_conn, - bool fuse_brelu, - float fuse_brelu_threshold) const { + mkldnn::primitive_attr CreatePostOps( + std::string fuse_activation, float fuse_alpha, float fuse_beta, + bool fuse_residual_conn, const std::vector output_shift_scale = {}, + float sum_scale = 1.0f) const { mkldnn::primitive_attr conv_attr; mkldnn::post_ops post_operations; + if (output_shift_scale.size() > 0) { + int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; + conv_attr.set_output_scales(mask, output_shift_scale); + } // Fusion with Elementwise layer relies on adding a sum post-operation with // the scale parameter. It is assumed that when fuse_residual_connection is // true, the output tensor contains the data coming from residual // connection. The result of this post_op is: // Output = scale * Output + Conv_Out. if (fuse_residual_conn) { - post_operations.append_sum(1.0f); + post_operations.append_sum(sum_scale); } // Fusion with ReLU layer is executed through the PostOps feature. Create a // PostOps object and configure it to execute an eltwise relu operation. - if (fuse_relu) { + if (fuse_activation == "relu" || fuse_activation == "leaky_relu") { constexpr float scale = 1.0f; - constexpr float negative_slope = 0.0f; - constexpr float placeholder = 0.0f; post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, - negative_slope, placeholder); + fuse_alpha, fuse_beta); } - if (fuse_brelu) { + if (fuse_activation == "relu6") { constexpr float scale = 1.0f; - constexpr float placeholder = 0.0f; post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_bounded_relu, - fuse_brelu_threshold, placeholder); + fuse_alpha, fuse_beta); } conv_attr.set_post_ops(post_operations); return conv_attr; @@ -1200,9 +1003,10 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { boost::optional bias, const mkldnn::memory::desc& dst, const std::vector& strides, const std::vector& paddings, const mkldnn::engine& engine, - const bool fuse_relu, const bool fuse_residual_conn, - const bool fuse_brelu, const float fuse_brelu_threshold, - mkldnn::prop_kind fwd_prop_kind) { + const std::string& fuse_activation, float fuse_alpha, float fuse_beta, + const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind, + const std::vector output_shift_scale = {}, + const float sum_scale = 1.0f) { // Conv PD has to be passed to Grad op that // may be exxecuted by diffrent thread, hence // for that one we use key that does not contain TID @@ -1232,8 +1036,9 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { src, weights, dst, stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); - mkldnn::primitive_attr conv_attr = CreatePostOps( - fuse_relu, fuse_residual_conn, fuse_brelu, fuse_brelu_threshold); + mkldnn::primitive_attr conv_attr = + CreatePostOps(fuse_activation, fuse_alpha, fuse_beta, + fuse_residual_conn, output_shift_scale, sum_scale); conv_pd_.reset(new typename forward_t::primitive_desc( conv_desc, conv_attr, engine)); @@ -1312,35 +1117,6 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { return conv_bwd_data_p; } - // Generate keys for storing/retriving primitives for this operator - // TODO(jczaja): Make hashing function more optimial - static std::string GetHash(mkldnn::memory::dims& input_dims, // NOLINT - mkldnn::memory::dims& weights_dims, // NOLINT - const bool& fuse_relu, // NOLINT - const bool& fuse_brelu, // NOLINT - std::vector& strides, // NOLINT - std::vector& paddings, // NOLINT - std::vector& dilations, // NOLINT - int groups, const std::string& suffix) { - return dims2str(input_dims) + dims2str(weights_dims) + - std::to_string(fuse_relu) + std::to_string(fuse_brelu) + - dims2str(strides) + dims2str(paddings) + dims2str(dilations) + - std::to_string(groups) + suffix; - } - - // Generate keys for storing/retriving primitives for this operator - // TODO(jczaja): Make hashing function more optimial - static std::string GetHash(mkldnn::memory::dims& input_dims, // NOLINT - mkldnn::memory::dims& weights_dims, // NOLINT - std::vector& strides, // NOLINT - std::vector& paddings, // NOLINT - std::vector& dilations, // NOLINT - int groups, const std::string& suffix) { - return dims2str(input_dims) + dims2str(weights_dims) + dims2str(strides) + - dims2str(paddings) + dims2str(dilations) + std::to_string(groups) + - suffix; - } - private: std::shared_ptr conv_pd_; std::shared_ptr @@ -1358,47 +1134,6 @@ using ConvTransposeMKLDNNHandler = mkldnn::deconvolution_backward_data, mkldnn::deconvolution_backward_weights>; -template -static std::shared_ptr SetDstMemory( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const std::shared_ptr& handler) { - T* output_data = - output->mutable_data(ctx.GetPlace(), handler->GetDstMemorySize()); - std::shared_ptr dst_memory_p = - handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); - return dst_memory_p; -} - -template -static std::shared_ptr SetDstMemory( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const framework::Tensor* residual_param, - const mkldnn::memory::desc& user_residual_md, - const std::shared_ptr& handler, - std::vector* pipeline) { - const T* residual_param_data = residual_param->data(); - PADDLE_ENFORCE(residual_param_data != nullptr, - "Provide data if you want MKLDNN conv+elementwise_add fusion"); - std::shared_ptr user_residual_memory_p = - handler->AcquireResidualDataMemory(user_residual_md, - to_void_cast(residual_param_data)); - T* output_data = output->mutable_data(ctx.GetPlace()); - std::shared_ptr dst_memory_p = - handler->AcquireDstMemoryFromResidualDataMemory( - user_residual_memory_p, to_void_cast(output_data), *pipeline); - return dst_memory_p; -} - -template -static void SetDstMemoryHandler( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const std::shared_ptr& handler, - std::shared_ptr* dst_memory_p) { - T* output_data = - output->mutable_data(ctx.GetPlace(), handler->GetDstMemorySize()); - (*dst_memory_p)->set_data_handle(to_void_cast(output_data)); -} - template static void SetDstMemoryQuantized( const framework::ExecutionContext& ctx, framework::Tensor* output, @@ -1407,10 +1142,10 @@ static void SetDstMemoryQuantized( std::shared_ptr& dst_memory) { // NOLINT T* output_data = output->mutable_data(ctx.GetPlace()); const size_t dst_dims = dst_tz.size(); - memory::format dst_fmt; - PADDLE_ENFORCE(dst_dims <= 5, - "Dst memory for quantization can not have dims > 5"); - dst_fmt = platform::MKLDNNFormatForSize(dst_dims, memory::format::nhwc); + MKLDNNMemoryFormat dst_fmt; + PADDLE_ENFORCE_LE(dst_dims, 5, + "Dst memory for quantization can not have dims > 5"); + dst_fmt = platform::MKLDNNFormatForSize(dst_dims, MKLDNNMemoryFormat::nhwc); auto dst_md = platform::MKLDNNMemDesc( {dst_tz}, paddle::framework::ToMKLDNNDataType( diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 4680b070..a843a768 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -63,11 +63,11 @@ class NCCLGroupGuard { inline NCCLGroupGuard() { NCCLMutex().lock(); - PADDLE_ENFORCE(dynload::ncclGroupStart()); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart()); } inline ~NCCLGroupGuard() { - PADDLE_ENFORCE(dynload::ncclGroupEnd()); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd()); NCCLMutex().unlock(); } }; @@ -94,7 +94,7 @@ struct NCCLContextMap { explicit NCCLContextMap(const std::vector &places, ncclUniqueId *nccl_id = nullptr, size_t num_trainers = 1, size_t trainer_id = 0) { - PADDLE_ENFORCE(!places.empty()); + PADDLE_ENFORCE_EQ(!places.empty(), true); order_.reserve(places.size()); for (auto &p : places) { int dev_id = boost::get(p).device; @@ -109,7 +109,7 @@ struct NCCLContextMap { // if num_trainers == 1, should create a new nccl id for local comms. if (num_trainers == 1 && nccl_id == nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); - PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); } else { PADDLE_ENFORCE_NOT_NULL(nccl_id); @@ -126,8 +126,8 @@ struct NCCLContextMap { } VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i]; - PADDLE_ENFORCE(cudaSetDevice(gpu_id)); - PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( + PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(gpu_id)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank( comms.get() + i, nranks, *nccl_id, rank)); } } @@ -249,13 +249,13 @@ class NCCLCommunicator { size_t trainers_num, size_t trainer_id, size_t inter_trainers_num, size_t exter_trainers_num) { - PADDLE_ENFORCE(trainers_num == inter_trainers_num * exter_trainers_num, - "trainers_num:%llu != inter_trainers_num:%llu * " - "exter_trainers_num:%llu", - trainers_num, inter_trainers_num, exter_trainers_num); + PADDLE_ENFORCE_EQ(trainers_num, inter_trainers_num * exter_trainers_num, + "trainers_num:%llu != inter_trainers_num:%llu * " + "exter_trainers_num:%llu", + trainers_num, inter_trainers_num, exter_trainers_num); - PADDLE_ENFORCE(inter_trainers_num > 1, "inter_trainers_num:%llu must > 1", - inter_trainers_num); + PADDLE_ENFORCE_GT(inter_trainers_num, 1, "inter_trainers_num:%llu must > 1", + inter_trainers_num); int inter_trainer_id = trainer_id % inter_trainers_num; for (size_t i = 0; i < inter_nccl_ids.size(); i++) { diff --git a/paddle/fluid/platform/profiler.cu b/paddle/fluid/platform/profiler.cu index aed276b1..d4db6506 100644 --- a/paddle/fluid/platform/profiler.cu +++ b/paddle/fluid/platform/profiler.cu @@ -35,13 +35,13 @@ void DummyKernelAndEvent() { ForEachDevice([](int d) { platform::SetDeviceId(d); cudaStream_t stream; - PADDLE_ENFORCE(cudaStreamCreate(&stream)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream)); Mark("_cuda_startup_"); int *ptr; - PADDLE_ENFORCE(cudaMalloc(&ptr, sizeof(int))); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&ptr, sizeof(int))); DummyKernel<<<1, 1, 0, stream>>>(ptr); - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE(cudaFree(ptr)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaFree(ptr)); }); } } diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc deleted file mode 100644 index 6177b024..00000000 --- a/paddle/fluid/platform/temporary_allocator.cc +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/platform/temporary_allocator.h" -#include -#include "paddle/fluid/memory/allocation/allocator_facade.h" - -DEFINE_int64(limit_of_tmp_allocation, -1, - "The up limit of temporary_allocation size."); -DEFINE_double(times_excess_than_required_tmp_allocation, 2, - "times_excess_than_required_tmp_allocation indicates the " - "max size the TemporaryAllocator can return. For example, " - "if the required memory size is N, and " - "times_excess_than_required_tmp_allocation is 2.0, " - "the TemporaryAllocator will return the available allocation " - "that the range of size is N ~ 2*N."); - -namespace paddle { -namespace platform { -namespace alloc = memory::allocation; - -TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) { - temp_mem_map_.reset(new std::multimap()); -} - -bool TemporaryAllocator::IsAllocThreadSafe() const { return true; } - -void TemporaryAllocator::Release(const std::function &callback) { - std::unique_ptr> t_allocations; - { - std::unique_lock lock(mtx_); - callback(); - t_allocations.swap(temp_mem_map_); - temp_mem_map_.reset(new std::multimap()); - wait_delete_mem_ = 0; - } - - alloc::AllocationDeleter deleter; - for (auto tmp : *t_allocations) { - VLOG(10) << "Delete temporary allocation " << tmp.second->ptr() - << " size: " << tmp.second->size(); - deleter(tmp.second); - } -} - -void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) { - if (platform::is_gpu_place(temp_allocation->place())) { - PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_), - "The place should be the same."); - size_t wait_delete_mem = 0; - { - std::unique_lock lock(mtx_); - temp_mem_map_->emplace(temp_allocation->size(), temp_allocation); - wait_delete_mem_ += temp_allocation->size(); - wait_delete_mem = wait_delete_mem_; - VLOG(10) << "Move temporary allocation: " << temp_allocation->ptr() - << " to delete queue: " << temp_allocation->size() << "; " - << "wait_delete_mem: " << wait_delete_mem; - } - - if (FLAGS_limit_of_tmp_allocation >= 0 && - wait_delete_mem >= static_cast(FLAGS_limit_of_tmp_allocation)) { - PADDLE_ENFORCE(callback_ != nullptr, "The callback is non-initialized."); - Release(callback_); - } - return; - } - VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr() - << " size: " << temp_allocation->size(); - alloc::AllocationDeleter()(temp_allocation); -} - -size_t TemporaryAllocator::TemporaryAllocationQueueSize() { - std::unique_lock lock(mtx_); - return temp_mem_map_ ? temp_mem_map_->size() : 0; -} - -void TemporaryAllocator::SetCallback(const std::function &callback) { - callback_ = callback; -} - -alloc::Allocation *TemporaryAllocator::AllocateImpl(size_t size) { - { - // Find available allocation in temp_mem_map. - std::unique_lock lock(mtx_); - if (temp_mem_map_->size()) { - auto it = temp_mem_map_->lower_bound(size); - // FIXME(zcd): Not sure the best value of excess fraction. - if (it != temp_mem_map_->end() && - it->first < - static_cast( - size * FLAGS_times_excess_than_required_tmp_allocation)) { - auto tmp_ptr = it->second; - temp_mem_map_->erase(it); - wait_delete_mem_ -= tmp_ptr->size(); - VLOG(10) << "Reuse temporary allocation: " << tmp_ptr->ptr() << ": " - << tmp_ptr->size(); - return tmp_ptr; - } - } - } - // If not find the the available allocation, get allocation from - // AllocatorFacadeInstance. - auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size); - VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size; - return temp_mem.release(); -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h deleted file mode 100644 index 41f0e4a8..00000000 --- a/paddle/fluid/platform/temporary_allocator.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include // NOLINT -#include -#include -#include -#include // NOLINT -#include "paddle/fluid/memory/allocation/allocator.h" -#include "paddle/fluid/platform/lock_guard_ptr.h" -namespace paddle { -namespace platform { - -/*! \brief the TemporaryAllocator is used to alloc the temporary allocation - * which used by CUDA's async operation. - * - * The TemporaryAllocator contains a temp_allocation_queue which - * is used to store the temporary allocations. The allocation, which is - * allocated by TemporaryAllocator, is a unique_ptr, and when it is not held - * by any variable, it will be pushed into the temp_allocation_queue. - * - * There is one opportunity to free the allocations of temp_allocation_queue: - * - when the allocation size of opportunities exceeds a certain threshold - * (defined by FLAGS_limit_of_tmp_allocation). - * - * */ -class TemporaryAllocator : public memory::allocation::Allocator { - public: - explicit TemporaryAllocator(platform::Place place); - - void Release(const std::function &callback); - - size_t TemporaryAllocationQueueSize(); - - bool IsAllocThreadSafe() const override; - - void SetCallback(const std::function &callback); - - protected: - void FreeImpl(memory::allocation::Allocation *allocation) override; - - memory::allocation::Allocation *AllocateImpl(size_t size) override; - - private: - platform::Place place_; - // When the allocation is not held by any variable, it should be placed - // to temp_mem_map immediately. - std::unique_ptr> - temp_mem_map_{nullptr}; - std::mutex mtx_; - size_t wait_delete_mem_{0}; - std::function callback_; -}; - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc deleted file mode 100644 index a5068eff..00000000 --- a/paddle/fluid/platform/temporary_allocator_test.cc +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/platform/temporary_allocator.h" -#include -#include -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor_util.h" - -DECLARE_int64(limit_of_tmp_allocation); -DECLARE_double(times_excess_than_required_tmp_allocation); - -namespace paddle { -namespace platform { - -class DummyOp : public framework::OperatorBase { - public: - DummyOp(const std::string& type, const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - protected: - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override {} -}; - -TEST(temporary_allocator, test_base_function) { - platform::CPUPlace cpu_place; - TemporaryAllocator alloc(cpu_place); - alloc.Allocate(100); - -#ifdef PADDLE_WITH_CUDA - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); - - auto allocation = gpu_alloc.Allocate(101); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - gpu_alloc.Release([]() {}); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - - { - auto allocation = gpu_alloc.Allocate(102); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - } - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); - gpu_alloc.Release([]() {}); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); -#endif -} - -TEST(temporary_allocator, test_flags_function) { -#ifdef PADDLE_WITH_CUDA - const int64_t limit = FLAGS_limit_of_tmp_allocation; - FLAGS_limit_of_tmp_allocation = 10; - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = - static_cast(pool.Get(gpu_place)); - auto stream = dev_ctx->stream(); - bool deleted = false; - gpu_alloc.SetCallback([stream, &deleted]() { - PADDLE_ENFORCE(cudaStreamSynchronize(stream)); - PADDLE_ENFORCE(cudaGetLastError()); - deleted = true; - }); - { gpu_alloc.Allocate(100); } - PADDLE_ENFORCE(deleted); - FLAGS_limit_of_tmp_allocation = limit; -#endif -} - -TEST(temporary_allocator, test_reuse_tmp_allocation) { -#ifdef PADDLE_WITH_CUDA - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); - gpu_alloc.SetCallback([]() {}); - - void* tmp_allocation_ptr1 = nullptr; - { - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - auto tmp_allocation1 = gpu_alloc.Allocate(200); - tmp_allocation_ptr1 = tmp_allocation1->ptr(); - } - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); - auto tmp_allocation2 = gpu_alloc.Allocate(200); - void* tmp_allocation_ptr2 = tmp_allocation2->ptr(); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2); - - auto tmp_allocation3 = gpu_alloc.Allocate(200); - void* tmp_allocation_ptr3 = tmp_allocation2->ptr(); - PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr3); -#endif -} - -TEST(temporary_allocator, test_times_excess_than_required_tmp_allocation) { -#ifdef PADDLE_WITH_CUDA - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); - gpu_alloc.SetCallback([]() {}); - double excess_fraction = FLAGS_times_excess_than_required_tmp_allocation; - void* tmp_allocation_ptr1 = nullptr; - { - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - auto tmp_allocation1 = - gpu_alloc.Allocate(static_cast(200 * excess_fraction - 1)); - tmp_allocation_ptr1 = tmp_allocation1->ptr(); - } - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); - auto tmp_allocation2 = gpu_alloc.Allocate(200 * excess_fraction - 10); - void* tmp_allocation_ptr2 = tmp_allocation2->ptr(); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - PADDLE_ENFORCE_EQ(tmp_allocation_ptr1, tmp_allocation_ptr2); -#endif -} - -TEST(temporary_allocator, create_tensor_with_allocationptr) { - framework::VariableNameMap dummy_vars; - framework::AttributeMap dummy_attrs; - DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs); - framework::Scope scope; - framework::VariableValueMap vars; - framework::RuntimeContext run_ctx(vars, vars); - size_t memory_size = 300; - { - platform::CPUPlace cpu_place; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = - static_cast(pool.Get(cpu_place)); - framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr); - - int numel = memory_size / sizeof(float); - framework::Tensor tensor = - ctx.AllocateTmpTensor( - framework::make_ddim({numel}), *dev_ctx); - PADDLE_ENFORCE_EQ(tensor.numel(), numel); - } - -#ifdef PADDLE_WITH_CUDA - { - platform::CUDAPlace gpu_place(0); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = - static_cast(pool.Get(gpu_place)); - framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr); - int numel = memory_size / sizeof(float); - framework::Tensor tensor = - ctx.AllocateTmpTensor( - framework::make_ddim({numel}), *dev_ctx); - PADDLE_ENFORCE_EQ(tensor.numel(), numel); - } -#endif -} - -TEST(temporary_allocator, create_tensor_with_allocationptr2) { - framework::VariableNameMap dummy_vars; - framework::AttributeMap dummy_attrs; - DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs); - framework::Scope scope; - framework::VariableValueMap vars; - framework::RuntimeContext run_ctx(vars, vars); - size_t memory_size = 400; - { - platform::CPUPlace cpu_place; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = - static_cast(pool.Get(cpu_place)); - framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr); - int numel = memory_size / sizeof(float); - - framework::Tensor out_side_tensor; - { - framework::Tensor tensor = - ctx.AllocateTmpTensor( - framework::make_ddim({numel}), *dev_ctx); - PADDLE_ENFORCE_EQ(tensor.numel(), numel); - - out_side_tensor.ShareDataWith(tensor); - } - PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel); - } - -#ifdef PADDLE_WITH_CUDA - { - platform::CUDAPlace gpu_place(0); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto* dev_ctx = - static_cast(pool.Get(gpu_place)); - framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx, nullptr); - - size_t memory_size = 500; - int numel = memory_size / sizeof(float); - framework::Tensor out_side_tensor; - { - framework::Tensor tensor = - ctx.AllocateTmpTensor( - framework::make_ddim({numel}), *dev_ctx); - PADDLE_ENFORCE_EQ(tensor.numel(), numel); - - out_side_tensor.ShareDataWith(tensor); - } - PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel); - } -#endif -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/pybind/.gitignore b/paddle/fluid/pybind/.gitignore new file mode 100644 index 00000000..8f222791 --- /dev/null +++ b/paddle/fluid/pybind/.gitignore @@ -0,0 +1 @@ +pybind.h diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 8ee03c79..cb3493b6 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper nccl_wrapper prune - feed_fetch_method pass_builder parallel_executor profiler layer scope_pool - tracer analysis_predictor imperative_profiler nccl_context) +set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper nccl_wrapper prune + feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool + analysis_predictor imperative_profiler nccl_context imperative_flag) if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) @@ -15,9 +15,9 @@ set(PYBIND_SRCS exception.cc protobuf.cc const_value.cc - recordio.cc reader_py.cc fleet_wrapper_py.cc + box_helper_py.cc nccl_wrapper_py.cc data_set_py.cc imperative.cc diff --git a/paddle/fluid/pybind/box_helper_py.cc b/paddle/fluid/pybind/box_helper_py.cc new file mode 100644 index 00000000..13aec9aa --- /dev/null +++ b/paddle/fluid/pybind/box_helper_py.cc @@ -0,0 +1,50 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#ifdef _POSIX_C_SOURCE +#undef _POSIX_C_SOURCE +#endif + +#ifdef _XOPEN_SOURCE +#undef _XOPEN_SOURCE +#endif + +#include +#include +#include + +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/fleet/box_wrapper.h" +#include "paddle/fluid/pybind/box_helper_py.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { +void BindBoxHelper(py::module* m) { + py::class_>( + *m, "BoxPS") + .def(py::init([](paddle::framework::Dataset* dataset) { + return std::make_shared(dataset); + })) + .def("begin_pass", &framework::BoxHelper::BeginPass) + .def("end_pass", &framework::BoxHelper::EndPass) + .def("wait_feed_pass_done", &framework::BoxHelper::WaitFeedPassDone) + .def("preload_into_memory", &framework::BoxHelper::PreLoadIntoMemory) + .def("load_into_memory", &framework::BoxHelper::LoadIntoMemory); +} // end BoxHelper +} // end namespace pybind +} // end namespace paddle diff --git a/paddle/fluid/pybind/recordio.h b/paddle/fluid/pybind/box_helper_py.h similarity index 87% rename from paddle/fluid/pybind/recordio.h rename to paddle/fluid/pybind/box_helper_py.h index 2555f9b7..33072dd5 100644 --- a/paddle/fluid/pybind/recordio.h +++ b/paddle/fluid/pybind/box_helper_py.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include "pybind11/pybind11.h" #include "pybind11/stl.h" @@ -21,7 +22,7 @@ namespace py = pybind11; namespace paddle { namespace pybind { -void BindRecordIOWriter(py::module* m); +void BindBoxHelper(py::module* m); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index 633e3259..71eeaf3b 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/const_value.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" @@ -34,7 +33,6 @@ void BindConstValue(pybind11::module* m) { m->def("kControlDepVarName", [] { return framework::ir::Node::kControlDepVarName; }); m->def("kNewGradSuffix", [] { return framework::kNewGradSuffix; }); - m->def("kMemOptSkipVars", [] { return framework::ir::kMemOptSkipVars; }); auto op_proto_and_checker_maker = m->def_submodule("op_proto_and_checker_maker"); diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index 0e88027e..dd513d4b 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -21,6 +21,8 @@ limitations under the License. */ #endif #include #include +#include +#include #include #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/text_format.h" @@ -41,10 +43,147 @@ namespace pd = paddle::framework; namespace paddle { namespace pybind { -void BindDataset(py::module* m) { +class IterableDatasetWrapper { + public: + IterableDatasetWrapper(framework::Dataset *dataset, + const std::vector &slots, + const std::vector &places, + size_t batch_size, bool drop_last) + : dataset_(dataset), + slots_(slots), + places_(places), + batch_size_(batch_size), + drop_last_(drop_last) { +#if defined _WIN32 + PADDLE_THROW("Dataset is not supported on Windows"); +#elif defined __APPLE__ + PADDLE_THROW("Dataset is not supported on MAC"); +#else + size_t device_num = places_.size(); + PADDLE_ENFORCE_GT(device_num, 0, "thread_num must be larger than 0"); + PADDLE_ENFORCE_GT(slots_.size(), 0, "slot_num cannot be 0"); + scopes_.reserve(device_num); + tensors_.reserve(device_num); + for (size_t i = 0; i < device_num; ++i) { + scopes_.emplace_back(new framework::Scope()); + tensors_.emplace_back(); + for (auto &var_name : slots_) { + auto *var = scopes_.back()->Var(var_name); + auto *t = var->GetMutable(); + tensors_.back().emplace_back(t); + } + } + + is_exhaustive_.resize(device_num); + exhaustive_num_ = 0; +#endif + } + + void Start() { + PADDLE_ENFORCE_EQ(is_started_, false, "Reader has been started"); + data_feeds_ = dataset_->GetReaders(); + PADDLE_ENFORCE_EQ(data_feeds_.size(), places_.size(), + "Device number does not match reader number"); + for (size_t i = 0; i < places_.size(); ++i) { + data_feeds_[i]->AssignFeedVar(*scopes_[i]); + data_feeds_[i]->SetPlace(platform::CPUPlace()); + PADDLE_ENFORCE_EQ(data_feeds_[i]->Start(), true, "Reader start failed"); + } + is_started_ = true; + + is_exhaustive_.assign(places_.size(), false); + exhaustive_num_ = 0; + } + + std::vector> Next() { + PADDLE_ENFORCE_EQ(is_started_, true, "Reader must be started"); + size_t device_num = places_.size(); + + std::vector> result( + device_num); + + size_t read_num = 0; + while (read_num < device_num && exhaustive_num_ < device_num) { + for (size_t i = 0; i < data_feeds_.size(); ++i) { + if (is_exhaustive_[i]) { + continue; + } + + bool is_success = (data_feeds_[i]->Next() > 0); + if (!is_success) { + is_exhaustive_[i] = true; + ++exhaustive_num_; + continue; + } + + for (size_t j = 0; j < slots_.size(); ++j) { + if (!IsValidLoDTensor(*tensors_[i][j])) { + is_success = false; + break; + } + + if (tensors_[i][j]->place() == places_[read_num]) { + result[read_num].emplace(slots_[j], std::move(*tensors_[i][j])); + } else { + framework::TensorCopy(std::move(*tensors_[i][j]), places_[read_num], + &result[read_num][slots_[j]]); + } + } + + if (!is_success) { + is_exhaustive_[i] = true; + ++exhaustive_num_; + continue; + } + + ++read_num; + if (read_num == device_num) { + break; + } + } + } + + if (UNLIKELY(read_num != device_num)) { + is_started_ = false; + throw py::stop_iteration(); + } + + return result; + } + + private: + bool IsValidLoDTensor(const framework::LoDTensor &tensor) const { + auto &lod = tensor.lod(); + PADDLE_ENFORCE_LE(lod.size(), 1, "lod level must be not larger than 1"); + if (!drop_last_) return true; + + if (lod.empty()) { + return static_cast(tensor.dims()[0]) == batch_size_; + } else { + return lod[0].size() == batch_size_ + 1; + } + } + + private: + framework::Dataset *dataset_; + std::vector slots_; + std::vector places_; + size_t batch_size_; + bool drop_last_; + + std::vector data_feeds_; + std::vector is_exhaustive_; + size_t exhaustive_num_; + + std::vector> scopes_; + std::vector> tensors_; + bool is_started_{false}; +}; + +void BindDataset(py::module *m) { py::class_>(*m, "Dataset") - .def(py::init([](const std::string& name = "MultiSlotDataset") { + .def(py::init([](const std::string &name = "MultiSlotDataset") { return framework::DatasetFactory::CreateDataset(name); })) .def("set_filelist", &framework::Dataset::SetFileList, @@ -100,11 +239,41 @@ void BindDataset(py::module* m) { py::call_guard()) .def("set_queue_num", &framework::Dataset::SetChannelNum, py::call_guard()) + .def("set_parse_ins_id", &framework::Dataset::SetParseInsId, + py::call_guard()) + .def("set_parse_content", &framework::Dataset::SetParseContent, + py::call_guard()) .def("set_merge_by_lineid", &framework::Dataset::SetMergeByInsId, py::call_guard()) .def("merge_by_lineid", &framework::Dataset::MergeByInsId, + py::call_guard()) + .def("slots_shuffle", &framework::Dataset::SlotsShuffle, + py::call_guard()) + .def("set_fea_eval", &framework::Dataset::SetFeaEval, + py::call_guard()) + .def("set_preload_thread_num", &framework::Dataset::SetPreLoadThreadNum, + py::call_guard()) + .def("create_preload_readers", &framework::Dataset::CreatePreLoadReaders, + py::call_guard()) + .def("destroy_preload_readers", + &framework::Dataset::DestroyPreLoadReaders, + py::call_guard()) + .def("dynamic_adjust_channel_num", + &framework::Dataset::DynamicAdjustChannelNum, + py::call_guard()) + .def("dynamic_adjust_readers_num", + &framework::Dataset::DynamicAdjustReadersNum, + py::call_guard()) + .def("set_fleet_send_sleep_seconds", + &framework::Dataset::SetFleetSendSleepSeconds, py::call_guard()); + + py::class_(*m, "IterableDatasetWrapper") + .def(py::init &, + const std::vector &, size_t, bool>()) + .def("_start", &IterableDatasetWrapper::Start) + .def("_next", &IterableDatasetWrapper::Next); } -} // end namespace pybind -} // end namespace paddle +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc index e9625879..e7c7750c 100644 --- a/paddle/fluid/pybind/fleet_wrapper_py.cc +++ b/paddle/fluid/pybind/fleet_wrapper_py.cc @@ -43,12 +43,17 @@ void BindFleetWrapper(py::module* m) { py::class_(*m, "Fleet") .def(py::init()) .def("push_dense", &framework::FleetWrapper::PushDenseVarsSync) + .def("pull_dense", &framework::FleetWrapper::PullDenseVarsSync) .def("init_server", &framework::FleetWrapper::InitServer) .def("run_server", &framework::FleetWrapper::RunServer) .def("init_worker", &framework::FleetWrapper::InitWorker) .def("init_model", &framework::FleetWrapper::PushDenseParamSync) .def("save_model", &framework::FleetWrapper::SaveModel) + .def("get_cache_threshold", &framework::FleetWrapper::GetCacheThreshold) + .def("cache_shuffle", &framework::FleetWrapper::CacheShuffle) + .def("save_cache", &framework::FleetWrapper::SaveCache) .def("load_model", &framework::FleetWrapper::LoadModel) + .def("clear_model", &framework::FleetWrapper::ClearModel) .def("stop_server", &framework::FleetWrapper::StopServer) .def("gather_servers", &framework::FleetWrapper::GatherServers) .def("gather_clients", &framework::FleetWrapper::GatherClients) @@ -60,7 +65,9 @@ void BindFleetWrapper(py::module* m) { .def("client_flush", &framework::FleetWrapper::ClientFlush) .def("load_from_paddle_model", &framework::FleetWrapper::LoadFromPaddleModel) - .def("load_model_one_table", &framework::FleetWrapper::LoadModelOneTable); + .def("load_model_one_table", &framework::FleetWrapper::LoadModelOneTable) + .def("set_client2client_config", + &framework::FleetWrapper::SetClient2ClientConfig); } // end FleetWrapper } // end namespace pybind } // end namespace paddle diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index aaae26cd..63e3e7e8 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -20,11 +20,13 @@ limitations under the License. */ #include #include #include +#include #include #include - -#include "paddle/fluid/framework/block_desc.h" +#include +#include "paddle/fluid/imperative/backward_strategy.h" #include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/nccl_context.h" #include "paddle/fluid/imperative/profiler.h" #include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" @@ -44,16 +46,27 @@ class Layer : public imperative::Layer { const std::vector> &inputs) override { PYBIND11_OVERLOAD(std::vector>, Layer, - Forward, - inputs); // NOLINT + Forward, inputs); // NOLINT } }; -class PYBIND11_HIDDEN PyOpBase : public imperative::OpBase { +// warper for pyobject to avoid imperative module depend on python +// TODO(jiabin) Add OpBase's pybind interface back to enable backward hook +class PYBIND11_HIDDEN PyCallableObject { public: - using imperative::OpBase::OpBase; // Inherit constructors + PyCallableObject(std::shared_ptr py_obj_ptr) + : py_obj_ptr_(std::move(py_obj_ptr)) {} + ~PyCallableObject() { + py::call_guard(); + py_obj_ptr_.reset(); + } + void operator()() { + py::call_guard(); + py_obj_ptr_->operator()(this); + } - PyOpBase(const std::string &name) : OpBase(name) {} + private: + std::shared_ptr py_obj_ptr_; }; // Function like obj.attr_name in Python. @@ -125,33 +138,43 @@ GetVarBaseListFromPyHandle(const py::handle &handle) { } } else { PADDLE_THROW( - "unsupported type %s, must be Variable, List[Variable] or " + "unsupported type %s, must be Variable, list[Variable] or " "tuple[Variable]", py::str(handle)); } - PADDLE_ENFORCE(PyErr_Occurred() == nullptr, - py::str(py::handle(PyErr_Occurred()))); - return result; } -using PyVarBaseMap = std::unordered_map; +using PyNameVarBaseMap = std::unordered_map; -static imperative::VarBasePtrMap ConvertToVarBasePtrMap( - const PyVarBaseMap &map) { - imperative::VarBasePtrMap result; +static imperative::NameVarBaseMap ConvertToNameVarBaseMap( + const PyNameVarBaseMap &map) { + imperative::NameVarBaseMap result; for (auto &pair : map) { auto var_vec = GetVarBaseListFromPyHandle(pair.second); if (!var_vec.empty()) { result.emplace(pair.first, std::move(var_vec)); } } + + PADDLE_ENFORCE_EQ(PyErr_Occurred() == nullptr, true, + py::str(py::handle(PyErr_Occurred()))); return result; } +static std::string GetTypeName(const imperative::VarBase &var) { + if (var.Type() == framework::proto::VarType::RAW) { + return "RAW"; + } else if (!var.Var().IsInitialized()) { + return "nullptr"; + } else { + return framework::ToTypeName(var.Var().Type()); + } +} + // Bind Methods -void BindImperative(pybind11::module *m_ptr) { +void BindImperative(py::module *m_ptr) { auto &m = *m_ptr; py::class_ backward_strategy( @@ -200,68 +223,86 @@ void BindImperative(pybind11::module *m_ptr) { m.def("_dygraph_debug_level", []() { return imperative::GetDebugLevel(); }); py::class_>( - m, "VarBase", R"DOC()DOC") + m, "VarBase", + R"DOC()DOC") .def_static("_alive_vars", &imperative::VarBase::AliveVarNames) - .def( - py::init, const paddle::platform::CPUPlace, - bool, bool>()) - .def( - py::init, - const paddle::platform::CUDAPlace, bool, bool>()) + .def("__init__", + [](imperative::VarBase &self, const std::string &name, + framework::proto::VarType::Type type, + framework::proto::VarType::Type dtype, + const std::vector &dims, bool persistable) { + new (&self) imperative::VarBase(name); + self.SetPersistable(persistable); + self.SetType(type); + self.SetDataType(dtype); + if (type == framework::proto::VarType::LOD_TENSOR) { + auto *tensor = + self.MutableVar()->GetMutable(); + tensor->Resize(framework::make_ddim(dims)); + } + }) .def("_run_backward", [](imperative::VarBase &self, - const imperative::detail::BackwardStrategy &bckst) { - self.RunBackward(bckst); - }) - .def("_grad_name", &imperative::VarBase::GradName) - .def("_grad_value", &imperative::VarBase::GradValue) + const imperative::detail::BackwardStrategy &bckst, + const imperative::Tracer &tracer) { + // TODO(jiabin): when we impl more backward execution we can select + // them + + imperative::Engine *engine = tracer.GetDefaultEngine(); + VLOG(3) << "Start backward"; + engine->Init(&self, bckst); + engine->Execute(); + VLOG(3) << "Finish backward"; + }, + py::call_guard()) + .def("_grad_name", &imperative::VarBase::GradVarName) + .def("_grad_value", + [](imperative::VarBase &self) { + return self.MutableGradVar()->Get(); + }, + py::return_value_policy::reference) .def("_clear_gradient", &imperative::VarBase::ClearGradient) .def("_grad_ivar", - [](const imperative::VarBase &self) { return self.grads_; }, - py::return_value_policy::reference) + [](const imperative::VarBase &self) { + auto &grad_var = self.GradVarBase(); + if (grad_var && grad_var->Var().IsInitialized()) { + return grad_var; + } else { + return std::shared_ptr(nullptr); + } + }, + py::return_value_policy::copy) .def("_copy_to", [](const imperative::VarBase &self, const platform::CPUPlace &place, - bool blocking) { - return self.NewVarBase(place, blocking).release(); - }, - py::return_value_policy::take_ownership) + bool blocking) { return self.NewVarBase(place, blocking); }, + py::return_value_policy::copy) .def("_copy_to", [](const imperative::VarBase &self, const platform::CUDAPlace &place, - bool blocking) { - return self.NewVarBase(place, blocking).release(); - }, - py::return_value_policy::take_ownership) - .def("value", - [](const imperative::VarBase &self) { return self.var_.get(); }, + bool blocking) { return self.NewVarBase(place, blocking); }, + py::return_value_policy::copy) + .def("value", [](imperative::VarBase &self) { return self.MutableVar(); }, py::return_value_policy::reference) .def_property("name", &imperative::VarBase::Name, &imperative::VarBase::SetName) - .def_property_readonly("shape", &imperative::VarBase::Shape) + .def_property_readonly( + "shape", + [](imperative::VarBase &self) { + if (self.Var().IsType()) { + return framework::vectorize( + self.Var().Get().dims()); + } else { + VLOG(2) << "It is meaningless to get shape of variable type " + << GetTypeName(self); + return std::vector(); + } + }) + .def_property_readonly("type", &imperative::VarBase::Type) .def_property_readonly("dtype", &imperative::VarBase::DataType) - .def_property("persistable", &imperative::VarBase::IsPersistable, + .def_property("persistable", &imperative::VarBase::Persistable, &imperative::VarBase::SetPersistable) - .def_property("stop_gradient", &imperative::VarBase::IsStopGradient, - &imperative::VarBase::SetStopGradient); - - py::class_(m, "OpBase", R"DOC()DOC") - .def(py::init()) - .def("register_backward_hooks", - [](imperative::OpBase &self, const py::object &callable) { - self.RegisterBackwardHooks(callable); - }) - .def_property("_trace_id", - [](const imperative::OpBase &self) { - py::gil_scoped_release release; - return self.trace_id_; - }, - [](imperative::OpBase &self, int trace_id) { - py::gil_scoped_release release; - self.trace_id_ = trace_id; - }, - py::return_value_policy::reference) - .def_property_readonly("type", &imperative::OpBase::Type); + .def_property("stop_gradient", + &imperative::VarBase::OverridedStopGradient, + &imperative::VarBase::SetOverridedStopGradient); py::class_ layer(m, "Layer"); layer.def(py::init<>()) @@ -271,42 +312,35 @@ void BindImperative(pybind11::module *m_ptr) { return self.Forward(inputs); }); - // NOTE(zjl): Tracer use PyVarBaseMap as its parameter but not VarBasePtrMap. - // We call Python C-API to convert PyVarBaseMap to VarBasePtrMap, instead - // making conversion in Python code. This speed up Tracer.trace() about 6% - // in ptb model and make time cost in Python to be nearly zero. py::class_(m, "Tracer", "") .def("__init__", - [](imperative::Tracer &self, framework::BlockDesc *root_block) { - new (&self) imperative::Tracer(root_block); - }) + [](imperative::Tracer &self) { new (&self) imperative::Tracer(); }) .def("trace", - [](imperative::Tracer &self, imperative::OpBase *op, - const PyVarBaseMap &inputs, const PyVarBaseMap &outputs, - framework::AttributeMap attrs_map, - const platform::CPUPlace expected_place, - const bool stop_gradient = false) { - auto ins = ConvertToVarBasePtrMap(inputs); - auto outs = ConvertToVarBasePtrMap(outputs); + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, const platform::CUDAPlace &place, + bool trace_backward) { + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); { py::gil_scoped_release release; - self.Trace(op, std::move(ins), &outs, attrs_map, expected_place, - stop_gradient); + self.TraceOp(type, std::move(ins_map), std::move(outs_map), + std::move(attrs), place, trace_backward); } }) - .def("trace", [](imperative::Tracer &self, imperative::OpBase *op, - const PyVarBaseMap &inputs, const PyVarBaseMap &outputs, - framework::AttributeMap attrs_map, - const platform::CUDAPlace expected_place, - const bool stop_gradient = false) { - auto ins = ConvertToVarBasePtrMap(inputs); - auto outs = ConvertToVarBasePtrMap(outputs); - { - py::gil_scoped_release release; - self.Trace(op, std::move(ins), &outs, attrs_map, expected_place, - stop_gradient); - } - }); + .def("trace", + [](imperative::Tracer &self, const std::string &type, + const PyNameVarBaseMap &ins, const PyNameVarBaseMap &outs, + framework::AttributeMap attrs, const platform::CPUPlace &place, + bool trace_backward) { + auto ins_map = ConvertToNameVarBaseMap(ins); + auto outs_map = ConvertToNameVarBaseMap(outs); + { + py::gil_scoped_release release; + self.TraceOp(type, std::move(ins_map), std::move(outs_map), + std::move(attrs), place, trace_backward); + } + }); // define parallel context py::class_ parallel_strategy( diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h index cfe185bb..0e3e9851 100644 --- a/paddle/fluid/pybind/imperative.h +++ b/paddle/fluid/pybind/imperative.h @@ -14,10 +14,6 @@ limitations under the License. */ #pragma once #include -#include -#include -#include "paddle/fluid/imperative/layer.h" -#include "paddle/fluid/imperative/nccl_context.h" #include "pybind11/pybind11.h" #include "pybind11/stl.h" diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index d8664425..f7a59022 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/pybind/inference_api.h" +#include #include #include #include @@ -20,6 +21,7 @@ #include #include #include +#include #include #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" @@ -37,20 +39,97 @@ using paddle::NativeConfig; using paddle::NativePaddlePredictor; using paddle::AnalysisPredictor; -static void BindPaddleDType(py::module *m); -static void BindPaddleBuf(py::module *m); -static void BindPaddleTensor(py::module *m); -static void BindPaddlePlace(py::module *m); -static void BindPaddlePredictor(py::module *m); -static void BindNativeConfig(py::module *m); -static void BindNativePredictor(py::module *m); -static void BindAnalysisConfig(py::module *m); -static void BindAnalysisPredictor(py::module *m); +namespace { +void BindPaddleDType(py::module *m); +void BindPaddleBuf(py::module *m); +void BindPaddleTensor(py::module *m); +void BindPaddlePlace(py::module *m); +void BindPaddlePredictor(py::module *m); +void BindNativeConfig(py::module *m); +void BindNativePredictor(py::module *m); +void BindAnalysisConfig(py::module *m); +void BindAnalysisPredictor(py::module *m); #ifdef PADDLE_WITH_MKLDNN -static void BindMkldnnQuantizerConfig(py::module *m); +void BindMkldnnQuantizerConfig(py::module *m); #endif +template +PaddleBuf PaddleBufCreate(py::array_t data) { + PaddleBuf buf(data.size() * sizeof(T)); + std::copy_n(static_cast(data.mutable_data()), data.size(), + static_cast(buf.data())); + return buf; +} + +template +void PaddleBufReset(PaddleBuf &buf, py::array_t data) { // NOLINT + buf.Resize(data.size() * sizeof(T)); + std::copy_n(static_cast(data.mutable_data()), data.size(), + static_cast(buf.data())); +} + +template +PaddleDType PaddleTensorGetDType(); + +template <> +PaddleDType PaddleTensorGetDType() { + return PaddleDType::INT32; +} + +template <> +PaddleDType PaddleTensorGetDType() { + return PaddleDType::INT64; +} + +template <> +PaddleDType PaddleTensorGetDType() { + return PaddleDType::FLOAT32; +} + +template +PaddleTensor PaddleTensorCreate( + py::array_t data, const std::string name = "", + const std::vector> &lod = {}, bool copy = true) { + PaddleTensor tensor; + + if (copy) { + PaddleBuf buf(data.size() * sizeof(T)); + std::copy_n(static_cast(data.mutable_data()), data.size(), + static_cast(buf.data())); + tensor.data = std::move(buf); + } else { + tensor.data = PaddleBuf(data.mutable_data(), data.size() * sizeof(T)); + } + + tensor.dtype = PaddleTensorGetDType(); + tensor.name = name; + tensor.lod = lod; + tensor.shape.resize(data.ndim()); + std::copy_n(data.shape(), data.ndim(), tensor.shape.begin()); + + return tensor; +} + +py::array PaddleTensorGetData(PaddleTensor &tensor) { // NOLINT + py::dtype dt; + switch (tensor.dtype) { + case PaddleDType::INT32: + dt = py::dtype::of(); + break; + case PaddleDType::INT64: + dt = py::dtype::of(); + break; + case PaddleDType::FLOAT32: + dt = py::dtype::of(); + break; + default: + LOG(FATAL) << "unsupported dtype"; + } + return py::array(dt, {tensor.shape}, tensor.data.data()); +} +} // namespace + void BindInferenceApi(py::module *m) { BindPaddleDType(m); BindPaddleBuf(m); @@ -71,6 +150,7 @@ void BindInferenceApi(py::module *m) { m->def("paddle_dtype_size", &paddle::PaddleDtypeSize); } +namespace { void BindPaddleDType(py::module *m) { py::enum_(*m, "PaddleDType") .value("FLOAT32", PaddleDType::FLOAT32) @@ -86,23 +166,39 @@ void BindPaddleBuf(py::module *m) { std::memcpy(buf.data(), static_cast(data.data()), buf.length()); return buf; })) - .def(py::init([](std::vector &data) { - auto buf = PaddleBuf(data.size() * sizeof(int64_t)); - std::memcpy(buf.data(), static_cast(data.data()), buf.length()); - return buf; - })) + .def(py::init(&PaddleBufCreate)) + .def(py::init(&PaddleBufCreate)) + .def(py::init(&PaddleBufCreate)) .def("resize", &PaddleBuf::Resize) .def("reset", [](PaddleBuf &self, std::vector &data) { self.Resize(data.size() * sizeof(float)); std::memcpy(self.data(), data.data(), self.length()); }) - .def("reset", - [](PaddleBuf &self, std::vector &data) { - self.Resize(data.size() * sizeof(int64_t)); - std::memcpy(self.data(), data.data(), self.length()); - }) + .def("reset", &PaddleBufReset) + .def("reset", &PaddleBufReset) + .def("reset", &PaddleBufReset) .def("empty", &PaddleBuf::empty) + .def("tolist", + [](PaddleBuf &self, const std::string &dtype) -> py::list { + py::list l; + if (dtype == "int32") { + auto *data = static_cast(self.data()); + auto size = self.length() / sizeof(int32_t); + l = py::cast(std::vector(data, data + size)); + } else if (dtype == "int64") { + auto *data = static_cast(self.data()); + auto size = self.length() / sizeof(int64_t); + l = py::cast(std::vector(data, data + size)); + } else if (dtype == "float32") { + auto *data = static_cast(self.data()); + auto size = self.length() / sizeof(float); + l = py::cast(std::vector(data, data + size)); + } else { + LOG(FATAL) << "unsupported dtype"; + } + return l; + }) .def("float_data", [](PaddleBuf &self) -> std::vector { auto *data = static_cast(self.data()); @@ -124,6 +220,19 @@ void BindPaddleBuf(py::module *m) { void BindPaddleTensor(py::module *m) { py::class_(*m, "PaddleTensor") .def(py::init<>()) + .def(py::init(&PaddleTensorCreate), py::arg("data"), + py::arg("name") = "", + py::arg("lod") = std::vector>(), + py::arg("copy") = true) + .def(py::init(&PaddleTensorCreate), py::arg("data"), + py::arg("name") = "", + py::arg("lod") = std::vector>(), + py::arg("copy") = true) + .def(py::init(&PaddleTensorCreate), py::arg("data"), + py::arg("name") = "", + py::arg("lod") = std::vector>(), + py::arg("copy") = true) + .def("as_ndarray", &PaddleTensorGetData) .def_readwrite("name", &PaddleTensor::name) .def_readwrite("shape", &PaddleTensor::shape) .def_readwrite("data", &PaddleTensor::data) @@ -199,6 +308,7 @@ void BindAnalysisConfig(py::module *m) { py::enum_(analysis_config, "Precision") .value("Float32", AnalysisConfig::Precision::kFloat32) .value("Int8", AnalysisConfig::Precision::kInt8) + .value("Half", AnalysisConfig::Precision::kHalf) .export_values(); analysis_config.def(py::init()) @@ -226,6 +336,9 @@ void BindAnalysisConfig(py::module *m) { .def("switch_ir_optim", &AnalysisConfig::SwitchIrOptim, py::arg("x") = true) .def("ir_optim", &AnalysisConfig::ir_optim) + .def("enable_memory_optim", &AnalysisConfig::EnableMemoryOptim) + .def("enable_profile", &AnalysisConfig::EnableProfile) + .def("set_optim_cache_dir", &AnalysisConfig::SetOptimCacheDir) .def("switch_use_feed_fetch_ops", &AnalysisConfig::SwitchUseFeedFetchOps, py::arg("x") = true) .def("use_feed_fetch_ops_enabled", @@ -311,6 +424,6 @@ void BindAnalysisPredictor(py::module *m) { .def("SaveOptimModel", &AnalysisPredictor::SaveOptimModel, py::arg("dir")); } - +} // namespace } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 6dd6c842..64413685 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -46,10 +46,12 @@ limitations under the License. */ #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/pybind/box_helper_py.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/data_set_py.h" #include "paddle/fluid/pybind/exception.h" @@ -61,13 +63,12 @@ limitations under the License. */ #ifndef _WIN32 #include "paddle/fluid/pybind/nccl_wrapper_py.h" #endif +#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/reader_py.h" -#include "paddle/fluid/pybind/recordio.h" #include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/string/to_string.h" - #ifdef PADDLE_WITH_CUDA #ifndef _WIN32 #include "paddle/fluid/operators/nccl/nccl_gpu_common.h" @@ -85,6 +86,10 @@ limitations under the License. */ DEFINE_bool(reader_queue_speed_test_mode, false, "If set true, the queue.pop will only get data from queue but not " "remove the data from queue for speed testing"); +DECLARE_bool(use_mkldnn); +#ifdef PADDLE_WITH_NGRAPH +DECLARE_bool(use_ngraph); +#endif // disable auto conversion to list in Python PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); @@ -188,6 +193,8 @@ PYBIND11_MODULE(core_noavx, m) { m.add_object("_cleanup", py::capsule([]() { ScopePool::Instance().Clear(); })); + m.def("_set_paddle_lib_path", &paddle::platform::dynload::SetPaddleLibPath); + BindImperative(&m); py::class_(m, "Tensor", py::buffer_protocol()) @@ -212,6 +219,10 @@ PYBIND11_MODULE(core_noavx, m) { [](Tensor &self, paddle::platform::CPUPlace &place) { self.mutable_data(place); }) + .def("_alloc_double", + [](Tensor &self, paddle::platform::CPUPlace &place) { + self.mutable_data(place); + }) .def("_alloc_int", [](Tensor &self, paddle::platform::CPUPlace &place) { self.mutable_data(place); @@ -331,8 +342,8 @@ PYBIND11_MODULE(core_noavx, m) { recursive_sequence_lengths.end(), std::back_inserter(new_lod)); LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); - PADDLE_ENFORCE( - CheckLoD(new_offset_lod, -1), + PADDLE_ENFORCE_EQ( + CheckLoD(new_offset_lod, -1), true, "the provided recursive_sequence_lengths info is invalid"); new (&instance) LoDTensor(new_offset_lod); }) @@ -348,8 +359,9 @@ PYBIND11_MODULE(core_noavx, m) { LoD new_lod; new_lod.reserve(lod.size()); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); - PADDLE_ENFORCE(CheckLoD(new_lod, vectorize(self.dims()).front()), - "the provided lod info is invalid"); + PADDLE_ENFORCE_EQ( + CheckLoD(new_lod, vectorize(self.dims()).front()), true, + "the provided lod info is invalid"); self.set_lod(new_lod); }, py::arg("lod"), R"DOC( @@ -379,8 +391,8 @@ PYBIND11_MODULE(core_noavx, m) { recursive_sequence_lengths.end(), std::back_inserter(new_lod)); LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); - PADDLE_ENFORCE( - CheckLoD(new_offset_lod, vectorize(self.dims()).front()), + PADDLE_ENFORCE_EQ( + CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true, "the provided recursive_sequence_lengths info is invalid"); self.set_lod(new_offset_lod); }, @@ -487,10 +499,24 @@ PYBIND11_MODULE(core_noavx, m) { Returns: out (Tensor): new Tensor(NOT LoDTensor). )DOC") - .def("__str__", [](const LoDTensor &self) { - std::stringstream ostr; - ostr << self; - return ostr.str(); + .def("__str__", + [](const LoDTensor &self) { + std::stringstream ostr; + ostr << self; + return ostr.str(); + }) + .def("_copy", [](const LoDTensor &self, const platform::Place &place) { + // follow fetch_op's inplementation + LoDTensor dst; + if (self.IsInitialized() && self.numel() > 0) { + TensorCopySync(self, place, &dst); + } else { + // Not copy, if the src tensor is empty. + dst.clear(); + dst.Resize({0}); + } + dst.set_lod(self.lod()); + return dst; }); py::class_(m, "SelectedRows") @@ -567,7 +593,7 @@ All parameter, weight, gradient are variables in Paddle. #endif .def("get_reader", [](Variable &self) -> framework::ReaderHolder * { - PADDLE_ENFORCE(self.IsType()); + PADDLE_ENFORCE_EQ(self.IsType(), true); return self.GetMutable(); }, py::return_value_policy::reference); @@ -692,8 +718,8 @@ All parameter, weight, gradient are variables in Paddle. auto &info = iter.second; if (info.HasOpProtoAndChecker()) { std::string str; - PADDLE_ENFORCE( - info.Proto().SerializeToString(&str), + PADDLE_ENFORCE_EQ( + info.Proto().SerializeToString(&str), true, "Serialize OpProto Error. This could be a bug of Paddle."); ret_values.emplace_back(str); } @@ -716,16 +742,32 @@ All parameter, weight, gradient are variables in Paddle. [](std::unique_ptr &p) { return p.release(); }); return std::make_pair(grad_op_desc_ptrs, grad_to_var); }); + m.def("has_grad_op_maker", [](const std::string op_type) { + return framework::OpInfoMap::Instance().Get(op_type).HasGradOpMaker(); + }); + m.def("has_infer_inplace", [](const std::string op_type) { + return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace(); + }); + m.def("get_flags_use_mkldnn", []() { return FLAGS_use_mkldnn; }); +#ifdef PADDLE_WITH_NGRAPH + m.def("get_flags_use_ngraph", []() { return FLAGS_use_ngraph; }); +#endif + m.def("prune", [](const ProgramDesc &origin, + const std::set &feeded_var_names, const std::vector> &targets) { ProgramDesc prog_with_targets(origin); + for (const auto &t : targets) { prog_with_targets.MutableBlock(t[0])->Op(t[1])->SetIsTarget(true); } proto::ProgramDesc pruned_desc; - Prune(*prog_with_targets.Proto(), &pruned_desc); + Prune(*prog_with_targets.Proto(), feeded_var_names, &pruned_desc); return new ProgramDesc(pruned_desc); }); + m.def("prune_backward", [](const framework::ProgramDesc &program) { + return PruneBackward(program); + }); m.def("empty_var_name", []() { return std::string(framework::kEmptyVarName); }); m.def("grad_var_suffix", @@ -908,16 +950,17 @@ All parameter, weight, gradient are variables in Paddle. }); py::class_(m, "Operator") - .def_static("create", - [](py::bytes protobin) { - proto::OpDesc desc; - PADDLE_ENFORCE(desc.ParsePartialFromString(protobin), - "Cannot parse user input to OpDesc"); - PADDLE_ENFORCE(desc.IsInitialized(), - "User OpDesc is not initialized, reason %s", - desc.InitializationErrorString()); - return OpRegistry::CreateOp(desc); - }) + .def_static( + "create", + [](py::bytes protobin) { + proto::OpDesc desc; + PADDLE_ENFORCE_EQ(desc.ParsePartialFromString(protobin), true, + "Cannot parse user input to OpDesc"); + PADDLE_ENFORCE_EQ(desc.IsInitialized(), true, + "User OpDesc is not initialized, reason %s", + desc.InitializationErrorString()); + return OpRegistry::CreateOp(desc); + }) .def("run", [](OperatorBase &self, const Scope &scope, const platform::CPUPlace &place) { self.Run(scope, place); }) @@ -1065,10 +1108,17 @@ All parameter, weight, gradient are variables in Paddle. t = fluid.LoDTensor() t.set(np.ndarray([5, 30]), fluid.CPUPlace()) arr.append(t) - )DOC"); - - m.def("IsInplace", - [](std::string op) -> bool { return operators::IsInplace(op); }); + )DOC") + .def("_move_to_list", + [](LoDTensorArray &self) -> py::list { + py::list res(self.size()); + for (size_t i = 0; i < self.size(); ++i) { + res[i] = py::cast(std::move(self[i])); + } + self.clear(); + return res; + }, + py::return_value_policy::take_ownership); m.def("op_support_gpu", OpSupportGPU); #ifdef PADDLE_WITH_CUDA @@ -1106,6 +1156,11 @@ All parameter, weight, gradient are variables in Paddle. return std::shared_ptr(std::move(pass)); }); + m.def("size_of_dtype", framework::SizeOfType); + + using VarQuantScale = + std::unordered_map>; + py::class_> pass(m, "Pass"); pass.def(py::init()) .def("has", &ir::Pass::Has) @@ -1120,6 +1175,20 @@ All parameter, weight, gradient are variables in Paddle. }) .def("set", [](ir::Pass &self, const std::string &name, int val) { self.Set(name, new int(val)); }) + .def("set", + [](ir::Pass &self, const std::string &name, + std::unordered_set set) { + self.Set(name, new std::unordered_set(set)); + }) + .def("set", + [](ir::Pass &self, const std::string &name, + std::unordered_set set) { + self.Set(name, new std::unordered_set(set)); + }) + .def("set", + [](ir::Pass &self, const std::string &name, VarQuantScale scales) { + self.Set(name, new VarQuantScale(scales)); + }) .def("type", &ir::Pass::Type) .def("apply", [](ir::Pass &self, std::shared_ptr graph) { self.Apply(graph.get()); @@ -1280,15 +1349,17 @@ All parameter, weight, gradient are variables in Paddle. "reduce_strategy", [](const BuildStrategy &self) { return self.reduce_; }, [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + "BuildStrategy is finlaized."); self.reduce_ = strategy; }, - R"DOC(The type is STR, there are two reduce strategies in ParallelExecutor, - 'AllReduce' and 'Reduce'. If you want that all the parameters' - optimization are done on all devices independently, you should choose 'AllReduce'; - if you choose 'Reduce', all the parameters' optimization will be evenly distributed - to different devices, and then broadcast the optimized parameter to other devices. - In some models, `Reduce` is faster. Default 'AllReduce'. + R"DOC(The type is fluid.BuildStrategy.ReduceStrategy, there are two reduce + strategies in ParallelExecutor, AllReduce and Reduce. If you want + that all the parameters' optimization are done on all devices independently, + you should choose AllReduce; if you choose Reduce, all the parameters' + optimization will be evenly distributed to different devices, and then + broadcast the optimized parameter to other devices. + Default 'AllReduce'. Examples: .. code-block:: python @@ -1302,30 +1373,73 @@ All parameter, weight, gradient are variables in Paddle. [](const BuildStrategy &self) { return self.gradient_scale_; }, [](BuildStrategy &self, BuildStrategy::GradientScaleStrategy strategy) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + "BuildStrategy is finalized."); self.gradient_scale_ = strategy; }, - R"DOC(The type is STR, there are three ways of defining :math:`loss@grad` in - ParallelExecutor, 'CoeffNumDevice', 'One' and 'Customized'. By default, - ParallelExecutor sets the :math:`loss@grad` according to the number of devices. - If you want to customize :math:`loss@grad`, you can choose 'Customized'. - Default 'CoeffNumDevice'. + R"DOC(The type is fluid.BuildStrategy.GradientScaleStrategy, there are three + ways of defining :math:`loss@grad` in ParallelExecutor, CoeffNumDevice, + One and Customized. By default, ParallelExecutor sets the :math:`loss@grad` + according to the number of devices. If you want to customize :math:`loss@grad`, + you can choose Customized. Default 'CoeffNumDevice'. Examples: .. code-block:: python import paddle.fluid as fluid + import paddle.fluid.compiler as compiler + import numpy + import os + + use_cuda = True + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + + # NOTE: If you use CPU to run the program, you need + # to specify the CPU_NUM, otherwise, fluid will use + # all the number of the logic core as the CPU_NUM, + # in that case, the batch size of the input should be + # greater than CPU_NUM, if not, the process will be + # failed by an exception. + if not use_cuda: + os.environ['CPU_NUM'] = str(2) + places = fluid.cpu_places() + else: + places = places = fluid.cuda_places() + + data = fluid.layers.data(name='X', shape=[1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) + + fluid.default_startup_program().random_seed=1 + exe.run(fluid.default_startup_program()) + build_strategy = fluid.BuildStrategy() - build_strategy.gradient_scale_strategy = True + build_strategy.gradient_scale_strategy = \ + fluid.BuildStrategy.GradientScaleStrategy.Customized + compiled_prog = compiler.CompiledProgram( + fluid.default_main_program()).with_data_parallel( + loss_name=loss.name, build_strategy=build_strategy, + places = places) + + dev_count = len(places) + x = numpy.random.random(size=(10, 1)).astype('float32') + loss_grad = numpy.ones((dev_count)).astype("float32") * 0.01 + loss_grad_name = loss.name+"@GRAD" + loss_data = exe.run(compiled_prog, + feed={"X": x, loss_grad_name : loss_grad}, + fetch_list=[loss.name, loss_grad_name]) )DOC") .def_property( "debug_graphviz_path", [](const BuildStrategy &self) { return self.debug_graphviz_path_; }, [](BuildStrategy &self, const std::string &path) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + "BuildStrategy is finlaized."); self.debug_graphviz_path_ = path; }, - R"DOC(The type is STR, debug_graphviz_path indicate the path that + R"DOC(The type is STR, debug_graphviz_path indicates the path that writing the SSA Graph to file in the form of graphviz. It is useful for debugging. Default "" @@ -1334,7 +1448,8 @@ All parameter, weight, gradient are variables in Paddle. import paddle.fluid as fluid build_strategy = fluid.BuildStrategy() - build_strategy.debug_graphviz_path = "" + build_strategy.debug_graphviz_path = "./graph" + )DOC") .def_property( "enable_sequential_execution", @@ -1342,10 +1457,12 @@ All parameter, weight, gradient are variables in Paddle. return self.enable_sequential_execution_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + "BuildStrategy is finlaized."); self.enable_sequential_execution_ = b; }, - R"DOC(The type is BOOL. If set True, the execution order of ops would be the same as what is in the program. Default False. + R"DOC(The type is BOOL. If set True, the execution order of ops would + be the same as what is in the program. Default False. Examples: .. code-block:: python @@ -1360,10 +1477,12 @@ All parameter, weight, gradient are variables in Paddle. return self.remove_unnecessary_lock_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + "BuildStrategy is finlaized."); self.remove_unnecessary_lock_ = b; }, - R"DOC(The type is BOOL. If set True, some locks in GPU ops would be released and ParallelExecutor would run faster. Default True. + R"DOC(The type is BOOL. If set True, some locks in GPU ops would be + released and ParallelExecutor would run faster. Default True. Examples: .. code-block:: python @@ -1420,7 +1539,8 @@ All parameter, weight, gradient are variables in Paddle. return self.fuse_elewise_add_act_ops_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + "BuildStrategy is finlaized."); self.fuse_elewise_add_act_ops_ = b; }, R"DOC(The type is BOOL, fuse_elewise_add_act_ops indicate whether @@ -1440,7 +1560,8 @@ All parameter, weight, gradient are variables in Paddle. return self.fuse_relu_depthwise_conv_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + "BuildStrategy is finlaized."); self.fuse_relu_depthwise_conv_ = b; }, R"DOC(The type is BOOL, fuse_relu_depthwise_conv indicate whether @@ -1456,14 +1577,17 @@ All parameter, weight, gradient are variables in Paddle. build_strategy = fluid.BuildStrategy() build_strategy.fuse_relu_depthwise_conv = True )DOC") - .def_property( - "fuse_broadcast_ops", - [](const BuildStrategy &self) { return self.fuse_broadcast_ops_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); - self.fuse_broadcast_ops_ = b; - }, - R"DOC(The type is BOOL, fuse_broadcast_op indicates whether + .def_property("fuse_broadcast_ops", + [](const BuildStrategy &self) { + return self.fuse_broadcast_ops_ == true || + self.fuse_broadcast_ops_ == boost::none; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + "BuildStrategy is finlaized."); + self.fuse_broadcast_ops_ = b; + }, + R"DOC(The type is BOOL, fuse_broadcast_op indicates whether to fuse the broadcast ops. Note that, in Reduce mode, fusing broadcast ops may make the program faster. Because fusing broadcast OP equals delaying the execution of all @@ -1471,18 +1595,20 @@ All parameter, weight, gradient are variables in Paddle. for NCCLReduce operations for a period of time. Default False.)DOC") .def_property("fuse_all_optimizer_ops", [](const BuildStrategy &self) { - return self.fuse_all_optimizer_ops_; + return self.fuse_all_optimizer_ops_ == true || + self.fuse_all_optimizer_ops_ == boost::none; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE(!self.IsFinalized(), - "BuildStrategy is finlaized."); + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + "BuildStrategy is finlaized."); self.fuse_all_optimizer_ops_ = b; }) .def_property( "sync_batch_norm", [](const BuildStrategy &self) { return self.sync_batch_norm_; }, [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + PADDLE_ENFORCE_EQ(!self.IsFinalized(), true, + "BuildStrategy is finlaized."); self.sync_batch_norm_ = b; }, R"DOC(The type is BOOL, sync_batch_norm indicates whether to use @@ -1503,17 +1629,31 @@ All parameter, weight, gradient are variables in Paddle. )DOC") .def_property( "memory_optimize", - [](const BuildStrategy &self) { return self.memory_optimize_; }, - [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; }, - R"DOC(The type is BOOL, memory opitimize aims to save total memory + [](const BuildStrategy &self) -> py::object { + if (self.memory_optimize_) { + return py::cast(self.memory_optimize_.get()); + } else { + return py::cast(nullptr); + } + }, + [](BuildStrategy &self, const py::handle &value) { + auto *py_obj = value.ptr(); + if (py_obj == nullptr || py_obj == Py_None) { + self.memory_optimize_ = boost::none; + } else if (PyBool_Check(py_obj)) { + self.memory_optimize_ = (py_obj == Py_True); + } else { + PADDLE_THROW( + "BuildStrategy.memory_optimize must be None, False or True"); + } + }, + R"DOC(The type is BOOL or None, memory opitimize aims to save total memory consumption, set to True to enable it. - Memory Optimize is our experimental feature, some variables - may be reused/removed by optimize strategy. If you need to - fetch some variable values when using this feature, please - set the persistable property of the variables to True. - - Default False)DOC") + Default None. None means framework would choose to use or not use + this strategy automatically. Currently, None means that it is + enabled when GC is disabled, and disabled when GC is enabled. + True means enabling and False means disabling. Default None.)DOC") .def_property( "is_distribution", [](const BuildStrategy &self) { return self.is_distribution_; }, @@ -1533,16 +1673,12 @@ All parameter, weight, gradient are variables in Paddle. "enable_inplace", [](const BuildStrategy &self) { return self.enable_inplace_; }, [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) - .def_property("_use_legacy_memory_optimize_strategy", - [](const BuildStrategy &self) { - return self.use_legacy_memory_optimize_strategy_; - }, - [](BuildStrategy &self, bool b) { - self.use_legacy_memory_optimize_strategy_ = b; - }) .def_property( "fuse_all_reduce_ops", - [](const BuildStrategy &self) { return self.fuse_all_reduce_ops_; }, + [](const BuildStrategy &self) { + return self.fuse_all_reduce_ops_ == true || + self.fuse_all_reduce_ops_ == boost::none; + }, [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; }) .def_property("enable_backward_optimizer_op_deps", [](const BuildStrategy &self) { @@ -1593,14 +1729,13 @@ All parameter, weight, gradient are variables in Paddle. .def("feed_and_split_tensor_into_local_scopes", &ParallelExecutor::FeedAndSplitTensorIntoLocalScopes) .def("run", [](ParallelExecutor &self, - const std::vector &fetch_tensors, - const std::string &fetched_var_name) { + const std::vector &fetch_tensors) { pybind11::gil_scoped_release release; - self.Run(fetch_tensors, fetched_var_name); + return self.Run(fetch_tensors); }); - BindRecordIOWriter(&m); BindFleetWrapper(&m); + BindBoxHelper(&m); #ifndef _WIN32 BindNCCLWrapper(&m); #endif diff --git a/paddle/fluid/pybind/pybind.h b/paddle/fluid/pybind/pybind.h deleted file mode 100644 index 40ed4ea0..00000000 --- a/paddle/fluid/pybind/pybind.h +++ /dev/null @@ -1,553 +0,0 @@ -#ifdef PYBIND_AVX_MKLML -// Generated by the paddle/fluid/operator/CMakeLists.txt. DO NOT EDIT! - -USE_NO_KERNEL_OP(feed); -USE_NO_KERNEL_OP(while); -USE_NO_KERNEL_OP(get_places); -USE_NO_KERNEL_OP(fetch); -USE_NO_KERNEL_OP(conditional_block_infer); -USE_NO_KERNEL_OP(conditional_block); -USE_OP(less_than); -USE_OP(logical_and); -USE_NO_KERNEL_OP(read_from_array); -USE_CPU_ONLY_OP(bipartite_match); -USE_OP(box_coder); -USE_OP(iou_similarity); -USE_CPU_ONLY_OP(mine_hard_examples); -USE_CPU_ONLY_OP(multiclass_nms); -USE_OP(prior_box); -USE_OP(density_prior_box); -USE_OP(anchor_generator); -USE_OP(target_assign); -USE_OP(polygon_box_transform); -USE_CPU_ONLY_OP(rpn_target_assign); -USE_CPU_ONLY_OP(generate_proposal_labels); -USE_OP(box_clip); -USE_CPU_ONLY_OP(yolov3_loss); -USE_OP(yolo_box); -USE_OP(box_decoder_and_assign); -USE_OP(sigmoid_focal_loss); -USE_CPU_ONLY_OP(retinanet_detection_output); -USE_CPU_ONLY_OP(generate_proposals); -USE_CPU_ONLY_OP(distribute_fpn_proposals); -USE_CPU_ONLY_OP(collect_fpn_proposals); -USE_OP(roi_perspective_transform); -USE_CPU_ONLY_OP(generate_mask_labels); -USE_OP(elementwise_mod); -USE_OP(elementwise_floordiv); -USE_OP(elementwise_max); -USE_OP(elementwise_pow); -USE_OP(elementwise_sub_grad); -USE_OP(elementwise_add_grad); -USE_OP(elementwise_min); -USE_OP(elementwise_div); -USE_OP(elementwise_mul); -USE_CPU_ONLY_OP(fusion_squared_mat_sub); -USE_CPU_ONLY_OP(fusion_seqpool_concat); -USE_CPU_ONLY_OP(fused_embedding_fc_lstm); -USE_CPU_ONLY_OP(fusion_seqexpand_concat_fc); -USE_CPU_ONLY_OP(fused_embedding_seq_pool); -USE_CPU_ONLY_OP(fusion_seqconv_eltadd_relu); -USE_CPU_ONLY_OP(fusion_gru); -USE_CPU_ONLY_OP(fusion_repeated_fc_relu); -USE_CPU_ONLY_OP(fusion_lstm); -USE_OP(fused_elemwise_activation); -USE_OP(accuracy); -USE_CPU_ONLY_OP(precision_recall); -USE_CPU_ONLY_OP(auc); -USE_OP(adamax); -USE_OP(sgd); -USE_OP(lars_momentum); -USE_OP(adagrad); -USE_OP(ftrl); -USE_OP(momentum); -USE_OP(adadelta); -USE_OP(rmsprop); -USE_OP(lamb); -USE_OP(proximal_gd); -USE_OP(proximal_adagrad); -USE_OP(adam); -USE_OP(decayed_adagrad); -USE_OP(reduce_all); -USE_OP(reduce_min); -USE_OP(reduce_sum); -USE_OP(reduce_any); -USE_OP(reduce_max); -USE_OP(reduce_mean); -USE_OP(reduce_prod); -USE_OP(sequence_erase); -USE_OP(sequence_unpad); -USE_OP(sequence_mask); -USE_OP(sequence_expand); -USE_OP(sequence_pad); -USE_OP(sequence_enumerate); -USE_OP(sequence_slice); -USE_OP(sequence_softmax); -USE_OP(sequence_expand_as); -USE_OP(sequence_pool); -USE_OP(sequence_reverse); -USE_CPU_ONLY_OP(sequence_scatter); -USE_OP(sequence_conv); -USE_OP(sequence_concat); -USE_OP(sequence_reshape); -USE_NO_KERNEL_OP(open_files); -USE_NO_KERNEL_OP(create_random_data_generator); -USE_NO_KERNEL_OP(create_shuffle_reader); -USE_NO_KERNEL_OP(create_batch_reader); -USE_NO_KERNEL_OP(create_recordio_file_reader); -USE_NO_KERNEL_OP(create_double_buffer_reader); -USE_NO_KERNEL_OP(create_multi_pass_reader); -USE_NO_KERNEL_OP(create_custom_reader); -USE_NO_KERNEL_OP(create_py_reader); -USE_NO_KERNEL_OP(read); -USE_OP(increment); -USE_OP(stack); -USE_CPU_ONLY_OP(fc); -USE_NO_KERNEL_OP(assign); -USE_OP(load); -USE_NO_KERNEL_OP(fill); -USE_NO_KERNEL_OP(reorder_lod_tensor_by_rank); -USE_OP(conv_shift); -USE_OP(fill_zeros_like); -USE_CPU_ONLY_OP(hash); -USE_NO_KERNEL_OP(dequantize); -USE_OP(fake_quantize_abs_max); -USE_OP(size); -USE_OP(scatter); -USE_OP(uniform_random); -USE_OP(beam_search); -USE_NO_KERNEL_OP(beam_search_decode); -USE_OP(dropout); -USE_OP(bilinear_interp); -USE_OP(sampling_id); -USE_OP(lstm); -USE_OP(modified_huber_loss); -USE_OP(temporal_shift); -USE_OP(sum); -USE_OP(arg_min); -USE_OP(psroi_pool); -USE_NO_KERNEL_OP(uniform_random_batch_size_like); -USE_NO_KERNEL_OP(rnn_memory_helper); -USE_CPU_ONLY_OP(crf_decoding); -USE_OP(where); -USE_OP(fake_dequantize_max_abs); -USE_OP(mean_iou); -USE_OP(roi_align); -USE_OP(range); -USE_OP(edit_distance); -USE_OP(multiplex); -USE_OP(clip); -USE_OP(gaussian_random); -USE_OP(norm); -USE_OP(rank_loss); -USE_CPU_ONLY_OP(detection_map); -USE_OP(lstm_unit); -USE_OP(shard_index); -USE_OP(shape); -USE_OP(arg_max); -USE_OP(average_accumulates); -USE_NO_KERNEL_OP(requantize); -USE_OP(conv2d); -USE_CPU_ONLY_OP(add_position_encoding); -USE_OP(gru_unit); -USE_OP(batch_norm); -USE_CPU_ONLY_OP(chunk_eval); -USE_NO_KERNEL_OP(lod_rank_table); -USE_NO_KERNEL_OP(unsqueeze); -USE_CPU_ONLY_OP(positive_negative_pair); -USE_OP(im2sequence); -USE_OP(margin_rank_loss); -USE_OP(hinge_loss); -USE_CPU_ONLY_OP(cvm); -USE_OP(huber_loss); -USE_OP(crop); -USE_OP(relu_grad); -USE_CPU_ONLY_OP(hierarchical_sigmoid); -USE_OP(unfold); -USE_NO_KERNEL_OP(max_sequence_len); -USE_OP(mul); -USE_CPU_ONLY_OP(attention_lstm); -USE_OP(top_k); -USE_OP(group_norm); -USE_OP(selu); -USE_OP(lstmp); -USE_NO_KERNEL_OP(merge_lod_tensor); -USE_OP(truncated_gaussian_random); -USE_OP(label_smooth); -USE_CPU_ONLY_OP(matmul); -USE_OP(spp); -USE_NO_KERNEL_OP(unstack); -USE_OP(conv2d_transpose); -USE_OP(diag); -USE_OP(unpool); -USE_NO_KERNEL_OP(lod_array_length); -USE_OP(affine_channel); -USE_OP(log_loss); -USE_OP(concat); -USE_NO_KERNEL_OP(lod_tensor_to_array); -USE_OP(gru); -USE_CPU_ONLY_OP(coalesce_tensor); -USE_OP(fsp); -USE_OP(linspace); -USE_OP(reverse); -USE_NO_KERNEL_OP(recurrent); -USE_OP(split_selected_rows); -USE_OP(dgc_clip_by_norm); -USE_OP(scale); -USE_OP(save); -USE_OP(load_combine); -USE_OP(merge_selected_rows); -USE_OP(split); -USE_OP(cumsum); -USE_OP(deformable_psroi_pooling); -USE_CPU_ONLY_OP(teacher_student_sigmoid_loss); -USE_OP(transpose); -USE_OP(fill_constant_batch_size_like); -USE_OP(sigmoid_cross_entropy_with_logits); -USE_OP(shuffle_channel); -USE_CPU_ONLY_OP(affine_grid); -USE_NO_KERNEL_OP(split_lod_tensor); -USE_CPU_ONLY_OP(grid_sampler); -USE_OP(lookup_table); -USE_OP(cos_sim); -USE_NO_KERNEL_OP(quantize); -USE_OP(spectral_norm); -USE_OP(cross_entropy); -USE_NO_KERNEL_OP(print); -USE_OP(lrn); -USE_CPU_ONLY_OP(nce); -USE_CPU_ONLY_OP(similarity_focus); -USE_CPU_ONLY_OP(get_tensor_from_selected_rows); -USE_OP(squared_l2_distance); -USE_OP(cudnn_lstm); -USE_OP(tree_conv); -USE_OP(one_hot); -USE_NO_KERNEL_OP(lookup_sparse_table); -USE_CPU_ONLY_OP(unique); -USE_OP(mean); -USE_OP(prelu); -USE_NO_KERNEL_OP(delete_var); -USE_OP(ctc_align); -USE_OP(argsort); -USE_CPU_ONLY_OP(data_norm); -USE_OP(minus); -USE_NO_KERNEL_OP(shrink_rnn_memory); -USE_OP(lod_reset); -USE_OP(l1_norm); -USE_NO_KERNEL_OP(gaussian_random_batch_size_like); -USE_OP(is_empty); -USE_OP(bilinear_tensor_product); -USE_OP(kldiv_loss); -USE_NO_KERNEL_OP(squeeze); -USE_OP(softmax); -USE_OP(clip_by_norm); -USE_OP(max_pool2d_with_index); -USE_OP(linear_chain_crf); -USE_CPU_ONLY_OP(reshape); -USE_OP(fill_constant); -USE_OP(space_to_depth); -USE_OP(gather); -USE_OP(softmax_with_cross_entropy); -USE_OP(slice); -USE_OP(sign); -USE_OP(expand); -USE_OP(smooth_l1_loss); -USE_NO_KERNEL_OP(tensor_array_to_tensor); -USE_OP(row_conv); -USE_OP(pad2d); -USE_OP(pixel_shuffle); -USE_OP(assign_value); -USE_OP(random_crop); -USE_OP(squared_l2_norm); -USE_OP(save_combine); -USE_OP(pool2d); -USE_OP(cast); -USE_NO_KERNEL_OP(array_to_lod_tensor); -USE_OP(fill_any_like); -USE_NO_KERNEL_OP(flatten); -USE_OP(sample_logits); -USE_OP(pad); -USE_CPU_ONLY_OP(bpr_loss); -USE_OP(roi_pool); -USE_OP(pad_constant_like); -USE_OP(isfinite); -USE_OP(layer_norm); -USE_OP(maxout); -USE_OP(warpctc); -#elif defined PYBIND_NOAVX_OPENBLAS -// Generated by the paddle/fluid/operator/CMakeLists.txt. DO NOT EDIT! - -USE_NO_KERNEL_OP(feed); -USE_NO_KERNEL_OP(while); -USE_NO_KERNEL_OP(get_places); -USE_NO_KERNEL_OP(fetch); -USE_NO_KERNEL_OP(conditional_block_infer); -USE_NO_KERNEL_OP(conditional_block); -USE_OP(less_than); -USE_OP(logical_and); -USE_NO_KERNEL_OP(read_from_array); -USE_CPU_ONLY_OP(bipartite_match); -USE_OP(box_coder); -USE_OP(iou_similarity); -USE_CPU_ONLY_OP(mine_hard_examples); -USE_CPU_ONLY_OP(multiclass_nms); -USE_OP(prior_box); -USE_OP(density_prior_box); -USE_OP(anchor_generator); -USE_OP(target_assign); -USE_OP(polygon_box_transform); -USE_CPU_ONLY_OP(rpn_target_assign); -USE_CPU_ONLY_OP(generate_proposal_labels); -USE_OP(box_clip); -USE_CPU_ONLY_OP(yolov3_loss); -USE_OP(yolo_box); -USE_OP(box_decoder_and_assign); -USE_OP(sigmoid_focal_loss); -USE_CPU_ONLY_OP(retinanet_detection_output); -USE_CPU_ONLY_OP(generate_proposals); -USE_CPU_ONLY_OP(distribute_fpn_proposals); -USE_CPU_ONLY_OP(collect_fpn_proposals); -USE_OP(roi_perspective_transform); -USE_CPU_ONLY_OP(generate_mask_labels); -USE_OP(elementwise_mod); -USE_OP(elementwise_floordiv); -USE_OP(elementwise_max); -USE_OP(elementwise_pow); -USE_OP(elementwise_sub_grad); -USE_OP(elementwise_add_grad); -USE_OP(elementwise_min); -USE_OP(elementwise_div); -USE_OP(elementwise_mul); -USE_CPU_ONLY_OP(fusion_squared_mat_sub); -USE_CPU_ONLY_OP(fusion_seqpool_concat); -USE_CPU_ONLY_OP(fused_embedding_fc_lstm); -USE_CPU_ONLY_OP(fusion_seqexpand_concat_fc); -USE_CPU_ONLY_OP(fused_embedding_seq_pool); -USE_CPU_ONLY_OP(fusion_seqconv_eltadd_relu); -USE_CPU_ONLY_OP(fusion_gru); -USE_CPU_ONLY_OP(fusion_repeated_fc_relu); -USE_CPU_ONLY_OP(fusion_lstm); -USE_OP(fused_elemwise_activation); -USE_OP(accuracy); -USE_CPU_ONLY_OP(precision_recall); -USE_CPU_ONLY_OP(auc); -USE_OP(adamax); -USE_OP(sgd); -USE_OP(lars_momentum); -USE_OP(adagrad); -USE_OP(ftrl); -USE_OP(momentum); -USE_OP(adadelta); -USE_OP(rmsprop); -USE_OP(lamb); -USE_OP(proximal_gd); -USE_OP(proximal_adagrad); -USE_OP(adam); -USE_OP(decayed_adagrad); -USE_OP(reduce_all); -USE_OP(reduce_min); -USE_OP(reduce_sum); -USE_OP(reduce_any); -USE_OP(reduce_max); -USE_OP(reduce_mean); -USE_OP(reduce_prod); -USE_OP(sequence_erase); -USE_OP(sequence_unpad); -USE_OP(sequence_mask); -USE_OP(sequence_expand); -USE_OP(sequence_pad); -USE_OP(sequence_enumerate); -USE_OP(sequence_slice); -USE_OP(sequence_softmax); -USE_OP(sequence_expand_as); -USE_OP(sequence_pool); -USE_OP(sequence_reverse); -USE_CPU_ONLY_OP(sequence_scatter); -USE_OP(sequence_conv); -USE_OP(sequence_concat); -USE_OP(sequence_reshape); -USE_NO_KERNEL_OP(open_files); -USE_NO_KERNEL_OP(create_random_data_generator); -USE_NO_KERNEL_OP(create_shuffle_reader); -USE_NO_KERNEL_OP(create_batch_reader); -USE_NO_KERNEL_OP(create_recordio_file_reader); -USE_NO_KERNEL_OP(create_double_buffer_reader); -USE_NO_KERNEL_OP(create_multi_pass_reader); -USE_NO_KERNEL_OP(create_custom_reader); -USE_NO_KERNEL_OP(create_py_reader); -USE_NO_KERNEL_OP(read); -USE_OP(increment); -USE_OP(stack); -USE_CPU_ONLY_OP(fc); -USE_NO_KERNEL_OP(assign); -USE_OP(load); -USE_NO_KERNEL_OP(fill); -USE_NO_KERNEL_OP(reorder_lod_tensor_by_rank); -USE_OP(conv_shift); -USE_OP(fill_zeros_like); -USE_CPU_ONLY_OP(hash); -USE_NO_KERNEL_OP(dequantize); -USE_OP(fake_quantize_abs_max); -USE_OP(size); -USE_OP(scatter); -USE_OP(uniform_random); -USE_OP(beam_search); -USE_NO_KERNEL_OP(beam_search_decode); -USE_OP(dropout); -USE_OP(bilinear_interp); -USE_OP(sampling_id); -USE_OP(lstm); -USE_OP(modified_huber_loss); -USE_OP(temporal_shift); -USE_OP(sum); -USE_OP(arg_min); -USE_OP(psroi_pool); -USE_NO_KERNEL_OP(uniform_random_batch_size_like); -USE_NO_KERNEL_OP(rnn_memory_helper); -USE_CPU_ONLY_OP(crf_decoding); -USE_OP(where); -USE_OP(fake_dequantize_max_abs); -USE_OP(mean_iou); -USE_OP(roi_align); -USE_OP(range); -USE_OP(edit_distance); -USE_OP(multiplex); -USE_OP(clip); -USE_OP(gaussian_random); -USE_OP(norm); -USE_OP(rank_loss); -USE_CPU_ONLY_OP(detection_map); -USE_OP(lstm_unit); -USE_OP(shard_index); -USE_OP(shape); -USE_OP(arg_max); -USE_OP(average_accumulates); -USE_NO_KERNEL_OP(requantize); -USE_OP(conv2d); -USE_CPU_ONLY_OP(add_position_encoding); -USE_OP(gru_unit); -USE_OP(batch_norm); -USE_CPU_ONLY_OP(chunk_eval); -USE_NO_KERNEL_OP(lod_rank_table); -USE_NO_KERNEL_OP(unsqueeze); -USE_CPU_ONLY_OP(positive_negative_pair); -USE_OP(im2sequence); -USE_OP(margin_rank_loss); -USE_OP(hinge_loss); -USE_CPU_ONLY_OP(cvm); -USE_OP(huber_loss); -USE_OP(crop); -USE_OP(relu_grad); -USE_CPU_ONLY_OP(hierarchical_sigmoid); -USE_OP(unfold); -USE_NO_KERNEL_OP(max_sequence_len); -USE_OP(mul); -USE_CPU_ONLY_OP(attention_lstm); -USE_OP(top_k); -USE_OP(group_norm); -USE_OP(selu); -USE_OP(lstmp); -USE_NO_KERNEL_OP(merge_lod_tensor); -USE_OP(truncated_gaussian_random); -USE_OP(label_smooth); -USE_CPU_ONLY_OP(matmul); -USE_OP(spp); -USE_NO_KERNEL_OP(unstack); -USE_OP(conv2d_transpose); -USE_OP(diag); -USE_OP(unpool); -USE_NO_KERNEL_OP(lod_array_length); -USE_OP(affine_channel); -USE_OP(log_loss); -USE_OP(concat); -USE_NO_KERNEL_OP(lod_tensor_to_array); -USE_OP(gru); -USE_CPU_ONLY_OP(coalesce_tensor); -USE_OP(fsp); -USE_OP(linspace); -USE_OP(reverse); -USE_NO_KERNEL_OP(recurrent); -USE_OP(split_selected_rows); -USE_OP(dgc_clip_by_norm); -USE_OP(scale); -USE_OP(save); -USE_OP(load_combine); -USE_OP(merge_selected_rows); -USE_OP(split); -USE_OP(cumsum); -USE_OP(deformable_psroi_pooling); -USE_CPU_ONLY_OP(teacher_student_sigmoid_loss); -USE_OP(transpose); -USE_OP(fill_constant_batch_size_like); -USE_OP(sigmoid_cross_entropy_with_logits); -USE_OP(shuffle_channel); -USE_CPU_ONLY_OP(affine_grid); -USE_NO_KERNEL_OP(split_lod_tensor); -USE_CPU_ONLY_OP(grid_sampler); -USE_OP(lookup_table); -USE_OP(cos_sim); -USE_NO_KERNEL_OP(quantize); -USE_OP(spectral_norm); -USE_OP(cross_entropy); -USE_NO_KERNEL_OP(print); -USE_OP(lrn); -USE_CPU_ONLY_OP(nce); -USE_CPU_ONLY_OP(similarity_focus); -USE_CPU_ONLY_OP(get_tensor_from_selected_rows); -USE_OP(squared_l2_distance); -USE_OP(cudnn_lstm); -USE_OP(tree_conv); -USE_OP(one_hot); -USE_NO_KERNEL_OP(lookup_sparse_table); -USE_CPU_ONLY_OP(unique); -USE_OP(mean); -USE_OP(prelu); -USE_NO_KERNEL_OP(delete_var); -USE_OP(ctc_align); -USE_OP(argsort); -USE_CPU_ONLY_OP(data_norm); -USE_OP(minus); -USE_NO_KERNEL_OP(shrink_rnn_memory); -USE_OP(lod_reset); -USE_OP(l1_norm); -USE_NO_KERNEL_OP(gaussian_random_batch_size_like); -USE_OP(is_empty); -USE_OP(bilinear_tensor_product); -USE_OP(kldiv_loss); -USE_NO_KERNEL_OP(squeeze); -USE_OP(softmax); -USE_OP(clip_by_norm); -USE_OP(max_pool2d_with_index); -USE_OP(linear_chain_crf); -USE_CPU_ONLY_OP(reshape); -USE_OP(fill_constant); -USE_OP(space_to_depth); -USE_OP(gather); -USE_OP(softmax_with_cross_entropy); -USE_OP(slice); -USE_OP(sign); -USE_OP(expand); -USE_OP(smooth_l1_loss); -USE_NO_KERNEL_OP(tensor_array_to_tensor); -USE_OP(row_conv); -USE_OP(pad2d); -USE_OP(pixel_shuffle); -USE_OP(assign_value); -USE_OP(random_crop); -USE_OP(squared_l2_norm); -USE_OP(save_combine); -USE_OP(pool2d); -USE_OP(cast); -USE_NO_KERNEL_OP(array_to_lod_tensor); -USE_OP(fill_any_like); -USE_NO_KERNEL_OP(flatten); -USE_OP(sample_logits); -USE_OP(pad); -USE_CPU_ONLY_OP(bpr_loss); -USE_OP(roi_pool); -USE_OP(pad_constant_like); -USE_OP(isfinite); -USE_OP(layer_norm); -USE_OP(maxout); -USE_OP(warpctc); -#endif diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index 4c304e86..4009bcf2 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -18,6 +18,7 @@ #include #include #include +#include "Python.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/operators/reader/buffered_reader.h" #include "paddle/fluid/operators/reader/py_reader.h" @@ -27,6 +28,14 @@ namespace paddle { namespace pybind { +namespace py = pybind11; + +static void RaiseStopIterationException() { + VLOG(2) << "Raise StopIteration Exception in Python"; + py::gil_scoped_acquire guard; + throw py::stop_iteration(); +} + class MultiDeviceFeedReader { public: using ResultDictList = @@ -69,6 +78,7 @@ class MultiDeviceFeedReader { bool success = WaitFutures(); if (!success) { + RaiseStopIterationException(); return {}; } @@ -85,6 +95,7 @@ class MultiDeviceFeedReader { ResultList ReadNextList() { bool success = WaitFutures(); if (!success) { + RaiseStopIterationException(); return {}; } @@ -144,8 +155,6 @@ class MultiDeviceFeedReader { std::vector> ret_; }; -namespace py = pybind11; - void BindReader(py::module *module) { auto &m = *module; diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc deleted file mode 100644 index 32caf4be..00000000 --- a/paddle/fluid/pybind/recordio.cc +++ /dev/null @@ -1,88 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/pybind/recordio.h" - -#include -#include -#include - -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/recordio/writer.h" - -namespace paddle { -namespace pybind { - -namespace { - -class RecordIOWriter { - public: - RecordIOWriter(const std::string& filename, recordio::Compressor compressor, - size_t max_num_record) - : closed_(false), - stream_(filename, std::ios::binary), - writer_(&stream_, compressor, max_num_record) {} - - void AppendTensor(const framework::LoDTensor& tensor) { - tensors_.push_back(tensor); - } - - void CompleteAppendTensor() { - auto& ctx = - *platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); - framework::WriteToRecordIO(&writer_, tensors_, ctx); - tensors_.clear(); - } - - void Close() { - PADDLE_ENFORCE(tensors_.empty()); - writer_.Flush(); - stream_.close(); - closed_ = true; - } - - ~RecordIOWriter() { - if (!closed_) { - Close(); - } - } - - private: - bool closed_; - std::vector tensors_; - std::ofstream stream_; - recordio::Writer writer_; -}; - -} // namespace - -void BindRecordIOWriter(py::module* m) { - py::class_ writer(*m, "RecordIOWriter", ""); - py::enum_(writer, "Compressor", "") - .value("Snappy", recordio::Compressor::kSnappy) - .value("NoCompress", recordio::Compressor::kNoCompress); - - writer - .def("__init__", - [](RecordIOWriter& self, const std::string& filename, - recordio::Compressor compressor, size_t max_num_record) { - new (&self) RecordIOWriter(filename, compressor, max_num_record); - }) - .def("append_tensor", &RecordIOWriter::AppendTensor) - .def("complete_append_tensor", &RecordIOWriter::CompleteAppendTensor) - .def("close", &RecordIOWriter::Close); -} - -} // namespace pybind -} // namespace paddle diff --git a/paddle/fluid/recordio/CMakeLists.txt b/paddle/fluid/recordio/CMakeLists.txt deleted file mode 100644 index 92e97a6c..00000000 --- a/paddle/fluid/recordio/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -# internal library. -cc_library(header SRCS header.cc) -cc_test(header_test SRCS header_test.cc DEPS header) -cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib) -cc_test(chunk_test SRCS chunk_test.cc DEPS chunk) -cc_library(writer SRCS writer.cc DEPS chunk) -cc_library(scanner SRCS scanner.cc DEPS chunk) -cc_test(writer_scanner_test SRCS writer_scanner_test.cc DEPS writer scanner) -cc_library(recordio DEPS chunk header writer scanner) diff --git a/paddle/fluid/recordio/README.md b/paddle/fluid/recordio/README.md deleted file mode 100644 index ef99c0cf..00000000 --- a/paddle/fluid/recordio/README.md +++ /dev/null @@ -1,13 +0,0 @@ -## Background - -The RecordIO file format is a container for records. This package is a C++ implementation of https://github.com/paddlepaddle/recordio, which originates from https://github.com/wangkuiyi/recordio. - -## Fault-tolerant Writing - -For the initial design purpose of RecordIO within Google, which was logging, RecordIO groups record into *chunks*, whose header contains an MD5 hash of the chunk. A process that writes logs is supposed to call the Writer interface to add records. Once the writer accumulates a handful of them, it groups a chunk, put the MD5 into the chunk header, and appends the chunk to the file. In the event the process crashes unexpected, the last chunk in the RecordIO file could be incomplete/corrupt. The RecordIO reader is able to recover from these errors when the process restarts by identifying incomplete chucks and skipping over them. - -## Reading Ranges - -A side-effect of chunks is to make it easy to indexing records while reading, thus allows us to read a range of successive records. This is good for distributed log process, where each MapReduce task handles only part of records in a big RecordIO file. - -The procedure that creates the index starts from reading the header of the first chunk. It indexes the offset (0) and the size of the chunk, and skips to the header of the next chunk by calling the `fseek` API. Please be aware that most distributed filesystems and all POSIX-compatible local filesystem provides `fseek`, and makes sure that `fseek` runs much faster than `fread`. This procedure generates a map from chunks to their offsets, which allows the readers is to locate and read a range of records. diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc deleted file mode 100644 index 6c65d916..00000000 --- a/paddle/fluid/recordio/chunk.cc +++ /dev/null @@ -1,174 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/recordio/chunk.h" - -#include -#include -#include -#include - -#include "paddle/fluid/platform/enforce.h" -#include "snappystream.hpp" - -namespace paddle { -namespace recordio { -constexpr size_t kMaxBufSize = 1024; - -/** - * Read Stream by a fixed sized buffer. - * @param in input stream - * @param limit read at most `limit` bytes from input stream. 0 means no limit - * @param callback A function object with (const char* buf, size_t size) -> void - * as its type. - */ -template -static void ReadStreamByBuf(std::istream& in, size_t limit, Callback callback) { - char buf[kMaxBufSize]; - std::streamsize actual_size; - size_t counter = 0; - size_t actual_max; - while (!in.eof() || - (limit != 0 && counter >= limit)) { // End of file or reach limit - actual_max = - limit != 0 ? std::min(limit - counter, kMaxBufSize) : kMaxBufSize; - in.read(buf, actual_max); - actual_size = in.gcount(); - if (actual_size == 0) { - break; - } - callback(buf, actual_size); - if (limit != 0) { - counter += actual_size; - } - } - in.clear(); // unset eof state -} - -/** - * Copy stream in to another stream - */ -static void PipeStream(std::istream& in, std::ostream& os) { - ReadStreamByBuf(in, 0, - [&os](const char* buf, size_t len) { os.write(buf, len); }); -} - -/** - * Calculate CRC32 from an input stream. - */ -static uint32_t Crc32Stream(std::istream& in, size_t limit = 0) { - uint32_t crc = static_cast(crc32(0, nullptr, 0)); - ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) { - crc = static_cast(crc32(crc, reinterpret_cast(buf), - static_cast(len))); - }); - return crc; -} - -bool Chunk::Write(std::ostream& os, Compressor ct) const { - // NOTE(dzhwinter): don't check records.numBytes instead, because - // empty records are allowed. - if (records_.empty()) { - return false; - } - std::stringstream sout; - std::unique_ptr compressed_stream; - switch (ct) { - case Compressor::kNoCompress: - break; - case Compressor::kSnappy: - compressed_stream.reset(new snappy::oSnappyStream(sout)); - break; - default: - PADDLE_THROW("Not implemented"); - } - - std::ostream& buf_stream = compressed_stream ? *compressed_stream : sout; - - for (auto& record : records_) { - size_t sz = record.size(); - buf_stream.write(reinterpret_cast(&sz), sizeof(uint32_t)) - .write(record.data(), record.size()); - } - - if (compressed_stream) { - compressed_stream.reset(); - } - - sout.seekg(0, std::ios::end); - uint32_t len = static_cast(sout.tellg()); - sout.seekg(0, std::ios::beg); - uint32_t crc = Crc32Stream(sout); - Header hdr(static_cast(records_.size()), crc, ct, len); - hdr.Write(os); - sout.seekg(0, std::ios::beg); - sout.clear(); - PipeStream(sout, os); - return true; -} - -bool Chunk::Parse(std::istream& sin) { - ChunkParser parser(sin); - if (!parser.Init()) { - return false; - } - Clear(); - while (parser.HasNext()) { - Add(parser.Next()); - } - return true; -} - -ChunkParser::ChunkParser(std::istream& sin) : in_(sin) {} -bool ChunkParser::Init() { - pos_ = 0; - bool ok = header_.Parse(in_); - if (!ok) { - return ok; - } - auto beg_pos = in_.tellg(); - uint32_t crc = Crc32Stream(in_, header_.CompressSize()); - PADDLE_ENFORCE_EQ(header_.Checksum(), crc); - in_.seekg(beg_pos, in_.beg); - - switch (header_.CompressType()) { - case Compressor::kNoCompress: - break; - case Compressor::kSnappy: - compressed_stream_.reset(new snappy::iSnappyStream(in_)); - break; - default: - PADDLE_THROW("Not implemented"); - } - return true; -} - -bool ChunkParser::HasNext() const { return pos_ < header_.NumRecords(); } - -std::string ChunkParser::Next() { - if (!HasNext()) { - return ""; - } - ++pos_; - std::istream& stream = compressed_stream_ ? *compressed_stream_ : in_; - uint32_t rec_len; - stream.read(reinterpret_cast(&rec_len), sizeof(uint32_t)); - std::string buf; - buf.resize(rec_len); - stream.read(&buf[0], rec_len); - PADDLE_ENFORCE_EQ(rec_len, stream.gcount()); - return buf; -} -} // namespace recordio -} // namespace paddle diff --git a/paddle/fluid/recordio/chunk.h b/paddle/fluid/recordio/chunk.h deleted file mode 100644 index cfb954a5..00000000 --- a/paddle/fluid/recordio/chunk.h +++ /dev/null @@ -1,73 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include - -#include "paddle/fluid/platform/macros.h" -#include "paddle/fluid/recordio/header.h" - -namespace paddle { -namespace recordio { - -// A Chunk contains the Header and optionally compressed records. -class Chunk { - public: - Chunk() : num_bytes_(0) {} - void Add(const std::string& buf) { - num_bytes_ += buf.size(); - records_.emplace_back(buf); - } - // dump the chunk into w, and clears the chunk and makes it ready for - // the next add invocation. - bool Write(std::ostream& fo, Compressor ct) const; - void Clear() { - records_.clear(); - num_bytes_ = 0; - } - - // returns true if ok, false if eof - bool Parse(std::istream& sin); - size_t NumBytes() const { return num_bytes_; } - size_t NumRecords() const { return records_.size(); } - const std::string& Record(int i) const { return records_[i]; } - - bool Empty() const { return records_.empty(); } - - private: - std::vector records_; - // sum of record lengths in bytes. - size_t num_bytes_; - DISABLE_COPY_AND_ASSIGN(Chunk); -}; - -class ChunkParser { - public: - explicit ChunkParser(std::istream& sin); - - bool Init(); - std::string Next(); - bool HasNext() const; - - private: - Header header_; - uint32_t pos_{0}; - std::istream& in_; - std::unique_ptr compressed_stream_; -}; - -} // namespace recordio -} // namespace paddle diff --git a/paddle/fluid/recordio/chunk_test.cc b/paddle/fluid/recordio/chunk_test.cc deleted file mode 100644 index 5177475c..00000000 --- a/paddle/fluid/recordio/chunk_test.cc +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/recordio/chunk.h" - -#include - -#include "gtest/gtest.h" - -TEST(Chunk, SaveLoad) { - paddle::recordio::Chunk ch; - ch.Add(std::string("12345", 6)); - ch.Add(std::string("123", 4)); - std::stringstream ss; - ch.Write(ss, paddle::recordio::Compressor::kNoCompress); - ss.seekg(0); - ch.Parse(ss); - ASSERT_EQ(ch.NumBytes(), 10U); -} - -TEST(Chunk, Compressor) { - paddle::recordio::Chunk ch; - ch.Add(std::string("12345", 6)); - ch.Add(std::string("123", 4)); - ch.Add(std::string("123", 4)); - ch.Add(std::string("123", 4)); - std::stringstream ss; - ch.Write(ss, paddle::recordio::Compressor::kSnappy); - std::stringstream ss2; - ch.Write(ss2, paddle::recordio::Compressor::kNoCompress); - ASSERT_LE(ss.tellp(), ss2.tellp()); // Compress should contain less data; - - ch.Clear(); - ch.Parse(ss); - ASSERT_EQ(ch.NumBytes(), 18ul); -} diff --git a/paddle/fluid/recordio/header.cc b/paddle/fluid/recordio/header.cc deleted file mode 100644 index c4822329..00000000 --- a/paddle/fluid/recordio/header.cc +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/recordio/header.h" - -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace recordio { - -Header::Header() - : num_records_(0), - checksum_(0), - compressor_(Compressor::kNoCompress), - compress_size_(0) {} - -Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs) - : num_records_(num), checksum_(sum), compressor_(c), compress_size_(cs) {} - -bool Header::Parse(std::istream& is) { - uint32_t magic; - is.read(reinterpret_cast(&magic), sizeof(uint32_t)); - size_t read_size = is.gcount(); - if (read_size < sizeof(uint32_t)) { - return false; - } - PADDLE_ENFORCE_EQ(magic, kMagicNumber); - - is.read(reinterpret_cast(&num_records_), sizeof(uint32_t)) - .read(reinterpret_cast(&checksum_), sizeof(uint32_t)) - .read(reinterpret_cast(&compressor_), sizeof(uint32_t)) - .read(reinterpret_cast(&compress_size_), sizeof(uint32_t)); - return true; -} - -void Header::Write(std::ostream& os) const { - os.write(reinterpret_cast(&kMagicNumber), sizeof(uint32_t)) - .write(reinterpret_cast(&num_records_), sizeof(uint32_t)) - .write(reinterpret_cast(&checksum_), sizeof(uint32_t)) - .write(reinterpret_cast(&compressor_), sizeof(uint32_t)) - .write(reinterpret_cast(&compress_size_), sizeof(uint32_t)); -} - -std::ostream& operator<<(std::ostream& os, Header h) { - os << "Header: " << h.NumRecords() << ", " << h.Checksum() << ", " - << static_cast(h.CompressType()) << ", " << h.CompressSize(); - return os; -} - -bool operator==(Header l, Header r) { - return l.NumRecords() == r.NumRecords() && l.Checksum() == r.Checksum() && - l.CompressType() == r.CompressType() && - l.CompressSize() == r.CompressSize(); -} - -} // namespace recordio -} // namespace paddle diff --git a/paddle/fluid/recordio/header.h b/paddle/fluid/recordio/header.h deleted file mode 100644 index 24542599..00000000 --- a/paddle/fluid/recordio/header.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -namespace paddle { -namespace recordio { - -// MagicNumber for memory checking -constexpr uint32_t kMagicNumber = 0x01020304; - -enum class Compressor : uint32_t { - // NoCompression means writing raw chunk data into files. - // With other choices, chunks are compressed before written. - kNoCompress = 0, - // Snappy had been the default compressing algorithm widely - // used in Google. It compromises between speech and - // compression ratio. - kSnappy = 1, - // Gzip is a well-known compression algorithm. It is - // recommmended only you are looking for compression ratio. - kGzip = 2, -}; - -// Header is the metadata of Chunk -class Header { - public: - Header(); - Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs); - - void Write(std::ostream& os) const; - - // returns true if OK, false if eof - bool Parse(std::istream& is); - - uint32_t NumRecords() const { return num_records_; } - uint32_t Checksum() const { return checksum_; } - Compressor CompressType() const { return compressor_; } - uint32_t CompressSize() const { return compress_size_; } - - private: - uint32_t num_records_; - uint32_t checksum_; - Compressor compressor_; - uint32_t compress_size_; -}; - -// Allow Header Loggable -std::ostream& operator<<(std::ostream& os, Header h); -bool operator==(Header l, Header r); - -} // namespace recordio -} // namespace paddle diff --git a/paddle/fluid/recordio/scanner.cc b/paddle/fluid/recordio/scanner.cc deleted file mode 100644 index b06c274a..00000000 --- a/paddle/fluid/recordio/scanner.cc +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/recordio/scanner.h" - -#include -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace recordio { - -Scanner::Scanner(std::unique_ptr &&stream) - : stream_(std::move(stream)), parser_(*stream_) { - Reset(); -} - -Scanner::Scanner(const std::string &filename) - : stream_(new std::ifstream(filename, std::ios::in | std::ios::binary)), - parser_(*stream_) { - PADDLE_ENFORCE(static_cast(*stream_), "Cannot open file %s", filename); - Reset(); -} - -void Scanner::Reset() { - stream_->clear(); - stream_->seekg(0, std::ios::beg); - parser_.Init(); -} - -std::string Scanner::Next() { - if (stream_->eof()) { - return ""; - } - - auto res = parser_.Next(); - if (!parser_.HasNext() && HasNext()) { - parser_.Init(); - } - return res; -} - -bool Scanner::HasNext() const { return !stream_->eof(); } -} // namespace recordio -} // namespace paddle diff --git a/paddle/fluid/recordio/writer.cc b/paddle/fluid/recordio/writer.cc deleted file mode 100644 index 8046f4ff..00000000 --- a/paddle/fluid/recordio/writer.cc +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/recordio/writer.h" - -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace recordio { - -void Writer::Write(const std::string& record) { - cur_chunk_.Add(record); - if (cur_chunk_.NumRecords() >= max_num_records_in_chunk_) { - Flush(); - } -} - -void Writer::Flush() { - cur_chunk_.Write(stream_, compressor_); - cur_chunk_.Clear(); -} - -Writer::~Writer() { - PADDLE_ENFORCE(cur_chunk_.Empty(), "Writer must be flushed when destroy."); -} - -} // namespace recordio -} // namespace paddle diff --git a/paddle/fluid/recordio/writer.h b/paddle/fluid/recordio/writer.h deleted file mode 100644 index ac7e50ee..00000000 --- a/paddle/fluid/recordio/writer.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#pragma once - -#include - -#include "paddle/fluid/recordio/chunk.h" -namespace paddle { -namespace recordio { - -class Writer { - public: - Writer(std::ostream* sout, Compressor compressor, - size_t max_num_records_in_chunk = 1000) - : stream_(*sout), - max_num_records_in_chunk_(max_num_records_in_chunk), - compressor_(compressor) {} - - void Write(const std::string& record); - - void Flush(); - - ~Writer(); - - private: - std::ostream& stream_; - size_t max_num_records_in_chunk_; - Chunk cur_chunk_; - Compressor compressor_; -}; - -} // namespace recordio -} // namespace paddle diff --git a/paddle/fluid/recordio/writer_scanner_test.cc b/paddle/fluid/recordio/writer_scanner_test.cc deleted file mode 100644 index 6583df21..00000000 --- a/paddle/fluid/recordio/writer_scanner_test.cc +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/recordio/scanner.h" -#include "paddle/fluid/recordio/writer.h" - -TEST(WriterScanner, Normal) { - std::stringstream* stream = new std::stringstream(); - - { - paddle::recordio::Writer writer(stream, - paddle::recordio::Compressor::kSnappy); - writer.Write("ABC"); - writer.Write("BCD"); - writer.Write("CDE"); - writer.Flush(); - } - - { - stream->seekg(0, std::ios::beg); - std::unique_ptr stream_ptr(stream); - paddle::recordio::Scanner scanner(std::move(stream_ptr)); - ASSERT_TRUE(scanner.HasNext()); - ASSERT_EQ(scanner.Next(), "ABC"); - ASSERT_EQ("BCD", scanner.Next()); - ASSERT_TRUE(scanner.HasNext()); - ASSERT_EQ("CDE", scanner.Next()); - ASSERT_FALSE(scanner.HasNext()); - } -} - -TEST(WriterScanner, TinyChunk) { - std::stringstream* stream = new std::stringstream(); - { - paddle::recordio::Writer writer( - stream, paddle::recordio::Compressor::kNoCompress, 2 /*max chunk num*/); - writer.Write("ABC"); - writer.Write("BCD"); - writer.Write("CDE"); - writer.Write("DEFG"); - writer.Flush(); - } - - { - stream->seekg(0, std::ios::beg); - std::unique_ptr stream_ptr(stream); - paddle::recordio::Scanner scanner(std::move(stream_ptr)); - ASSERT_TRUE(scanner.HasNext()); - ASSERT_EQ(scanner.Next(), "ABC"); - ASSERT_EQ(scanner.Next(), "BCD"); - ASSERT_EQ(scanner.Next(), "CDE"); - ASSERT_EQ(scanner.Next(), "DEFG"); - ASSERT_FALSE(scanner.HasNext()); - } -} diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index 49a8fb82..a465f590 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -1,6 +1,6 @@ -cc_library(stringpiece SRCS piece.cc) -cc_library(pretty_log SRCS pretty_log.cc) -cc_library(string_helper SRCS string_helper.cc DEPS boost) +cc_library(stringpiece SRCS piece.cc DEPS flags) +cc_library(pretty_log SRCS pretty_log.cc DEPS flags) +cc_library(string_helper SRCS string_helper.cc DEPS boost flags) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) diff --git a/paddle/fluid/string/piece.cc b/paddle/fluid/string/piece.cc index 8e8cfb0e..e60eb0d6 100644 --- a/paddle/fluid/string/piece.cc +++ b/paddle/fluid/string/piece.cc @@ -20,6 +20,13 @@ #include #include +#define CHAR_POINTER_CMP(a, b) \ + do { \ + if (!a && !b) return 0; \ + if (!a) return -1; \ + if (!b) return 1; \ + } while (0) + namespace paddle { namespace string { @@ -40,6 +47,7 @@ char Piece::operator[](size_t n) const { } int Compare(Piece a, Piece b) { + CHAR_POINTER_CMP(a.data(), b.data()); const size_t min_len = (a.len() < b.len()) ? a.len() : b.len(); int r = memcmp(a.data(), b.data(), min_len); if (r == 0) { @@ -52,8 +60,10 @@ int Compare(Piece a, Piece b) { } bool operator==(Piece x, Piece y) { - return ((x.len() == y.len()) && - (x.data() == y.data() || memcmp(x.data(), y.data(), x.len()) == 0)); + return (!x.len() && !y.len()) ? true + : ((x.len() == y.len()) && + (x.data() == y.data() || + memcmp(x.data(), y.data(), x.len()) == 0)); } bool operator!=(Piece x, Piece y) { return !(x == y); } @@ -65,12 +75,14 @@ bool operator<=(Piece x, Piece y) { return Compare(x, y) <= 0; } bool operator>=(Piece x, Piece y) { return Compare(x, y) >= 0; } bool HasPrefix(Piece s, Piece x) { - return ((s.len() >= x.len()) && (memcmp(s.data(), x.data(), x.len()) == 0)); + return !x.len() ? true : ((s.len() >= x.len()) && + (memcmp(s.data(), x.data(), x.len()) == 0)); } bool HasSuffix(Piece s, Piece x) { - return ((s.len() >= x.len()) && - (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == 0)); + return !x.len() ? true : ((s.len() >= x.len()) && + (memcmp(s.data() + (s.len() - x.len()), x.data(), + x.len()) == 0)); } Piece SkipPrefix(Piece s, size_t n) { diff --git a/paddle/fluid/string/string_helper.cc b/paddle/fluid/string/string_helper.cc old mode 100755 new mode 100644 index 8feff87b..27708b8e --- a/paddle/fluid/string/string_helper.cc +++ b/paddle/fluid/string/string_helper.cc @@ -24,6 +24,26 @@ namespace paddle { namespace string { +inline size_t count_spaces(const char* s) { + size_t count = 0; + + while (*s != 0 && isspace(*s++)) { + count++; + } + + return count; +} + +inline size_t count_nonspaces(const char* s) { + size_t count = 0; + + while (*s != 0 && !isspace(*s++)) { + count++; + } + + return count; +} + // remove leading and tailing spaces std::string trim_spaces(const std::string& str) { const char* p = str.c_str(); diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h old mode 100755 new mode 100644 index 3f5c893b..cc09088c --- a/paddle/fluid/string/string_helper.h +++ b/paddle/fluid/string/string_helper.h @@ -26,25 +26,9 @@ namespace paddle { namespace string { -inline size_t count_spaces(const char* s) { - size_t count = 0; +inline size_t count_spaces(const char* s); - while (*s != 0 && isspace(*s++)) { - count++; - } - - return count; -} - -inline size_t count_nonspaces(const char* s) { - size_t count = 0; - - while (*s != 0 && !isspace(*s++)) { - count++; - } - - return count; -} +inline size_t count_nonspaces(const char* s); template void format_string_append(std::string& str, const char* fmt, // NOLINT @@ -152,18 +136,6 @@ std::string join_strings(const Container& strs, char delim) { return str; } - - static inline bool end_with(const std::string& main_str, const std::string& str) { - return main_str.length() >= str.length() && - strncmp(main_str.c_str() + main_str.length() - str.length(), str.c_str(), str.length()) == - 0; - } - - static inline bool begin_with(const std::string& main_str, const std::string& str) { - return main_str.length() >= str.length() && - strncmp(main_str.c_str(), str.c_str(), str.length()) == 0; - } - // A helper class for reading lines from file. A line buffer is maintained. It // doesn't need to know the maximum possible length of a line. diff --git a/paddle/fluid/string/to_string.h b/paddle/fluid/string/to_string.h index 9378a0fe..8caf1494 100644 --- a/paddle/fluid/string/to_string.h +++ b/paddle/fluid/string/to_string.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include #include -#include #include namespace paddle { @@ -32,15 +31,6 @@ inline std::string to_string(T v) { return sout.str(); } -template -inline std::string to_string(const std::vector& v_list) { - std::ostringstream sout; - for (const auto& v : v_list) { - sout << v << " "; - } - return sout.str(); -} - template <> inline std::string to_string(std::type_index t) { return t.name(); diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt index 59d62d45..7b0bc669 100644 --- a/paddle/fluid/train/CMakeLists.txt +++ b/paddle/fluid/train/CMakeLists.txt @@ -29,7 +29,3 @@ endfunction(train_test) if(WITH_TESTING) train_test(recognize_digits ARGS mlp conv) endif() - -if(WITH_CUSTOM_TRAINER) - add_subdirectory(custom_trainer) -endif() diff --git a/paddle/fluid/train/custom_trainer/CMakeLists.txt b/paddle/fluid/train/custom_trainer/CMakeLists.txt deleted file mode 100644 index 4fd9a3a9..00000000 --- a/paddle/fluid/train/custom_trainer/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_subdirectory(feed) diff --git a/paddle/fluid/train/custom_trainer/feed/.clang-format b/paddle/fluid/train/custom_trainer/feed/.clang-format deleted file mode 100644 index 50fee4ce..00000000 --- a/paddle/fluid/train/custom_trainer/feed/.clang-format +++ /dev/null @@ -1,33 +0,0 @@ -BasedOnStyle: Google -AccessModifierOffset: -4 -AlignAfterOpenBracket: AlwaysBreak -AlignOperands: false -AllowAllParametersOfDeclarationOnNextLine: false -AllowShortBlocksOnASingleLine: false -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: Empty -AllowShortIfStatementsOnASingleLine: false -AllowShortLoopsOnASingleLine: false -AlwaysBreakAfterReturnType: None -AlwaysBreakTemplateDeclarations: true -BinPackArguments: false -BinPackParameters: false -BreakConstructorInitializers: AfterColon -ColumnLimit: 100 -ConstructorInitializerIndentWidth: 8 -ContinuationIndentWidth: 8 -DerivePointerAlignment: true -FixNamespaceComments: true -IndentCaseLabels: false -IndentWidth: 4 -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 1 -PenaltyBreakComment: 500 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakString: 1000 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 400 -PointerAlignment: Left -SortIncludes: false diff --git a/paddle/fluid/train/custom_trainer/feed/CMakeLists.txt b/paddle/fluid/train/custom_trainer/feed/CMakeLists.txt deleted file mode 100644 index f88f2009..00000000 --- a/paddle/fluid/train/custom_trainer/feed/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -add_subdirectory(common) -add_subdirectory(process) -cc_library(custom_trainer_main SRCS main.cc DEPS custom_trainer_process custom_trainer_common) - - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - -set(ARCHIVE_START "-Wl,--whole-archive") -set(ARCHIVE_END "-Wl,--no-whole-archive") -set(EXTERNAL_LIB "-lrt -ldl -lpthread") - -add_executable(feed_trainer main.cc) -target_link_libraries(feed_trainer - ${MACOS_LD_FLAGS} - ${ARCHIVE_START} - ${ARCHIVE_END} - glog gflags protobuf snappystream snappy z xxhash yaml-cpp - paddle_fluid custom_trainer_common custom_trainer_process - ${EXTERNAL_LIB}) diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/accessor.h b/paddle/fluid/train/custom_trainer/feed/accessor/accessor.h deleted file mode 100644 index a09c62e7..00000000 --- a/paddle/fluid/train/custom_trainer/feed/accessor/accessor.h +++ /dev/null @@ -1,19 +0,0 @@ -#pragma once -#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h" -#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class Accessor { -public: - Accessor() {} - virtual ~Accessor() {} - virtual int initialize(YAML::Node config, - std::shared_ptr context_ptr) = 0; -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/dense_input_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/dense_input_accessor.cc deleted file mode 100644 index 0cda14f6..00000000 --- a/paddle/fluid/train/custom_trainer/feed/accessor/dense_input_accessor.cc +++ /dev/null @@ -1,191 +0,0 @@ -#include -#include "gflags/gflags.h" -#include "paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -DEFINE_string(feed_trainer_debug_dense_name, "", "open dense debug for specif layer_name"); - -int DenseInputAccessor::initialize(YAML::Node config, - std::shared_ptr context_ptr) { - CHECK(DataInputAccessor::initialize(config, context_ptr) == 0); - _total_dim = 0; - _pull_request_num.store(0); - for (const auto& input : config["input"]) { - DenseInputVariable variable; - variable.name = input["name"].as(); - variable.gradient_name = paddle::framework::GradVarName(variable.name); - variable.shape = input["shape"].as>(); - variable.dim = 1; - for (int i = 0; i < variable.shape.size(); ++i) { - if (variable.shape[i] <= 0) { - variable.shape[i] = 1; - } - variable.dim *= variable.shape[i]; - } - _total_dim += variable.dim; - _x_variables.emplace_back(variable); - } - if (config["async_pull"] && config["async_pull"].as()) { - _need_async_pull = true; - } - _data_buffer_list.resize(6); // 6 buffer顺序循环使用, 降低更新时的写冲突 - for (auto*& buffer : _data_buffer_list) { - buffer = new float[_total_dim]; - } - return 0; -} - -int32_t DenseInputAccessor::create(::paddle::framework::Scope* scope) { - size_t data_buffer_idx = 0; - std::vector regions; - for (auto& variable : _x_variables) { - auto* tensor = scope->Var(variable.name)-> - GetMutable(); - auto* data = tensor->data(); - regions.emplace_back(data, variable.dim); - if (FLAGS_feed_trainer_debug_dense_name == variable.name) - VLOG(2) << "[Debug][CreateDense]" << ScopeHelper::to_string(scope, variable.name); - } - auto* ps_client = _trainer_context->pslib->ps_client(); - auto push_status = ps_client->push_dense_param(regions.data(), regions.size(), _table_id); - return push_status.get(); -} - -// rpc拉取数据,需保证单线程运行 -int32_t DenseInputAccessor::pull_dense(size_t table_id) { - size_t data_buffer_idx = 0; - float* data_buffer = backend_data_buffer(); - std::vector regions; - for (auto& variable : _x_variables) { - regions.emplace_back(data_buffer + data_buffer_idx, variable.dim); - data_buffer_idx += variable.dim; - } - auto* ps_client = _trainer_context->pslib->ps_client(); - auto push_status = ps_client->pull_dense(regions.data(), regions.size(), table_id); - int32_t ret = push_status.get(); - switch_data_buffer(); - _is_data_buffer_init = true; - return ret; -} - -int32_t DenseInputAccessor::forward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope) { - collect_persistables(scope); - - if (_need_async_pull) { - ++_pull_request_num; - } - return 0; -} - -int32_t DenseInputAccessor::collect_persistables(paddle::framework::Scope* scope) { - // 首次同步pull,之后异步pull - if (!_is_data_buffer_init) { - _pull_mutex.lock(); - if (!_is_data_buffer_init) { - CHECK(pull_dense(_table_id) == 0); - _async_pull_thread = std::make_shared( - [this]() { - while (_need_async_pull) { - if (_pull_request_num > 0) { - pull_dense(_table_id); - _pull_request_num = 0; - } else { - usleep(10000); - } - } - }); - } - _pull_mutex.unlock(); - } - size_t data_buffer_idx = 0; - auto* data_buff = data_buffer(); - for (auto& variable : _x_variables) { - auto* shape_ptr = &(variable.shape[0]); - paddle::framework::DDim ddim(shape_ptr, variable.shape.size()); - auto* tensor = ScopeHelper::resize_lod_tensor(scope, variable.name, ddim); - auto* grad_tensor = ScopeHelper::resize_lod_tensor(scope, variable.gradient_name, ddim); - VLOG(5) << "fill scope variable:" << variable.name << ", " << variable.gradient_name - << ", data_buffer: " << data_buff + data_buffer_idx - << ", dim: " << variable.dim * sizeof(float); - auto* var_data = tensor->mutable_data(_trainer_context->cpu_place); - memcpy(var_data, data_buff + data_buffer_idx, variable.dim * sizeof(float)); - data_buffer_idx += variable.dim; - } - if (!FLAGS_feed_trainer_debug_dense_name.empty()) { - for (auto& variable : _x_variables) { - if (variable.name != FLAGS_feed_trainer_debug_dense_name) { - continue; - } - VLOG(2) << "[Debug][PullDense]" << ScopeHelper::to_string(scope, variable.name); - } - } - return 0; -} - -int32_t DenseInputAccessor::collect_persistables_name(std::vector& persistables) { - for (auto& variable : _x_variables) { - persistables.push_back(variable.name); - } - return 0; -} - -std::future DenseInputAccessor::backward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope) { - std::future ret; - if (!_need_gradient) { - return ret; - } - size_t data_buffer_idx = 0; - std::vector regions; - for (auto& variable : _x_variables) { - auto* tensor = scope->Var(variable.gradient_name)-> - GetMutable(); - auto* grad_data = tensor->mutable_data(_trainer_context->cpu_place); - regions.emplace_back(grad_data, variable.dim); - } - auto* ps_client = _trainer_context->pslib->ps_client(); - ps_client->push_dense(regions.data(), regions.size(), _table_id); - if (!FLAGS_feed_trainer_debug_dense_name.empty()) { - for (auto& variable : _x_variables) { - if (variable.name != FLAGS_feed_trainer_debug_dense_name) { - continue; - } - VLOG(2) << "[Debug][PushDense]" << ScopeHelper::to_string(scope, variable.gradient_name); - } - } - // not wait dense push - return ret; -} - -int32_t EbdVariableInputAccessor::forward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope) { - CHECK(_x_variables.size() == 1); - CHECK(_x_variables[0].shape.size() == 1); - auto& variable = _x_variables[0]; - auto* tensor = ScopeHelper::resize_lod_tensor(scope, - variable.name, {num, variable.shape[0]}); - auto* var_data = tensor->mutable_data(_trainer_context->cpu_place); - for (size_t i = 0; i < num; ++i) { - auto& sample = samples[i]; - CHECK(sample.embedx.size() == variable.dim); - memcpy(var_data, sample.embedx.data(), variable.dim * sizeof(float)); - var_data += variable.dim; - } - return 0; -} -std::future EbdVariableInputAccessor::backward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope) { - std::future ret; - return ret; -} - -REGIST_CLASS(DataInputAccessor, DenseInputAccessor); -REGIST_CLASS(DataInputAccessor, EbdVariableInputAccessor); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc deleted file mode 100755 index 1773c056..00000000 --- a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.cc +++ /dev/null @@ -1,187 +0,0 @@ -#include "paddle/fluid/string/string_helper.h" -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" -#include "paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - int EpochAccessor::initialize(YAML::Node config, - std::shared_ptr context_ptr) { - _model_root_path = config["model_root_path"].as(); - - _trainer_context = context_ptr.get(); - if (context_ptr->file_system == nullptr) { - VLOG(0) << "file_system is not initialized"; - return -1; - } - auto fs = _trainer_context->file_system.get(); - _done_file_path = fs->path_join(_model_root_path, config["donefile"].as("epoch_donefile.txt")); - if (!fs->exists(_done_file_path)) { - VLOG(0) << "missing done file, path:" << _done_file_path; - return -1; - } - std::string done_text = fs->tail(_done_file_path); - _done_status = paddle::string::split_string(done_text, std::string("\t")); - _last_done_epoch_id = get_status(EpochStatusFiled::EpochIdField); - _last_checkpoint_epoch_id = get_status(EpochStatusFiled::CheckpointIdField); - // 训练需要从上一个checkpoint对应的epoch开始 - _current_epoch_id = _last_checkpoint_epoch_id; - _last_checkpoint_path = get_status(EpochStatusFiled::CheckpointPathField); - _inference_base_model_key = get_status(EpochStatusFiled::InferenceBaseKeyField); - _inference_model_path = fs->path_join(_model_root_path, config["inference_model_dir"].as("xbox")); - _inference_model_base_done_path = fs->path_join(_inference_model_path, - config["inference_base_done_name"].as("xbox_base_done.txt")); - _inference_model_delta_done_path = fs->path_join(_inference_model_path, - config["inference_delta_done_name"].as("xbox_patch_done.txt")); - return 0; - } - - int32_t EpochAccessor::epoch_done(uint64_t epoch_id) { - struct timeval now; - gettimeofday(&now, NULL); - if (need_save_model(epoch_id, ModelSaveWay::ModelSaveTrainCheckpoint)) { - _last_checkpoint_epoch_id = epoch_id; - _last_checkpoint_path = model_save_path(epoch_id, ModelSaveWay::ModelSaveTrainCheckpoint); - } - set_status(EpochStatusFiled::EpochIdField, epoch_id); - set_status(EpochStatusFiled::TimestampField, now.tv_sec); - set_status(EpochStatusFiled::CheckpointIdField, _last_checkpoint_epoch_id); - set_status(EpochStatusFiled::CheckpointPathField, _last_checkpoint_path); - set_status(EpochStatusFiled::DateField, format_timestamp(epoch_id, "%Y%m%d-%H%M")); - set_status(EpochStatusFiled::InferenceBaseKeyField, _inference_base_model_key); - - auto* env = _trainer_context->environment.get(); - if (env->is_master_node(EnvironmentRole::WORKER)) { - if (epoch_id > _last_done_epoch_id) { - // 保留末尾1000数据 - auto fs = _trainer_context->file_system.get(); - std::string done_str = paddle::string::join_strings(_done_status, '\t'); - fs->append_line(_done_file_path, done_str, 1000); - } - } - return 0; - } - - int EpochAccessor::update_model_donefile( - uint64_t epoch_id, ModelSaveWay save_way) { - auto* env = _trainer_context->environment.get(); - // 非主节点不做done状态持久化 - if (!env->is_master_node(EnvironmentRole::WORKER)) { - return 0; - } - std::string done_str; - std::string donefile; - auto fs = _trainer_context->file_system.get(); - auto model_path = model_save_path(epoch_id, save_way); - std::string inference_done_format("{\"id\":\"%lu\",\"key\":\"%lu\",\"input\":\"%s/000\",\"record_count\":\"1\",\"file_format\":\"pb\",\"schema_version\":\"2\",\"partition_type\":\"1\",\"job_name\":\"%s\",\"job_id\":\"%s\",\"mpi_size\":\"%d\",\"monitor_data\":\"%s\"}"); - - auto id = time(NULL); - switch (save_way) { - case ModelSaveWay::ModelSaveInferenceDelta: - donefile = _inference_model_delta_done_path; - done_str = string::format_string(inference_done_format.c_str(), id, _inference_base_model_key, - model_path.c_str(), env->job_name().c_str(), env->job_id().c_str(), - env->node_num(EnvironmentRole::PSERVER), _trainer_context->monitor_ssm.str().c_str()); - fs->append_line(donefile, done_str, 1000); - break; - case ModelSaveWay::ModelSaveInferenceBase: - donefile = _inference_model_base_done_path; - _inference_base_model_key = id; - done_str = string::format_string(inference_done_format.c_str(), id, id, - model_path.c_str(), env->job_name().c_str(), env->job_id().c_str(), - env->node_num(EnvironmentRole::PSERVER), _trainer_context->monitor_ssm.str().c_str()); - fs->append_line(donefile, done_str, 1000); - break; - } - return 0; - } - - int TimelyEpochAccessor::initialize(YAML::Node config, - std::shared_ptr context_ptr) { - _time_zone_seconds = config["time_zone_seconds"].as(); - _train_time_interval = config["train_time_interval"].as(); - _checkpoint_time_interval = config["checkpoint_time_interval"].as(3600 * 18); // 默认每18小时dump一个 - CHECK(_train_time_interval > 0 && (_train_time_interval % SecondsPerMin) == 0); - _train_num_per_day = SecondsPerDay / _train_time_interval; - return EpochAccessor::initialize(config, context_ptr); - } - - void TimelyEpochAccessor::next_epoch() { - _current_epoch_id = next_epoch_id(_current_epoch_id); - } - - std::string TimelyEpochAccessor::text(uint64_t epoch_id) { - auto delta = delta_id(epoch_id); - std::string date = format_timestamp(epoch_id, "%Y%m%d%H%M"); - return string::format_string("%s delta-%d", date.c_str(), delta); - } - - uint64_t TimelyEpochAccessor::next_epoch_id(uint64_t epoch_id) { - if (epoch_id == 0) { - struct timeval now; - gettimeofday(&now, NULL); - // 归整到零点 - return now.tv_sec / SecondsPerDay * SecondsPerDay; - } - return epoch_id + _train_time_interval; - } - - bool TimelyEpochAccessor::is_last_epoch(uint64_t epoch_id) { - auto delta = delta_id(epoch_id); - return delta == 0; // 最后一个delta恰好整除 - } - - uint64_t TimelyEpochAccessor::epoch_time_interval() { - return _train_time_interval; - } - - uint64_t TimelyEpochAccessor::epoch_timestamp(uint64_t epoch_id) { - return epoch_id; - } - - bool TimelyEpochAccessor::need_save_model(uint64_t epoch_id, ModelSaveWay save_way) { - if (epoch_id == 0) { - return false; - } - switch (save_way) { - case ModelSaveWay::ModelSaveInferenceDelta: - // 重启训练后,中间的delta不重复dump - return epoch_id > _last_done_epoch_id && - delta_id(epoch_id) % 6 == 0; - case ModelSaveWay::ModelSaveInferenceBase: - return is_last_epoch(epoch_id); - case ModelSaveWay::ModelSaveTrainCheckpoint: - if (is_last_epoch(epoch_id)) { - return true; - } - return delta_id(epoch_id) % (_checkpoint_time_interval / _train_time_interval) == 0; - case ModelSaveWay::ModelSaveTrainCheckpointBase: - return is_last_epoch(epoch_id); - } - return false; - } - - std::string TimelyEpochAccessor::model_save_path(uint64_t epoch_id, ModelSaveWay save_way) { - int32_t delta = delta_id(epoch_id); - std::string date = format_timestamp(epoch_id, "%Y%m%d"); - std::string date_with_hour = format_timestamp(epoch_id, "%Y%m%d%H"); - switch (save_way) { - case ModelSaveWay::ModelSaveInferenceDelta: - return _trainer_context->file_system->path_join(_inference_model_path, - string::format_string("%s/delta-%d", date.c_str(), delta)); - case ModelSaveWay::ModelSaveInferenceBase: - return _trainer_context->file_system->path_join(_inference_model_path, - string::format_string("%s/base", date.c_str())); - case ModelSaveWay::ModelSaveTrainCheckpointBase: - case ModelSaveWay::ModelSaveTrainCheckpoint: - return _trainer_context->file_system->path_join(_model_root_path, - string::format_string("batch_model/%s", date_with_hour.c_str())); - } - return ""; - } - - REGIST_CLASS(EpochAccessor, TimelyEpochAccessor); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h b/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h deleted file mode 100644 index 53df71f3..00000000 --- a/paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h +++ /dev/null @@ -1,111 +0,0 @@ -#pragma once -#include -#include "paddle/fluid/string/to_string.h" -#include "paddle/fluid/string/string_helper.h" -#include "paddle/fluid/train/custom_trainer/feed/accessor/accessor.h" -#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -enum class EpochStatusFiled { - DateField = 0, - TimestampField = 1, - CheckpointPathField = 2, - EpochIdField = 3, - CheckpointIdField = 4, - InferenceBaseKeyField = 5 -}; - -class EpochAccessor : public Accessor { -public: - EpochAccessor() {} - virtual ~EpochAccessor() {} - virtual int initialize(YAML::Node config, - std::shared_ptr context_ptr); - - virtual uint64_t current_epoch_id() { - return _current_epoch_id; - } - - virtual const std::string& checkpoint_path() { - return _last_checkpoint_path; - } - - virtual int32_t epoch_done(uint64_t epoch_id); - - template - T get_status(EpochStatusFiled field) { - auto status = paddle::string::trim_spaces(_done_status[static_cast(field)]); - return boost::lexical_cast(status.c_str()); - } - template - void set_status(EpochStatusFiled field, const T& status) { - auto str_status = paddle::string::to_string(status); - _done_status[static_cast(field)] = str_status; - return; - } - virtual std::string model_root_path() { - return _model_root_path; - } - - virtual void next_epoch() = 0; - - virtual std::string text(uint64_t epoch_id) = 0; - virtual uint64_t next_epoch_id(uint64_t epoch_id) = 0; - virtual bool is_last_epoch(uint64_t epoch_id) = 0; - - //epoch间的数据时间间隔(秒) - virtual uint64_t epoch_time_interval() = 0; - //获取epoch的样本数据时间 - virtual uint64_t epoch_timestamp(uint64_t epoch_id) = 0; - - virtual bool need_save_model(uint64_t epoch_id, ModelSaveWay save_way) = 0; - virtual std::string model_save_path(uint64_t epoch_id, ModelSaveWay save_way) = 0; - virtual int update_model_donefile(uint64_t epoch_id, ModelSaveWay save_way); -protected: - TrainerContext* _trainer_context; - std::string _done_file_path; - std::string _model_root_path; - std::string _inference_model_path; - std::string _inference_model_base_done_path; - std::string _inference_model_delta_done_path; - uint64_t _current_epoch_id = 0; - std::string _last_checkpoint_path; - uint64_t _last_done_epoch_id = 0; - uint64_t _last_checkpoint_epoch_id = 0; - std::vector _done_status; // 当前完成状态,统一存成string - uint64_t _inference_base_model_key = 0; // 预估模型的base-key - -}; -REGIST_REGISTERER(EpochAccessor); - -class TimelyEpochAccessor : public EpochAccessor { -public: - TimelyEpochAccessor() {} - virtual ~TimelyEpochAccessor() {} - virtual int initialize(YAML::Node config, - std::shared_ptr context_ptr); - virtual void next_epoch(); - virtual std::string text(uint64_t epoch_id); - virtual uint64_t next_epoch_id(uint64_t epoch_id); - virtual bool is_last_epoch(uint64_t epoch_id); - virtual uint64_t epoch_time_interval(); - virtual uint64_t epoch_timestamp(uint64_t epoch_id); - virtual bool need_save_model(uint64_t epoch_id, ModelSaveWay save_way); - virtual std::string model_save_path(uint64_t epoch_id, ModelSaveWay save_way); - -private: - inline size_t delta_id(uint64_t epoch_id) { - return ((epoch_id + _time_zone_seconds) % SecondsPerDay) / _train_time_interval; - } - uint32_t _time_zone_seconds; // 相对UTC时差(秒) - uint32_t _train_time_interval; // 训练时间间隔(秒) - uint32_t _train_num_per_day; // 天级训练总轮数 - uint32_t _checkpoint_time_interval; // 每隔n秒,dump出CheckPoint -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h b/paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h deleted file mode 100644 index 8918231b..00000000 --- a/paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h +++ /dev/null @@ -1,208 +0,0 @@ -#pragma once -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/train/custom_trainer/feed/accessor/accessor.h" -#include "paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h" -#include "paddle/fluid/train/custom_trainer/feed/common/scope_helper.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class DataInputAccessor : public Accessor { -public: - DataInputAccessor() {} - virtual ~DataInputAccessor() {} - - virtual int initialize(YAML::Node config, - std::shared_ptr context_ptr) { - _trainer_context = context_ptr.get(); - _table_id = config["table_id"].as(); - _need_gradient = config["need_gradient"].as(); - return 0; - } - - // 创建,一般用于模型冷启的随机初始化 - virtual int32_t create(::paddle::framework::Scope* scope) { - return 0; - } - // 裁剪,用于模型裁剪,base级调用 - virtual int32_t shrink() { - return 0; - } - - // 前向, 一般用于填充输入,在训练网络执行前调用 - virtual int32_t forward(SampleInstance* samples, size_t num, - ::paddle::framework::Scope* scope) = 0; - - // 后向,一般用于更新梯度,在训练网络执行后调用, 由于backward一般是异步,这里返回future, - // TODO 前向接口也改为future返回形式,接口一致性好些 - virtual std::future backward(SampleInstance* samples, size_t num, - ::paddle::framework::Scope* scope) = 0; - - // 收集持久化变量的名称, 并将值拷贝到Scope - virtual int32_t collect_persistables_name(std::vector& persistables) {return 0;} - - // 填充持久化变量的值,用于保存 - virtual int32_t collect_persistables(paddle::framework::Scope* scope) {return 0;} -protected: - size_t _table_id = 0; - bool _need_gradient = false; - TrainerContext* _trainer_context = nullptr; -}; -REGIST_REGISTERER(DataInputAccessor); - -struct LabelInputVariable { - std::string label_name; - std::string output_name; - size_t label_dim = 0; -}; -class LabelInputAccessor : public DataInputAccessor { -public: - LabelInputAccessor() {} - virtual ~LabelInputAccessor() {} - - virtual int initialize(YAML::Node config, - std::shared_ptr context_ptr); - - virtual int32_t forward(SampleInstance* samples, size_t num, - ::paddle::framework::Scope* scope); - - virtual std::future backward(SampleInstance* samples, size_t num, - ::paddle::framework::Scope* scope); -protected: - size_t _label_total_dim = 0; - std::vector _labels; -}; - -struct SparseInputVariable { - size_t slot_dim; // slot下dim数 - size_t total_dim; // slot_dim * slot_num - std::string name; // 对应参数变量名称 - std::string gradient_name; // 对应梯度变量名称 - std::vector slot_idx; // 通过slot_id 反查 slot在参数层的idx - std::vector slot_list; // slot_id列表 -}; - -struct SparseVarRuntimeData { - uint32_t row_size; // batch_size - uint32_t total_size; // batch_size * input_dim - float* variable_data; // 参数 - float* gradient_data; // 梯度 - SparseInputVariable* sparse_var_metas; // metas -}; - -class BaseSparseInputAccessor : public DataInputAccessor { -public: - BaseSparseInputAccessor() {} - virtual ~BaseSparseInputAccessor() {} - - virtual int initialize(YAML::Node config, - std::shared_ptr context_ptr); - - // forword过程的input填充 - virtual int32_t forward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope); - - // 取得单个SparseKey的PullValue, 实现单个SparseValue的填充 - virtual void fill_input(float* var_data, const float* pull_raw, - paddle::ps::ValueAccessor&, SparseInputVariable&, SampleInstance&) = 0; - - // 所有SparseValue填充完成后,调用,可进一步全局处理 - virtual void post_process_input(float* var_data, SparseInputVariable&, SampleInstance*, size_t num) = 0; - - // backward过程的梯度push - virtual std::future backward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope); - - // SparseGradValue会被依次调用,用于整理push的梯度 - virtual void fill_gradient(float* push_value, const float* gradient_raw, - paddle::ps::ValueAccessor&, SparseInputVariable&, - SampleInstance&, FeatureItem&) = 0; - -protected: - // 输入层列表 - std::vector _x_variables; -}; - -struct DenseInputVariable { - size_t dim; - std::string name; - std::vector shape; - std::string gradient_name; -}; - -class DenseInputAccessor : public DataInputAccessor { -public: - DenseInputAccessor() {} - virtual ~DenseInputAccessor() { - for (float* buffer : _data_buffer_list) { - delete[] buffer; - } - _need_async_pull = false; - if (_async_pull_thread) { - _async_pull_thread->join(); - } - } - - // 返回当前可用的Dense buffer - inline float* data_buffer() { - return _data_buffer_list[_current_buffer_idx]; - } - inline float* backend_data_buffer() { - return _data_buffer_list[next_buffer_idx()]; - } - inline void switch_data_buffer() { - _current_buffer_idx = next_buffer_idx(); - } - inline size_t next_buffer_idx() { - auto buffer_idx = _current_buffer_idx + 1; - if (buffer_idx >= _data_buffer_list.size()) { - return 0; - } - return buffer_idx; - } - - virtual int initialize(YAML::Node config, - std::shared_ptr context_ptr); - - virtual int32_t create(::paddle::framework::Scope* scope); - - virtual int32_t forward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope); - - virtual std::future backward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope); - - - virtual int32_t collect_persistables_name(std::vector& persistables); - - virtual int32_t collect_persistables(paddle::framework::Scope* scope); -protected: - virtual int32_t pull_dense(size_t table_id); - size_t _total_dim = 0; - std::mutex _pull_mutex; - bool _need_async_pull = false; - bool _is_data_buffer_init = false; - std::vector _data_buffer_list; - size_t _current_buffer_idx = 0; - std::atomic _pull_request_num; - std::vector _x_variables; - std::shared_ptr _async_pull_thread; -}; - -class EbdVariableInputAccessor : public DenseInputAccessor { -public: - EbdVariableInputAccessor() {} - virtual ~EbdVariableInputAccessor() {} - - virtual int32_t forward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope); - - virtual std::future backward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope); -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/label_input_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/label_input_accessor.cc deleted file mode 100644 index 5a38d8ee..00000000 --- a/paddle/fluid/train/custom_trainer/feed/accessor/label_input_accessor.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -int LabelInputAccessor::initialize(YAML::Node config, - std::shared_ptr context_ptr) { - _trainer_context = context_ptr.get(); - _label_total_dim = 0; - for (const auto& input : config["input"]) { - LabelInputVariable variable; - variable.label_name = input["label_name"].as(); - variable.output_name = input["output_name"].as(); - auto shape = input["shape"].as>(); - variable.label_dim = 0; - for (auto dim : shape) { - variable.label_dim += (dim > 0 ? dim : 0); - } - _label_total_dim += variable.label_dim; - _labels.emplace_back(variable); - } - return 0; -} - -int32_t LabelInputAccessor::forward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope) { - if (num < 1) { - return 0; - } - size_t sample_label_data_idx = 0; - for (auto& label : _labels) { - auto* tensor = ScopeHelper::resize_lod_tensor(scope, label.label_name, {num, label.label_dim}); - auto* res_tens = ScopeHelper::resize_lod_tensor(scope, label.output_name, {num, label.label_dim}); - auto* var_data = tensor->mutable_data(_trainer_context->cpu_place); - for (size_t i = 0; i < num; ++i) { - auto& sample = samples[i]; - CHECK(sample.labels.size() > sample_label_data_idx); - float* sample_label_buffer = sample.labels.data(); - memcpy(var_data + i * label.label_dim, - sample_label_buffer + sample_label_data_idx, label.label_dim * sizeof(float)); - } - sample_label_data_idx += label.label_dim; - } - return 0; -} - -std::future LabelInputAccessor::backward(SampleInstance* samples, size_t num, - paddle::framework::Scope* scope) { - std::future ret; - if (num < 1) { - return ret; - } - for (size_t i = 0; i < num; ++i) { - auto& sample = samples[i]; - sample.predicts.resize(_label_total_dim); - size_t sample_predict_data_idx = 0; - float* sample_predict_buffer = sample.predicts.data(); - for (auto& label : _labels) { - auto* tensor = scope->Var(label.output_name)-> - GetMutable(); - auto* var_data = tensor->mutable_data(_trainer_context->cpu_place); - memcpy(sample_predict_buffer + sample_predict_data_idx, - var_data + i * label.label_dim, label.label_dim * sizeof(float)); - sample_predict_data_idx += label.label_dim; - } - } - /* for debug - for (auto& label : _labels) { - VLOG(2) << "[Debug][Lable]" << ScopeHelper::to_string(scope, label.label_name) << ScopeHelper::to_string(scope, label.output_name); - } - */ - return ret; -} - -REGIST_CLASS(DataInputAccessor, LabelInputAccessor); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/sparse_input_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/sparse_input_accessor.cc deleted file mode 100644 index 0f1408d9..00000000 --- a/paddle/fluid/train/custom_trainer/feed/accessor/sparse_input_accessor.cc +++ /dev/null @@ -1,286 +0,0 @@ -#include -#include -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/string/string_helper.h" -#include "paddle/fluid/train/custom_trainer/feed/common/scope_helper.h" -#include "paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h" - -DEFINE_int32(feed_trainer_debug_sparse_slot, 0, "open sparse debug for specif slot"); - -namespace paddle { -namespace custom_trainer { -namespace feed { - -int BaseSparseInputAccessor::initialize(YAML::Node config, - std::shared_ptr context_ptr) { - CHECK(DataInputAccessor::initialize(config, context_ptr) == 0); - for (const auto& input : config["input"]) { - SparseInputVariable variable; - variable.name = input["name"].as(); - variable.gradient_name = paddle::framework::GradVarName(variable.name); - auto slots = input["slots"].as>(); - variable.slot_idx.resize(UINT16_MAX, -1); - for (int i = 0; i < slots.size(); ++i) { - uint16_t slot = (uint16_t)slots[i]; - variable.slot_idx[slot] = i; - variable.slot_list.push_back(slot); - } - variable.slot_dim = input["slot_dim"].as(); - variable.total_dim = variable.slot_list.size() * variable.slot_dim; - _x_variables.push_back(variable); - } - return 0; -} - -// 取sparse数据 -int32_t BaseSparseInputAccessor::forward(SampleInstance* samples, - size_t num, paddle::framework::Scope* scope) { - CHECK(num > 0); - auto* ps_client = _trainer_context->pslib->ps_client(); - auto* value_accessor = ps_client->table_accessor(_table_id); - size_t key_num = 0; - for (size_t i = 0; i < num; ++i) { - key_num += samples[i].features.size(); - } - std::vector keys(key_num); - float** pull_values = new float*[key_num]; - auto pull_value_dim = value_accessor->select_dim(); - - // 填入sparseKey Request - size_t key_idx = 0; - for (size_t i = 0; i < num; ++i) { - auto& features = samples[i].features; - for (auto& feature_item : features) { - feature_item.weights.resize(pull_value_dim, 0.0); - keys[key_idx] = feature_item.sign(); - pull_values[key_idx++] = &(feature_item.weights[0]); - } - } - auto pull_status = ps_client->pull_sparse(pull_values, _table_id, keys.data(), key_num); - auto ret = pull_status.get(); - delete[] pull_values; - if (ret != 0) { - VLOG(0) << "pull sparse failed, table_id:" << _table_id << ", key_num:" << key_num << ", ret:" << ret; - return ret; - } - - auto* runtime_data_ptr = new std::vector(); - auto& var_runtime_data = *runtime_data_ptr; - var_runtime_data.resize(_x_variables.size()); - int64_t runtime_data_for_scope = (int64_t)runtime_data_ptr; - ScopeHelper::fill_value(scope, _trainer_context->cpu_place, - "sparse_runtime_data", runtime_data_for_scope); - // Variable空间初始化 - for (size_t i = 0; i < _x_variables.size(); ++i) { - const auto& variable = _x_variables[i]; - var_runtime_data[i].row_size = num; - var_runtime_data[i].total_size = num * variable.total_dim; - var_runtime_data[i].sparse_var_metas = &(_x_variables[i]); - auto* tensor = ScopeHelper::resize_lod_tensor( - scope, variable.name, {num, variable.total_dim}); - auto* grad_tensor = ScopeHelper::resize_lod_tensor( - scope, variable.gradient_name, {num, variable.total_dim}); - VLOG(5) << "fill scope variable:" << variable.name << ", " << variable.gradient_name; - var_runtime_data[i].variable_data = tensor->mutable_data(_trainer_context->cpu_place); - var_runtime_data[i].gradient_data = grad_tensor->mutable_data(_trainer_context->cpu_place); - memset((void*) var_runtime_data[i].variable_data, 0, var_runtime_data[i].total_size * sizeof(float)); - memset((void*) var_runtime_data[i].gradient_data, 0, var_runtime_data[i].total_size * sizeof(float)); - } - // 参数填入Variable - for (size_t samp_idx = 0; samp_idx < num; ++samp_idx) { - auto& features = samples[samp_idx].features; - for (auto& feature_item : features) { - for (size_t i = 0; i < _x_variables.size(); ++i) { - auto& variable = _x_variables[i]; - auto slot_idx = variable.slot_idx[feature_item.slot()]; - if (slot_idx < 0) { - continue; - } - float* item_data = var_runtime_data[i].variable_data + - samp_idx * variable.total_dim + variable.slot_dim * slot_idx; - fill_input(item_data, &(feature_item.weights[0]), *value_accessor, variable, samples[samp_idx]); - } - } - } - if (FLAGS_feed_trainer_debug_sparse_slot) { - std::stringstream ssm; - for (size_t samp_idx = 0; samp_idx < num; ++samp_idx) { - ssm.str(""); - auto& features = samples[samp_idx].features; - for (auto& feature_item : features) { - for (size_t i = 0; i < _x_variables.size(); ++i) { - auto& variable = _x_variables[i]; - if (feature_item.slot() != FLAGS_feed_trainer_debug_sparse_slot) { - continue; - } - if (variable.slot_idx[feature_item.slot()] < 0) { - continue; - } - ssm << "(" << feature_item.sign() << "," << feature_item.slot(); - for (auto weight : feature_item.weights) { - ssm << "," << weight; - } - ssm << ")"; - } - } - VLOG(2) << "[DEBUG][sparse_slot_pull]" << ssm.str(); - } - } - // Variable后置处理 - for (size_t i = 0; i < _x_variables.size(); ++i) { - auto& variable = _x_variables[i]; - post_process_input(var_runtime_data[i].variable_data, variable, samples, num); - } - return 0; -} - -// 更新spare数据 -std::future BaseSparseInputAccessor::backward(SampleInstance* samples, - size_t num, paddle::framework::Scope* scope) { - std::future ret; - int64_t runtime_data_for_scope = *ScopeHelper::get_value( - scope, _trainer_context->cpu_place, "sparse_runtime_data"); - auto* runtime_data_ptr = (std::vector*)runtime_data_for_scope; - auto& var_runtime_data = *runtime_data_ptr; - DoneGuard gurad([runtime_data_ptr](){ - delete runtime_data_ptr; - }); - if (!_need_gradient) { - return ret; - } - auto* ps_client = _trainer_context->pslib->ps_client(); - auto* value_accessor = ps_client->table_accessor(_table_id); - - size_t key_num = 0; - for (size_t i = 0; i < num; ++i) { - key_num += samples[i].features.size(); - } - std::vector keys(key_num); - float** push_values = new float*[key_num]; - auto push_value_dim = value_accessor->update_dim(); - - size_t key_idx = 0; - for (size_t samp_idx = 0; samp_idx < num; ++samp_idx) { - auto& features = samples[samp_idx].features; - for (auto& feature_item : features) { - feature_item.gradients.resize(push_value_dim, 0.0); - for (size_t i = 0; i < _x_variables.size(); ++i) { - auto& variable = _x_variables[i]; - auto slot_idx = variable.slot_idx[feature_item.slot()]; - if (slot_idx < 0) { - continue; - } - const float* grad_data = var_runtime_data[i].gradient_data + - samp_idx * variable.total_dim + variable.slot_dim * slot_idx; - fill_gradient(&(feature_item.gradients[0]), grad_data, - *value_accessor, variable, samples[samp_idx], feature_item); - keys[key_idx] = feature_item.sign(); - push_values[key_idx++] = &(feature_item.gradients[0]); - } - } - } - if (FLAGS_feed_trainer_debug_sparse_slot) { - size_t key_idx = 0; - std::stringstream ssm; - for (size_t samp_idx = 0; samp_idx < num; ++samp_idx) { - ssm.str(""); - auto& features = samples[samp_idx].features; - for (auto& feature_item : features) { - for (size_t i = 0; i < _x_variables.size(); ++i) { - auto& variable = _x_variables[i]; - if (feature_item.slot() != FLAGS_feed_trainer_debug_sparse_slot) { - continue; - } - if (variable.slot_idx[feature_item.slot()] < 0) { - continue; - } - ssm << "(" << feature_item.sign() << "," << feature_item.slot(); - for (auto weight : feature_item.gradients) { - ssm << "," << weight; - } - ssm << ")"; - } - } - VLOG(2) << "[DEBUG][sparse_slot_push]" << ssm.str(); - } - } - ret = ps_client->push_sparse(_table_id, - keys.data(), (const float**)push_values, key_idx); - delete[] push_values; - return ret; -} - -class AbacusSparseJoinAccessor : public BaseSparseInputAccessor { -public: - AbacusSparseJoinAccessor() {} - virtual ~AbacusSparseJoinAccessor() {} - virtual void fill_input(float* var_data, const float* pull_raw, - paddle::ps::ValueAccessor& value_accessor, - SparseInputVariable& variable, SampleInstance& sample) { - for (size_t i = 0; i < variable.slot_dim; ++i) { - var_data[i] += pull_raw[i]; - } - } - - virtual void post_process_input(float* var_data, - SparseInputVariable& variable, SampleInstance* samples, size_t num) { - for (size_t i = 0; i < num * variable.slot_list.size(); ++i) { - var_data[0] = log(var_data[0] + 1); // show - var_data[1] = log(var_data[1] + 1) - var_data[0]; // ctr - var_data += variable.slot_dim; - } - } - - virtual void fill_gradient(float* push_value, const float* gradient_raw, - paddle::ps::ValueAccessor& value_accessor, SparseInputVariable& variable, - SampleInstance& sample, FeatureItem& feature) { - // join阶段不回填梯度 - CHECK(false); - return; - } -}; -REGIST_CLASS(DataInputAccessor, AbacusSparseJoinAccessor); - -class AbacusSparseUpdateAccessor : public BaseSparseInputAccessor { -public: - AbacusSparseUpdateAccessor() {} - virtual ~AbacusSparseUpdateAccessor() {} - virtual void fill_input(float* var_data, const float* pull_raw, - paddle::ps::ValueAccessor& value_accessor, - SparseInputVariable& variable, SampleInstance& sample) { - for (size_t i = 0; i < variable.slot_dim; ++i) { - var_data[i] += pull_raw[i + 2]; - } - } - - // 裁剪,用于模型裁剪,base级调用 - virtual int32_t shrink() { - auto* ps_client = _trainer_context->pslib->ps_client(); - auto status = ps_client->shrink(_table_id); - return status.get(); - } - - virtual void post_process_input(float* var_data, - SparseInputVariable& variable, SampleInstance* samples, size_t num) { - return; - } - - virtual void fill_gradient(float* push_value, const float* gradient_raw, - paddle::ps::ValueAccessor& value_accessor, SparseInputVariable& variable, - SampleInstance& sample, FeatureItem& feature) { - push_value[0] = feature.slot(); - push_value[1] += 1; - push_value[2] += sample.labels[0]; - for (size_t i = 0; i < variable.slot_dim; ++i) { - push_value[i + 3] += gradient_raw[i]; - } - return; - } -}; -REGIST_CLASS(DataInputAccessor, AbacusSparseUpdateAccessor); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/accessor/weights_input_accessor.cc b/paddle/fluid/train/custom_trainer/feed/accessor/weights_input_accessor.cc deleted file mode 100644 index 7d5ca170..00000000 --- a/paddle/fluid/train/custom_trainer/feed/accessor/weights_input_accessor.cc +++ /dev/null @@ -1,74 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class WeightsAdjustAccessor : public DataInputAccessor { -public: - WeightsAdjustAccessor() {} - virtual ~WeightsAdjustAccessor() {} - - virtual int initialize(YAML::Node config, - std::shared_ptr context_ptr) { - _trainer_context = context_ptr.get(); - _slot_id = config["slot_id"].as(); - _input_name = config["input"].as(); - _adjw_ratio = config["adjw_ratio"].as(); - _adjw_threshold = config["adjw_threshold"].as(); - return 0; - } - - virtual int32_t forward(SampleInstance* samples, size_t num, - ::paddle::framework::Scope* scope) { - int64_t runtime_data_for_scope = *ScopeHelper::get_value( - scope, _trainer_context->cpu_place, "sparse_runtime_data"); - auto* runtime_data_ptr = (std::vector*)runtime_data_for_scope; - auto& var_runtime_data = *runtime_data_ptr; - - int slot_idx = -1; - SparseVarRuntimeData* sparse_var_data = nullptr; - for (auto& sparse_var : var_runtime_data) { - slot_idx = sparse_var.sparse_var_metas->slot_idx[_slot_id]; - if (slot_idx >= 0) { - sparse_var_data = &sparse_var; - break; - } - } - CHECK(slot_idx >= 0) << "Not Found this Slot in slot_list. slot_id:" << _slot_id; - - auto* tensor = ScopeHelper::resize_lod_tensor(scope, _input_name, {num, 1}); - auto* weights_data = tensor->mutable_data(_trainer_context->cpu_place); - - float* sparse_input_data = sparse_var_data->variable_data; - size_t sparse_slot_dim = sparse_var_data->sparse_var_metas->slot_dim; - size_t sparse_input_col = sparse_var_data->sparse_var_metas->total_dim; - for (int i = 0; i < num; ++i) { - float show = sparse_input_data[i * sparse_input_col + slot_idx * sparse_slot_dim]; - show = pow(M_E, show) - 1; // show在fill时算过log,这里恢复原值 - weights_data[i] = 1.0; - if (show >= 0 && show < _adjw_threshold) { - weights_data[i] = log(M_E + (_adjw_threshold - show) / _adjw_threshold * _adjw_ratio); - } - } - return 0; - } - - virtual std::future backward(SampleInstance* samples, size_t num, - ::paddle::framework::Scope* scope) { - std::future ret; - return ret; - } -protected: - size_t _slot_id; - float _adjw_ratio; - float _adjw_threshold; - std::string _input_name; -}; - -REGIST_CLASS(DataInputAccessor, WeightsAdjustAccessor); - - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/common/CMakeLists.txt b/paddle/fluid/train/custom_trainer/feed/common/CMakeLists.txt deleted file mode 100644 index b2744984..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -cc_library(custom_trainer_common SRCS registerer.cc DEPS memory) diff --git a/paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.cc b/paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.cc deleted file mode 100644 index 857b8040..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.cc +++ /dev/null @@ -1,22 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -void* execute_bthread_task(void* args) { - auto* param = reinterpret_cast<::std::tuple*, google::protobuf::Closure*>*>(args); - auto* task = ::std::get<0>(*param); - auto* closure = ::std::get<1>(*param); - (*task)(); - if (closure != NULL) { - closure->Run(); - } - delete task; - delete param; - return NULL; -} - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.h b/paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.h deleted file mode 100644 index f0b646d9..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.h +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -#ifndef BUTIL_LOGGING_H_ -#define BUTIL_LOGGING_H_ -#endif - -#include -#include -#include -#include -#include "glog/logging.h" -#include "google/protobuf/stubs/callback.h" -#include "bthread/bthread.h" -#include "bthread/mutex.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -void* execute_bthread_task(void* args); - -class BthreadTaskRunner { -public: - static BthreadTaskRunner& instance() { - static BthreadTaskRunner runner; - return runner; - } - - template - int add_task(Callable &&func, Args &&... args) { - bthread_t th; - auto* task = new std::packaged_task( - std::bind(std::forward(func), std::forward(args)...)); - auto* param = new ::std::tuple*, google::protobuf::Closure*>( - ::std::move(task), NULL); - if (0 != bthread_start_background(&th, NULL, execute_bthread_task, param)) { - delete task; - delete param; - return -1; - } - return 0; - } -private: - BthreadTaskRunner() {} - ~BthreadTaskRunner() {} -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/common/pipeline.h b/paddle/fluid/train/custom_trainer/feed/common/pipeline.h deleted file mode 100644 index e1fe5c42..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/pipeline.h +++ /dev/null @@ -1,161 +0,0 @@ -#pragma once -#include -#include "paddle/fluid/framework/archive.h" -#include "paddle/fluid/train/custom_trainer/feed/common/bthread_task_runner.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class DoneGuard { -public: - DoneGuard(std::function func) : _func(func) {} - virtual ~DoneGuard() { _func(); } -private: - std::function _func; -}; - -class PipelineOptions { -public: - PipelineOptions() = default; - uint32_t batch_size = 10; // pipe输出的batch大小 - float input_output_rate = 1; // 输入/输出 qps流量比 - uint32_t buffer_batch_count = 4; // pipe预存count组batch数据 - bool need_hold_input_data = false; // 是否保存input流数据,否则消费后释放 -}; - -/* - * 数据流管道,管道内可对流入数据进行格式转换,再流出 - * - * |---------------Pipeline---------------| - * Channel -> Converter -> Channel - * 多个管道可通过connect_to方法进行级联 - * - * 使用initialize 或 connect_to 初始化管道 - */ -template -class Pipeline { -public: - Pipeline() {} - Pipeline(Pipeline&&) = delete; - Pipeline(const Pipeline&) = delete; - typedef std::function PipeDataConverter; - - int initialize(const PipelineOptions& options, - ::paddle::framework::Channel input_channel, - PipeDataConverter data_converter) { - CHECK(_inited == false); - CHECK(options.batch_size > 0); - _inited = true; - _options = options; - _is_read_end = false; - _converter = data_converter; - _input_channel = input_channel; - _output_channel = ::paddle::framework::MakeChannel(); - _output_channel->SetBlockSize(options.batch_size); - size_t input_batch_size = options.batch_size * options.input_output_rate; - _input_channel->SetBlockSize(input_batch_size); - _output_channel->SetCapacity(options.batch_size * options.buffer_batch_count); - if (_options.need_hold_input_data) { - _input_channel_backup = ::paddle::framework::MakeChannel(); - _input_channel_backup->SetBlockSize(input_batch_size); - } - CHECK(_input_channel != nullptr) << " Input Channel is null"; - _convert_thread = std::make_shared([this](){ - async_convert_data(); - }); - return 0; - } - - template - int connect_to(Pipeline& pre_pipeline, - PipelineOptions& options, PipeDataConverter data_converter) { - // 保证全局batch一致 - options.batch_size = pre_pipeline.options().batch_size / options.input_output_rate; - return initialize(options, pre_pipeline.output_chnnel(), data_converter); - } - - virtual ~Pipeline() { - _is_read_end = true; - if (_convert_thread != nullptr) { - _convert_thread->join(); - } - } - - inline size_t read(std::vector& p) { - p.clear(); - size_t num = _output_channel->Read(p); - return num; - } - - inline const PipelineOptions& options() { - return _options; - } - - inline ::paddle::framework::Channel output_chnnel() { - return _output_channel; - } - - // 返回对input_channel的消费备份 - inline ::paddle::framework::Channel backup_channel() { - return _input_channel_backup; - } -private: - void async_convert_data() { - size_t input_batch_size = _options.batch_size * _options.input_output_rate; - size_t input_data_max = input_batch_size * _options.buffer_batch_count; - std::atomic parsing_num(0); - while (!_is_read_end) { - while (!_is_read_end && parsing_num < input_data_max) { - auto input_data_buffer = std::make_shared>(input_batch_size); - size_t read_size = _input_channel->Read(input_batch_size, input_data_buffer->data()); - if (read_size == 0) { - _is_read_end = true; - break; - } - parsing_num += read_size; - BthreadTaskRunner::instance().add_task( - [this, &parsing_num, read_size, input_data_buffer]() { - size_t write_size = 0; - std::vector output_data_buffer(_options.batch_size); - _converter(input_data_buffer->data(), read_size, - &output_data_buffer[0], &write_size, 0); - _output_channel->WriteMove(write_size, &output_data_buffer[0]); - if (_input_channel_backup) { - _input_channel_backup->WriteMove(read_size, input_data_buffer->data()); - } - parsing_num -= read_size; - }); - } - // 离线场景,batch消费间的buffer充足,允许的gap时间较大 - // 使用sleep会相对condition或yeild更省资源,且不影响吞吐 - // 可以考虑添加参数选项,以适应其它场景,这里只针对离线场景实现。 - while (!_is_read_end && parsing_num >= input_data_max) { - usleep(50000); // 50ms - } - } - while (parsing_num > 0) { - usleep(100000); // 100ms - } - _output_channel->Close(); - if (_input_channel_backup) { - _input_channel_backup->Close(); - } - } - - -private: - bool _inited = false; //标识初始化状态 - bool _is_read_end = false; //标识输入流读取完成 - PipelineOptions _options; //pipe参数 - PipeDataConverter _converter; //converter - std::shared_ptr _convert_thread; //异步convert - ::paddle::framework::Channel _input_channel; //输入流 - ::paddle::framework::Channel _input_channel_backup; //备份原始输入流 - ::paddle::framework::Channel _output_channel; //输出流 -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.cc b/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.cc deleted file mode 100644 index ff07bf4f..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.cc +++ /dev/null @@ -1,89 +0,0 @@ -#include -#include -#include -#include "json2pb/json_to_pb.h" -#include -#include -#include "paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h" -#include "paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -int PSlib::initialize(const std::string& conf_path, - RuntimeEnvironment* environment) { - _environment = environment; - init_gflag(); - int file_descriptor = open(conf_path.c_str(), O_RDONLY); - if (file_descriptor == -1){ - LOG(ERROR) << "FATAL: cant open " << conf_path; - return -1; - } - google::protobuf::io::FileInputStream fileInput(file_descriptor); - if (!google::protobuf::TextFormat::Parse(&fileInput, &_ps_param)) { - LOG(ERROR) << "FATAL: fail to parse " << conf_path; - return -1; - } - close(file_descriptor); - init_server(); - init_client(); - return 0; -} - -int PSlib::init_server() { - if (_environment->is_role(EnvironmentRole::PSERVER)) { - _server_ptr.reset(paddle::ps::PSServerFactory::create(_ps_param)); - _server_ptr->configure(_ps_param, *(_environment->ps_environment()), - _environment->rank_id(EnvironmentRole::PSERVER)); - _server_ptr->start(); - } - _environment->barrier(EnvironmentRole::ALL); - _environment->ps_environment()->gather_ps_servers(); - return 0; -} - -int PSlib::init_client() { - // 所有节点都启动psclient - _client_ptr.reset(paddle::ps::PSClientFactory::create(_ps_param)); - _client_ptr->configure(_ps_param, *(_environment->ps_environment()), - _environment->rank_id(EnvironmentRole::ALL)); - - _environment->barrier(EnvironmentRole::ALL); - _environment->ps_environment()->gather_ps_clients(); - _client_ptr->create_client2client_connection(); - return 0; -} - -paddle::ps::PSServer* PSlib::ps_server() { - return _server_ptr.get(); -} - -paddle::ps::PSClient* PSlib::ps_client() { - return _client_ptr.get(); -} - -paddle::PSParameter* PSlib::get_param() { - return &_ps_param; -} - -void PSlib::init_gflag() { - int cnt = 4; - char** params_ptr = new char*[cnt]; - char p0[] = "exe default"; - char p1[] = "-max_body_size=314217728"; - char p2[] = "-bthread_concurrency=40"; - char p3[] = "-socket_max_unwritten_bytes=2048000000"; - params_ptr[0] = p0; - params_ptr[1] = p1; - params_ptr[2] = p2; - params_ptr[3] = p3; - // ParseCommandLineFlags would change param_ptr, so copy it - char** params_ptrp = params_ptr; - ::google::ParseCommandLineFlags(&cnt, ¶ms_ptrp, true); - delete[] params_ptr; -} - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h b/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h deleted file mode 100644 index 87f6af59..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/pslib_warpper.h +++ /dev/null @@ -1,43 +0,0 @@ -#pragma once - -// Hide BLOG -#ifndef BUTIL_LOGGING_H_ -#define BUTIL_LOGGING_H_ -#endif -#ifndef COMPACT_GOOGLE_LOG_NOTICE -#define COMPACT_GOOGLE_LOG_NOTICE COMPACT_GOOGLE_LOG_INFO -#endif - -#include "communicate/ps_server.h" -#include "communicate/ps_client.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - - -class RuntimeEnvironment; -class PSlib { -public: - PSlib() {} - virtual ~PSlib() {} - int initialize(const std::string& conf_path, - RuntimeEnvironment* environment); - - virtual paddle::ps::PSServer* ps_server(); - virtual paddle::ps::PSClient* ps_client(); - virtual paddle::PSParameter* get_param(); -private: - void init_gflag(); - virtual int init_server(); - virtual int init_client(); - - paddle::PSParameter _ps_param; - RuntimeEnvironment* _environment; - std::shared_ptr _server_ptr; - std::shared_ptr _client_ptr; -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/common/registerer.cc b/paddle/fluid/train/custom_trainer/feed/common/registerer.cc deleted file mode 100644 index c2dff151..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/registerer.cc +++ /dev/null @@ -1,18 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h" -namespace paddle { -namespace custom_trainer { -namespace feed { - -BaseClassMap& global_reg_factory_map() { - static BaseClassMap *base_class = new BaseClassMap(); - return *base_class; -} -BaseClassMap& global_reg_factory_map_cpp() { - return global_reg_factory_map(); -} - -}// feed -}// namespace custom_trainer -}// namespace paddle -/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ - diff --git a/paddle/fluid/train/custom_trainer/feed/common/registerer.h b/paddle/fluid/train/custom_trainer/feed/common/registerer.h deleted file mode 100644 index b5399fdc..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/registerer.h +++ /dev/null @@ -1,114 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class Any { -public: - Any() : content_(NULL) {} - - template - Any(const ValueType &value) : content_(new Holder(value)) {} - - Any(const Any &other) : content_(other.content_ ? other.content_->clone() : NULL) {} - - ~Any() { - delete content_; - } - - template ValueType *any_cast() { - return content_ ? &static_cast *>(content_)->held_ : NULL; - } - -private: - class PlaceHolder { - public: - virtual ~PlaceHolder() {} - virtual PlaceHolder *clone() const = 0; - }; - - template - class Holder : public PlaceHolder { - public: - explicit Holder(const ValueType &value) : held_(value) {} - virtual PlaceHolder *clone() const { - return new Holder(held_); - } - - ValueType held_; - }; - - PlaceHolder *content_; -}; - -class ObjectFactory { -public: - ObjectFactory() {} - virtual ~ObjectFactory() {} - virtual Any NewInstance() { - return Any(); - } -private: -}; - -typedef std::map FactoryMap; -typedef std::map BaseClassMap; -#ifdef __cplusplus -extern "C" { -#endif -BaseClassMap& global_reg_factory_map(); -#ifdef __cplusplus -} -#endif - -BaseClassMap& global_reg_factory_map_cpp(); - -#define REGIST_REGISTERER(base_class) \ - class base_class ## Registerer { \ - public: \ - static base_class *CreateInstanceByName(const ::std::string &name) { \ - if (global_reg_factory_map_cpp().find(#base_class) \ - == global_reg_factory_map_cpp().end()) { \ - LOG(ERROR) << "Can't Find BaseClass For CreateClass with:" << #base_class; \ - return NULL; \ - } \ - FactoryMap &map = global_reg_factory_map_cpp()[#base_class]; \ - FactoryMap::iterator iter = map.find(name); \ - if (iter == map.end()) { \ - LOG(ERROR) << "Can't Find Class For Create with:" << name; \ - return NULL; \ - } \ - Any object = iter->second->NewInstance(); \ - return *(object.any_cast()); \ - } \ - }; - -#define REGIST_CLASS(clazz, name) \ - class ObjectFactory##name : public ObjectFactory { \ - public: \ - Any NewInstance() { \ - return Any(new name()); \ - } \ - }; \ - void register_factory_##name() { \ - FactoryMap &map = global_reg_factory_map_cpp()[#clazz]; \ - if (map.find(#name) == map.end()) { \ - map[#name] = new ObjectFactory##name(); \ - } \ - } \ - void register_factory_##name() __attribute__((constructor)); - -#define CREATE_INSTANCE(base_class, name) \ - base_class##Registerer::CreateInstanceByName(name) - -}//namespace feed -}//namespace custom_trainer -}//namespace paddle -/* vim: set expandtab ts=4 sw=4 sts=4 tw=100: */ diff --git a/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc b/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc deleted file mode 100755 index ccf56207..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.cc +++ /dev/null @@ -1,236 +0,0 @@ -#include -#include "paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -template -struct mpi_type_trait { -}; -template<> -struct mpi_type_trait { - static MPI_Datatype type() { - return MPI_DOUBLE; - } -}; -template<> -struct mpi_type_trait { - static MPI_Datatype type() { - return MPI_FLOAT; - } -}; -template<> -struct mpi_type_trait { - static MPI_Datatype type() { - return MPI_INT; - } -}; -template<> -struct mpi_type_trait { - static MPI_Datatype type() { - return MPI_UNSIGNED; - } -}; -template<> -struct mpi_type_trait { - static MPI_Datatype type() { - return MPI_LONG_LONG; - } -}; -template<> -struct mpi_type_trait { - static MPI_Datatype type() { - return MPI_UNSIGNED_LONG_LONG; - } -}; -template<> -struct mpi_type_trait { - static MPI_Datatype type() { - return MPI_LONG_LONG; - } -}; -template<> -struct mpi_type_trait { - static MPI_Datatype type() { - return MPI_UNSIGNED_LONG_LONG; - } -}; -RuntimeEnvironment::RuntimeEnvironment() {} -RuntimeEnvironment::~RuntimeEnvironment() {} -bool RuntimeEnvironment::is_master_node(EnvironmentRole role) { - return rank_id(role) == 0; -} -std::string format_timestamp(time_t time, const char* format) { - std::string result; - struct tm p = *localtime(&time); - char time_str_buffer[64]; - int size = strftime (time_str_buffer, 64, format, &p); - if (size > 0) { - result.assign(time_str_buffer, size); - } - return result; -} - -struct MpiNodeInfo { - int rank_id = -1; - int node_num = 0; - MPI_Comm mpi_comm; -}; - -class MPIRuntimeEnvironment : public RuntimeEnvironment { -public: - MPIRuntimeEnvironment() {} - virtual ~MPIRuntimeEnvironment() {} - virtual int initialize(YAML::Node config) { - return 0; - } - virtual int wireup() { - int argc = 0; - char** argv = NULL; - int hr = MPI_Init(&argc, &argv); - if (MPI_SUCCESS != hr) { - LOG(FATAL) << "MPI_init failed with error code" << hr; - return -1; - } - _roles_node_info.resize(static_cast(EnvironmentRole::ALL) + 1); - add_role(EnvironmentRole::ALL); - - char* value = getenv("JOB_ID"); - if (value) { - _job_id = value; - } - value = getenv("JOB_NAME"); - if (value) { - _job_name = value; - } - return 0; - } - - virtual paddle::ps::PSEnvironment* ps_environment() { - static paddle::ps::MpiPSEnvironment ps_environment; - return &ps_environment; - } - - virtual uint32_t rank_id(EnvironmentRole role) { - return mpi_node_info(role).rank_id; - } - virtual uint32_t node_num(EnvironmentRole role) { - return mpi_node_info(role).node_num; - } - virtual int add_role(EnvironmentRole role) { - auto& node_info = mpi_node_info(role); - if (node_info.rank_id < 0) { - if (role == EnvironmentRole::ALL) { - node_info.mpi_comm = MPI_COMM_WORLD; - } else { - MPI_Comm_split(MPI_COMM_WORLD, static_cast(role), - mpi_node_info(EnvironmentRole::ALL).rank_id, &(node_info.mpi_comm)); - } - MPI_Comm_rank(node_info.mpi_comm, &(node_info.rank_id)); - MPI_Comm_size(node_info.mpi_comm, &(node_info.node_num)); - } - _role_set.insert(role); - return 0; - } - virtual bool is_role(EnvironmentRole role) { - return _role_set.count(role) > 0; - } - - virtual void barrier(EnvironmentRole role) { - MPI_Barrier(mpi_node_info(role).mpi_comm); - } - - virtual void bcast(paddle::framework::BinaryArchive& ar, int root_id, EnvironmentRole role) { - auto& node_info = mpi_node_info(role); - int len = (int)ar.Length(); - MPI_Bcast(&len, 1, MPI_INT, root_id, node_info.mpi_comm); - ar.Resize(len); - ar.SetCursor(ar.Buffer()); - MPI_Bcast(ar.Buffer(), len, MPI_BYTE, root_id, node_info.mpi_comm); - } - virtual void all_reduce_in_place(double* x, int n, ReduceOperator op, EnvironmentRole role) { - auto& node_info = mpi_node_info(role); - if (op == ReduceOperator::SUM) { - MPI_Allreduce(MPI_IN_PLACE, x, n, MPI_DOUBLE, MPI_SUM, node_info.mpi_comm); - } else { - CHECK(false) << "unsupport operator"; - } - } - -protected: - virtual void print_log(EnvironmentRole role, EnvironmentLogType type, - EnvironmentLogLevel level, const std::string& log_str) { - if (type == EnvironmentLogType::MASTER_LOG) { - if (is_master_node(role)) { - fprintf(stdout, log_str.c_str()); - fprintf(stdout, "\n"); - fflush(stdout); - } - return; - } - VLOG(static_cast(level)) << log_str; - /* - static std::mutex mtx; - std::lock_guard guard(mtx); - std::err << log_str; - */ - } - - inline MpiNodeInfo& mpi_node_info(EnvironmentRole role) { - return _roles_node_info[static_cast(role)]; - } - -private: - std::set _role_set; - std::vector _roles_node_info; -}; -REGIST_CLASS(RuntimeEnvironment, MPIRuntimeEnvironment); - -//用于本地模式单机训练 -class LocalRuntimeEnvironment : public RuntimeEnvironment { -public: - LocalRuntimeEnvironment() {} - virtual ~LocalRuntimeEnvironment() {} - virtual int initialize(YAML::Node config) { - return 0; - } - virtual int wireup() { - return 0; - } - virtual paddle::ps::PSEnvironment* ps_environment() { - static paddle::ps::LocalPSEnvironment ps_environment; - return &ps_environment; - } - virtual uint32_t rank_id(EnvironmentRole role) { - return 0; - } - virtual uint32_t node_num(EnvironmentRole role) { - return 1; - } - virtual int add_role(EnvironmentRole role) { - return 0; - } - virtual bool is_role(EnvironmentRole role) { - return true; - } - virtual void barrier(EnvironmentRole role) { - return; - } - virtual void bcast(paddle::framework::BinaryArchive& ar, int root_id, EnvironmentRole role) { - return; - } - virtual void all_reduce_in_place(double* x, int n, ReduceOperator op, EnvironmentRole role) { - return; - } -protected: - virtual void print_log(EnvironmentRole role, EnvironmentLogType type, - EnvironmentLogLevel level, const std::string& log_str) { - VLOG(static_cast(level)) << log_str; - } -}; -REGIST_CLASS(RuntimeEnvironment, LocalRuntimeEnvironment); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h b/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h deleted file mode 100755 index 164e70b0..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h +++ /dev/null @@ -1,125 +0,0 @@ -/* - *Author: xiexionghang - *运行环境,屏蔽MPI or Local环境的运行差异 - *为了兼容不同环境的底层实现,Env的接口调用条件严格于sum(limit(env[n])) - *如:MPI环境下,写接口只允许单线程调用,那么默认对所有Env保证此调用限制 - */ -#pragma once -#include -#include "communicate/ps_env.h" -#include "paddle/fluid/framework/archive.h" -#include "paddle/fluid/string/string_helper.h" -#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -enum class EnvironmentLogLevel { - FATAL = 0, - ERROR = 1, - NOTICE = 2, - DEBUG = 3 -}; - -enum class EnvironmentLogType { - MASTER_LOG = 0, //仅master节点对外输出 - ALL_LOG = 1 //所有节点都会对外输出 -}; - -// 保持该枚举值的连续递增,且ALL在尾部 -enum class EnvironmentRole { - WORKER = 0, //训练Worker - PSERVER = 1, //参数服务器 - - ALL = 2 //所有角色,请保持在枚举尾部 -}; - -// Reduce的操作类型 -enum class ReduceOperator { - SUM = 0 //求和 -}; - -class RuntimeEnvironment { -public: - RuntimeEnvironment(); - virtual ~RuntimeEnvironment(); - // 配置初始化 - virtual int initialize(YAML::Node config) = 0; - - // job 信息 - virtual std::string job_id() { - return _job_id; - } - virtual std::string job_name() { - return _job_name; - } - - // 设置role - virtual int add_role(EnvironmentRole role) = 0; - // 判断role - virtual bool is_role(EnvironmentRole role) = 0; - // 环境初始化,会在所有依赖模块initialize后调用 - virtual int wireup() = 0; - - // 多线程可调用接口 Start - // 当前环境rank_idx - virtual uint32_t rank_id(EnvironmentRole role) = 0; - // 运行环境节点数 - virtual uint32_t node_num(EnvironmentRole role) = 0; - // 环境内主节点 - virtual bool is_master_node(EnvironmentRole role); - //For PS - virtual paddle::ps::PSEnvironment* ps_environment() = 0; - - // 环境定制化log - template - void log(EnvironmentRole role, EnvironmentLogType type, - EnvironmentLogLevel level, ARGS && ... args) { - print_log(role, type, level, paddle::string::format_string(args...)); - } - // 多线程可调用接口 End - - - // 接口只允许在主线程调用 Start - // barrier 指定role的节点 - virtual void barrier(EnvironmentRole role) = 0; - // bcast 广播 - virtual void bcast(paddle::framework::BinaryArchive& ar, int root_id, EnvironmentRole role) = 0; - // 全局reduce操作, 返回reduce结果 - virtual double all_reduce(double x, ReduceOperator op, EnvironmentRole role) { - double result = x; - all_reduce_in_place(&result, 1, op, role); - return result; - } - // 全局reduce,就地执行 - virtual void all_reduce_in_place(double* x, int n, - ReduceOperator op, EnvironmentRole role) = 0; - // 接口只允许在主线程调用 End -protected: - virtual void print_log(EnvironmentRole role, EnvironmentLogType type, - EnvironmentLogLevel level, const std::string& log_str) = 0; - - std::string _debug_verion; - std::string _job_id = "default_job_id"; - std::string _job_name = "default_job_name"; -}; -REGIST_REGISTERER(RuntimeEnvironment); - -#define ENVLOG_WORKER_ALL_NOTICE(...) \ -environment->log(EnvironmentRole::WORKER, EnvironmentLogType::ALL_LOG, EnvironmentLogLevel::NOTICE, __VA_ARGS__); -#define ENVLOG_WORKER_MASTER_NOTICE(...) \ -environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::NOTICE, __VA_ARGS__); -#define ENVLOG_WORKER_ALL_ERROR(...) \ -environment->log(EnvironmentRole::WORKER, EnvironmentLogType::ALL_LOG, EnvironmentLogLevel::ERROR, __VA_ARGS__); -#define ENVLOG_WORKER_MASTER_ERROR(...) \ -environment->log(EnvironmentRole::WORKER, EnvironmentLogType::MASTER_LOG, EnvironmentLogLevel::ERROR, __VA_ARGS__); - -std::string format_timestamp(time_t time, const char* format); -inline std::string format_timestamp(time_t time, const std::string& format) { - return format_timestamp(time, format.c_str()); -} - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/common/scope_helper.h b/paddle/fluid/train/custom_trainer/feed/common/scope_helper.h deleted file mode 100644 index 399fd4f5..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/scope_helper.h +++ /dev/null @@ -1,80 +0,0 @@ -#pragma once -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/lod_tensor.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class ScopeHelper { -public: - //直接取var - template - static const T& var(paddle::framework::Scope* scope, const std::string& name) { - return scope->Var(name)->Get(); - } - template - static T* mutable_var(paddle::framework::Scope* scope, const std::string& name) { - return scope->Var(name)->GetMutable(); - } - - template - static T* resize_variable(paddle::framework::Scope* scope, - const std::string& name, const paddle::framework::DDim& dim) { - auto* tensor = scope->Var(name)->GetMutable(); - tensor->Resize(dim); - return tensor; - } - - static paddle::framework::LoDTensor* resize_lod_tensor( - paddle::framework::Scope* scope, - const std::string& name, const paddle::framework::DDim& dim) { - return resize_variable(scope, name, dim); - } - - template - static void fill_value(paddle::framework::Scope* scope, - paddle::platform::Place place, const std::string& name, T& value) { - auto* tensor = resize_variable(scope, name, { 1 }); - T* data = tensor->mutable_data(place); - *data = value; - return; - } - - template - static T* get_value(paddle::framework::Scope* scope, - paddle::platform::Place place, const std::string& name) { - auto* tensor = scope->Var(name)->GetMutable(); - return tensor->mutable_data(place); - } - - static std::string to_string(paddle::framework::Scope* scope, const std::string& name) { - CHECK(scope->FindVar(name) != nullptr) << "Var named:" << name << " is not exists in scope"; - auto& tensor = scope->Var(name)->Get(); - auto& ddim = tensor.dims(); - thread_local std::stringstream ssm; - ssm.str(""); - ssm << "[" << name << "]["; - for (auto i = 0; i < ddim.size(); ++i) { - if (i > 0) ssm << "X"; - ssm << ddim.at(i); - } - ssm << "]["; - auto last_dim = ddim.at(ddim.size() - 1); - auto sample_rate = last_dim > 100 ? last_dim / 100 : 1; // 保证最后一层 最多只打100个 - auto* data = tensor.data(); - for (auto i = 0; i < tensor.numel(); i += last_dim) { - auto* dim_data = data + i; - for (auto j = 0; j < last_dim; j += sample_rate, dim_data += sample_rate) { - ssm << *dim_data << " "; - } - } - ssm << "]"; - return ssm.str(); - } - -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/common/yaml_helper.h b/paddle/fluid/train/custom_trainer/feed/common/yaml_helper.h deleted file mode 100644 index 71c38a29..00000000 --- a/paddle/fluid/train/custom_trainer/feed/common/yaml_helper.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once -#include -#include - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class YamlHelper { -public: - // 直接使用node["key"]判断,会导致node数据被加入key键 - static bool has_key(const YAML::Node& node, const std::string& key) { - CHECK(node.Type() == YAML::NodeType::Map); - for (const auto& itr : node) { - if (key == itr.first.as()) { - return true; - } - } - return false; - } - template - static T get_with_default(YAML::Node node, const std::string& key, const T& default_v) { - if (has_key(node, key)) { - return node[key].as(); - } - return default_v; - } -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/conf/env.conf b/paddle/fluid/train/custom_trainer/feed/conf/env.conf deleted file mode 100644 index f97cf557..00000000 --- a/paddle/fluid/train/custom_trainer/feed/conf/env.conf +++ /dev/null @@ -1,19 +0,0 @@ -HPC_HOME=/home/work/xiexionghang/trainer/paddle_trainer/feed_muye/smart_client -HADOOP_HOME=/home/work/xiexionghang/trainer/paddle_trainer/feed_muye/hadoop-client/hadoop/ - -#===============Job-related config====================== -MPI_JOB_NAME=feed_smfw_shoubai_video_cupai_new_arch -MPI_QUEUE=feed5 -MPI_PRIORITY=high -MPI_NODE_NUM=100 -MPI_WALL_TIME=700:00:00 -MPI_NODE_MEM=100000 -MPI_RESOURCE=full - -#===========MPI cluster Server(nmg-off/10g/hlan)========== -MPI_SERVER=yq01-hpc-lvliang01-smart-master.dmop.baidu.com - -#===========Cluster-related (HDFS/MPI Server)============== -HDFS_ROOT=/user/feed/mlarch/mio_temp/$(date +%Y%m%d-%H%M%S-%N) -HADOOP_FS=afs://xingtian.afs.baidu.com:9902 -HADOOP_UGI=mlarch,Fv1M87 diff --git a/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf b/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf deleted file mode 100644 index 96e03e71..00000000 --- a/paddle/fluid/train/custom_trainer/feed/conf/gflags.conf +++ /dev/null @@ -1,5 +0,0 @@ --log_dir=log --v=2 --logbufsecs=0 --pslib_push_dense_merge_limit=1 --pslib_push_sparse_merge_limit=1 diff --git a/paddle/fluid/train/custom_trainer/feed/conf/ps_table_config b/paddle/fluid/train/custom_trainer/feed/conf/ps_table_config deleted file mode 100644 index d21b1e11..00000000 --- a/paddle/fluid/train/custom_trainer/feed/conf/ps_table_config +++ /dev/null @@ -1,120 +0,0 @@ -server_param { - downpour_server_param { - downpour_table_param { - table_id: 0 - table_class: "DownpourSparseTable" - shard_num: 1950 - accessor { - accessor_class: "DownpourCtrAccessor" - sparse_sgd_param { - learning_rate: 0.05 - initial_g2sum: 3.0 - initial_range: 0.0001 - weight_bounds: -10.0 - weight_bounds: 10.0 - } - fea_dim: 11 - embedx_dim: 8 - embedx_threshold: 10 - downpour_accessor_param { - nonclk_coeff: 0.1 - click_coeff: 1 - base_threshold: 1.5 - delta_threshold: 0.25 - delta_keep_days: 16 - delete_after_unseen_days: 30 - show_click_decay_rate: 0.98 - delete_threshold: 0.8 - } - table_accessor_save_param { - param: 1 - converter: "(tool/xbox_compressor_mf.py | tool/xbox_pb_converter)" - deconverter: "(tool/xbox_pb_deconverter | tool/xbox_decompressor_mf.awk)" - } - table_accessor_save_param { - param: 2 - converter: "(tool/xbox_compressor_mf.py | tool/xbox_pb_converter)" - deconverter: "(tool/xbox_pb_deconverter | tool/xbox_decompressor_mf.awk)" - } - } - type: PS_SPARSE_TABLE - compress_in_save: true - } - downpour_table_param { - table_id: 1 - table_class: "DownpourDenseTable" - accessor { - accessor_class: "DownpourDenseValueAccessor" - dense_sgd_param { - name: "adam" - adam { - learning_rate: 5e-06 - avg_decay_rate: 0.999993 - ada_decay_rate: 0.9999 - ada_epsilon: 1e-08 - mom_decay_rate: 0.99 - } - naive { - learning_rate: 0.0002 - } - } - fea_dim: 2571127 - } - type: PS_DENSE_TABLE - compress_in_save: true - } - downpour_table_param { - table_id: 2 - table_class: "DownpourDenseDoubleTable" - accessor { - accessor_class: "DownpourDenseValueDoubleAccessor" - dense_sgd_param { - name: "summarydouble" - summary { - summary_decay_rate: 0.999999 - } - } - fea_dim: 13464 - } - type: PS_DENSE_TABLE - compress_in_save: true - } - downpour_table_param { - table_id: 3 - table_class: "DownpourDenseTable" - accessor { - accessor_class: "DownpourDenseValueAccessor" - dense_sgd_param { - name: "adam" - adam { - learning_rate: 5e-06 - avg_decay_rate: 0.999993 - ada_decay_rate: 0.9999 - ada_epsilon: 1e-08 - mom_decay_rate: 0.99 - } - naive { - learning_rate: 0.0002 - } - } - fea_dim: 2072615 - } - type: PS_DENSE_TABLE - compress_in_save: true - } - service_param { - server_class: "DownpourBrpcPsServer" - client_class: "DownpourBrpcPsClient" - service_class: "DownpourPsService" - start_server_port: 0 - server_thread_num: 12 - } - } -} - -fs_client_param { - uri: "afs://xingtian.afs.baidu.com:9902" - user: "mlarch" - passwd: "Fv1M87" - hadoop_bin: "$HADOOP_HOME/bin/hadoop" -} diff --git a/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml b/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml deleted file mode 100644 index 219aede2..00000000 --- a/paddle/fluid/train/custom_trainer/feed/conf/trainer.yaml +++ /dev/null @@ -1,58 +0,0 @@ -train_thread_num: 10 - -environment: - environment_class: LocalRuntimeEnvironment - ps: ./conf/ps_table_config - - - -io: - file_systems: - afs: - class: HadoopFileSystem - buffer_size: 1024000 - ugis: - 'default': 'feed_video,D3a0z8' - 'xingtian.afs.baidu.com:9902': 'feed_video,D3a0z8' - default: - class: LocalFileSystem - buffer_size: 1024000 -dataset: - data_list: - train_sample: - prefetch_num: 2 - root_path : [./sample] - data_spit_interval: 300 - data_path_formater: '%Y%m%d/%H%M' - data_reader: LineDataReader - done_file: to.hadoop.done - filename_prefix: part - pipeline_cmd: './tool/ins_weight.py | awk -f ./tool/format_newcate_hotnews.awk' - parser: - class: AbacusTextDataParser - shuffler: - class: LocalShuffler - -epoch: - epoch_class: TimelyEpochAccessor - model_root_path: ./model/ - train_time_interval: 600 - time_zone_seconds: 28800 - -executor: -- name: join - class: SimpleExecutor - train_data_name: train_sample - train_batch_size: 32 - input_parse_thread_num: 10 - push_gradient_thread_num: 16 - train_thread_num: 12 - need_dump_all_model: true -- name: update - class: SimpleExecutor - train_data_name: train_sample - train_batch_size: 32 - input_parse_thread_num: 10 - push_gradient_thread_num: 16 - train_thread_num: 12 - need_dump_all_model: false diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/abacus_data_reader.cc b/paddle/fluid/train/custom_trainer/feed/dataset/abacus_data_reader.cc deleted file mode 100755 index 55ce6398..00000000 --- a/paddle/fluid/train/custom_trainer/feed/dataset/abacus_data_reader.cc +++ /dev/null @@ -1,76 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h" - -#include -#include - -#include -#include - -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -/*解析Abacus格式明文Feasign - */ -class AbacusTextDataParser : public LineDataParser { -public: - AbacusTextDataParser() {} - virtual ~AbacusTextDataParser() {} - - virtual int parse_to_sample(const DataItem& data, SampleInstance& instance) const { - instance.id = data.id; - instance.labels.resize(1); - size_t len = data.data.size(); - const char* str = data.data.c_str(); - const char* line_end = str + len; - - char* cursor = NULL; - int show = (int)strtol(str, &cursor, 10); - str = cursor; - instance.labels[0] = (float)strtol(str, &cursor, 10);// click - str = cursor; - - while (*(str += paddle::string::count_nonspaces(str)) != 0) { - if (*str == '*') { - str++; - size_t len = paddle::string::count_nonspaces(str); - str += len; - } else if (*str == '$') { - str++; - CHECK(((int)strtol(str, &cursor, 10), cursor != str))<<" sample type parse err:" << str; - str = cursor; - } else if (*str == '#') { - str++; - break; - } else if (*str == '@') { - str++; - size_t len = paddle::string::count_nonspaces(str); - std::string all_str(str, str + len); - str += len; - } else { - FeatureItem feature_item; - feature_item.sign() = (uint64_t)strtoull(str, &cursor, 10); - if (cursor == str) { //FIXME abacus没有这种情况 - str++; - continue; - } - str = cursor; - CHECK(*str++ == ':'); - CHECK(!isspace(*str)); - CHECK((feature_item.slot() = (int) strtol(str, &cursor, 10), cursor != str)) << " format error: " << str; - str = cursor; - instance.features.emplace_back(feature_item); - } - } - VLOG(5) << "parse sample success, id:" << instance.id << ", fea_sum:" - << instance.features.size() << ", label:" << instance.labels[0]; - return 0; - } -}; -REGIST_CLASS(DataParser, AbacusTextDataParser); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/archive_data_reader.cc b/paddle/fluid/train/custom_trainer/feed/dataset/archive_data_reader.cc deleted file mode 100755 index a6c6e6e9..00000000 --- a/paddle/fluid/train/custom_trainer/feed/dataset/archive_data_reader.cc +++ /dev/null @@ -1,390 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h" - -#include -#include - -#include -#include - -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" -#include "paddle/fluid/platform/timer.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -/******************************** - * feasign压缩格式 - * 情形1:slot:hot - * |4b|4b|4b|4b|4b| 28b | - * |slot |0 |sign | - * 情形2:slot:hot*n - * |4b|4b|4b|4b|4b|4b|4b|4b|32b*n| - * |slot |1 |0 |len |sign | - * 情形3:slot:cold - * |4b|4b|4b|4b|4b|4b| 64b | - * |slot |2 |0 |sign | - * 情形4:slot:cold*n - * |4b|4b|4b|4b|4b|4b|4b|4b|64b*n| - * |slot |3 |0 |len |sign | - ********************************/ -class ArchiveDataParse : public DataParser { -public: - static const uint8_t HOT_SIGN_SIZE = 4; - static const uint8_t COLD_SIGN_SIZE = 8; - -public: - ArchiveDataParse() {} - virtual ~ArchiveDataParse() {} - -private: - struct Record { - int show, clk; - std::string tags; - std::map> vec_feas; - int sample_type; - std::map> auc_category_info_map; //为细维度计算auc准备的数据 - std::vector hot_feas, cold_feas; //冷(int32_t)热(uint64_t)feasign - - void clear() { - show = 0; - clk = 0; - tags.clear(); - vec_feas.clear(); - sample_type = 0; - auc_category_info_map.clear(); - hot_feas.clear(); - cold_feas.clear(); - } - - uint32_t calc_compress_feas_lens() const { - uint32_t hot_len = hot_feas.size(); - uint32_t cold_len = cold_feas.size(); - uint32_t cursor = 0; - int32_t pre_slot = -1; - uint32_t k = 0; - //热编码 - if (hot_len > 0) { - pre_slot = hot_feas[0].slot(); - - for (uint32_t i = 0; i < hot_len + 1; ++i) { - if (i == hot_len || pre_slot != hot_feas[i].slot()) { - cursor += 2; - //情形2 - if (i - k > 1) { - cursor += 2; - } - //情形1/2 - cursor += (HOT_SIGN_SIZE * (i - k)); - k = i; - } - pre_slot = hot_feas[i].slot(); - } - } - //冷编码 - if (cold_len > 0) { - pre_slot = cold_feas[0].slot(); - k = 0; - - for (uint32_t i = 0; i < cold_len + 1; ++i) { - if (i == cold_len || pre_slot != cold_feas[i].slot()) { - cursor += 2; - //情形4 - if (i - k > 1) { - cursor += 2; - } else { //情形3 - cursor++; - } - //情形3/4 - cursor += (COLD_SIGN_SIZE * (i - k)); - k = i; - } - pre_slot = cold_feas[i].slot(); - } - } - return cursor; - } - - void serialize_to_compress_feas(char* buffer) const { - if (buffer == nullptr) { - return ; - } - uint32_t cursor = 0; - uint32_t hot_len = hot_feas.size(); - uint32_t cold_len = cold_feas.size(); - int32_t pre_slot = -1; - int32_t hot_sign; - uint16_t slot; - uint8_t flag = 0, len = 0; - uint32_t k = 0; - //热编码 - if (hot_len > 0) { - pre_slot = hot_feas[0].slot(); - - for (uint32_t i = 0; i < hot_len + 1; ++i) { - if (i == hot_len || pre_slot != hot_feas[i].slot()) { - memcpy(buffer + cursor, &pre_slot, 2); - cursor += 2; - //情形2 - if (i - k > 1) { - flag = 0x10; - memcpy(buffer + cursor, &flag, 1); - cursor++; - len = i - k; - memcpy(buffer + cursor, &len, 1); - cursor++; - } - //情形1/2 - for (uint32_t j = k; j < i; ++j) { - hot_sign = (int32_t) hot_feas[j].sign(); - for (uint8_t b = 0; b < HOT_SIGN_SIZE; ++b) { - flag = (hot_sign >> ((HOT_SIGN_SIZE - b - 1) * 8)) & 0xFF; - memcpy(buffer + cursor, &flag, 1); - cursor++; - } - } - k = i; - } - pre_slot = hot_feas[i].slot(); - } - } - //冷编码 - if (cold_len > 0) { - pre_slot = cold_feas[0].slot(); - k = 0; - - for (uint32_t i = 0; i < cold_len + 1; ++i) { - if (i == cold_len || pre_slot != cold_feas[i].slot()) { - memcpy(buffer + cursor, &pre_slot, 2); - cursor += 2; - //情形4 - if (i - k > 1) { - flag = 0x30; - memcpy(buffer + cursor, &flag, 1); - cursor++; - len = i - k; - memcpy(buffer + cursor, &len, 1); - cursor++; - } - //情形3/4 - for (uint32_t j = k; j < i; ++j) { - if (i - k == 1) { - flag = 0x20; - memcpy(buffer + cursor, &flag, 1); - cursor++; - } - memcpy(buffer + cursor, &cold_feas[j].sign(), COLD_SIGN_SIZE); - cursor += COLD_SIGN_SIZE; - } - k = i; - } - pre_slot = cold_feas[i].slot(); - } - } - } - }; - - void deserialize_feas_to_ins(char* buffer, uint32_t len, std::vector& ins) const { - if (buffer == nullptr) { - return ; - } - - uint32_t cursor = 0; - uint16_t slot; - uint8_t flag; - while (cursor < len) { - memcpy(&slot, buffer + cursor, 2); - cursor += 2; - - memcpy(&flag, buffer + cursor, 1); - flag &= 0xF0; - - CHECK(flag == 0x00 || flag == 0x10|| flag == 0x20 || flag == 0x30); - - if (flag == 0x00 || flag == 0x10) { - uint8_t len = 1; - if (flag == 0x10) { - cursor++; - memcpy(&len, buffer + cursor, 1); - cursor++; - } - for (uint8_t i = 0; i < len; ++i) { - int32_t sign; - for (uint8_t j = 0; j < HOT_SIGN_SIZE; ++j) { - memcpy((char*)&sign + HOT_SIGN_SIZE-j-1, buffer + cursor, 1); - cursor++; - } - - uint64_t sign64 = sign & 0x0FFFFFFF; - sign64 = _index->index2sign((int32_t)sign64); - ins.emplace_back(sign64, slot); - } - } - - if (flag == 0x20 || flag == 0x30) { - uint8_t len = 1; - cursor++; - if (flag == 0x30) { - memcpy(&len, buffer + cursor, 1); - cursor++; - } - - for (uint8_t i = 0; i < len; ++i) { - uint64_t sign64; - memcpy(&sign64, buffer + cursor, COLD_SIGN_SIZE); - cursor += COLD_SIGN_SIZE; - ins.emplace_back(sign64, slot); - } - } - } - } - -public: - virtual int initialize(const YAML::Node& config, std::shared_ptr context) { - _index = context->cache_dict; - - return 0; - } - - virtual int parse(const char* str, size_t len, DataItem& data) const { - size_t pos = paddle::string::count_nonspaces(str); - if (pos >= len) { - VLOG(2) << "fail to parse line: " << std::string(str, len) << ", strlen: " << len; - return -1; - } - VLOG(5) << "getline: " << str << " , pos: " << pos << ", len: " << len; - data.id.assign(str, pos); - str += pos; - - static thread_local std::vector vec_feas; - static thread_local Record rec; - rec.clear(); - - const char* line_end = str + len; - char* cursor = NULL; - CHECK((rec.show = (int)strtol(str, &cursor, 10), cursor != str)); - str = cursor; - CHECK((rec.clk = (int)strtol(str, &cursor, 10), cursor != str)); - str = cursor; - CHECK(rec.show >= 1 && rec.clk >= 0 && rec.clk <= rec.show); - - while (*(str += paddle::string::count_nonspaces(str)) != 0) { - if (*str == '*') { - str++; - size_t len = paddle::string::count_nonspaces(str); - std::string tag(str, str + len); - rec.tags = tag; - str += len; - } else if (*str == '$') { - str++; - CHECK((rec.sample_type = (int)strtol(str, &cursor, 10), cursor != str))<<" sample type parse err:" << str; - str = cursor; - } else if (*str == '#') { - str++; - size_t len = std::find_if_not(str, line_end, - [](char c) { return std::isalnum(c) != 0 || c == '_';}) - str; - CHECK(len > 0 && *(str + len) == ':'); - std::string name(str, len); - str += len; - vec_feas.clear(); - while (*str == ':') { - float val = 0; - CHECK((val = strtof(str + 1, &cursor), cursor > str)); - vec_feas.push_back(val); - str = cursor; - } - CHECK(rec.vec_feas.insert({name, vec_feas}).second); - } else if (*str == '@') { - str++; - size_t len = paddle::string::count_nonspaces(str); - std::string all_str(str, str + len); - str += len; - //category_name1=value1,value2,value3|category_name2=value1,value2|.... - std::vector all_category_vec = paddle::string::split_string(all_str, "|"); - for (size_t i = 0; i < all_category_vec.size(); ++i) { - std::string& single_category_str = all_category_vec[i]; - std::vector str_vec = paddle::string::split_string(single_category_str, "="); - CHECK(str_vec.size() == 2); - std::string category_name = str_vec[0]; - std::vector category_info_vec = paddle::string::split_string(str_vec[1], ","); - CHECK(category_info_vec.size() > 0); - - CHECK(rec.auc_category_info_map.insert({category_name, category_info_vec}).second); - } - } else { - uint64_t sign = 0; - int slot = -1; - sign = (uint64_t)strtoull(str, &cursor, 10); - if (cursor == str) { //FIXME abacus没有这种情况 - str++; - continue; - } - //CHECK((sign = (uint64_t)strtoull(str, &cursor, 10), cursor != str)); - str = cursor; - CHECK(*str++ == ':'); - CHECK(!isspace(*str)); - CHECK((slot = (int) strtol(str, &cursor, 10), cursor != str)) << " format error: " << str; - CHECK((uint16_t) slot == slot); - str = cursor; - - int32_t compress_sign = _index->sign2index(sign); - if (compress_sign < 0) { - rec.cold_feas.emplace_back(sign, (uint16_t)slot); - } else { - rec.hot_feas.emplace_back(compress_sign, (uint16_t)slot); - } - } - } - - paddle::framework::BinaryArchive bar; - bar << rec.show << rec.clk << rec.tags << rec.vec_feas << rec.sample_type << rec.auc_category_info_map; - uint32_t feas_len = rec.calc_compress_feas_lens(); //事先计算好压缩后feasign的空间 - bar << feas_len; - bar.Resize(bar.Length() + feas_len); - rec.serialize_to_compress_feas(bar.Finish() - feas_len); //直接在archive内部buffer进行压缩,避免不必要的拷贝 - data.data.assign(bar.Buffer(), bar.Length()); //TODO 这一步拷贝是否也能避免 - - return 0; - } - - virtual int parse_to_sample(const DataItem& data, SampleInstance& instance) const { - instance.id = data.id; - if (data.data.empty()) { - return -1; - } - - int show = 0, clk = 0; - std::string tags; - std::map> vec_feas; - int sample_type = 0; - std::map> auc_category_info_map; - uint32_t feas_len = 0; - - paddle::framework::BinaryArchive bar; - bar.SetReadBuffer(const_cast(&data.data[0]), data.data.size(), nullptr); - bar >> show; - bar >> clk; - bar >> tags; - bar >> vec_feas; - bar >> sample_type; - bar >> auc_category_info_map; - bar >> feas_len; - CHECK((bar.Finish() - bar.Cursor()) == feas_len); - - deserialize_feas_to_ins(bar.Cursor(), feas_len, instance.features); - - instance.labels.resize(1); - instance.labels[0] = clk; - - return 0; - } - -private: - std::shared_ptr _index; - -}; -REGIST_CLASS(DataParser, ArchiveDataParse); - -} -} -} \ No newline at end of file diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc deleted file mode 100755 index 3ceac7e2..00000000 --- a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.cc +++ /dev/null @@ -1,194 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h" - -#include -#include - -#include -#include - -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -int LineDataParser::parse(const char* str, size_t len, DataItem& data) const { - size_t pos = 0; - while (pos < len && str[pos] != ' ') { - ++pos; - } - if (pos >= len) { - VLOG(2) << "fail to parse line: " << std::string(str, len) << ", strlen: " << len; - return -1; - } - VLOG(5) << "getline: " << str << " , pos: " << pos << ", len: " << len; - data.id.assign(str, pos); - data.data.assign(str + pos + 1, len - pos - 1); - return 0; -} -REGIST_CLASS(DataParser, LineDataParser); - -int DataReader::initialize(const YAML::Node& config, std::shared_ptr context) { - _parser.reset(CREATE_INSTANCE(DataParser, config["parser"]["class"].as())); - if (_parser == nullptr) { - VLOG(2) << "fail to get parser: " << config["parser"]["class"].as(); - return -1; - } - if (_parser->initialize(config["parser"], context) != 0) { - VLOG(2) << "fail to initialize parser" << config["parser"]["class"].as(); - return -1; - } - _pipeline_cmd = config["pipeline_cmd"].as(); - return 0; -} - -class LineDataReader : public DataReader { -public: - LineDataReader() {} - virtual ~LineDataReader() {} - virtual int initialize(const YAML::Node& config, std::shared_ptr context) { - if (DataReader::initialize(config, context) != 0) { - return -1; - } - _done_file_name = config["done_file"].as(); - _filename_prefix = config["filename_prefix"].as(""); - - if (config["file_system"] && config["file_system"]["class"]) { - _file_system.reset( - CREATE_INSTANCE(FileSystem, config["file_system"]["class"].as())); - if (_file_system == nullptr || - _file_system->initialize(config["file_system"], context) != 0) { - VLOG(2) << "fail to create class: " - << config["file_system"]["class"].as(); - return -1; - } - } else if (context->file_system != nullptr) { - _file_system = context->file_system; - } else { - _file_system.reset(CREATE_INSTANCE(FileSystem, "LocalFileSystem")); - if (_file_system == nullptr || _file_system->initialize(YAML::Load(""), context) != 0) { - VLOG(2) << "fail to init file system"; - return -1; - } - } - return 0; - } - - //判断样本数据是否已就绪,就绪表明可以开始download - virtual bool is_data_ready(const std::string& data_dir) { - auto done_file_path = _file_system->path_join(data_dir, _done_file_name); - if (_file_system->exists(done_file_path)) { - return true; - } - return false; - } - - virtual std::vector data_file_list(const std::string& data_dir) { - std::vector data_files; - for (auto& filepath : _file_system->list(data_dir)) { - auto filename = _file_system->path_split(filepath).second; - if (filename != _done_file_name && - string::begin_with(filename, _filename_prefix)) { - data_files.push_back(std::move(filepath)); - } - } - return data_files; - } - - //读取数据样本流中 - virtual int read_all(const std::string& data_dir, framework::Channel data_channel) { - auto file_list = data_file_list(data_dir); - return read_all(file_list, data_channel); - } - virtual int read_all(const std::vector& file_list, ::paddle::framework::Channel data_channel) { - data_channel->Open(); - const int file_list_size = file_list.size(); - std::atomic is_failed(false); - - const int max_threads = omp_get_max_threads(); - std::vector> writers; // writer is not thread safe - writers.reserve(max_threads); - for (int i = 0; i < max_threads; ++i) { - writers.emplace_back(data_channel.get()); - } - VLOG(5) << "file_list: " << string::join_strings(file_list, ' '); - #pragma omp parallel for - for (int i = 0; i < file_list_size; ++i) { - if (is_failed) { - continue; - } - const int thread_num = omp_get_thread_num(); - framework::ChannelWriter *writer = nullptr; - if (thread_num < max_threads) { - writer = &writers[thread_num]; - } - const auto& filepath = file_list[i]; - std::shared_ptr fin = _file_system->open_read(filepath, _pipeline_cmd); - if (fin == nullptr) { - VLOG(2) << "fail to open file: " << filepath << ", with cmd: " << _pipeline_cmd; - is_failed = true; - continue; - } - char *buffer = nullptr; - size_t buffer_size = 0; - ssize_t line_len = 0; - while ((line_len = getline(&buffer, &buffer_size, fin.get())) != -1) { - // 去掉行尾回车 - if (line_len > 0 && buffer[line_len - 1] == '\n') { - buffer[--line_len] = '\0'; - } - // 忽略空行 - if (line_len <= 0) { - continue; - } - DataItem data_item; - if (_parser->parse(buffer, line_len, data_item) == 0) { - VLOG(5) << "parse data: " << data_item.id << " " << data_item.data << ", filename: " << filepath << ", thread_num: " << thread_num << ", max_threads: " << max_threads; - if (writer == nullptr) { - if (!data_channel->Put(std::move(data_item))) { - LOG(WARNING) << "fail to put data, thread_num: " << thread_num; - is_failed = true; - } - } else { - (*writer) << std::move(data_item); - } - } - } - if (buffer != nullptr) { - free(buffer); - buffer = nullptr; - buffer_size = 0; - } - if (ferror(fin.get()) != 0) { - VLOG(2) << "fail to read file: " << filepath; - is_failed = true; - continue; - } - } - // omp end - - for (int i = 0; i < max_threads; ++i) { - writers[i].Flush(); - if (!writers[i]) { - VLOG(2) << "writer " << i << " is failed"; - is_failed = true; - } - } - data_channel->Close(); - return is_failed ? -1 : 0; - } - - virtual const DataParser* get_parser() { - return _parser.get(); - } - -private: - std::string _done_file_name; // without data_dir - std::string _filename_prefix; - std::shared_ptr _file_system; -}; -REGIST_CLASS(DataReader, LineDataReader); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h b/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h deleted file mode 100755 index b190f5ea..00000000 --- a/paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h +++ /dev/null @@ -1,140 +0,0 @@ -/* DataReader - * 对指定数据的读取 - */ -#pragma once -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/train/custom_trainer/feed/common/pipeline.h" -#include "paddle/fluid/framework/archive.h" -#include "paddle/fluid/string/string_helper.h" -#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class TrainerContext; - -struct FeatureItem { - std::vector weights; - std::vector gradients; -public: - FeatureItem() { - } - FeatureItem(uint64_t sign_, uint16_t slot_) { - sign() = sign_; - slot() = slot_; - } - uint64_t& sign() { - return *(uint64_t*)sign_buffer(); - } - const uint64_t& sign() const { - return *(const uint64_t*)sign_buffer(); - } - uint16_t& slot() { - return _slot; - } - const uint16_t& slot() const { - return _slot; - } - -private: - char _sign[sizeof(uint64_t)]; - uint16_t _slot; - - char* sign_buffer() const { - return (char*)_sign; - } -}; - -struct SampleInstance { - std::string id; - std::vector predicts; - std::vector labels; - std::vector features; - std::vector embedx; -}; - -class DataItem { -public: - DataItem() {} - virtual ~DataItem() {} - std::string id; //样本id标识,可用于shuffle - std::string data;//样本数据, maybe压缩格式 -}; - -template -paddle::framework::Archive& operator>>(paddle::framework::Archive& ar, DataItem& x) { - return ar >> x.id >> x.data; -} - -template -paddle::framework::Archive& operator<<(paddle::framework::Archive& ar, const DataItem& x) { - return ar << x.id << x.data; -} - -typedef std::shared_ptr> SampleInstancePipe; -inline SampleInstancePipe make_sample_instance_channel() { - return std::make_shared>(); -} - -class DataParser { -public: - DataParser() {} - virtual ~DataParser() {} - virtual int initialize(const YAML::Node& config, std::shared_ptr context) = 0; - virtual int parse(const std::string& str, DataItem& data) const { - return parse(str.c_str(), str.size(), data); - } - virtual int parse(const char* str, size_t len, DataItem& data) const = 0; - virtual int parse_to_sample(const DataItem& data, SampleInstance& instance) const = 0; -}; -REGIST_REGISTERER(DataParser); - -class DataReader { -public: - DataReader() {} - virtual ~DataReader() {} - virtual int initialize(const YAML::Node& config, std::shared_ptr context); - //判断样本数据是否已就绪,就绪表明可以开始download - virtual bool is_data_ready(const std::string& data_dir) = 0; - //读取dir下文件列表 - virtual std::vector data_file_list(const std::string& data_dir) = 0; - //读取目录下数据到样本流中 - virtual int read_all(const std::string& data_dir, ::paddle::framework::Channel data_channel) = 0; - //读取指定文件列表的数据到样本流中 - virtual int read_all(const std::vector& data_list, ::paddle::framework::Channel data_channel) = 0; - virtual const DataParser* get_parser() { - return _parser.get(); - } -protected: - std::shared_ptr _parser;//数据格式转换 - std::string _pipeline_cmd; //将文件流,重定向到pipeline_cmd,再读入 -}; -REGIST_REGISTERER(DataReader); - -class LineDataParser : public DataParser { -public: - LineDataParser() {} - - virtual ~LineDataParser() {} - - virtual int initialize(const YAML::Node& config, std::shared_ptr context) { - return 0; - } - - virtual int parse(const char* str, size_t len, DataItem& data) const; - - virtual int parse_to_sample(const DataItem& data, SampleInstance& instance) const { - return 0; - } -}; - -}//namespace feed -}//namespace custom_trainer -}//namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc b/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc deleted file mode 100644 index a4081c0f..00000000 --- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.cc +++ /dev/null @@ -1,80 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/dataset/dataset.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -int Dataset::initialize( - const YAML::Node& config, std::shared_ptr context) { - if (config["data_list"].Type() != YAML::NodeType::Map) { - LOG(FATAL) << "miss data_list config in dataset, or type error please check"; - return -1; - } - for (auto& data_config : config["data_list"]) { - std::string name = data_config.first.as(); - auto data_ptr = std::make_shared(); - if (data_ptr->initialize(data_config.second, context) != 0) { - LOG(FATAL) << "dataset initialize failed, name:" << name; - return -1; - } - _data_containers[name] = data_ptr; - } - return 0; -} - -inline void Dataset::pre_detect_data(uint64_t epoch_id) { - for (auto it = _data_containers.begin(); it != _data_containers.end(); ++it) { - it->second->pre_detect_data(epoch_id); - } - return; -} -inline void Dataset::pre_detect_data( - const std::string& data_name, uint64_t epoch_id) { - _data_containers[data_name]->pre_detect_data(epoch_id); - return; -} - -inline DatasetStatus Dataset::epoch_data_status(uint64_t epoch_id) { - int status = static_cast(DatasetStatus::Ready); - for (auto it = _data_containers.begin(); it != _data_containers.end(); ++it) { - auto d_status = static_cast(it->second->epoch_data_status(epoch_id)); - status = d_status < status ? d_status : status; - } - return static_cast(status); -} - -inline DatasetStatus Dataset::epoch_data_status( - const std::string& data_name, uint64_t epoch_id) { - return _data_containers[data_name]->epoch_data_status(epoch_id); -} - -inline std::vector Dataset::epoch_data_path( - const std::string& data_name, uint64_t epoch_id) { - return _data_containers[data_name]->epoch_data_path(epoch_id); -} - -inline std::vector Dataset::epoch_data_path(uint64_t epoch_id) { - std::vector results; - for (auto it = _data_containers.begin(); it != _data_containers.end(); ++it) { - auto items = std::move(it->second->epoch_data_path(epoch_id)); - for (auto& item : items) { - results.emplace_back(item); - } - } - return results; -} - -inline ::paddle::framework::Channel Dataset::fetch_data( - const std::string& data_name, uint64_t epoch_id) { - return _data_containers[data_name]->fetch(epoch_id); -} - -inline const DataParser* Dataset::data_parser(const std::string& data_name) { - auto* data_container = _data_containers[data_name].get(); - return data_container->data_parser(); -} - - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.h b/paddle/fluid/train/custom_trainer/feed/dataset/dataset.h deleted file mode 100644 index eeda3752..00000000 --- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset.h +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h" -#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h" -#include "paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class Dataset { -public: - Dataset() {} - virtual ~Dataset() {} - - virtual int initialize( - const YAML::Node& config, std::shared_ptr context); - - //触发可预取的数据判断 - virtual void pre_detect_data(uint64_t epoch_id); - virtual void pre_detect_data(const std::string& data_name, uint64_t epoch_id); - - //获取数据状态 - virtual DatasetStatus epoch_data_status(uint64_t epoch_id); - virtual DatasetStatus epoch_data_status(const std::string& data_name, uint64_t epoch_id); - - //获取数据路径 - virtual std::vector epoch_data_path(uint64_t epoch_id); - virtual std::vector epoch_data_path(const std::string& data_name, uint64_t epoch_id); - - //返回各DataContainer内的原始数据(maybe 压缩格式) - virtual ::paddle::framework::Channel fetch_data( - const std::string& data_name, uint64_t epoch_id); - - //获取DataItem解析器 - virtual const DataParser* data_parser(const std::string& data_name); - -private: - std::unordered_map> _data_containers; -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc deleted file mode 100755 index 389c449a..00000000 --- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.cc +++ /dev/null @@ -1,199 +0,0 @@ -/* DatasetContainer - * 保存一个数据源的样本,并驱动样本的异步加载 - */ -#include -#include -#include -#include -#include -#include "paddle/fluid/string/string_helper.h" -#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h" -#include "paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h" -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" -#include "paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h" -#include "paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -int DatasetContainer::initialize( - const YAML::Node& config, std::shared_ptr context) { - _dataset_config = config; - _trainer_context = context.get(); - //预取n轮样本数据 - _prefetch_num = config["prefetch_num"].as(); - _dataset_list.resize(_prefetch_num); - for (int i = 0; i < _prefetch_num; ++i) { - _dataset_list[i].reset(new DatasetInfo); - } - - _data_root_paths = config["root_path"].as>(); - _data_split_interval = config["data_spit_interval"].as(); - _data_path_formater = config["data_path_formater"].as(); - std::string shuffler = config["shuffler"]["class"].as(); - _shuffler.reset(CREATE_INSTANCE(Shuffler, shuffler)); - _shuffler->initialize(config, context); - std::string data_reader_class = config["data_reader"].as(); - DataReader* data_reader = CREATE_INSTANCE(DataReader, data_reader_class); - _data_reader.reset(data_reader); - return _data_reader->initialize(config, context); -} - -std::shared_ptr DatasetContainer::dataset(uint64_t timestamp) { - auto* epoch_accessor = _trainer_context->epoch_accessor.get(); - auto data_idx = timestamp / epoch_accessor->epoch_time_interval(); - return _dataset_list[data_idx % _prefetch_num]; -} -std::vector DatasetContainer::epoch_data_path(uint64_t epoch_id) { - std::vector results; - auto* epoch_accessor = _trainer_context->epoch_accessor.get(); - time_t timestamp = epoch_accessor->epoch_timestamp(epoch_id); - size_t data_num = data_num_for_train(timestamp, epoch_accessor->epoch_time_interval(), _data_split_interval); - uint64_t data_timestamp = timestamp % _data_split_interval == 0 ? timestamp : (timestamp / _data_split_interval + 1) * _data_split_interval; - for (int i = 0; i < _data_root_paths.size(); ++i) { - for (int j = 0; j < data_num; ++j) { - std::string path_suffix = format_timestamp(data_timestamp + j * _data_split_interval, _data_path_formater); - std::string data_dir = _trainer_context->file_system->path_join(_data_root_paths[i], path_suffix); - results.emplace_back(data_dir); - } - } - return results; -} - -void DatasetContainer::pre_detect_data(uint64_t epoch_id) { - int status = 0; - auto* epoch_accessor = _trainer_context->epoch_accessor.get(); - time_t timestamp = epoch_accessor->epoch_timestamp(epoch_id); - if (timestamp % epoch_accessor->epoch_time_interval() != 0) { - LOG(FATAL) << "timestamp:" << timestamp << " don't match interval:" << epoch_accessor->epoch_time_interval(); - return; - } - if (_downloader_thread == nullptr) { - _downloader_thread.reset(new std::thread([this, timestamp](){ - async_download_data(timestamp); - })); - } - for (int detect_idx = 0 ; detect_idx < _prefetch_num; ++detect_idx, ++epoch_id) { - if (DatasetStatus::Empty != data_status(timestamp)) { - continue; - } - size_t data_num = data_num_for_train(timestamp, epoch_accessor->epoch_time_interval(), _data_split_interval); - uint64_t data_timestamp = timestamp % _data_split_interval == 0 ? timestamp : (timestamp / _data_split_interval + 1) * _data_split_interval; - std::vector data_path_list; - for (int i = 0; i < _data_root_paths.size() && status == 0; ++i) { - for (int j = 0; j < data_num && status == 0; ++j) { - std::string path_suffix = format_timestamp(data_timestamp + j * _data_split_interval, _data_path_formater); - std::string data_dir = _trainer_context->file_system->path_join(_data_root_paths[i], path_suffix); - status = read_data_list(data_dir, data_path_list); - } - } - if (status == 0) { - auto dataset_info = dataset(timestamp); - dataset_info->timestamp = timestamp; - dataset_info->file_path_list = std::move(data_path_list); - dataset_info->status = DatasetStatus::Detected; - VLOG(2) << epoch_accessor->text(epoch_id) << ", data is detected"; - } - timestamp += epoch_accessor->epoch_time_interval(); - } - return; -} - -int DatasetContainer::read_data_list(const std::string& data_dir, std::vector& data_list) { - auto* environment = _trainer_context->environment.get(); - - // 检查数据Ready - int data_status = -1; - if (environment->is_master_node(EnvironmentRole::WORKER)) { - if (_data_reader->is_data_ready(data_dir)) { - data_status = 0; - } - } - paddle::framework::BinaryArchive ar; - ar << data_status; - environment->bcast(ar, 0, EnvironmentRole::WORKER); - ar >> data_status; - if (data_status != 0) { - return -1; - } - - // 读取文件列表 - ar.Clear(); - std::vector data_path_list; - if (environment->is_master_node(EnvironmentRole::WORKER)) { - data_path_list = _data_reader->data_file_list(data_dir); - ar << data_path_list; - } - environment->bcast(ar, 0, EnvironmentRole::WORKER); - ar >> data_path_list; - auto worker_id = environment->rank_id(EnvironmentRole::WORKER); - auto worker_num = environment->node_num(EnvironmentRole::WORKER); - for (int i = worker_id; i < data_path_list.size(); i+=worker_num) { - data_list.push_back(data_path_list[i]); - } - environment->barrier(EnvironmentRole::WORKER); - return 0; -} - -DatasetStatus DatasetContainer::epoch_data_status(uint64_t epoch_id) { - auto* epoch_accessor = _trainer_context->epoch_accessor.get(); - time_t timestamp = epoch_accessor->epoch_timestamp(epoch_id); - return data_status(timestamp); -} - -DatasetStatus DatasetContainer::data_status(uint64_t timestamp) { - auto dataset_info = dataset(timestamp); - if (dataset_info->timestamp != timestamp) { - return DatasetStatus::Empty; - } - return dataset_info->status; -} - -paddle::framework::Channel DatasetContainer::fetch(uint64_t epoch_id) { - paddle::framework::Channel result; - auto* epoch_accessor = _trainer_context->epoch_accessor.get(); - time_t timestamp = epoch_accessor->epoch_timestamp(epoch_id); - if (data_status(timestamp) != DatasetStatus::Ready) { - return result; - } - auto dataset_info = dataset(timestamp); - return dataset_info->data_channel; -} - -void DatasetContainer::async_download_data(uint64_t start_timestamp) { - auto* epoch_accessor = _trainer_context->epoch_accessor.get(); - if (start_timestamp % epoch_accessor->epoch_time_interval() != 0) { - LOG(FATAL) << "timestamp:" << start_timestamp << " don't match interval:" << epoch_accessor->epoch_time_interval(); - return; - } - while (!_stop_download) { - auto dataset_info = dataset(start_timestamp); - while (data_status(start_timestamp) == DatasetStatus::Empty) { - sleep(30); - } - dataset_info->status = DatasetStatus::Downloding; - - VLOG(2) << "Start download data, data_timestap:" << start_timestamp - << ", for epoch:" << epoch_accessor->text(start_timestamp); - const auto& file_list = dataset_info->file_path_list; - dataset_info->data_channel->Clear(); - while (_data_reader->read_all(file_list, dataset_info->data_channel) != 0) { - dataset_info->data_channel->Clear(); - VLOG(0) << "Failed download data, data_timestap:" << start_timestamp - << ", for epoch:" << epoch_accessor->text(start_timestamp) << ", Retry it"; - sleep(30); - } - VLOG(2) << "End download data num:" << dataset_info->data_channel->Size() - << ", data_timestap:" << start_timestamp - << ", for epoch:" << epoch_accessor->text(start_timestamp) << ", Start shuffle"; - _shuffler->shuffle(dataset_info->data_channel); - VLOG(2) << "Shuffle done"; - dataset_info->status = DatasetStatus::Ready; - start_timestamp += epoch_accessor->epoch_time_interval(); - } -} - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h b/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h deleted file mode 100644 index aaf76839..00000000 --- a/paddle/fluid/train/custom_trainer/feed/dataset/dataset_container.h +++ /dev/null @@ -1,89 +0,0 @@ -/* DatasetContainer - * 保存一个数据源的样本,并驱动样本的异步加载 - */ -#pragma once -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/framework/data_set.h" -#include "paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h" -#include "paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class Shuffler; - -inline int data_num_for_train(uint64_t train_begin_timestamp, uint32_t train_time_interval, uint32_t data_time_interval) { - uint64_t data_begin_time = train_begin_timestamp; - uint64_t data_end_time = data_begin_time + train_time_interval; - uint64_t end_idx = (data_end_time + data_time_interval - 1) / data_time_interval; - uint64_t begin_idx = (data_begin_time + data_time_interval - 1 ) / data_time_interval; - return end_idx - begin_idx; -} - -enum class DatasetStatus { - Empty = 0, - Detected = 1, - Downloding = 2, - Ready = 3 -}; - -struct DatasetInfo { - uint64_t timestamp = 0; - std::vector file_path_list; - DatasetStatus status = DatasetStatus::Empty; - ::paddle::framework::Channel data_channel = ::paddle::framework::MakeChannel(); -}; - -class DatasetContainer { -public: - DatasetContainer() {} - virtual ~DatasetContainer() { - if (_downloader_thread != nullptr) { - _stop_download = true; - _downloader_thread->join(); - } - } - virtual int initialize( - const YAML::Node& config, std::shared_ptr context); - // 触发可预取的数据判断 - virtual void pre_detect_data(uint64_t epoch_id); - // 获取epoch对应的样本数据目录 - std::vector epoch_data_path(uint64_t epoch_id); - // 获取数据状态 - virtual DatasetStatus epoch_data_status(uint64_t epoch_id); - // 获取特定epoch_i样本,如果数据未ready,Channel内为空指针 - virtual ::paddle::framework::Channel fetch(uint64_t epoch_id); - // 获取DataItem解析器 - virtual const DataParser* data_parser() { - return _data_reader->get_parser(); - } -protected: - virtual DatasetStatus data_status(uint64_t timestamp); - virtual int read_data_list(const std::string& data_dir, std::vector& data_list); - // 异步样本download - virtual void async_download_data(uint64_t start_timestamp); - virtual std::shared_ptr dataset(uint64_t timestamp); - - int _prefetch_num = 0; - bool _stop_download = false; - int _data_split_interval = 60; //样本切分周期(秒) - YAML::Node _dataset_config; - std::string _data_path_formater; - std::vector _data_root_paths; //支持同时读取多个目录 - - TrainerContext* _trainer_context; - std::shared_ptr _shuffler; - std::shared_ptr _data_reader; - std::shared_ptr _downloader_thread; - std::vector> _dataset_list;//预取的数据列表 -}; - -}//namespace feed -}//namespace custom_trainer -}//namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/executor/executor.cc b/paddle/fluid/train/custom_trainer/feed/executor/executor.cc deleted file mode 100644 index 246d4a36..00000000 --- a/paddle/fluid/train/custom_trainer/feed/executor/executor.cc +++ /dev/null @@ -1,126 +0,0 @@ -#include -#include "paddle/fluid/train/custom_trainer/feed/executor/executor.h" - -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/platform/init.h" -#include "paddle/fluid/platform/cpu_helper.h" -#include "paddle/fluid/inference/api/details/reset_tensor_array.h" -#include "paddle/fluid/platform/enforce.h" - - -namespace paddle { -namespace custom_trainer { -namespace feed { - -namespace { - -int ReadBinaryFile(const std::string& filename, std::string* contents) { - std::ifstream fin(filename, std::ios::in | std::ios::binary); - if (!fin) { - LOG(FATAL) << "Cannot open file " << filename; - return -1; - } - fin.seekg(0, std::ios::end); - contents->clear(); - contents->resize(fin.tellg()); - fin.seekg(0, std::ios::beg); - fin.read(&(contents->at(0)), contents->size()); - fin.close(); - return 0; -} - -std::unique_ptr Load( - paddle::framework::Executor* /*executor*/, const std::string& model_filename) { - LOG(INFO) << "loading model from " << model_filename; - std::string program_desc_str; - if (ReadBinaryFile(model_filename, &program_desc_str) != 0) { - return nullptr; - } - std::unique_ptr main_program( - new paddle::framework::ProgramDesc(program_desc_str)); - return main_program; -} - -} - - -class SimpleExecutor : public Executor { -public: - SimpleExecutor() {}; - virtual ~SimpleExecutor() {}; - virtual int initialize(YAML::Node exe_config, - std::shared_ptr context_ptr) { - paddle::framework::InitDevices(false); - //if (exe_config["num_threads"]) { - - //} - paddle::platform::SetNumThreads(1); - std::string name = exe_config["name"].as(); - std::string main_program = YamlHelper::get_with_default(exe_config, "main_program", - string::format_string("./model/%s/main_program", name.c_str())); - std::string startup_program = YamlHelper::get_with_default(exe_config, "startup_program", - string::format_string("./model/%s/startup_program", name.c_str())); - try { - _context.reset(new SimpleExecutor::Context(context_ptr->cpu_place)); - _context->startup_program = Load(&_context->executor, startup_program); - if (_context->startup_program == nullptr) { - VLOG(0) << "fail to load startup_program: " << startup_program; - return -1; - } - _context->main_program = Load(&_context->executor, main_program); - if (_context->main_program == nullptr) { - VLOG(0) << "fail to load main_program: " << main_program; - return -1; - } - _context->prepare_context = _context->executor.Prepare(*_context->main_program, 0); - } catch (::paddle::platform::EnforceNotMet& err) { - VLOG(0) << err.what(); - _context.reset(nullptr); - return -1; - } - return 0; - } - virtual int initialize_scope(::paddle::framework::Scope* scope) { - _context->executor.Run(*_context->startup_program, scope, 0, false, true); - _context->executor.CreateVariables(*_context->main_program, scope, 0); - return 0; - } - virtual int run(::paddle::framework::Scope* scope) { - if (_context == nullptr) { - VLOG(2) << "need initialize before run"; - return -1; - } - try { - _context->executor.RunPreparedContext(_context->prepare_context.get(), scope, - false, /* don't create local scope each time*/ - false /* don't create variable each time */); - - // For some other vector like containers not cleaned after each batch. - _context->tensor_array_batch_cleaner.CollectNoTensorVars(scope); - _context->tensor_array_batch_cleaner.ResetNoTensorVars(); - } catch (::paddle::platform::EnforceNotMet& err) { - VLOG(2) << err.what(); - return -1; - } - return 0; - } -protected: - struct Context { - Context(const ::paddle::platform::Place& place) : place(place), executor(place) { - } - const ::paddle::platform::Place& place; - ::paddle::framework::Executor executor; - ::std::unique_ptr<::paddle::framework::ProgramDesc> main_program; - ::std::unique_ptr<::paddle::framework::ProgramDesc> startup_program; - ::std::unique_ptr prepare_context; - details::TensorArrayBatchCleaner tensor_array_batch_cleaner; - }; - std::unique_ptr _context; -}; - -REGIST_CLASS(Executor, SimpleExecutor); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/executor/executor.h b/paddle/fluid/train/custom_trainer/feed/executor/executor.h deleted file mode 100644 index 15f73527..00000000 --- a/paddle/fluid/train/custom_trainer/feed/executor/executor.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once -#include -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h" -#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class Executor { -public: - Executor() {} - virtual ~Executor() {} - - // 初始化,包括进行训练网络&配置加载工作 - virtual int initialize(YAML::Node exe_config, - std::shared_ptr context_ptr) = 0; - - // 初始化scope, 后续反复执行训练,不再初始化 - virtual int initialize_scope(::paddle::framework::Scope* scope) = 0; - - // 执行训练 - virtual int run(::paddle::framework::Scope* scope) = 0; - - // cost time millisecond - virtual uint64_t epoch_cost() const { - return 0; - } -}; -REGIST_REGISTERER(Executor); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc b/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc deleted file mode 100755 index 1293fc14..00000000 --- a/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.cc +++ /dev/null @@ -1,258 +0,0 @@ -#include "paddle/fluid/platform/timer.h" -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" -#include "paddle/fluid/train/custom_trainer/feed/monitor/monitor.h" -#include "paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/program_desc.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -std::once_flag MultiThreadExecutor::_async_delete_flag; -std::shared_ptr MultiThreadExecutor::_async_delete_thread; -paddle::framework::Channel MultiThreadExecutor::_delete_channel; - -int MultiThreadExecutor::initialize(YAML::Node exe_config, - std::shared_ptr context_ptr) { - int ret = 0; - _trainer_context = context_ptr.get(); - _train_data_name = exe_config["train_data_name"].as(); - _train_batch_size = exe_config["train_batch_size"].as(); - - // 暂未使用,后续各流考虑独立线程池,或设置流数据的优先级 - _input_parse_thread_num = exe_config["input_parse_thread_num"].as(); - _push_gradient_thread_num = exe_config["push_gradient_thread_num"].as(); - _train_thread_num = exe_config["train_thread_num"].as(); - - _need_dump_all_model = exe_config["need_dump_all_model"].as(); - CHECK(_train_thread_num > 0 && _train_batch_size > 0); - _thread_executors.resize(_train_thread_num); - auto e_class = exe_config["class"].as(); - _train_exe_name = exe_config["name"].as(); - if (exe_config["debug_layer_list"]) { - _debug_layer_list = exe_config["debug_layer_list"].as>(); - } - - omp_set_num_threads(_train_thread_num); - #pragma omp parallel for - for (int i = 0; i < _train_thread_num; ++i) { - auto* e_ptr = CREATE_INSTANCE(Executor, e_class); - _thread_executors[i].reset(e_ptr); - if (e_ptr->initialize(exe_config, context_ptr) != 0) { - VLOG(0) << "executor initialize failed, name:" << _train_exe_name - << " class:" << e_class; - ret = -1; - } - } - CHECK(ret == 0); - - // buffer - _scope_obj_pool.reset(new paddle::ps::ObjectPool<::paddle::framework::Scope>( - [this]() -> ::paddle::framework::Scope* { - auto* scope = new ::paddle::framework::Scope(); - _thread_executors[0]->initialize_scope(scope); - return scope; - }, _train_thread_num * 8, 0, _train_thread_num * 8)); - - // 模型网络加载 - std::string model_config_path = _trainer_context->file_system->path_join( - "./model", string::format_string("%s/model.yaml", _train_exe_name.c_str())); - CHECK(_trainer_context->file_system->exists(model_config_path)) - << "miss model config file:" << model_config_path; - _model_config = YAML::LoadFile(model_config_path); - _persistables.clear(); - for (const auto& accessor_config : _model_config["input_accessor"]) { - auto accessor_class = accessor_config["class"].as(); - auto* accessor_ptr = CREATE_INSTANCE(DataInputAccessor, accessor_class); - _input_accessors.emplace_back(accessor_ptr); - CHECK(accessor_ptr->initialize(accessor_config, context_ptr) == 0) - << "InputAccessor init Failed, class:" << accessor_class; - if (accessor_config["table_id"]) { - auto table_id = accessor_config["table_id"].as(); - if (_table_to_accessors.count(table_id) > 0) { - _table_to_accessors[table_id].push_back(accessor_ptr); - } else { - _table_to_accessors[table_id] = {accessor_ptr}; - } - } - CHECK(accessor_ptr->collect_persistables_name(_persistables) == 0) - << "collect_persistables Failed, class:" << accessor_class; - } - std::sort(_persistables.begin(), _persistables.end()); // 持久化变量名一定要排序 - - // Monitor组件 - for (const auto& monitor_config : _model_config["monitor"]) { - auto monitor_class = monitor_config["class"].as(); - auto* monitor_ptr = CREATE_INSTANCE(Monitor, monitor_class); - _monitors.emplace_back(monitor_ptr); - CHECK(monitor_ptr->initialize(monitor_config, context_ptr) == 0) - << "Monitor init Failed, class:" << monitor_class; - } - - // 异步删除池 - std::call_once(_async_delete_flag, [this](){ - _delete_channel = paddle::framework::MakeChannel(); - _delete_channel->SetBlockSize(32); - _async_delete_thread.reset(new std::thread([this]{ - std::vector ctxs; - while (true) { - while (_delete_channel->Read(ctxs)) { - for (auto* ctx : ctxs) { - delete ctx; - } - } - usleep(200000); // 200ms - } - })); - }); - return ret; -} - -int32_t MultiThreadExecutor::save_persistables(const std::string& file_path) { - auto fs = _trainer_context->file_system; - auto file_name = fs->path_split(file_path).second; - fs->remove(file_name); - auto scope_obj = _scope_obj_pool->get(); - for (size_t i = 0; i < _input_accessors.size(); ++i) { - _input_accessors[i]->collect_persistables(scope_obj.get()); - } - framework::ProgramDesc prog; - auto* block = prog.MutableBlock(0); - auto* op = block->AppendOp(); - op->SetType("save_combine"); - op->SetInput("X", _persistables); - op->SetAttr("file_path", file_name); - op->CheckAttrs(); - - platform::CPUPlace place; - framework::Executor exe(place); - exe.Run(prog, scope_obj.get(), 0, true, true); - // exe只能将模型产出在本地,这里通过cp方式兼容其他文件系统 - fs->copy(file_name, file_path); - return 0; -} - -paddle::framework::Channel MultiThreadExecutor::run( - paddle::framework::Channel input, const DataParser* parser) { - - uint64_t epoch_id = _trainer_context->epoch_accessor->current_epoch_id(); - auto* environment = _trainer_context->environment.get(); - // 输入流 - PipelineOptions input_pipe_option; - input_pipe_option.need_hold_input_data = true; - input_pipe_option.batch_size = 1; - input_pipe_option.input_output_rate = _train_batch_size; - input_pipe_option.buffer_batch_count = _train_thread_num; - auto input_pipe = std::make_shared>(); - input_pipe->initialize(input_pipe_option, input, - [this, parser](DataItem* item, size_t item_num, - ScopePoolObj* scope, size_t* scope_num, size_t thread_idx) -> int { - *scope_num = 1; - paddle::platform::Timer timer; - timer.Start(); - auto scope_obj = _scope_obj_pool->get(); - auto* scope_context = new ScopeExecutorContext(item_num); - auto* samples = scope_context->samples(); - for (size_t i = 0; i parse_to_sample(item[i], samples[i]) == 0); - } - for (size_t i = 0; i < _input_accessors.size(); ++i) { - _input_accessors[i]->forward(samples, item_num, scope_obj.get()); - } - timer.Pause(); - scope_context->prepare_cost_ms = timer.ElapsedMS(); - int64_t data_for_scope = (int64_t)scope_context; - ScopeHelper::fill_value(scope_obj.get(), _trainer_context->cpu_place, - "scope_context", data_for_scope); - *scope = std::move(scope_obj); - return 0; - }); - - // 训练流 - PipelineOptions train_pipe_option; - train_pipe_option.input_output_rate = 1; - train_pipe_option.buffer_batch_count = _train_thread_num; - auto train_pipe = std::make_shared>(); - train_pipe->connect_to(*input_pipe, train_pipe_option, - [this] (ScopePoolObj* in_items, size_t in_num, - ScopePoolObj* out_items, size_t* out_num, size_t thread_idx) -> int { - auto* executor = _thread_executors[thread_idx].get(); - size_t& out_idx = *out_num; - for (out_idx = 0; out_idx < in_num; ++out_idx) { - auto* scope = in_items[out_idx].get(); - auto* scope_ctx = (ScopeExecutorContext*)(*ScopeHelper::get_value( - scope, _trainer_context->cpu_place, "scope_context")); - paddle::platform::Timer timer; - timer.Start(); - CHECK(executor->run(scope) == 0); - timer.Pause(); - scope_ctx->executor_cost_ms = timer.ElapsedMS(); - out_items[out_idx] = std::move(in_items[out_idx]); - } - return 0; - }); - - // 梯度回传流 - PipelineOptions gradient_pipe_option; - gradient_pipe_option.input_output_rate = 1; - gradient_pipe_option.buffer_batch_count = _train_thread_num; - auto gradient_pipe = std::make_shared>(); - gradient_pipe->connect_to(*train_pipe, gradient_pipe_option, - [epoch_id, this] (ScopePoolObj* in_items, size_t in_num, - int* out_items, size_t* out_num, size_t thread_idx) -> int { - size_t& out_idx = *out_num; - for (out_idx = 0; out_idx < in_num; ++out_idx) { - paddle::platform::Timer timer; - timer.Start(); - auto* scope = in_items[out_idx].get(); - auto* scope_ctx = (ScopeExecutorContext*)(*ScopeHelper::get_value( - scope, _trainer_context->cpu_place, "scope_context")); - auto* samples = scope_ctx->samples(); - auto sample_num = scope_ctx->sample_num(); - - out_items[out_idx] = 0; - scope_ctx->wait_status.resize(_input_accessors.size()); - for (size_t i = 0; i < _input_accessors.size(); ++i) { - scope_ctx->wait_status[i] = _input_accessors[i]->backward(samples, sample_num, scope); - } - timer.Pause(); - scope_ctx->push_gradient_cost_ms = timer.ElapsedMS(); - - // Monitor && Debug - for (auto& monitor : _monitors) { - monitor->add_data(epoch_id, this, scope_ctx); - } - if (_debug_layer_list.size() > 0) { - for (auto& layer_name : _debug_layer_list) { - VLOG(2) << "[Debug][Layer]" << ScopeHelper::to_string(scope, layer_name); - } - } - // 所有pipe完成后,再异步回收sample - _delete_channel->Put(scope_ctx); - } - return 0; - }); - - // 等待训练流结束 - std::vector gradient_status; - while (gradient_pipe->read(gradient_status) > 0) { - } - - // 输出相关监控&统计项 - for (auto& monitor : _monitors) { - if (monitor->need_compute_result(epoch_id)) { - monitor->compute_result(); - ENVLOG_WORKER_MASTER_NOTICE("[Monitor]%s, monitor:%s, result:%s", - _train_exe_name.c_str(), monitor->get_name().c_str(), monitor->format_result().c_str()); - _trainer_context->monitor_ssm << _train_exe_name << ":" << - monitor->get_name() << ":" << monitor->format_result() << ","; - monitor->reset(); - } - } - return input_pipe->backup_channel(); -} - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h b/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h deleted file mode 100644 index 2db7d09f..00000000 --- a/paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h +++ /dev/null @@ -1,103 +0,0 @@ -#pragma once -#include -#include -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/train/custom_trainer/feed/executor/executor.h" -#include "paddle/fluid/train/custom_trainer/feed/accessor/input_data_accessor.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class Monitor; -typedef paddle::ps::ObjectPool<::paddle::framework::Scope>::PooledObject ScopePoolObj; - -class ScopeExecutorContext { -public: - ScopeExecutorContext(size_t sample_num) { - _samples = new SampleInstance[sample_num]; - _sample_num = sample_num; - } - virtual ~ScopeExecutorContext() { - for (auto& status : wait_status) { - if (!status.valid()) { - continue; - } - status.wait(); - } - delete[] _samples; - } - inline SampleInstance* samples() { - return _samples; - } - inline size_t sample_num() { - return _sample_num; - } - size_t executor_cost_ms = 0; - size_t prepare_cost_ms = 0; - size_t push_gradient_cost_ms = 0; - std::vector> wait_status; -private: - size_t _sample_num = 0; - SampleInstance* _samples = NULL; -}; - -class MultiThreadExecutor { -public: - MultiThreadExecutor() {} - virtual ~MultiThreadExecutor() {} - - //初始化,包括进行训练网络&配置加载工作 - virtual int initialize(YAML::Node exe_config, - std::shared_ptr context_ptr); - - //执行训练 - virtual paddle::framework::Channel run( - paddle::framework::Channel input, const DataParser* parser); - - virtual int32_t save_persistables(const std::string& filename); - - virtual bool is_dump_all_model() { - return _need_dump_all_model; - } - virtual const std::string& train_exe_name() { - return _train_exe_name; - } - virtual const std::string& train_data_name() { - return _train_data_name; - } - virtual const std::map>& table_accessors() { - return _table_to_accessors; - } - virtual ScopePoolObj fetch_scope() { - ScopePoolObj scope_obj(_scope_obj_pool->get()); - return scope_obj; - } -protected: - std::string _train_data_name; - size_t _train_batch_size = 32; - size_t _train_thread_num = 12; - size_t _input_parse_thread_num = 10; - size_t _push_gradient_thread_num = 10; - bool _need_dump_all_model = false; - - YAML::Node _model_config; - std::string _train_exe_name; - TrainerContext* _trainer_context = nullptr; - std::vector _debug_layer_list; - std::vector> _monitors; - std::vector> _thread_executors; - std::vector> _input_accessors; - std::map> _table_to_accessors; - std::shared_ptr> _scope_obj_pool; - std::vector _persistables; - - // 异步删除 - static std::once_flag _async_delete_flag; - static std::shared_ptr _async_delete_thread; - static paddle::framework::Channel _delete_channel; -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc b/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc deleted file mode 100644 index 48476a14..00000000 --- a/paddle/fluid/train/custom_trainer/feed/io/auto_file_system.cc +++ /dev/null @@ -1,96 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" - -#include -#include - -#include "paddle/fluid/train/custom_trainer/feed/io/shell.h" -#include "paddle/fluid/string/string_helper.h" -#include "glog/logging.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class AutoFileSystem : public FileSystem { -public: - int initialize(const YAML::Node& config, std::shared_ptr context) override { - _file_system.clear(); - if (config && config["file_systems"] && config["file_systems"].Type() == YAML::NodeType::Map) { - for (auto& prefix_fs: config["file_systems"]) { - std::unique_ptr fs(CREATE_INSTANCE(FileSystem, prefix_fs.second["class"].as(""))); - if (fs == nullptr) { - LOG(FATAL) << "fail to create class: " << prefix_fs.second["class"].as(""); - return -1; - } - if (fs->initialize(prefix_fs.second, context) != 0) { - LOG(FATAL) << "fail to initialize class: " << prefix_fs.second["class"].as(""); - return -1; - } - _file_system.emplace(prefix_fs.first.as(""), std::move(fs)); - } - } - if (_file_system.find("default") == _file_system.end()) { - LOG(WARNING) << "miss default file_system, use LocalFileSystem as default"; - std::unique_ptr fs(CREATE_INSTANCE(FileSystem, "LocalFileSystem")); - if (fs == nullptr || fs->initialize(YAML::Load(""), context) != 0) { - return -1; - } - _file_system.emplace("default", std::move(fs)); - } - return 0; - } - - std::shared_ptr open_read(const std::string& path, const std::string& converter) - override { - return get_file_system(path)->open_read(path, converter); - } - - std::shared_ptr open_write(const std::string& path, const std::string& converter) - override { - return get_file_system(path)->open_write(path, converter); - } - - int64_t file_size(const std::string& path) override { - return get_file_system(path)->file_size(path); - } - - void remove(const std::string& path) override { - get_file_system(path)->remove(path); - } - - std::vector list(const std::string& path) override { - return get_file_system(path)->list(path); - } - - std::string tail(const std::string& path, size_t tail_num = 1) override { - return get_file_system(path)->tail(path, tail_num); - } - - bool exists(const std::string& path) override { - return get_file_system(path)->exists(path); - } - - void mkdir(const std::string& path) override { - get_file_system(path)->mkdir(path); - } - - FileSystem* get_file_system(const std::string& path) { - auto pos = path.find_first_of(":"); - if (pos != std::string::npos) { - auto substr = path.substr(0, pos); // example: afs:/xxx -> afs - auto fs_it = _file_system.find(substr); - if (fs_it != _file_system.end()) { - return fs_it->second.get(); - } - } - return _file_system["default"].get(); - } - -private: - std::unordered_map> _file_system; -}; -REGIST_CLASS(FileSystem, AutoFileSystem); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/io/file_system.cc b/paddle/fluid/train/custom_trainer/feed/io/file_system.cc deleted file mode 100644 index 36afa163..00000000 --- a/paddle/fluid/train/custom_trainer/feed/io/file_system.cc +++ /dev/null @@ -1,78 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" -#include - -namespace paddle { -namespace custom_trainer { -namespace feed { - -std::string FileSystem::path_join(const std::string& dir, const std::string& path) { - if (dir.empty()) { - return path; - } - if (dir.back() == '/') { - return dir + path; - } - return dir + '/' + path; -} - -std::pair FileSystem::path_split(const std::string& path) { - size_t pos = path.find_last_of('/'); - if (pos == std::string::npos) { - return {".", path}; - } - return {path.substr(0, pos), path.substr(pos + 1)}; -} - -int FileSystem::copy(const std::string& ori_path, const std::string& dest_path) { - if (!exists(ori_path)) { - return -1; - } - remove(dest_path); - auto ori_file = open_read(ori_path, ""); - auto dest_file = open_write(dest_path, ""); - size_t read_buffer_size = 102400; // 100kb - char* buffer = new char[read_buffer_size]; - while (true) { - size_t read_size = fread(buffer, 1, read_buffer_size, ori_file.get()); - CHECK(ferror(ori_file.get()) == 0) << " File read Failed:" << ori_path; - if (read_size > 0) { - fwrite(buffer, 1, read_size, dest_file.get()); - } - // read done - if (read_size < read_buffer_size) { - break; - } - } - delete[] buffer; - return 0; -} - -int FileSystem::append_line(const std::string& path, - const std::string& line, size_t reserve_line_num) { - std::string tail_data; - if (exists(path)) { - tail_data = paddle::string::trim_spaces(tail(path, reserve_line_num)); - } - if (tail_data.size() > 0) { - tail_data = tail_data + "\n" + line; - } else { - tail_data = line; - } - VLOG(2) << "Append to file:" << path << ", line str:" << line; - while (true) { - remove(path); - { - auto fp = open_write(path, ""); - if (fwrite(tail_data.c_str(), tail_data.length(), 1, &*fp) == 1) { - break; - } - } - sleep(10); - VLOG(0) << "Retry Append to file:" << path << ", line str:" << line; - } - return 0; -} - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/io/file_system.h b/paddle/fluid/train/custom_trainer/feed/io/file_system.h deleted file mode 100644 index 9b981249..00000000 --- a/paddle/fluid/train/custom_trainer/feed/io/file_system.h +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once - -#include -#include -#include -#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h" -#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h" -#include - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class FileSystem { -public: - FileSystem() {} - virtual ~FileSystem() {} - virtual int initialize(const YAML::Node& config, std::shared_ptr context) = 0; - virtual std::shared_ptr open_read(const std::string& path, const std::string& converter) = 0; - virtual std::shared_ptr open_write(const std::string& path, const std::string& converter) = 0; - // only support text-file - virtual int append_line(const std::string& path, const std::string& line, size_t reserve_line_num); - virtual int64_t file_size(const std::string& path) = 0; - virtual int copy(const std::string& ori_path, const std::string& dest_path); - virtual void remove(const std::string& path) = 0; - virtual std::vector list(const std::string& path) = 0; - virtual std::string tail(const std::string& path, size_t tail_num = 1) = 0; - virtual bool exists(const std::string& path) = 0; - virtual void mkdir(const std::string& path) = 0; - virtual std::string path_join(const std::string& dir, const std::string& path); - template - std::string path_join(const std::string& dir, const std::string& path, const STRS&... paths) { - return path_join(path_join(dir, path), paths...); - } - virtual std::pair path_split(const std::string& path); -protected: -}; -REGIST_REGISTERER(FileSystem); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc b/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc deleted file mode 100644 index 6ef87d7e..00000000 --- a/paddle/fluid/train/custom_trainer/feed/io/hadoop_file_system.cc +++ /dev/null @@ -1,197 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" - -#include -#include -#include - -#include "paddle/fluid/train/custom_trainer/feed/io/shell.h" -#include "paddle/fluid/string/string_helper.h" -#include "paddle/fluid/string/piece.h" -#include "glog/logging.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class HadoopFileSystem : public FileSystem { -public: - int initialize(const YAML::Node& config, std::shared_ptr context) override { - _buffer_size = config["buffer_size"].as(0); - _hdfs_command = config["hdfs_command"].as("hadoop fs"); - _ugi.clear(); - if (config["ugis"] && config["ugis"].Type() == YAML::NodeType::Map) { - for (const auto& prefix_ugi : config["ugis"]) { - _ugi.emplace(prefix_ugi.first.as(), prefix_ugi.second.as()); - } - } - if (_ugi.find("default") == _ugi.end()) { - LOG(FATAL) << "fail to load default ugi"; - return -1; - } - return 0; - } - - std::shared_ptr open_read(const std::string& path, const std::string& converter) - override { - int err_no = 0; - std::string cmd; - if (string::end_with(path, ".gz")) { - cmd = string::format_string( - "%s -text \"%s\"", hdfs_command(path).c_str(), path.c_str()); - } else { - cmd = string::format_string("%s -cat \"%s\"", hdfs_command(path).c_str(), path.c_str()); - } - - bool is_pipe = true; - shell_add_read_converter(cmd, is_pipe, converter); - return shell_open(cmd, is_pipe, "r", _buffer_size, &err_no); - } - - std::shared_ptr open_write(const std::string& path, const std::string& converter) - override { - int err_no = 0; - std::string cmd = - string::format_string("%s -put - \"%s\"", hdfs_command(path).c_str(), path.c_str()); - bool is_pipe = true; - - if (string::end_with(path, ".gz\"")) { - shell_add_write_converter(cmd, is_pipe, "gzip"); - } - - shell_add_write_converter(cmd, is_pipe, converter); - return shell_open(cmd, is_pipe, "w", _buffer_size, &err_no); - } - - int64_t file_size(const std::string& path) override { - LOG(FATAL) << "not support"; - return 0; - } - - void remove(const std::string& path) override { - if (path == "") { - return; - } - - shell_execute(string::format_string( - "%s -rmr %s &>/dev/null; true", hdfs_command(path).c_str(), path.c_str())); - } - - std::vector list(const std::string& path) override { - if (path == "") { - return {}; - } - auto paths = split_path(path); - - int err_no = 0; - std::vector list; - do { - err_no = 0; - std::shared_ptr pipe; - pipe = shell_popen( - string::format_string( - "%s -ls %s | ( grep ^- ; [ $? != 2 ] )", - hdfs_command(path).c_str(), - path.c_str()), - "r", - &err_no); - string::LineFileReader reader; - list.clear(); - - while (reader.getline(&*pipe)) { - std::vector line = string::split_string(reader.get()); - if (line.size() != 8) { - continue; - } - list.push_back(get_prefix(paths) + line[7]); - } - } while (err_no == -1); - return list; - } - - std::string tail(const std::string& path, size_t tail_num = 1) override { - if (path == "") { - return ""; - } - - return shell_get_command_output(string::format_string( - "%s -text %s | tail -%u", hdfs_command(path).c_str(), path.c_str(), tail_num)); - } - - bool exists(const std::string& path) override { - std::string test = shell_get_command_output(string::format_string( - "%s -test -e %s ; echo $?", hdfs_command(path).c_str(), path.c_str())); - - if (string::trim_spaces(test) == "0") { - return true; - } - - return false; - } - - void mkdir(const std::string& path) override { - if (path == "") { - return; - } - - shell_execute(string::format_string( - "%s -mkdir %s; true", hdfs_command(path).c_str(), path.c_str())); - } - - std::string hdfs_command(const std::string& path) { - auto paths = split_path(path); - auto it = _ugi.find(std::get<1>(paths).ToString()); - if (it != _ugi.end()) { - return hdfs_command_with_ugi(it->second); - } - VLOG(5) << "path: " << path << ", select default ugi"; - return hdfs_command_with_ugi(_ugi["default"]); - } - - std::string hdfs_command_with_ugi(std::string ugi) { - return string::format_string( - "%s -Dhadoop.job.ugi=\"%s\"", _hdfs_command.c_str(), ugi.c_str()); - } - -private: - std::string get_prefix(const std::tuple& paths) { - if (std::get<1>(paths).len() == 0) { - return std::get<0>(paths).ToString(); - } - return std::get<0>(paths).ToString() + "//" + std::get<1>(paths).ToString(); - } - - // parse "xxx://abc.def:8756/user" as "xxx:", "abc.def:8756", "/user" - // parse "xxx:/user" as "xxx:", "", "/user" - // parse "xxx://abc.def:8756" as "xxx:", "abc.def:8756", "" - // parse "other" as "", "", "other" - std::tuple split_path(string::Piece path) { - std::tuple result{string::SubStr(path, 0, 0), string::SubStr(path, 0, 0), path}; - auto fs_pos = string::Find(path, ':', 0) + 1; - if (path.len() > fs_pos) { - std::get<0>(result) = string::SubStr(path, 0, fs_pos); - path = string::SkipPrefix(path, fs_pos); - if (string::HasPrefix(path, "//")) { - path = string::SkipPrefix(path, 2); - auto end_pos = string::Find(path, '/', 0); - if (end_pos != string::Piece::npos) { - std::get<1>(result) = string::SubStr(path, 0, end_pos); - std::get<2>(result) = string::SkipPrefix(path, end_pos); - } else { - std::get<1>(result) = path; - } - } else { - std::get<2>(result) = path; - } - } - return result; - } - - size_t _buffer_size = 0; - std::string _hdfs_command; - std::unordered_map _ugi; -}; -REGIST_CLASS(FileSystem, HadoopFileSystem); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/io/local_file_system.cc b/paddle/fluid/train/custom_trainer/feed/io/local_file_system.cc deleted file mode 100644 index 0b5e5cce..00000000 --- a/paddle/fluid/train/custom_trainer/feed/io/local_file_system.cc +++ /dev/null @@ -1,122 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" - -#include - -#include "paddle/fluid/train/custom_trainer/feed/io/shell.h" -#include "paddle/fluid/string/string_helper.h" -#include "glog/logging.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class LocalFileSystem : public FileSystem { -public: - int initialize(const YAML::Node& config, std::shared_ptr context) override { - _buffer_size = config["buffer_size"].as(0); - return 0; - } - - std::shared_ptr open_read(const std::string& path, const std::string& converter) override { - std::string cmd = path; - bool is_pipe = false; - if (string::end_with(path, ".gz")) { - shell_add_read_converter(cmd, is_pipe, "zcat"); - } - - shell_add_read_converter(cmd, is_pipe, converter); - return shell_open(cmd, is_pipe, "r", _buffer_size); - } - - std::shared_ptr open_write(const std::string& path, const std::string& converter) override { - std::string cmd = path; - - shell_execute(string::format_string("mkdir -p $(dirname \"%s\")", path.c_str())); - - bool is_pipe = false; - - if (string::end_with(path, ".gz")) { - shell_add_write_converter(cmd, is_pipe, "gzip"); - } - - shell_add_write_converter(cmd, is_pipe, converter); - return shell_open(cmd, is_pipe, "w", _buffer_size); - } - - int64_t file_size(const std::string& path) override { - struct stat buf; - if (0 != stat(path.c_str(), &buf)) { - LOG(FATAL) << "file stat not zero"; - return -1; - } - return (int64_t)buf.st_size; - } - - void remove(const std::string& path) override { - if (path == "") { - return; - } - - shell_execute(string::format_string("rm -rf %s", path.c_str())); - } - - std::vector list(const std::string& path) override { - if (path == "") { - return {}; - } - int err_no; - std::shared_ptr pipe; - pipe = shell_popen( - string::format_string("find %s -maxdepth 1 -type f", path.c_str()), "r", &err_no); - string::LineFileReader reader; - std::vector list; - - while (reader.getline(&*pipe)) { - list.push_back(reader.get()); - } - - return list; - } - - std::string tail(const std::string& path, size_t tail_num = 1) override { - if (path == "") { - return ""; - } - - return shell_get_command_output(string::format_string("tail -%u %s ", tail_num, path.c_str())); - } - - bool exists(const std::string& path) override { - std::string test_f = shell_get_command_output( - string::format_string("[ -f %s ] ; echo $?", path.c_str())); - - if (string::trim_spaces(test_f) == "0") { - return true; - } - - std::string test_d = shell_get_command_output( - string::format_string("[ -d %s ] ; echo $?", path.c_str())); - - if (string::trim_spaces(test_d) == "0") { - return true; - } - - return false; - } - - void mkdir(const std::string& path) override { - if (path == "") { - return; - } - - shell_execute(string::format_string("mkdir -p %s", path.c_str())); - } - -private: - size_t _buffer_size = 0; -}; -REGIST_CLASS(FileSystem, LocalFileSystem); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/io/shell.cc b/paddle/fluid/train/custom_trainer/feed/io/shell.cc deleted file mode 100644 index 3fc87085..00000000 --- a/paddle/fluid/train/custom_trainer/feed/io/shell.cc +++ /dev/null @@ -1,367 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/io/shell.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -void shell_add_write_converter(std::string& path, bool& is_pipe, // NOLINT - const std::string& converter) { - if (converter == "") { - return; - } - - if (!is_pipe) { - path = string::format_string("( %s ) > \"%s\"", converter.c_str(), path.c_str()); - is_pipe = true; - } else { - path = string::format_string("%s | %s", converter.c_str(), path.c_str()); - } -} - -void shell_add_read_converter(std::string& path, bool& is_pipe, const std::string& converter) { - if (converter == "") { - return; - } - - if (!is_pipe) { - path = string::format_string("( %s ) < \"%s\"", converter.c_str(), path.c_str()); - is_pipe = true; - } else { - path = string::format_string("%s | %s", path.c_str(), converter.c_str()); - } -} - -std::shared_ptr shell_open( - const std::string& path, - bool is_pipe, - const std::string& mode, - size_t buffer_size, - int* err_no) { - std::shared_ptr fp = nullptr; - - if (!is_pipe) { - fp = shell_fopen(path, mode); - } else { - fp = shell_popen(path, mode, err_no); - } - - if (buffer_size > 0) { - char* buffer = new char[buffer_size]; - CHECK_EQ(0, setvbuf(&*fp, buffer, _IOFBF, buffer_size)); - fp = {&*fp, [fp, buffer](FILE*) mutable { // NOLINT - CHECK(fp.unique()); // NOLINT - fp = nullptr; - delete[] buffer; - }}; - } - - return fp; -} - -std::shared_ptr shell_fopen(const std::string& path, const std::string& mode) { -#if defined _WIN32 || defined __APPLE__ - return nullptr; -#else - if (shell_verbose()) { - LOG(INFO) << "Opening file[" << path << "] with mode[" << mode << "]"; - } - FILE* fp; - if (!(fp = fopen(path.c_str(), mode.c_str()))) { - LOG(FATAL) << "fopen fail, path[" << path << "], mode[" << mode << "]"; - } - return {fp, [path](FILE* fp) { - if (shell_verbose()) { - LOG(INFO) << "Closing file[" << path << "]"; - } - if (0 != fclose(fp)) { - LOG(FATAL) << "fclose fail, path[" << path << "]"; - } - }}; -#endif -} - -// Close all open file descriptors -// The implementation is async signal safe -// Mostly copy from CPython code -static int close_open_fds_internal() { -#if defined _WIN32 || defined __APPLE__ - return 0; -#else - struct linux_dirent { - long d_ino = 0; // NOLINT - off_t d_off; - unsigned short d_reclen = 0; // NOLINT - char d_name[256]; - }; - - int dir_fd = -1; - if ((dir_fd = open("/proc/self/fd", O_RDONLY)) < 0) { - LOG(FATAL) << "proc/self/fd open fail"; - return -1; - } - char buffer[sizeof(linux_dirent)]; - - for (;;) { - int bytes = 0; - if ((bytes = - syscall(SYS_getdents, - dir_fd, - reinterpret_cast(buffer), - sizeof(buffer))) < 0) { - LOG(FATAL) << "syscall fail"; - return -1; - } - - if (bytes == 0) { - break; - } - - linux_dirent* entry = NULL; - - for (int offset = 0; offset < bytes; offset += entry->d_reclen) { - entry = reinterpret_cast(buffer + offset); - int fd = 0; - const char* s = entry->d_name; - - while (*s >= '0' && *s <= '9') { - fd = fd * 10 + (*s - '0'); - s++; - } - - if (s != entry->d_name && fd != dir_fd && fd >= 3) { - close(fd); - } - } - } - - close(dir_fd); - return 0; -#endif -} - -static int shell_popen_fork_internal( - const char* real_cmd, - bool do_read, - int parent_end, - int child_end) { -#if defined _WIN32 || defined __APPLE__ - return 0; -#else - int child_pid = -1; - // Too frequent calls to fork() makes openmpi very slow. Use vfork() instead. - // But vfork() is very dangerous. Be careful. - if ((child_pid = vfork()) < 0) { - return -1; - } - - // The following code is async signal safe (No memory allocation, no access to - // global data, etc.) - if (child_pid != 0) { - return child_pid; - } - - int child_std_end = do_read ? 1 : 0; - close(parent_end); - - if (child_end != child_std_end) { - if (dup2(child_end, child_std_end) != child_std_end) { - exit(127); - } - close(child_end); - } - - close_open_fds_internal(); - if (execl("/bin/bash", "bash", "-c", real_cmd, NULL) < 0) { - exit(127); - } - exit(127); -#endif -} - -std::shared_ptr shell_popen(const std::string& cmd, const std::string& mode, int* err_no) { -#if defined _WIN32 || defined __APPLE__ - return nullptr; -#else - bool do_read = mode == "r"; - bool do_write = mode == "w"; - if (!(do_read || do_write)) { - *err_no = -1; - return NULL; - } - - if (shell_verbose()) { - LOG(INFO) << "Opening pipe[" << cmd << "] with mode[" << mode << "]"; - } - - std::string real_cmd = "set -o pipefail; " + cmd; - - int pipe_fds[2]; - if (pipe(pipe_fds) != 0) { - *err_no = -1; - return NULL; - } - int parent_end = 0; - int child_end = 0; - - if (do_read) { - parent_end = pipe_fds[0]; - child_end = pipe_fds[1]; - } else if (do_write) { - parent_end = pipe_fds[1]; - child_end = pipe_fds[0]; - } - - int child_pid = shell_popen_fork_internal(real_cmd.c_str(), do_read, parent_end, child_end); - close(child_end); - fcntl(parent_end, F_SETFD, FD_CLOEXEC); - FILE* fp; - if ((fp = fdopen(parent_end, mode.c_str())) == NULL) { - *err_no = -1; - return NULL; - } - return {fp, [child_pid, cmd, err_no](FILE* fp) { - if (shell_verbose()) { - LOG(INFO) << "Closing pipe[" << cmd << "]"; - } - - if (fclose(fp) != 0) { - *err_no = -1; - } - int wstatus = -1; - waitpid(child_pid, &wstatus, 0); - if (wstatus == 0 || wstatus == (128 + SIGPIPE) * 256 || - (wstatus == -1 && errno == ECHILD)) { - } else { - *err_no = -1; - LOG(WARNING) << "status[" << wstatus << "], cmd[" << cmd << "]" - << ", err_no[" << *err_no << "]"; - } - if (wstatus == -1 && errno == ECHILD) { - LOG(WARNING) << "errno is ECHILD"; - } - }}; -#endif -} - -static int shell_p2open_fork_internal(const char* real_cmd, int pipein_fds[2], int pipeout_fds[2]) { -#if defined _WIN32 || defined __APPLE__ - return 0; -#else - int child_pid = -1; - if ((child_pid = fork()) < 0) { - return -1; - } - - if (child_pid != 0) { - return child_pid; - } - - close(pipein_fds[0]); - close(pipeout_fds[1]); - - if (pipein_fds[1] != 1) { - if (dup2(pipein_fds[1], 1) != 1) { - return -1; - } - close(pipein_fds[1]); - } - - if (pipeout_fds[0] != 0) { - if (dup2(pipeout_fds[0], 0) != 0) { - return -1; - } - close(pipeout_fds[0]); - } - - close_open_fds_internal(); - if (execl("/bin/sh", "sh", "-c", real_cmd, NULL) < 0) { - return -1; - } - exit(127); -#endif -} - -std::pair, std::shared_ptr> shell_p2open(const std::string& cmd) { -#if defined _WIN32 || defined __APPLE__ - return {}; -#else - if (shell_verbose()) { - LOG(INFO) << "Opening bidirectional pipe[" << cmd << "]"; - } - - std::string real_cmd = "set -o pipefail; " + cmd; - - int pipein_fds[2]; - int pipeout_fds[2]; - if (pipe(pipein_fds) != 0) { - return {NULL, NULL}; - } - if (pipe(pipeout_fds) != 0) { - return {NULL, NULL}; - } - - int child_pid = shell_p2open_fork_internal(real_cmd.c_str(), pipein_fds, pipeout_fds); - - close(pipein_fds[1]); - close(pipeout_fds[0]); - fcntl(pipein_fds[0], F_SETFD, FD_CLOEXEC); - fcntl(pipeout_fds[1], F_SETFD, FD_CLOEXEC); - - std::shared_ptr child_life = { - NULL, [child_pid, cmd](void*) { - if (shell_verbose()) { - LOG(INFO) << "Closing bidirectional pipe[" << cmd << "]"; - } - - int wstatus, ret; - - do { - PCHECK((ret = waitpid(child_pid, &wstatus, 0)) >= 0 || - (ret == -1 && errno == EINTR)); - } while (ret == -1 && errno == EINTR); - - PCHECK(wstatus == 0 || wstatus == (128 + SIGPIPE) * 256 || - (wstatus == -1 && errno == ECHILD)) - << "status[" << wstatus << "], cmd[" << cmd << "]"; - - if (wstatus == -1 && errno == ECHILD) { - LOG(WARNING) << "errno is ECHILD"; - } - }}; - - FILE* in_fp; - PCHECK((in_fp = fdopen(pipein_fds[0], "r")) != NULL); - FILE* out_fp; - PCHECK((out_fp = fdopen(pipeout_fds[1], "w")) != NULL); - return {{in_fp, [child_life](FILE* fp) { PCHECK(fclose(fp) == 0); }}, - {out_fp, [child_life](FILE* fp) { PCHECK(fclose(fp) == 0); }}}; -#endif -} - -std::string shell_get_command_output(const std::string& cmd) { -#if defined _WIN32 || defined __APPLE__ - return ""; -#else - int err_no = 0; - do { - if (err_no == -1) { - sleep(10); - } - err_no = 0; - std::shared_ptr pipe = shell_popen(cmd, "r", &err_no); - string::LineFileReader reader; - if (reader.getdelim(&*pipe, 0)) { - pipe = nullptr; - if (err_no == 0) { - return reader.get(); - } - } - VLOG(2) << "run shell cmd:" << cmd << ", errno:" << err_no; - } while (err_no == -1); - return ""; -#endif -} - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/io/shell.h b/paddle/fluid/train/custom_trainer/feed/io/shell.h deleted file mode 100644 index 7eca3d8e..00000000 --- a/paddle/fluid/train/custom_trainer/feed/io/shell.h +++ /dev/null @@ -1,64 +0,0 @@ -#pragma once - -#include -#include -#ifdef _WIN32 -#include -#else -#include -#endif -#include -#ifndef _WIN32 -#include -#endif -#include -#include -#include -#include "paddle/fluid/platform/port.h" -#include "paddle/fluid/string/string_helper.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -inline bool& shell_verbose_internal() { - static bool x = false; - return x; -} - -inline bool shell_verbose() { - return shell_verbose_internal(); -} - -inline void shell_set_verbose(bool x) { - shell_verbose_internal() = x; -} - -extern std::shared_ptr shell_fopen(const std::string& path, const std::string& mode); - -extern std::shared_ptr shell_popen( - const std::string& cmd, - const std::string& mode, - int* err_no); - -extern std::pair, std::shared_ptr> shell_p2open(const std::string& cmd); - -inline void shell_execute(const std::string& cmd) { - int err_no = 0; - do { - err_no = 0; - shell_popen(cmd, "w", &err_no); - } while (err_no == -1); -} - -extern std::string shell_get_command_output(const std::string& cmd); - -extern void shell_add_read_converter(std::string& path, bool& is_pipe, const std::string& converter); - -extern std::shared_ptr shell_open(const std::string& path, bool is_pipe, const std::string& mode, size_t buffer_size, int* err_no = 0); - -extern void shell_add_write_converter(std::string& path, bool& is_pipe, const std::string& converter); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/main.cc b/paddle/fluid/train/custom_trainer/feed/main.cc deleted file mode 100644 index df40840b..00000000 --- a/paddle/fluid/train/custom_trainer/feed/main.cc +++ /dev/null @@ -1,81 +0,0 @@ -#include -#include -#include -#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h" -#include "paddle/fluid/platform/init.h" -#include "paddle/fluid/train/custom_trainer/feed/process/process.h" -#include "paddle/fluid/train/custom_trainer/feed/process/init_env_process.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/pybind/pybind.h" - -using namespace paddle::custom_trainer::feed; - -DEFINE_string(feed_trainer_conf_path, "./conf/trainer.yaml", "path of trainer conf"); - -int main(int argc, char* argv[]) { - google::InitGoogleLogging(argv[0]); - //gflags - google::ParseCommandLineFlags(&argc, &argv, true); - std::string gflag_conf = "./conf/gflags.conf"; - google::SetCommandLineOption("flagfile", gflag_conf.c_str()); - - //load trainer config - auto trainer_context_ptr = std::make_shared(); - trainer_context_ptr->cache_dict.reset(new SignCacheDict); - trainer_context_ptr->trainer_config = YAML::LoadFile(FLAGS_feed_trainer_conf_path); - - //environment - auto& config = trainer_context_ptr->trainer_config; - std::string env_class = config["environment"]["environment_class"].as(); - trainer_context_ptr->environment.reset(CREATE_INSTANCE(RuntimeEnvironment, env_class)); - if (trainer_context_ptr->environment->initialize(config["environment"]) != 0) { - return -1; - } - auto* environment = trainer_context_ptr->environment.get(); - environment->wireup(); - VLOG(2) << "node_num: " << environment->node_num(EnvironmentRole::ALL); - if (environment->node_num(EnvironmentRole::ALL) == 1) { - environment->add_role(EnvironmentRole::WORKER); - environment->add_role(EnvironmentRole::PSERVER); - } else if (environment->rank_id(EnvironmentRole::ALL) % 2 == 0) { - environment->add_role(EnvironmentRole::WORKER); - } else { - environment->add_role(EnvironmentRole::PSERVER); - } - trainer_context_ptr->pslib.reset(new PSlib()); - std::string ps_config = config["environment"]["ps"].as(); - trainer_context_ptr->environment->barrier(EnvironmentRole::ALL); - trainer_context_ptr->pslib->initialize(ps_config, environment); - //VLOG(3) << "Node Start With Role:" << role; - - - if (environment->is_role(EnvironmentRole::WORKER)) { - std::vector process_name_list = { - "InitEnvProcess", - "LearnerProcess" - }; - for (const auto& process_name : process_name_list) { - Process* process = CREATE_INSTANCE(Process, process_name); - if (process == NULL) { - VLOG(1) << "Process:" << process_name << " does not exist"; - return -1; - } - if (process->initialize(trainer_context_ptr) != 0) { - VLOG(1) << "Process:" << process_name << " initialize failed"; - return -1; - } - trainer_context_ptr->process_list.push_back(std::shared_ptr(process)); - } - for (auto& process : trainer_context_ptr->process_list) { - process->run(); - } - - } - - //TODO exit control - bool running = true; - while (running) { - sleep(10000); - } - return 0; -} diff --git a/paddle/fluid/train/custom_trainer/feed/model/epoch_donefile.txt b/paddle/fluid/train/custom_trainer/feed/model/epoch_donefile.txt deleted file mode 100644 index 48fa79fd..00000000 --- a/paddle/fluid/train/custom_trainer/feed/model/epoch_donefile.txt +++ /dev/null @@ -1,3 +0,0 @@ -20190710 1562775817 afs:/user/feed/mlarch/feed_multiTarget_model/magnet_duration_model_new_label2/batch_model/20190710_18 21 18 -20190710 1562779976 afs:/user/feed/mlarch/feed_multiTarget_model/magnet_duration_model_new_label2/batch_model/20190710_18 22 18 -20190711 1562783841 afs:/user/feed/mlarch/feed_multiTarget_model/magnet_duration_model_new_label2/batch_model/20190711_0 1565625600 1565625600 diff --git a/paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.cc b/paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.cc deleted file mode 100644 index 9b45825b..00000000 --- a/paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.cc +++ /dev/null @@ -1,154 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.h" -#include "paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -int AucMonitor::initialize(const YAML::Node& config, std::shared_ptr context_ptr) { - Monitor::initialize(config, context_ptr); - _target_idx = config["target_idx"].as(); - _target_name = config["target"].as(); - _table_size = 1000000; - if (config["table_size"]) { - _table_size = config["table_size"].as(); - } - set_table_size(_table_size); - _compute_interval = config["compute_interval"].as(); - CHECK(_compute_interval % 60 == 0); - return 0; -} - -void AucMonitor::add_data(int epoch_id, - const MultiThreadExecutor* executor, ScopeExecutorContext* ctx) { - auto num = ctx->sample_num(); - auto* samples = ctx->samples(); - CHECK(num > 0); - std::lock_guard lock(_mutex); - for (int i = 0; i < num; ++i) { - auto& instance = samples[i]; - add_unlocked(instance.predicts[_target_idx], std::lround(instance.labels[_target_idx])); - } -} - -bool AucMonitor::need_compute_result(int epoch_id) { - CHECK(_epoch_accessor != nullptr); - uint64_t epoch_time = _epoch_accessor->epoch_timestamp(epoch_id); - return epoch_time % _compute_interval == 0; -} -void AucMonitor::compute_result() { - auto* environment = Monitor::_context_ptr->environment.get(); - double* table[2] = {&_table[0][0], &_table[1][0]}; - for (int i = 0; i < 2; i++) { - environment->all_reduce_in_place(table[i], - _table_size, ReduceOperator::SUM, EnvironmentRole::WORKER); - } - double area = 0; - double fp = 0; - double tp = 0; - for (int i = _table_size - 1; i >= 0; i--) { - double newfp = fp + table[0][i]; - double newtp = tp + table[1][i]; - area += (newfp - fp) * (tp + newtp) / 2; - fp = newfp; - tp = newtp; - } - _auc = area / (fp * tp); - _mae = environment->all_reduce(_local_abserr, - ReduceOperator::SUM, EnvironmentRole::WORKER) / (fp + tp); - _rmse = sqrt(environment->all_reduce(_local_sqrerr, - ReduceOperator::SUM, EnvironmentRole::WORKER) / (fp + tp)); - _actual_ctr = tp / (fp + tp); - _predicted_ctr = environment->all_reduce(_local_pred, - ReduceOperator::SUM, EnvironmentRole::WORKER) / (fp + tp); - _size = fp + tp; - calculate_bucket_error(); -} - -std::string AucMonitor::format_result() { - double copc = 0.0; - if (fabs(_predicted_ctr) > 1e-6) { - copc = _actual_ctr / _predicted_ctr; - } - return paddle::string::format_string("AUC=%.6f BUCKET_ERROR=%.6f MAE=%.6f RMSE=%.6f " - "Actual CTR=%.6f Predicted CTR=%.6f COPC=%.6f INS Count=%.0f", - _auc, - _bucket_error, - _mae, - _rmse, - _actual_ctr, - _predicted_ctr, - copc, - _size); -} - -void AucMonitor::add_unlocked(double pred, int label) { - if (std::isnan(pred)) { - VLOG(2) << "pred[" << pred << "] outside of [0,1]"; - return; - } - CHECK(pred >= 0 && pred <= 1) << "pred[" << pred << "] outside of [0,1]"; - CHECK(label == 0 || label == 1) << "label[" << label << "] invalid"; - _table[label][std::min(int(pred * _table_size), _table_size - 1)]++; - _local_abserr += fabs(pred - label); - _local_sqrerr += (pred - label) * (pred - label); - _local_pred += pred; -} - -void AucMonitor::calculate_bucket_error() { - double last_ctr = -1; - double impression_sum = 0; - double ctr_sum = 0.0; - double click_sum = 0.0; - double error_sum = 0.0; - double error_count = 0; - double* table[2] = {&_table[0][0], &_table[1][0]}; - for (int i = 0; i < _table_size; i++) { - double click = table[1][i]; - double show = table[0][i] + table[1][i]; - double ctr = (double)i / _table_size; - if (fabs(ctr - last_ctr) > kMaxSpan) { - last_ctr = ctr; - impression_sum = 0.0; - ctr_sum = 0.0; - click_sum = 0.0; - } - impression_sum += show; - ctr_sum += ctr * show; - click_sum += click; - double adjust_ctr = ctr_sum / impression_sum; - double relative_error = sqrt((1 - adjust_ctr) / (adjust_ctr * impression_sum)); - if (relative_error < kRelativeErrorBound) { - double actual_ctr = click_sum / impression_sum; - double relative_ctr_error = fabs(actual_ctr / adjust_ctr - 1); - error_sum += relative_ctr_error * impression_sum; - error_count += impression_sum; - last_ctr = -1; - } - } - _bucket_error = error_count > 0 ? error_sum / error_count : 0.0; -} - -void AucMonitor::set_table_size(int table_size) { - CHECK(table_size >= 1); - _table_size = table_size; - for (int i = 0; i < 2; i++) { - _table[i] = std::vector(); - } - reset(); -} - -void AucMonitor::reset() { - for (int i = 0; i < 2; i++) { - _table[i].assign(_table_size, 0.0); - } - _local_abserr = 0; - _local_sqrerr = 0; - _local_pred = 0; -} - -REGIST_CLASS(Monitor, AucMonitor); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.h b/paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.h deleted file mode 100644 index c668a731..00000000 --- a/paddle/fluid/train/custom_trainer/feed/monitor/auc_monitor.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once -#include -#include //std::lround -#include "paddle/fluid/train/custom_trainer/feed/monitor/monitor.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -//TODO 完善AucMonitor - -class AucMonitor : public Monitor { -public: - AucMonitor() {} - virtual ~AucMonitor() {} - - virtual int initialize(const YAML::Node& config, - std::shared_ptr context_ptr) override; - - //添加一项记录,统计内容Monitor自行从Executor按需获取 - virtual void add_data(int epoch_id, - const MultiThreadExecutor* executor, ScopeExecutorContext*); - - //是否开始结果统计 - virtual bool need_compute_result(int epoch_id); - //统计当前结果 - virtual void compute_result(); - //基于现有结果,输出格式化的统计信息 - virtual std::string format_result(); - - virtual void reset(); - -protected: - uint32_t _target_idx; - std::string _target_name; - std::string _name; - std::string _output_var; - std::mutex _mutex; - double _local_abserr, _local_sqrerr, _local_pred; - double _auc; - double _mae; - double _rmse; - double _actual_ctr, _predicted_ctr; - double _size; - double _bucket_error; - int _table_size; - void add_unlocked(double pred, int label); - -private: - void calculate_bucket_error(); - void set_table_size(int table_size); - - uint32_t _compute_interval; - std::vector _table[2]; - static constexpr double kRelativeErrorBound = 0.05; - static constexpr double kMaxSpan = 0.01; -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.cc b/paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.cc deleted file mode 100644 index 1c247e0a..00000000 --- a/paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.cc +++ /dev/null @@ -1,32 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h" -#include "paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -int CostMonitor::initialize(const YAML::Node& config, std::shared_ptr context_ptr) { - Monitor::initialize(config, context_ptr); - if (config["compute_interval"]) { - _compute_interval = config["compute_interval"].as(); - } -} - -void CostMonitor::add_data(int epoch_id, - const MultiThreadExecutor* executor, ScopeExecutorContext* ctx) { - auto num = ctx->sample_num(); - auto* samples = ctx->samples(); - CHECK(executor != nullptr); - //TODO use paddle time - _total_time_ms += 1; - _total_cnt ++; -} - -bool CostMonitor::need_compute_result(int epoch_id) { - uint64_t epoch_time = _epoch_accessor->epoch_timestamp(epoch_id); - return epoch_time % _compute_interval == 0; -} - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h b/paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h deleted file mode 100755 index 44e31544..00000000 --- a/paddle/fluid/train/custom_trainer/feed/monitor/cost_monitor.h +++ /dev/null @@ -1,54 +0,0 @@ -#pragma once -#include -#include //std::lround -#include "paddle/fluid/train/custom_trainer/feed/monitor/monitor.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -// cost time profile -class CostMonitor : public Monitor { -public: - CostMonitor() : _total_time_ms(0), _total_cnt(0), _avg_time_ms(0), _compute_interval(0) {} - virtual ~CostMonitor() {} - - virtual int initialize(const YAML::Node& config, - std::shared_ptr context_ptr) override; - - //添加一项记录,统计内容Monitor自行从Executor按需获取 - virtual void add_data(int epoch_id, - const MultiThreadExecutor* executor, ScopeExecutorContext*); - - //是否开始结果统计 - virtual bool need_compute_result(int epoch_id); - //统计当前结果 - virtual void compute_result() { - CHECK(_total_cnt != 0); - _avg_time_ms = _total_time_ms / _total_cnt; - } - //基于现有结果,输出格式化的统计信息 - virtual std::string format_result() { - return paddle::string::format_string( - "Monitor %s: Cost Time=%lu", Monitor::_name.c_str(), _avg_time_ms); - } - - virtual void reset() { - _total_time_ms = 0; - _total_cnt = 0; - _avg_time_ms = 0; - } - -protected: - std::string _name; - -private: - uint64_t _total_time_ms; - uint64_t _total_cnt; - uint64_t _avg_time_ms; - uint32_t _compute_interval; -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/monitor/monitor.h b/paddle/fluid/train/custom_trainer/feed/monitor/monitor.h deleted file mode 100755 index ab698130..00000000 --- a/paddle/fluid/train/custom_trainer/feed/monitor/monitor.h +++ /dev/null @@ -1,56 +0,0 @@ -#pragma once -#include -#include "paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h" -#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h" -#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h" -#include "paddle/fluid/train/custom_trainer/feed/executor/executor.h" -#include "paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h" -#include "paddle/fluid/train/custom_trainer/feed/common/runtime_environment.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { -class MultiThreadExecutor; -class ScopeExecutorContext; - -class Monitor { -public: - Monitor() {} - virtual ~Monitor() {} - - virtual int initialize(const YAML::Node& config, - std::shared_ptr context_ptr) { - _name = config["name"].as(); - _context_ptr = context_ptr; - _epoch_accessor = _context_ptr->epoch_accessor.get(); - return 0; - } - - //添加一项记录,统计内容Monitor自行从Executor按需获取 - virtual void add_data(int epoch_id, - const MultiThreadExecutor* executor, ScopeExecutorContext*) = 0; - - //是否对于当前epoch_id进行结果统计 - virtual bool need_compute_result(int epoch_id) = 0; - //统计当前结果 - virtual void compute_result() = 0; - //基于现有结果,输出格式化的统计信息 - virtual std::string format_result() = 0; - - virtual void reset() = 0; - - const std::string& get_name() { - return _name; - } - -protected: - std::string _name; - EpochAccessor* _epoch_accessor = nullptr; - std::shared_ptr _context_ptr; -}; - -REGIST_REGISTERER(Monitor); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/process/CMakeLists.txt b/paddle/fluid/train/custom_trainer/feed/process/CMakeLists.txt deleted file mode 100644 index cce01086..00000000 --- a/paddle/fluid/train/custom_trainer/feed/process/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -cc_library(custom_trainer_process SRCS process.cc init_env_process.cc DEPS memory) diff --git a/paddle/fluid/train/custom_trainer/feed/process/data_set_process.h b/paddle/fluid/train/custom_trainer/feed/process/data_set_process.h deleted file mode 100644 index 304a5aa9..00000000 --- a/paddle/fluid/train/custom_trainer/feed/process/data_set_process.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - *Author: xiexionghang - *组织训练样本的读取工作 - */ -#pragma once -#include "paddle/fluid/train/custom_trainer/feed/process/process.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class DatasetProcess : public Process { -public: - DatasetProcess() {} - virtual ~DatasetProcess() {} - virtual int initialize(std::shared_ptr context_ptr); -private: - std::map _dataset_map; -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc b/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc deleted file mode 100644 index 05b398d3..00000000 --- a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.cc +++ /dev/null @@ -1,57 +0,0 @@ -/* - *Author: xiexionghang - *用于训练环境的整体配置读取、环境初始化工作 - */ -#include "paddle/fluid/platform/init.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" -#include "paddle/fluid/train/custom_trainer/feed/dataset/dataset.h" -#include "paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h" -#include "paddle/fluid/train/custom_trainer/feed/process/init_env_process.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -int InitEnvProcess::initialize(std::shared_ptr context_ptr) { - Process::initialize(context_ptr); - paddle::framework::InitDevices(false); - context_ptr->cpu_place = paddle::platform::CPUPlace(); - - YAML::Node config = _context_ptr->trainer_config; - - //file_system - context_ptr->file_system.reset(CREATE_INSTANCE(FileSystem, "AutoFileSystem")); - if (context_ptr->file_system->initialize(config["io"], context_ptr) != 0) { - return -1; - } - - //epoch - std::string epoch_class = config["epoch"]["epoch_class"].as(); - context_ptr->epoch_accessor.reset(CREATE_INSTANCE(EpochAccessor, epoch_class)); - if (context_ptr->epoch_accessor->initialize(config["epoch"], context_ptr) != 0) { - return -1; - } - - //Dataset - context_ptr->dataset.reset(new Dataset()); - if (context_ptr->dataset->initialize(config["dataset"], context_ptr) != 0) { - return -1; - } - - VLOG(3) << "Env initialize success"; - return 0; -} - -int InitEnvProcess::run() { - auto* epoch_accessor = _context_ptr->epoch_accessor.get(); - VLOG(3) << "Trainer Resume From epoch:" << epoch_accessor->current_epoch_id(); - auto next_epoch_id = epoch_accessor->next_epoch_id(epoch_accessor->current_epoch_id()); - _context_ptr->dataset->pre_detect_data(next_epoch_id); - return 0; -} - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.h b/paddle/fluid/train/custom_trainer/feed/process/init_env_process.h deleted file mode 100644 index 340ee681..00000000 --- a/paddle/fluid/train/custom_trainer/feed/process/init_env_process.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - *Author: xiexionghang - *用于训练环境的整体配置读取、环境初始化工作 - */ -#pragma once -#include "paddle/fluid/train/custom_trainer/feed/process/process.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class InitEnvProcess : public Process { -public: - InitEnvProcess() {} - virtual ~InitEnvProcess() {} - virtual int initialize(std::shared_ptr context_ptr); - virtual int run(); -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc b/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc deleted file mode 100755 index bef0234b..00000000 --- a/paddle/fluid/train/custom_trainer/feed/process/learner_process.cc +++ /dev/null @@ -1,302 +0,0 @@ -/* - *Author: xiexionghang - *Train样本 - */ -#include -#include "paddle/fluid/platform/timer.h" -#include "paddle/fluid/train/custom_trainer/feed/io/file_system.h" -#include "paddle/fluid/train/custom_trainer/feed/dataset/dataset.h" -#include "paddle/fluid/train/custom_trainer/feed/accessor/epoch_accessor.h" -#include "paddle/fluid/train/custom_trainer/feed/process/learner_process.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -int LearnerProcess::initialize(std::shared_ptr context_ptr) { - int ret = Process::initialize(context_ptr); - auto& config = _context_ptr->trainer_config; - _is_dump_cache_model = config["dump_cache_model"].as(false); - _cache_load_converter = config["load_cache_converter"].as(""); - _startup_dump_inference_base = config["startup_dump_inference_base"].as(false); - if (config["executor"]) { - _executors.resize(config["executor"].size()); - for (size_t i = 0; i < _executors.size(); ++i) { - _executors[i].reset(new MultiThreadExecutor()); - CHECK(_executors[i]->initialize(config["executor"][i], context_ptr) == 0); - } - } - return 0; -} - -// 更新各节点存储的CacheModel -int LearnerProcess::update_cache_model(uint64_t epoch_id, ModelSaveWay way) { - auto fs = _context_ptr->file_system; - auto* ps_client = _context_ptr->pslib->ps_client(); - auto* environment = _context_ptr->environment.get(); - auto* epoch_accessor = _context_ptr->epoch_accessor.get(); - if (!epoch_accessor->need_save_model(epoch_id, way)) { - return 0; - } - auto* ps_param = _context_ptr->pslib->get_param(); - if (_is_dump_cache_model && way == ModelSaveWay::ModelSaveInferenceBase) { - auto model_dir = epoch_accessor->model_save_path(epoch_id, way); - auto& table_param = ps_param->server_param().downpour_server_param().downpour_table_param(); - for (auto& param : table_param) { - if (param.type() != paddle::PS_SPARSE_TABLE) { - continue; - } - auto cache_model_path = fs->path_join( - model_dir, string::format_string("%03d_cache/", param.table_id())); - if (!fs->exists(cache_model_path)) { - continue; - } - auto& cache_dict = *(_context_ptr->cache_dict.get()); - cache_dict.clear(); - cache_dict.reserve(_cache_sign_max_num); - auto cache_file_list = fs->list(fs->path_join(cache_model_path, "part*")); - for (auto& cache_path : cache_file_list) { - auto cache_file = fs->open_read(cache_path, _cache_load_converter); - char *buffer = nullptr; - size_t buffer_size = 0; - ssize_t line_len = 0; - while ((line_len = getline(&buffer, &buffer_size, cache_file.get())) != -1) { - if (line_len <= 1) { - continue; - } - char* data_ptr = NULL; - cache_dict.append(strtoul(buffer, &data_ptr, 10)); - } - if (buffer != nullptr) { - free(buffer); - } - } - break; - } - } - return 0; -} -int LearnerProcess::wait_save_model(uint64_t epoch_id, ModelSaveWay way, bool is_force_dump) { - ContextStatusGurad status_guard(_context_ptr, TrainerStatus::Saving); - auto fs = _context_ptr->file_system; - auto* ps_client = _context_ptr->pslib->ps_client(); - auto* environment = _context_ptr->environment.get(); - auto* epoch_accessor = _context_ptr->epoch_accessor.get(); - if (!environment->is_master_node(EnvironmentRole::WORKER)) { - return 0; - } - if (!is_force_dump && !epoch_accessor->need_save_model(epoch_id, way)) { - return 0; - } - paddle::platform::Timer timer; - timer.Start(); - std::set table_set; - auto model_dir = epoch_accessor->model_save_path(epoch_id, way); - for (auto& executor : _executors) { - const auto& table_accessors = executor->table_accessors(); - for (auto& itr : table_accessors) { - table_set.insert(itr.first); - } - auto save_path = fs->path_join(model_dir, executor->train_exe_name() + "_param"); - ENVLOG_WORKER_MASTER_NOTICE("Start save model, save_path: %s", save_path.c_str()); - executor->save_persistables(save_path); - } - int ret_size = 0; - auto table_num = table_set.size(); - std::future rets[table_num]; - for (auto table_id : table_set) { - ENVLOG_WORKER_MASTER_NOTICE("Start save model, table_id: %d", table_id); - rets[ret_size++] = ps_client->save(table_id, model_dir, std::to_string((int)way)); - } - int all_ret = 0; - for (int i = 0; i < ret_size; ++i) { - rets[i].wait(); - all_ret |= rets[i].get(); - } - timer.Pause(); - ENVLOG_WORKER_MASTER_NOTICE("Save Model Cost(s): %f", timer.ElapsedSec()); - - // save cache model, 只有inference需要cache_model - auto* ps_param = _context_ptr->pslib->get_param(); - if (_is_dump_cache_model && (way == ModelSaveWay::ModelSaveInferenceBase || - way == ModelSaveWay::ModelSaveInferenceDelta)) { - auto& table_param = ps_param->server_param().downpour_server_param().downpour_table_param(); - for (auto& param : table_param) { - if (param.type() != paddle::PS_SPARSE_TABLE) { - continue; - } - double cache_threshold = 0.0; - auto status = ps_client->get_cache_threshold(param.table_id(), cache_threshold); - CHECK(status.get() == 0) << "CacheThreshold Get failed!"; - status = ps_client->cache_shuffle(param.table_id(), model_dir, std::to_string((int)way), - std::to_string(cache_threshold)); - CHECK(status.get() == 0) << "Cache Shuffler Failed"; - status = ps_client->save_cache(param.table_id(), model_dir, std::to_string((int)way)); - auto feature_size = status.get(); - CHECK(feature_size >= 0) << "Cache Save Failed"; - auto cache_model_path = fs->path_join( - model_dir, string::format_string("%03d_cache/sparse_cache.meta", param.table_id())); - auto cache_meta_file = fs->open_write(cache_model_path, ""); - auto meta = string::format_string("file_prefix:part\npart_num:%d\nkey_num:%d\n", - param.sparse_table_cache_file_num(), feature_size); - CHECK(fwrite(meta.c_str(), meta.size(), 1, cache_meta_file.get()) == 1) << "Cache Meta Failed"; - if (feature_size > _cache_sign_max_num) { - _cache_sign_max_num = feature_size; - } - } - } - _context_ptr->epoch_accessor->update_model_donefile(epoch_id, way); - - return all_ret; -} - -int LearnerProcess::load_model(uint64_t epoch_id) { - auto* environment = _context_ptr->environment.get(); - if (!environment->is_master_node(EnvironmentRole::WORKER)) { - return 0; - } - auto* fs = _context_ptr->file_system.get(); - std::set loaded_table_set; - auto model_dir = _context_ptr->epoch_accessor->checkpoint_path(); - paddle::platform::Timer timer; - timer.Start(); - for (auto& executor : _executors) { - const auto& table_accessors = executor->table_accessors(); - for (auto& itr : table_accessors) { - if (loaded_table_set.count(itr.first)) { - continue; - } - auto table_model_path = fs->path_join( - model_dir, string::format_string("%03d", itr.first)); - if ((!fs->exists(table_model_path)) || fs->list(table_model_path).size() == 0) { - VLOG(2) << "miss table_model:" << table_model_path << ", initialize by default"; - auto scope = std::move(executor->fetch_scope()); - CHECK(itr.second[0]->create(scope.get()) == 0); - } else { - ENVLOG_WORKER_MASTER_NOTICE("Loading model %s", table_model_path.c_str()); - auto status = _context_ptr->ps_client()->load(itr.first, - model_dir, std::to_string((int)ModelSaveWay::ModelSaveTrainCheckpoint)); - CHECK(status.get() == 0) << "table load failed, id:" << itr.first; - } - loaded_table_set.insert(itr.first); - } - } - timer.Pause(); - ENVLOG_WORKER_MASTER_NOTICE("Finished loading model, cost:%f", timer.ElapsedSec()); - return 0; -} - -int LearnerProcess::run() { - auto* dataset = _context_ptr->dataset.get(); - auto* environment = _context_ptr->environment.get(); - auto* epoch_accessor = _context_ptr->epoch_accessor.get(); - uint64_t epoch_id = epoch_accessor->current_epoch_id(); - - ENVLOG_WORKER_MASTER_NOTICE("Resume train with epoch_id:%d %s", epoch_id, _context_ptr->epoch_accessor->text(epoch_id).c_str()); - //尝试加载模型 or 初始化 - CHECK(load_model(epoch_id) == 0); - environment->barrier(EnvironmentRole::WORKER); - - //判断是否先dump出base TODO - if (_startup_dump_inference_base) { - wait_save_model(epoch_id, ModelSaveWay::ModelSaveInferenceBase, _startup_dump_inference_base); - environment->barrier(EnvironmentRole::WORKER); - } - - while (true) { - epoch_accessor->next_epoch(); - _context_ptr->monitor_ssm.str(""); - bool already_dump_inference_model = false; - epoch_id = epoch_accessor->current_epoch_id(); - std::string epoch_log_title = paddle::string::format_string( - "train epoch_id:%d label:%s", epoch_id, epoch_accessor->text(epoch_id).c_str()); - std::string data_path = paddle::string::to_string(dataset->epoch_data_path(epoch_id)); - ENVLOG_WORKER_MASTER_NOTICE(" ==== begin %s ====", epoch_accessor->text(epoch_id).c_str()); - //Step1. 等待样本ready - { - ENVLOG_WORKER_MASTER_NOTICE(" %s, wait data ready:%s", epoch_log_title.c_str(), data_path.c_str()); - dataset->pre_detect_data(epoch_id); - while (dataset->epoch_data_status(epoch_id) != DatasetStatus::Ready) { - sleep(30); - dataset->pre_detect_data(epoch_id); - ENVLOG_WORKER_MASTER_NOTICE(" epoch_id:%d data not ready, wait 30s", epoch_id); - } - ENVLOG_WORKER_MASTER_NOTICE(" Start %s, data is ready", epoch_log_title.c_str()); - environment->barrier(EnvironmentRole::WORKER); - } - - //Step2. 运行训练网络 - { - ContextStatusGurad status_guard(_context_ptr, TrainerStatus::Training); - std::map> backup_input_map; - for (auto& executor : _executors) { - environment->barrier(EnvironmentRole::WORKER); - paddle::platform::Timer timer; - timer.Start(); - ENVLOG_WORKER_MASTER_NOTICE("Start executor:%s", executor->train_exe_name().c_str()); - auto data_name = executor->train_data_name(); - paddle::framework::Channel input_channel; - if (backup_input_map.count(data_name)) { - input_channel = backup_input_map[data_name]; - } else { - input_channel = dataset->fetch_data(data_name, epoch_id); - } - input_channel = executor->run(input_channel, dataset->data_parser(data_name)); - timer.Pause(); - ENVLOG_WORKER_MASTER_NOTICE("End executor:%s, cost:%f", executor->train_exe_name().c_str(), timer.ElapsedSec()); - - // 等待异步梯度完成 - _context_ptr->ps_client()->flush(); - environment->barrier(EnvironmentRole::WORKER); - if (executor->is_dump_all_model()) { - already_dump_inference_model = true; - wait_save_model(epoch_id, ModelSaveWay::ModelSaveInferenceDelta); - } - backup_input_map[data_name] = input_channel; - environment->barrier(EnvironmentRole::WORKER); - } - } - - //Step3. Dump Model For Delta&&Checkpoint - { - wait_save_model(epoch_id, ModelSaveWay::ModelSaveInferenceBase); - environment->barrier(EnvironmentRole::WORKER); - update_cache_model(epoch_id, ModelSaveWay::ModelSaveInferenceBase); - environment->barrier(EnvironmentRole::WORKER); - - if (epoch_accessor->is_last_epoch(epoch_id)) { - wait_save_model(epoch_id, ModelSaveWay::ModelSaveTrainCheckpointBase); - } else { - wait_save_model(epoch_id, ModelSaveWay::ModelSaveTrainCheckpoint); - } - environment->barrier(EnvironmentRole::WORKER); - if (epoch_accessor->is_last_epoch(epoch_id) && - environment->is_master_node(EnvironmentRole::WORKER)) { - paddle::platform::Timer timer; - timer.Start(); - ENVLOG_WORKER_MASTER_NOTICE("Start shrink table"); - for (auto& executor : _executors) { - const auto& table_accessors = executor->table_accessors(); - for (auto& itr : table_accessors) { - CHECK(itr.second[0]->shrink() == 0); - } - } - timer.Pause(); - ENVLOG_WORKER_MASTER_NOTICE("End shrink table, cost:%f", timer.ElapsedSec()); - } - environment->barrier(EnvironmentRole::WORKER); - - epoch_accessor->epoch_done(epoch_id); - environment->barrier(EnvironmentRole::WORKER); - } - ENVLOG_WORKER_MASTER_NOTICE(" ==== end %s ====", epoch_accessor->text(epoch_id).c_str()); - //Step4. Output Monitor && RunStatus - //TODO - } - - return 0; -} - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/process/learner_process.h b/paddle/fluid/train/custom_trainer/feed/process/learner_process.h deleted file mode 100644 index 4daca3f6..00000000 --- a/paddle/fluid/train/custom_trainer/feed/process/learner_process.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - *Author: xiexionghang - *Train样本 - */ -#pragma once -#include "paddle/fluid/train/custom_trainer/feed/process/process.h" -#include "paddle/fluid/train/custom_trainer/feed/executor/multi_thread_executor.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { -class LearnerProcess : public Process { -public: - LearnerProcess() {} - virtual ~LearnerProcess() {} - - virtual int run(); - virtual int initialize(std::shared_ptr context_ptr); - -protected: -// 加载所有模型 -virtual int load_model(uint64_t epoch_id); -// 同步保存所有模型, is_force_dump:不判断dump条件,强制dump出模型 -virtual int wait_save_model(uint64_t epoch_id, ModelSaveWay way, bool is_force_dump = false); -virtual int update_cache_model(uint64_t epoch_id, ModelSaveWay way); - -private: - bool _is_dump_cache_model; // 是否进行cache dump - uint32_t _cache_sign_max_num = 0; // cache sign最大个数 - std::string _cache_load_converter; // cache加载的前置转换脚本 - bool _startup_dump_inference_base; // 启动立即dump base - std::vector> _executors; -}; - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/process/process.cc b/paddle/fluid/train/custom_trainer/feed/process/process.cc deleted file mode 100644 index 0e1cd5fc..00000000 --- a/paddle/fluid/train/custom_trainer/feed/process/process.cc +++ /dev/null @@ -1,17 +0,0 @@ -#include "paddle/fluid/train/custom_trainer/feed/process/process.h" -#include "paddle/fluid/train/custom_trainer/feed/process/init_env_process.h" -#include "paddle/fluid/train/custom_trainer/feed/process/learner_process.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { -REGIST_CLASS(Process, InitEnvProcess); -REGIST_CLASS(Process, LearnerProcess); -int Process::run() { - return 0; -} - - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/process/process.h b/paddle/fluid/train/custom_trainer/feed/process/process.h deleted file mode 100644 index 127481e9..00000000 --- a/paddle/fluid/train/custom_trainer/feed/process/process.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once -#include "paddle/fluid/train/custom_trainer/feed/common/registerer.h" -#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class Process { -public: - Process() {} - virtual ~Process() {} - virtual int initialize(std::shared_ptr context_ptr) { - _context_ptr = context_ptr.get(); - return 0; - } - virtual int run(); -protected: - TrainerContext* _context_ptr = NULL; -}; -REGIST_REGISTERER(Process); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/compake_runable_package.sh b/paddle/fluid/train/custom_trainer/feed/scripts/compake_runable_package.sh deleted file mode 100755 index 0fa9d8a4..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/compake_runable_package.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash -#用于运行期的hadoop访问 -TRAINER_HODOOP_HOME="" -#用于跟据网络脚本生成模型 -TRAINER_PYTHON_HOME="/home/xiexionghang/paddle/py-paddle/" - -#环境准备 -if [ ! -f ${TRAINER_PYTHON_HOME}/python/bin/paddle ];then - echo "Miss File: ${TRAINER_PYTHON_HOME}/python/bin/paddle" - echo "TRAINER_PYTHON_HOME:${TRAINER_PYTHON_HOME} is invalid, Fix it, or Get From here:" - echo "wget ftp://cp01-arch-gr06.epc.baidu.com/home/xiexionghang/paddle/py-paddle.tar.gz" - echo "Then set TRAINER_PYTHON_HOME" - exit 0 -fi -TRAINER_PYTHON_BIN=${TRAINER_PYTHON_HOME}/python/bin/python -# for bad paddle 这里需要想办法解决,paddle的前置目录太多 -if [ ! -f ../../../third_party/install/pslib/lib/libps.so ];then - mkdir -p ../../../third_party/install/pslib/lib/ - ln -s ${TRAINER_PYTHON_HOME}/third_party/install/pslib/lib/libps.so ../../../third_party/install/pslib/lib/libps.so -fi - - -#生成模型配置 -#这里按名匹配 可能会出现匹配错误&兼容性差的问题,最好是先python解析yaml文件 -items=`grep " name:" conf/trainer.yaml | awk -F ':' '{print $2}' |awk '{sub("^ *","");sub(" *$","");print}'` -for item in ${items[@]}; -do - if [ ! -f scripts/${item}.py ];then - echo "Missing model_net config: scripts/${item}.py, skip it $item" - continue - fi - rm -rf model/$item - ${TRAINER_PYTHON_BIN} scripts/create_programs.py scripts/${item}.py - if [ $? -ne 0 ];then - echo "Create model with scripts/${item}.py failed" - exit 1 - fi -done - -#输出package包 -rm -rf package -mkdir -p package/log -cp -r bin conf tool scripts model so package -cp -r ${TRAINER_HODOOP_HOME} package/hadoop-client diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py b/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py deleted file mode 100644 index d0f54fd7..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/create_programs.py +++ /dev/null @@ -1,203 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- - -from __future__ import print_function, division -import os -import sys -import paddle -from paddle import fluid -import yaml - -def print_help(this_name): - """Print help - """ - dirname = os.path.dirname(this_name) - print('Usage: {} [model_dir]\n'.format(this_name)) - print(' example: {} {}'.format(this_name, os.path.join(dirname, 'example.py'))) - -class ModelBuilder: - """ - Attributes: - _save_path: Save path of programs - - def _inference(): - Build inference network(without loss and optimizer) - **This function is declared in the network_desc_path file, and will be set in initialize()** - - Returns: - list: inputs - and - list: outputs - pass - - def _loss_function(*outputs): - **This function is declared in the network_desc_path file, and will be set in initialize()** - Args: - *outputs: the second result of inference() - - Returns: - Variable: loss - and - list: labels - pass - """ - - def initialize(self, network_desc_path, save_path=None): - """compile the network description module - Args: - network_desc_path: path - save_path: model save path, default is ./model// - - Returns: - bool: True if succeed else False - """ - if not isinstance(network_desc_path, str): - print('network_desc_path must be str') - return False - - if not network_desc_path.endswith('.py'): - print('network_desc_path must be end with .py') - return False - - if not os.path.exists(network_desc_path): - print('file not exists:', network_desc_path) - return False - - scope = dict() - with open(network_desc_path, 'r') as f: - code = f.read() - compiled = compile(code, network_desc_path, 'exec') - exec(compiled, scope) - - if not 'inference' in scope: - print('inference not defined') - return False - - if not 'loss_function' in scope: - print('loss_function not defined') - return False - - if save_path is None: - # example /a/b/c.d -> ./model/c - save_path = os.path.join('./model', os.path.splitext(os.path.split(network_desc_path)[1])[0]) - print('save in the default path:', save_path) - - self._save_path = save_path - - self._inference = scope['inference'] - self._loss_function = scope['loss_function'] - - return True - - def build_and_save(self): - """Build programs and save to _save_path - """ - scope1 = fluid.Scope() - main_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(main_program, startup_program): - #input_accessor, sparses, inputs, outputs, monitors - inference_info = self._inference() - inputs = inference_info['inputs'] - outputs = inference_info['outputs'] - sparses = inference_info['sparses'] - monitors = inference_info['monitors'] - input_accessor = inference_info['accessors'] - - test_program = main_program.clone(for_test=True) - loss, labels = self._loss_function(*outputs) - - optimizer = fluid.optimizer.SGD(learning_rate=1.0) - params_grads = optimizer.backward(loss) - - if not os.path.exists(self._save_path): - os.makedirs(self._save_path) - - programs = { - 'startup_program': startup_program, - 'main_program': main_program, - 'test_program': test_program, - } - for name, program in programs.items(): - with open(os.path.join(self._save_path, name), 'w') as f: - f.write(program.desc.serialize_to_string()) - with open(os.path.join(self._save_path, name + '.pbtxt'), 'w') as fout: - fout.write(str(program)) - - fluid.io.save_inference_model(self._save_path, - [var.name for var in inputs], - outputs, - executor=None, - main_program=test_program, - model_filename='inference_program', - program_only=True) - with open(os.path.join(self._save_path, 'inference_program'), "rb") as f: - program_desc_str = f.read() - infer_program = fluid.Program.parse_from_string(program_desc_str) - with open(os.path.join(self._save_path, 'inference_program.pbtxt'), 'w') as fout: - fout.write(str(infer_program)) - - params = filter(fluid.io.is_parameter, main_program.list_vars()) - vars = [] - sums=[] - for param in params: - if param.name.find("bn") == 0: - sums.append({"name": param.name, "shape": param.shape}); - else: - vars.append({"name": param.name, "shape": param.shape}); - - for accessor in input_accessor: - if (accessor["input"] == "sparses"): - accessor["input"] = sparses - if (accessor["input"] == "vars"): - accessor["input"] = vars - if (accessor["input"] == "sums"): - accessor["input"] = sums - if (accessor["input"] == "labels"): - accessor["input"] = [ - {"label_name": label.name, "shape": label.shape, "output_name": output.name } - for (label, output) in zip(labels, outputs) ] - - for monitor in monitors: - idx = outputs.index(monitor['target']) - monitor["target_idx"] = idx - monitor["target"] = outputs[idx].name - - model_desc_path = os.path.join(self._save_path, 'model.yaml') - model_desc = { - 'inputs': [{"name": var.name, "shape": var.shape} for var in inputs], - 'outputs': [{"name": var.name, "shape": var.shape} for var in outputs], - 'labels': [{"name": var.name, "shape": var.shape} for var in labels], - 'loss': loss.name, - 'input_accessor': input_accessor, - 'monitor': monitors, - 'aa_Attention': 'Do Not Modify This File Manually, Unless You Really Know It' - } - - with open(model_desc_path, 'w') as f: - yaml.safe_dump(model_desc, f, encoding='utf-8', allow_unicode=True, default_flow_style=None) - - -def main(argv): - """Create programs - Args: - argv: arg list, length should be 2 - """ - if len(argv) < 2: - print_help(argv[0]) - exit(1) - network_desc_path = argv[1] - - if len(argv) > 2: - save_path = argv[2] - else: - save_path = None - - builder = ModelBuilder() - if not builder.initialize(network_desc_path, save_path): - print_help(argv[0]) - exit(1) - builder.build_and_save() - -if __name__ == "__main__": - main(sys.argv) diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/example.py b/paddle/fluid/train/custom_trainer/feed/scripts/example.py deleted file mode 100644 index 54e1128f..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/example.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- - -""" -This is an example of network building -""" - -from __future__ import print_function, division -import paddle -from paddle import fluid - -def inference(): - """Build inference network(without loss and optimizer) - - Returns: - list: inputs - and - list: outputs - """ - # TODO: build network here - cvm_input = fluid.layers.data(name='cvm_input', shape=[4488], dtype='float32', stop_gradient=False) - - net = cvm_input - net = fluid.layers.data_norm(input=net, name="bn6048", epsilon=1e-4, - param_attr={"batch_size":1e4, "batch_sum_default":0.0, "batch_square":1e4}) - net = fluid.layers.fc(net, 512, act='relu', name='fc_1') - net = fluid.layers.fc(net, 256, act='relu', name='fc_2') - net = fluid.layers.fc(net, 256, act='relu', name='fc_3') - net = fluid.layers.fc(net, 128, act='relu', name='fc_4') - net = fluid.layers.fc(net, 128, act='relu', name='fc_5') - net = fluid.layers.fc(net, 128, act='relu', name='fc_6') - net = fluid.layers.fc(net, 128, act='relu', name='fc_7') - - ctr_output = fluid.layers.fc(net, 1, act='sigmoid', name='ctr') - return {'accessors': [], 'monitors': [], 'sparses': [], 'inputs': [cvm_input], 'outputs': [ctr_output]} - -def loss_function(ctr_output): - """ - Args: - *outputs: the second result of inference() - - Returns: - Variable: loss - and - list: labels - """ - # TODO: calc loss here - - label = fluid.layers.data(name='label_ctr', shape=ctr_output.shape, dtype='float32') - loss = fluid.layers.square_error_cost(input=ctr_output, label=label) - loss = fluid.layers.mean(loss, name='loss_ctr') - - return loss, [label] diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/join.py b/paddle/fluid/train/custom_trainer/feed/scripts/join.py deleted file mode 100644 index f0317adf..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/join.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- - -""" -This is an example of network building -""" - -from __future__ import print_function, division -import paddle -from paddle import fluid - -def sparse_cvm_dim(sparse_info): - return sparse_info['slot_dim'] * len(sparse_info['slots']) - -def inference(): - """Build inference network(without loss and optimizer) - - Returns: - list: sparse_inputs - and - list: inputs - and - list: outputs - """ - sparse_cvm = { "name": "cvm_input", "slot_dim" : 11, "slots": [6048,6002,6145,6202,6201,6121,6738,6119,6146,6120,6147,6122,6123,6118,6142,6143,6008,6148,6151,6127,6144,6094,6083,6952,6739,6150,6109,6003,6099,6149,6129,6203,6153,6152,6128,6106,6251,7082,7515,6951,6949,7080,6066,7507,6186,6007,7514,6125,7506,10001,6006,7023,6085,10000,6098,6250,6110,6124,6090,6082,6067,6101,6004,6191,7075,6948,6157,6126,6188,7077,6070,6111,6087,6103,6107,6194,6156,6005,6247,6814,6158,7122,6058,6189,7058,6059,6115,7079,7081,6833,7024,6108,13342,13345,13412,13343,13350,13346,13409,6009,6011,6012,6013,6014,6015,6019,6023,6024,6027,6029,6031,6050,6060,6068,6069,6089,6095,6105,6112,6130,6131,6132,6134,6161,6162,6163,6166,6182,6183,6185,6190,6212,6213,6231,6233,6234,6236,6238,6239,6240,6241,6242,6243,6244,6245,6354,7002,7005,7008,7010,7012,7013,7015,7016,7017,7018,7019,7020,7045,7046,7048,7049,7052,7054,7056,7064,7066,7076,7078,7083,7084,7085,7086,7087,7088,7089,7090,7099,7100,7101,7102,7103,7104,7105,7109,7124,7126,7136,7142,7143,7144,7145,7146,7147,7148,7150,7151,7152,7153,7154,7155,7156,7157,7047,7050,6253,6254,6255,6256,6257,6259,6260,6261,7170,7185,7186,6751,6755,6757,6759,6760,6763,6764,6765,6766,6767,6768,6769,6770,7502,7503,7504,7505,7510,7511,7512,7513,6806,6807,6808,6809,6810,6811,6812,6813,6815,6816,6817,6819,6823,6828,6831,6840,6845,6875,6879,6881,6888,6889,6947,6950,6956,6957,6959,10006,10008,10009,10010,10011,10016,10017,10018,10019,10020,10021,10022,10023,10024,10029,10030,10031,10032,10033,10034,10035,10036,10037,10038,10039,10040,10041,10042,10044,10045,10046,10051,10052,10053,10054,10055,10056,10057,10060,10066,10069,6820,6821,6822,13333,13334,13335,13336,13337,13338,13339,13340,13341,13351,13352,13353,13359,13361,13362,13363,13366,13367,13368,13369,13370,13371,13375,13376,5700,5702,13400,13401,13402,13403,13404,13406,13407,13408,13410,13417,13418,13419,13420,13422,13425,13427,13428,13429,13430,13431,13433,13434,13436,13437,13326,13330,13331,5717,13442,13451,13452,13455,13456,13457,13458,13459,13460,13461,13462,13463,13464,13465,13466,13467,13468,1104,1106,1107,1108,1109,1110,1111,1112,1113,1114,1115,1116,1117,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128,1129,13812,13813,6740,1490,1491]} - - # TODO: build network here - cvm_input = fluid.layers.data(name='cvm_input', shape=[sparse_cvm_dim(sparse_cvm)], dtype='float32', stop_gradient=False) - net = cvm_input - net = fluid.layers.data_norm(input=net, name="bn6048", epsilon=1e-4, - param_attr={"batch_size":1e4, "batch_sum_default":0.0, "batch_square":1e4}) - lr_x = 1.0 - init_range = 0.2 - fc_layers_size = [511, 255, 255, 127, 127, 127, 127, 1] - fc_layers_act = ["relu"] * (len(fc_layers_size) - 1) + [None] - scales_tmp = [net.shape[1]] + fc_layers_size - scales = [] - for i in range(len(scales_tmp)): - scales.append(init_range / (scales_tmp[i] ** 0.5)) - for i in range(len(fc_layers_size)): - net = fluid.layers.fc( - input = net, - size = fc_layers_size[i], - name = 'fc_' + str(i), - act = fc_layers_act[i], - param_attr = \ - fluid.ParamAttr(learning_rate=lr_x, \ - initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])), - bias_attr = \ - fluid.ParamAttr(learning_rate=lr_x, \ - initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i]))) - ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name="ctr") - - accessors = [ - { "class": "AbacusSparseJoinAccessor", "input": "sparses", "table_id": 0, "need_gradient": False}, - { "class": "DenseInputAccessor", "input": "vars", "table_id": 1, "need_gradient": True, "async_pull": True}, - { "class": "DenseInputAccessor", "input": "sums", "table_id": 2, "need_gradient": True, "async_pull": True}, - { "class": "WeightsAdjustAccessor", "input": "ins_weight", - "slot_id": 6002, "adjw_ratio": 20, "adjw_threshold": 1000 }, - { "class": "LabelInputAccessor", "input": "labels"} - - ] - monitors = [ - { "name": "epoch_auc", "class": "AucMonitor", "target": ctr_output, "compute_interval": 600 }, - { "name": "day_auc", "class": "AucMonitor", "target": ctr_output, "compute_interval": 86400 } - ] - - return { - 'accessors': accessors, - 'monitors': monitors, - 'sparses': [sparse_cvm], - 'inputs': [cvm_input], - 'outputs': [ctr_output] - } - -def loss_function(ctr_output): - """ - Args: - *outputs: the second result of inference() - - Returns: - Variable: loss - and - list: labels - """ - # TODO: calc loss here - ins_weight = fluid.layers.data( - name="ins_weight", - shape=[-1, 1], - dtype="float32", - lod_level=0, - append_batch_size=False, - stop_gradient=True) - - label = fluid.layers.data(name='label_ctr', shape=ctr_output.shape, dtype='float32') - loss = fluid.layers.log_loss(input=ctr_output, label=label) - loss = fluid.layers.elementwise_mul(loss, ins_weight) - loss = fluid.layers.mean(loss, name='loss_ctr') - - return loss, [label] diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/main_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/main_program deleted file mode 100644 index cc20c06bd5bff5e10fae6efdcff8d78cffc2aa7c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 73923 zcmeI5Pi!06eaA&bvczXgrfGX^Gp?7NZPv7^P$DTxUL(pMCD`Q9Y=ZcZT@=M+L_SK4 zheKv&C`&Pds^}iJ=&fikMQ_OhEsAze=^lz40`!>6-r7^qQw{-oY)?h{d++y#eDjZ# zVTz*S-yRZq=Er$|=lg!2_ujl8``2GCEYIbB8~m3kl-O$HsAIOBUf=)fH^29_Z{*)t zUMl2SZh28N%YF1#;SbpDeP?@f>v4JC@*9Vy*YDsDuEGzN*dqJ@tp1SQ4_p1HZ@E0h z_-&SL_-@(n^vp{B4m{>2yBDimJc#Vpj%kOj1?Vzs7Zc3w7+{>(~> zB{rp2QJdo{sxmU|1(`Os5;p#rR?@RMy6LeB>nT-;cPo**AH9`3U4ZJxmXW!zgWU)hy!~SyEiyJoHClBGxl>WG*sMz5Mm8(vTK3z-QsI;EZB6N7r}(z&mTBcf z%Qs`4SAS<cch6_GSgZ9EaZNcADC z;jcS}pPIb;v z&z)E0m4i&QywY(}38ULIwm?{Wau|#~O^k)=X-ekNB)u4@V#>Wu76&!_q9d$2%DBg( zF_!3zRjIs2d~#Y7eSy6Z)@2x&@E1cr72Zv(Bk85$T}@iV*sctu4>ro6CPufI0o3xm zs%7OVp-FlzkI;jWo+@@;CI6U6a?J$!Q9j!OVRO2m?HT_{D-vwce50iu5h=<`YHL@ED_Xr&t+2lIfh}H&4$9ySh7*$Ak zl=1Ib_JC#g1j{}Zu2bBbVxXpYU%HJ?!M{lJH5p_KO~EMUmatWpc|WfW@b}o_x4Gqb z`Dhbw<_qk~(}rd9_`OR0Hp_ev`$@HsZCbu%I$gJ8+-6thKWyuy+xK(xYYXpv{afER zuCuG%p6PaNE+`=NHp|c*f*bOclg1+P9|VavTn;cBRI{gL;+@Ab6Z`$P-83!ePM&pyIgyiFQ04GO zz$}+n+{N~SNiUV(5~T1)<5Q^AD>4bu(6Wd!E$AfSG{w{2Jyld`%sAcf3Zo$Ki#}Y#1wSNxjX22#g$CQq7dheBsWgpFKAIBFjK3 z-j7QKTM-H(CXR6O;s*QNclv%_Oo%DfEPSrhx8r(oi>)4;4m4JyYn#n>$AgqJ#vOK} z)pZ-(Z1rkY6RS6O<-2nRn4iSWBl%7&+lM4fsTw+UVfvd35GIt8RztoHMz|M&AXS7K z90qS!f)s5hNRh*RczRTrY}uZTF#gAs)WQ zx|3t9?{Ty9%x*f4e5La&>j}R^yyl=Yb91L(5z2$tj$>C&atpiC<{ufmF0@F~7+r1v;xKHSi-fL6BmzT7vouvc|X5iChsTBQJcER z{KNT+Ox&5P)KF)XTejb}ys-QE;-Pii_8vYd!eD2Di|qPBc&6ZZMm@DUx;07=d$5)U zYSlineDKXd)RdbVE=Nh2i88GZv%f1w@0Jbz7_$J3>@z6gFGvYz$KCq73sJ)QJJMrP3NjQI>3|`N z!$nYwPtHGkQSsZx`2i(<3bw)u9K7Ym=G>h*OlC#Gy1n zHkcs0j2yc2avWN%1Gs{C#W}P}96D(ZRT`tKP=rI(LJ4u`>oW^a9LhIvtp=yaA}aU&^kgIhbk@9ITSAuhyHG^M-qolkwZz}rloH>hbjRZdVsj4MB`Ak zuZB33IP_9;C@voiwmQgJy%Fo7=^R=k4xKcIDvi-qD8iv?uOM+Kap+|B;iW9^(t4;j zX1#!PbXtB|8iy(^)HxI{5r-0oioMA%w<9@C318FoP|~+)>6^}>O2CF5AbvWlaj5$2 zhd7it^ip#u?lL(uhgPZ}NQvvATg0K0=1`?Ex(Y=&RDDB797-HY94fwDt6kK3DCy|5 z{IoOD&zI++RhtttzzcgM8Mu_PK3>*PSmSncnkc_*RpAEMown15Xn=kHPejPS<8*4RfgE^qYX-z-BIcZe^VE;(aG zsZzLttuqOK>{Ak(*%6z6z6i0w6D(ryd$QOlGg2D}7OzKhpV3(BACFU;O8z>!K+95V zLEQ@r9d0>sujJ<>6tg1~zns2MZ0?AVZ?N}DbPp*G{z;GT8nU;D8@;~o?Y=PHhp6Yr zKY^KU-*@5P@Ru64;of)pKHPd;K;)SXW0fs+d#2m9IfU^&oi}>Ey%7j`5Hx`4YSGKK}4QY}``hR>1;fMei^VT&pRZrD2&f#?P{BP0PWU z2aU`9yu%&; zc^ks9!xR_F09Rl@&CHE2>8cN~L_bYijv%;KseG$3$z8GvqPR8YHbj!ce3p z2>|ZpIA#v^)|D>tbbrS9UzRL-C~cA5X$caDZe9M3z*);*6K))AWlM0~z|(kj*X?a~ zRZ+5eWVyM;J-Le)65I==Y|zfO8PZVN*+x6t%Doc<7Es0EChuTytb|oooTdGpRa^a2 ztN+R-E3X3O>tYd>wHjvfCkyum+GV4=!D^e$cE^L6(Ts72-M~rIRjso9XoFwi*H5Av#i&!Y%V6! zf^CX9wuIM?V^>ac3%hf3r(ZF`XdTE{6s<$bl_Y9})`r{&3{%tZ(Wy>!s?#f;>O`jt zPv=ypM6^b+i=y?mSdEULHCiU3wKSh9TFVrrV>nEk(At}!6%j(~nM7+6+i8jI7_`>q zlnoZHQ(}*Wy-;P8gw`@t87lD@w3e1iK?$vU9427Y)@%I8?KL{h`7)w4>DVN6?7YxA zkg+IQhm1S|*~kG@mM3%M_(!I82++n$Wsd*FRI-Z(DFa z6@IIzPK9`A9k)F=yb3nGnsBs$D-VSD)%5kWjyt85o`&@jlrOBTsvJ^Q#oz10?C*+Q zye%6_#~rzom&CR>3x}O%j4^1f%jpoUx9~&hVIZS=8Mrqx$*GKz&{~?G39Y52Qcyx` zLTgEC8;9Kv-#BjbV{qFJCK#|cx(RX%by~@v5&pUWPU@+Mb9%^WysWF%fs94bI;31- zw8mS6)`ZrC)~*wA0EaC7QfOVp355xu^%kM^=+kl%&{|itp+mQofYxZ4h}P14s%R}! zl#byrZ9;28YeMVME-8ua)V{tMgVws74$-<6MeD5uv{pt*Xf4gpgx1nhDJY>eq4niL zYtpfa=-7F!S_d)~MeC4qh0z*s5n2;k6IxFUt#SS4%xGN|XP76hTGt4zUmC4-MH`}Z zEdj03G7+t%`Bc$brYIf5VcLY&gw}-CW6+w!b`l#Y&$2~m3|i}QIz;Pg6s>CsXswKr z&{~?G39Y52Qcyx`LhDP0)}&*T(6RGE>p;e$XdP0nFk0g+LTf^6LhFg4^^V-bIRUh; z5?a4BTI-57MC)n-TBBtmT1)e(qP0v>I)=lv39SjO39ZMVHHqycT+#`kwJxVaw5~+a zx|)F2$|wn~rTLlAT3RXvCA22AzGP@kIyMO%J1?{jWGsr-A>|6AHQpk$CbTBBo)}tV zAnh}wb-hweS+%YZTE8?}>xwo+>q-Jzqh%smOY^CswMiR~m@ z(w9Z+-^5P{*X5Mm7V)=o<^gxOYx%q>q79l=U5|l&^KTBqaNUlYCSi6(Q2CiKTUshL z4SqwXg%{yJI?`q_q@e7;=w>Z-nzQxL#v7!$m{$`(g7z`S9-BY$Yh9 z%0}3{-@(gzP~0H%@=js3Y5A7vblnaHExXOG$}6^YA`W|>Ut4(Z>)-mm@wb_1z@4;; z-y50Zy;xK1-A>H%UcB}9QmW4QnBqpvPTYu}PX7R+tsRWcGU&?MDRVCj-t*da*MXBb z)L-H}ju%19mw4#0DxT*No>=HvuGKNcu)h1k*dGXz@ub2b?cfTWU)1C+t8e?plg%<5 p1{d6rudOr|iT@x-JU!sdYa4wQ!b+%Sg0K>IKK<-*etvoG{{j9F!7%^; diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/model.yaml b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/model.yaml deleted file mode 100644 index 849c39d2..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/model.yaml +++ /dev/null @@ -1,49 +0,0 @@ -inputs: -- name: cvm_input - shape: [-1, 4488] -labels: -- name: label_ctr - shape: [-1, 1] -loss: loss_ctr -outputs: -- name: ctr.tmp_2 - shape: [-1, 1] -vars: -- name: bn6048.batch_size - shape: [4488] -- name: bn6048.batch_sum - shape: [4488] -- name: bn6048.batch_square_sum - shape: [4488] -- name: fc_1.w_0 - shape: [4488, 512] -- name: fc_1.b_0 - shape: [512] -- name: fc_2.w_0 - shape: [512, 256] -- name: fc_2.b_0 - shape: [256] -- name: fc_3.w_0 - shape: [256, 256] -- name: fc_3.b_0 - shape: [256] -- name: fc_4.w_0 - shape: [256, 128] -- name: fc_4.b_0 - shape: [128] -- name: fc_5.w_0 - shape: [128, 128] -- name: fc_5.b_0 - shape: [128] -- name: fc_6.w_0 - shape: [128, 128] -- name: fc_6.b_0 - shape: [128] -- name: fc_7.w_0 - shape: [128, 128] -- name: fc_7.b_0 - shape: [128] -- name: ctr.w_0 - shape: [128, 1] -- name: ctr.b_0 - shape: [1] diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/startup_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/startup_program deleted file mode 100644 index 259839f93f4f390a55c4589d14f4a4cef3b07652..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 29236 zcmeI5&2Jk;6u@KGZqjv<8gpogDr7c7ajUS7zml}V73v`;1ROXb4deCLUS++z-H*hf zLh>=pFUjoe%)FVMeebuk zZ{A8@{T^9bNdDLTMb@rKi>hlY6|$9*mKN78OR=SRGIBk#wk9$TA!7wHu1JfJ(T?!E zSEQwyN_Kdu1Y|Bsah?gqMAr7OtQ)-4w&GX03Z*1zNmSYP+Y`AvBXUKxrD4^Itm3dN zS$e5rY;P5Jl?rv$2642HSn5RJ<8^5FH7OZB$7}YVB19{V5G^BJ3s-R8qc$VXU_C<8 zT)(8TUhnh$O~jJzXfLC0eFcx#lWo6IQLr$&V6m@Y8(M^GB|lzH ztxA{P_1si<@~o}YG+igEX*e!5T=}Y$FfC%6IwJ?vPR8Y*6VfsqQK_!O8R~vAe(%dj zI=!dqEG=gnW|L)G8f$5$QEyPAp0#MTsFeRK)H8JgWbF--TVAZh1Cqr)u;~B(+-JyTS&`?jSw!XFg{$zO0X4+wT4J7^UOeNLn5wl5y=@$H{ zP=^IZvyk5%E%2>d{smWvYZ5UFGKZSmAc1~}p@ts(G_xKAgwJ(^rxu^K6^%NkzRzN*_ySKTA{Uo$Mv_3uN4r(qNv;uq-7o*&P?M^4r-0^KQ;#I>lYe7eBh zNT}*+2h;@xkS?di6;#5#&G@JJaznqgCIK~KyJf>`l9~=ZX9j_z4qPC!t5#TxHZ46g z2a0?OXhx0M%upF`9qb=5Jr~9ZTIR|_;q%B*YHG${?q(X428dVQaI|{U)T$esX#*yE zc>_vrK!YE48|>Z-wh1?A(+PC=w&-ug)ATB#pm-d5AX?`a2f8FAa6knAPEla$b!5yy z3vZv^Law9Qnp87*w|HBiPr_~bwe3w(n;5A#n`YJ1*R~wV#(;ONLF5v7CV|hxo~{}48fI}X{(H=mY2Nz-h#X;#Sf_Nv1H`+1^CGcDb8nX zGSQ0o2|h)kXk7j@5wbFX4Qs2Op+R*`V#64k%|SEs=M^+Fqite9odxD*sZ`*11wk`2 zAzRGC%*?=0_ABMW4xa}drR-aiGB@5NXjsG%e#>_4OP^KQqq^fm3F_fF(!(LmDoHVi zu_{;4%hIK$W`H&}9)0kw?5Ai;r0)8l3B?$ZNe%j}+bddI+>74}9{&e2$%t99={T~WbUf~}{`>2AsYs=_Y!nx9vUa_79?Q>5nqNXir2X@=o^-L-_E80&XWHkR zCy@4$_L26H_J{1H;mnX2u5nXX+b<&R&x-c>Mlz&*r2Q9Vlh}o(eN@3>+v;4b z?ZY0bfF}>6!k#?Q)_^Au-|&mHkF<}pkF-DUv@iC}PE7mx66{JJ>bGA&+MgorN5}c? z^FS3y`$+pp`_qVIGIaH2cAh+)$P=i7JAqCHgFDR7ljqUbzdk(KqzkEX$ZsELKiaoT zm&XM{+DF<)+DF=-ciI;LD5hX)@<{u$qJ18;4rw20A8G$0&_1f*`SIlWEA~x*_8}Ff z{n#Mw^WdUL`$+pp`$+rqPWvK&?8LNR%0UpHp=M zi@tved_bZ~T}ljyM9xFY1j6i#5c?4Q_`&F4l=;$-4XY=Smdg+1lh2jNcXABk1ayx; zwEHTnQBQZ}J6j6uy6R@|kB;z@KZeFJBK=cFq<`zi9=u-d1wuPs53}&!QtFyezXKnQ zLA{HME-%QU%k$8?Md1jDdbev<#yOvh*gNO#OCV$P?mtrMy3o6WAM`@Si;5Q&@3dCC Y`*(R3i(+d(D&C8!c*g^8D!R1rKN1YqUjP6A diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/example/test_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/example/test_program deleted file mode 100644 index 147833363f095e719093aa22b4da2ac31b847cfb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 32767 zcmeI5!EfWn6^A9qUR$%-HCIXCjSw^{7pd)qD~Xb2Zx(R_Y=NMMHZ8D+CMTAbMmAxJ zR7uKfcTvDVbJ!r~+;R&FwCEwno_fnMm)zRFptoLoOyA5KStC;Pda<%3cb`x4a5(d3 zIP=Z>afUPOzdz(xa>fsXzkKN@tk8yL-_4K7MUMixyd&@3t~`NzZ$tKstw8o?3x>QM3%?@^UmIVT z5gQsw8y3I@%wjj0MShUQ+P=S2+kQ~lcjDHe9sR!Vgxrq$U9Bw7?m;?e%IOQNnY{lS7Cv37iqZYY;;K`?lt)QN41&h$uAO*Z?C9Xn8ITH^Jp9~f!t8TD3 zm`*d14zy9NIX7jsWEEEL^_XK;Ow3W^NbUI?!MwA|py!P(AuDgabs?LwV(MvP?pf^+ zV`2^NWYi!wg6(7@70fnN4>h!tT z=ye!bj=Q$&_xkbpjw}WM7+OkfSqwn}y|)0p1-2~En>R#luHiL-bOzUMutm5A7tDVa z*pFD@kznOx(aq95k8QHscwM6OG2kH!u?)(LQ6W-_D(p7Pe^S&6^EO-h6?goom>lS8 z#S**mxaD{}ea$Mq#qyt~ZmE|FZ6|hYKM1?#TkNL%$8%1Derzn5hQM7P2R`52WRJePQd6Ps zcxWGTug62p+U>>|TSFfAL%$qxuT$wdu5b6k-~ftRY4yX9`?0<61+Alv&B}QZH|~pa zZI$oe#}YzuzP|e6L{;C{%F@{0-Ik>}Z#i1cKQ(zTa=pMeAC}dxKZ2D{2hD?@INhGd zwdCqenV9Q$I25|YWnw)C>cJL|ZttnNk1h%HGxMRvA@4y*ziYR7$LV{q`LI@j<}kP* zyPS+HGXLEm^H#{gD;!jFq#5!418Ky5-}TzI1I;ONj?oec5U6tK5kQityob*Vz@qHk zfE<4L{2Z*NC9@C@S{6~}1&w4hPm$V3mq`sD!3W)--S_yLHVM=Bx&CM!9ACt`m*ApHJ7F z6E|9aScbvQ1{XPu1$d_5cqToyCb~5Wh&_0t1GI9Fofv#`fSPhsN6WDxqKPtX4b9&Z zqj$#xe~kNb1SgJ`=fi(vY*Dl|=}~=ONYc_uLc-?z(qmEt8Nx+6V5r3*5NPqu)ti@u-wwVXQ1DY?dlY#?gC+Hb z{}SKkgl~UJF|9uI0P#Jx<^ihLwWLGo1=;xvvg@crcVA40)|(()0bXexS|=SkX&tH< zqf;oML)G*l>Cji_Ej;N^Ud6c@d_|U+F_@JOC63BDx$Jolt2v>0ge)DZSg7kz3?d!+ z|2ZE?I&_LUl=wC+zUewt0c_|2;%XI5hpLraq(e!EUTYnS(+B5E9pqcR3+6+!b!dZh z=%jV1VvJ6qgbu|RG#~m3tp+0I65sqElYCj2*P{kOX zLJ1wJKHH`FP?`^&%=chf-WRRQea^Zy;^?&ev@9K}Sg7kz3?dy$I+S#1Y9aB9t;;38 zO^a{34pjgfdVsj|gQi2(W)Gx8Nrzr%9ooPz3MQaKo1{Z0twR-KbP6SOsM_|0bSUXi z(xGAxmBx(LQLg_wD|T++b>rE zjJ+<-Z8<;W%D`h=&d*(}&02+iL8rFN&o<)ox?66LeQHYsw=f)4p>U*|I#Vd?@D_MC zCO;94+=Fi5w$D#*fjdczPC|@+!v;s!sS{pn8i|AyO_J!UcPjc1omVhFprM_LX0}ri yu~cqkH-87X{Hu^4rj=r3y|k9NZ*&9u8s zcY89!D1_VQjD)m{kdW9*;KGpu9N0S-wBim2#E}D6P(HG9%Q=!*%sM6WP6Z(u$;~pWg#_Lh@M-9yRWhAU?GpOlp?#j<7`&fx63AZqcWR03sOWfvN@1>iF7?U?oRvNP6XXDmTyWY_|BhR}Fat+kkH zi91VduEhaNpqm){#LA+kjcApyvI6?n#>th;SQ`!^(Tu3OBhV(A5nnwvBi4FzNUfuN zI<|;tSB+PvsOg=cGfnwYyX^Q8J51ALoCR$nfJjs`gW3+>DncB-t8Ek3lI!rLUvAjJjX6T3V^ch2WDeG zJ9qPU=|bUM%jN~GvgfvW<X*MuXK*y>0JeTu2ZqC9Y_?HwenWQ zw*n3s8vEuh_rqsecGqfFTAtbF_grtk+&R)p1>16XLEqlcO9jg*7#)vyxYIP;4lBT) z83dkR-@j*iyMB3>2SrV$z;m};hp(=(n;%V7Sj&w_dA+(Jg$!jBpbLWf5lN3%8m_%^ zqhQ?!(f%|&(e&!|s+6ri+;XSEambbO#f2`mP0NAic6XsaN(E6Dj#99^WA*7S#$fCJd z9(+X+N_+PtrJ;OZ1=pi%^RLMgMbSn7$S;FZ9jr3#iZlRO>xSFm*>l&@6>W(vC_51Y ziI!nA(6RFJZyEL_mboeX!aHK9PLAGem0iU9(rvs0W9KM$l7Y2jR>+hXa@j>;(0v=u zy}{$SBkTi_kvvRx;ee*5Yl z{<TC--6}`TvfSu;9(Mv`$95b0#np1}vEns>h^4}{Ylw|t4xdcEI)24*B6V3?-&&XE z-_s*$Ux?Yqno=n3MnPQ(Ogs#Y_D8NYP!{69Bnkq#5JODBCPS20Kh}1TiGj&!H*^j! z4pX{qqsd!l*ABEB)iUTpctcvB%q-sg!|=^x;7ozfAl67xdV&cq;}+`JSx9VzyaJHU zCcGs^{MXre*+)d&HTN3agXf-+|m(jHlUa9hy^E2(P zopddRV#aIi;-TR{p8=j4O{?t#sHU~6>~hQXV8Cj1)@lZb1)hC&{&q@(>_^G$F?9YV z;m}BTYw9@ki|2qt=$Ew`(p0rUg#rB0h5i3DwEr0LD9^KJ;L6;sz%p$B8?P?i*0SaF zQp&er$TaQu`YA=g=eoom`E{eNm#c+q1$pg%-}=F-a8zGa{K+!b^}!R}x0{Y5Z92y? z_z_6JPx|dt>StdU4hW_lWoPSyDTiU6PZ_%;5GsN^QEU>QpAy)l=Pz-aw_y;yXMq$z zUNpvL%|i=uCk7-#5$k`81tAkGvV%(zFku`L}kg@nNQ46_kf2W9~L9h?^Bjt#R%Q9!S&*f z9Hj|B@k_;X;HL=(E{UjD*wUdPojUNM+NN;oQow7MjuMIg%EU2LPl9GGKkIS3dm;|_ z8KL9Z@x6ZaT+ngjx&$4CDt*9_o)oe{NCX7={p7`q1L7ur8&H6#i1r{1_63$m*#AkG zca|~l63Ym`ve6CQO7P_@-w3!LNJ{x}0_kdfZxO%sN4!Nfk3~#Ye2u3sYP;eCI5AmH zOg1u6ko3Er1(Q|C6UC+&lf@jwWGC-EB&pxzzSk3zl*qSzE$U5q)1$r6(#CQD3~m@FP> z96h^OA!Kx)EGE&MEX~Q%oGi`BzBF^P3fB7GB8HHSFj*BAmzXRuSz@xpWT(hv74k%} zDaK?m2QgV--WQL1^6u@mVMZh zEgmGXCQl>pO)ttq8iA)mbwX${jFKqlzx}dr?Y3R3SsX|dFE2VgC>21N>+k3rB35hL zJgkeW)lxx3V6JOWs0Na}s?ib?hb(=VVV6bF!vS2#^}@RcUO8xYjB0*CwfTtv;ok%Z zX9p1e-ZQ$1od)VxaA8WT>p(a_*D-dv!#y)_qoJv5>suH+5Hd;o9t@M^c8bAKhu0x2 zy7*sYg&@5GLhb^99NsSp5K$>TP=VisEAB3YvGyZ@vtP`O0uM7`Jo*rHa&Mx~vh14& z9AjY{ZMVto3dVA+=I7*vc9G#l7|C^s&5D4N@@FcgeU4>iOii;DK+JUc2n32vp9sqS zK~Q#fplk{!0VjN$bsa\n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "col" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "BatchSize" - arguments: "bn6048.batch_size" - } - inputs { - parameter: "BatchSquareSum" - arguments: "bn6048.batch_square_sum" - } - inputs { - parameter: "BatchSum" - arguments: "bn6048.batch_sum" - } - inputs { - parameter: "X" - arguments: "cvm_input" - } - outputs { - parameter: "Means" - arguments: "_generated_var_0" - } - outputs { - parameter: "Scales" - arguments: "_generated_var_1" - } - outputs { - parameter: "Y" - arguments: "_generated_var_2" - } - type: "data_norm" - attrs { - name: "epsilon" - type: FLOAT - f: 9.99999974738e-05 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 3474, in data_norm\n attrs={\"epsilon\": epsilon})\n" - strings: " File \"join.py\", line 31, in inference\n param_attr={\"batch_size\":1e4, \"batch_sum_default\":0.0, \"batch_square\":1e4})\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "data_layout" - type: STRING - s: "NCHW" - } - } - ops { - inputs { - parameter: "X" - arguments: "_generated_var_2" - } - inputs { - parameter: "Y" - arguments: "fc_0.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_0.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_1.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_1.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_2.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_2.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_3.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_3.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_4.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_4.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_5.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_5.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_6.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_6.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_6.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_6.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_6.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_6.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_6.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_6.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_7.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_7.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_7.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_7.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_7.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_7.tmp_1" - } - outputs { - parameter: "Out" - arguments: "clip_0.tmp_0" - } - type: "clip" - attrs { - name: "min" - type: FLOAT - f: -15.0 - } - attrs { - name: "max" - type: FLOAT - f: 15.0 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10681, in clip\n outputs={\"Out\": out})\n" - strings: " File \"join.py\", line 52, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - } - ops { - inputs { - parameter: "X" - arguments: "clip_0.tmp_0" - } - outputs { - parameter: "Out" - arguments: "ctr.tmp_0" - } - type: "sigmoid" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/layer_function_generator.py\", line 247, in func\n helper.append_op(type=op_type, inputs={\"X\": x}, outputs={\"Out\": output})\n" - strings: " File \"join.py\", line 52, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "ctr.tmp_0" - } - outputs { - parameter: "Out" - arguments: "save_infer_model/scale_0" - } - type: "scale" - attrs { - name: "scale" - type: FLOAT - f: 1.0 - } - attrs { - name: "bias" - type: FLOAT - f: 0.0 - } - attrs { - name: "bias_after_scale" - type: BOOLEAN - b: true - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10410, in scale\n \'bias_after_scale\': bias_after_scale\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/io.py\", line 1045, in save_inference_model\n var, 1., name=\"save_infer_model/scale_{}\".format(i))\n" - strings: " File \"create_programs.py\", line 133, in build_and_save\n program_only=True)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - } - ops { - inputs { - parameter: "X" - arguments: "save_infer_model/scale_0" - } - outputs { - parameter: "Out" - arguments: "fetch" - } - type: "fetch" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/io.py\", line 922, in append_fetch_ops\n attrs={\'col\': i})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/io.py\", line 1091, in save_inference_model\n append_fetch_ops(main_program, fetch_var_names)\n" - strings: " File \"create_programs.py\", line 133, in build_and_save\n program_only=True)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "col" - type: INT - i: 0 - } - } -} diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/main_program deleted file mode 100644 index 0e186facc88e9f122550cd77f1d40292f1044893..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 77253 zcmeHQO>7(4eHYh~Ek0W^MO*&hZg$y86H%&`BB>8MM%84MV6z1rcM%`rMG#C!gPwZnt(OA5w8y~xzxRK`dGjTy zfW)QczYaDz^XL11zxVsSuRr^zf1jS6D!mB*3Y8k0Z}0a^+Z_ypZ+!c^?|iF#WA;*| z%u2H}npx@RU#|QyTikWG8XI@(yH?PCZ2H3<{@^nF;1ZjGAAr@Lus7mXKO9;f&oO?B z&2`&mqyE%vU{LcA6l1e8sM3oIl_kc&;7&d04a`RQCcM4IuEtYVC{5j(s@!6;V7O7= z1v3U_7uhtJJ-=0`ydmGnILe9O;cZ2}-4ZD%UxDAvvWwz(@|QIkc}qop3*KH&7Rw8a z9h12!Et}cNd@hrT(QRf%hY4-UcTqxLTqwOLKteH!_2Z*hV~geaR?~n&2Lrv# zbD9PwrzRua&?8-hx?D&$r010t%~CbU%q9onLOT~(X<&U|0cW?63t+s;O36AYXIlUK z%U})V=k$GF)<@Kq3((3j&nhcNRRK0#OkE>8Dri`u;I2fDc-Y(y|Ui* zp^?6o%4Z{H66?=79R(x%l0=40(T1>av2)5!4Mp^(eBF#*&q0kAB?gHusXD!EJzA%i z4F{5mfp0~>jt7pN^iHaiif&kAH{*pcg11;k!4gB+NQELWNw1!~LQhvu>&WWCwy||Q zKp1Qxo(20yJehbb-$qnITKS3wtwUp82CyO#3}jg+fEnN15pig5BCNx#9EmhokrMT@ zT8R!3M}~vAgx%#xT)Ggey1WX(s>A%rCamR8RR3)`=*ZTq6}*Iby*uW0Tr60CZ2)Xp z-8t}wb{JVp7IV~HQ)5a_7zm+D5LZYNvKyfQt`VBxkeOFHn5?g`8>v=^br>ZQ=)^L+ z(spbZ{EyG54vsn466S{4u5Ui&_TJ+l8$~gVZ?lrq-!ml`pCAEsw!j_U<8JWO_PN;` zI!AMs*V$Z%^O$oD>?;knn2Z~ZGDjpxuR=B1pOT$Hyw>sIn980XH7pD$bGR(PsJ5Wj z%`wTt!BtVYV{CEARse8B8lFS@5W}!)aA=vyUx6@SpqPN>5rKx|gB$?RPG^P%MH7(i z9Y+)NMtNkZWAR{?j|&uS72U)XX0{5!Q!v+PtDr5jI-ie5#St0Ta~-~x)HPiw&nr!K zBh~niNSbJoILNF=6b}cPOM>4QN^v@-D(skSDsqbP=44YZ=;J~S$CsQU!Jrf;KO(cD z9@5mPsbN1LogUb64FJg}wtAI^OgbsK5N1NsalPU~Dy4^|;>`~W2NTH)wA8=20D~IV zrW|K58jUiD+QxVk(L{^37ZYA}f55`bn!&>{x+ix_z%_EV9e;i(?)3he7GJmFWlHeD8b4Rd%^Q zFulIRg^DcA7{9s1uD}zB$ngWK{iHOr^GTsveb;t))mVGn@A0(*n;+PHcki*~?yU{1 zPRHSEgTuE+pMuAIcg?YP;i350s^3}j?SO+p`-!#3{qQ$kXJ~iUx}MeJPy61J`ry!5 ztva^LtIa!i8>>~@ty+TtcROZ(z^d?P1%c;pKY40-dwzY72Q@=JfnPo7yL@?>?fmnZ zCKbw#d*);A47jJIc4IvwtsW1Co?G>~)2;U`+cgJXe-E-+Zx1~W*n_$2^xIEr%k@zf zYbzpOtJRei%pqjww`ZRnf6aBZysU5B*^s#zwH&SFpBa4M+fLs#?p4*NA44h+!shW| z-;RaVlp)z}mqU)*T!u8Tpb$(^Z`;p|U7TZrPG;O|a)>y5GVGZh-nE8KVBBlep)G_j zWPg*!BIdsgV{UsKWwU0>|B@8CUy8Y@lR+jjJ>8(Kw z_m7XErD71ZEVC%1f))~wQiS&INkYTc^ryXkXXx-hlOc+rYyP2MhZeHG%#3TYkGjxt z{C0o9p*I)Sj5&5mb!EZQjS{=0>Znxs%FRzdzia%u$N(oErE!AI3HT6mYdC|pz%Km7 zFer;zImL)87l7!d!QwibKQLXWtajfqJ9f{9mn#(|24)>g7vG;U z!2B?6o=A6Q-aaN&$_3D|qtY+F094wPP||J7?qI}Q5uj01putbV-j%?i%`6mIF~;ds2S@L#E~Yd1O-qwBD9Txe#zsqNa9 zA7`Y`maGHYU%FR?A;(dj#=u3BA6T*MZ+C<{HmOO_+&1 ztwzb;7GrwX0hY#HIj$4BqA;0IuE*H4Xn~S_{bw`E0(|^E;p5~`vtNB7__+Bs2|Ef`BFK?s4MjUV1Pu9yvrk?U61PzPDdwnD zc^n9%1WPQ8z7pz9H0u2_g>i80Z(f)pG(x!m8^obh%Mp& zzi?k<{x0cOq=I^TErIdsE4Lx@l*{dSyHm3WNCMC_F0FsS(8=BqhnJ-$*L27 zq-06Sl9Ckz)H#rd=Suk4*Dd26x?A$28- zk4VXqk|iZeN*4Coyu3F-=}K0?TErIdnsQCasw*$)oGhJ_rE{`$<5TASb7!+AtB^;> zri7AJx8c&7EUn4Xnk=o!!bP#K>g}F+N>(AHu4M5MDOpmoq-06So*N~rU@c;c=-*3I zvdYf_DOpmoq-06Sj#0@f`Qr0Rw1OWWbqLx zSyHm3WJ$@M8zrk?EngD18bi)?M^kD4K^H z@6nH@4_;HjHz&9}gzv)3*w033OL{$Q$%wIq;PYKM>tI`aT3)RR52V`$1ZqG^URC#pnGQ=6PED_{3s7v* z@!{YdLwvJc{Pvmgvm#p%P7h3P@5NgdTbS$60obUA%AYa>p+h9r3>j_``0n;`<)& z*zJIKz*&8WNab5>`hMRBfAmg&&vf9^$X)bo_$1A2{*It$a-e7Y4(6N9jTSPKu!%^h zK`(*NmXynK@TICbn0LYp3=;pK^QeaR9&}9n3H2j+?>3Jdm82FK6L%KGTw`lBP(j{~(%5Jrdgw`orY@07IVeqJqjujq z7TGWRLD3=+4(-mOD zLIwNn{(ys@`NEozXYkPLI~-<$PV9I7+Q^0Ti}0^td^`)>3#4HhD9cw7LV;)j?SjFq zGTq~ro2Fa8Bf2#?$oSb8fHrqFvD=I0At6SJI77CU3=OAj#z_7_p!I z^fmZCvu`;=E~lS9`Z^#+vD^2+u~vI9lz?@a&G?TkK(osS1p~fxM1z4hGe)1?X0sse zWjy70w7Ec4Wfvde4{jvJ6tjT|E{fJ*?s--xj-3VLuP!q~i!mpF6VuMOMoC#ra#T#= zEW7vtrs;YztulE;My*cA5u2~x9)zDx#a+P$UO%rkH7VYvxR8`kJg}QD zaW^n`f%QD8E!RgstF2hx9>}ki=n^&}qetC@)Fd)GnlL#3s=Xl8^nuy20?P!nm?0_{#VZIpuvjwfrKK0W&>;P3}Thdk7gDzfceq%A_6gtGHPy7B5+Fi({!Re zIsBHU6Nj)?O4Et9tNS86UL_X*gD$BFU*M%TKmGi!@w*~Rt%0(N#s(8eQGR{I?iR$l zbvXNoFmgpLo{ngy?VADjgVM$KrvwESvqJ4*$93g{G$N*OhgMK02zO`&6&7Q=97u5+ zy;S@eU z4e@za?2eH{Fb7w`QsHX&8{;~gKQP^450-))vt##sc`)H7TfpU>?qI#8c0ftZ3+9Ju z^Vnjxm_N)|@cx*Vm?sCoe=+tY=JgHShS$YS!$!On!PelWST-hfgR{$Dz`oz~R92Z4 zeb_9-MduF}fnoX$pWzt=SQZ{VIV??YPffv-zh!VvLGIgQK;=SkOES|Ko@GH@Q)yd| zJHu0(jr%=u$jO1AU!6S!WpGGxcux-Rt$kjDvF6<|GQ$jUju41A7m4$&x;QwWg@~Er zT$+znoXZ#`iHzbVaqdshG$V=gqxQ0pe`+|KYrEBqLn))V0~0>=N5@h|j|FJHtp06| z73T=YNpYTYeQ+F~l~I;Bm$wZgJ|82_rKQ4E66XW%c1*Rw5BD0sLRWy2IOp5VdZW5h zm1@)fzWoDvVr8_aJbx#WNSrs5;=Gk1&XrM?IG5%p66exV;VOx966Yk& zNt~atI1j-sDb8bf78mFEio`jIa}wty&d;wn*KsZq=gkapj+U9?T$+znoXZ#`iHzbV zaZci##5sxcmsyWTt*&-`EDwoVX?sCrxct^OBG_{lG4Jpyz8hOxNOqym1b@`(c33X|yFqVWm z33U?cB-Bq`sQ=9s_9iRtSV8;oL;FX(ys+zTH8$?ncf}LahevFg&ExwY4lR#A9QMjL z<3{qi=?{DC4K|CWU{s15!Q0zx<~!VS{S>A)ScP4DD7?a@-!<91fZ<|$zX$F^2E%{> z;C_I}ngh7CFu{;|G`tZ{%jo$!yB5zHn!gb@ALS3V_||A(m>(rx zzNyT2tiUqezSlz+bc^h=eBxM#!YkfuE7K3Y`QG=8j|)jJ#(4$5b7YElQdP0Dn3?3A zbnWlt6rJ%Ka=SS>KJR{e_CD|YwZ!G2cQ&NIpHWk0Rro`nD$fi)@NK8>g8NPNNjTv= z2pyw+*tgw!\n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "epsilon" - type: FLOAT - f: 9.99999974738e-05 - } - } - ops { - inputs { - parameter: "X" - arguments: "_generated_var_2" - } - inputs { - parameter: "Y" - arguments: "fc_0.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_0.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_1.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_1.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_2.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_2.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_3.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_3.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_4.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_4.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_5.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_5.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_6.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_6.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_6.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_6.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_6.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_6.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_6.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_6.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_7.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_7.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_7.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_7.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_7.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_7.tmp_1" - } - outputs { - parameter: "Out" - arguments: "clip_0.tmp_0" - } - type: "clip" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10681, in clip\n outputs={\"Out\": out})\n" - strings: " File \"join.py\", line 52, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "max" - type: FLOAT - f: 15.0 - } - attrs { - name: "min" - type: FLOAT - f: -15.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "clip_0.tmp_0" - } - outputs { - parameter: "Out" - arguments: "ctr.tmp_0" - } - type: "sigmoid" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/layer_function_generator.py\", line 247, in func\n helper.append_op(type=op_type, inputs={\"X\": x}, outputs={\"Out\": output})\n" - strings: " File \"join.py\", line 52, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "Labels" - arguments: "label_ctr" - } - inputs { - parameter: "Predicted" - arguments: "ctr.tmp_0" - } - outputs { - parameter: "Loss" - arguments: "log_loss_0.tmp_0" - } - type: "log_loss" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 11422, in log_loss\n attrs={\'epsilon\': epsilon})\n" - strings: " File \"join.py\", line 95, in loss_function\n loss = fluid.layers.log_loss(input=ctr_output, label=label)\n" - strings: " File \"create_programs.py\", line 108, in build_and_save\n loss, labels = self._loss_function(*outputs)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "epsilon" - type: FLOAT - f: 9.99999974738e-05 - } - } - ops { - inputs { - parameter: "X" - arguments: "log_loss_0.tmp_0" - } - inputs { - parameter: "Y" - arguments: "ins_weight" - } - outputs { - parameter: "Out" - arguments: "elementwise_mul_0" - } - type: "elementwise_mul" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10367, in _elementwise_op\n \'use_mkldnn\': use_mkldnn})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10428, in elementwise_mul\n return _elementwise_op(LayerHelper(\'elementwise_mul\', **locals()))\n" - strings: " File \"join.py\", line 96, in loss_function\n loss = fluid.layers.elementwise_mul(loss, ins_weight)\n" - strings: " File \"create_programs.py\", line 108, in build_and_save\n loss, labels = self._loss_function(*outputs)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "axis" - type: INT - i: -1 - } - } - ops { - inputs { - parameter: "X" - arguments: "elementwise_mul_0" - } - outputs { - parameter: "Out" - arguments: "loss_ctr" - } - type: "mean" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10756, in mean\n type=\"mean\", inputs={\"X\": x}, attrs={}, outputs={\"Out\": out})\n" - strings: " File \"join.py\", line 97, in loss_function\n loss = fluid.layers.mean(loss, name=\'loss_ctr\')\n" - strings: " File \"create_programs.py\", line 108, in build_and_save\n loss, labels = self._loss_function(*outputs)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 256 - } - } - ops { - outputs { - parameter: "Out" - arguments: "loss_ctr@GRAD" - } - type: "fill_constant" - attrs { - name: "op_role" - type: INT - i: 257 - } - attrs { - name: "value" - type: FLOAT - f: 1.0 - } - attrs { - name: "force_cpu" - type: INT - i: 0 - } - attrs { - name: "shape" - type: LONGS - longs: 1 - } - attrs { - name: "dtype" - type: INT - i: 5 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "loss_ctr@GRAD" - } - inputs { - parameter: "X" - arguments: "elementwise_mul_0" - } - outputs { - parameter: "X@GRAD" - arguments: "elementwise_mul_0@GRAD" - } - type: "mean_grad" - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "elementwise_mul_0@GRAD" - } - inputs { - parameter: "X" - arguments: "log_loss_0.tmp_0" - } - inputs { - parameter: "Y" - arguments: "ins_weight" - } - outputs { - parameter: "X@GRAD" - arguments: "log_loss_0.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - } - type: "elementwise_mul_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10367, in _elementwise_op\n \'use_mkldnn\': use_mkldnn})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10428, in elementwise_mul\n return _elementwise_op(LayerHelper(\'elementwise_mul\', **locals()))\n" - strings: " File \"join.py\", line 96, in loss_function\n loss = fluid.layers.elementwise_mul(loss, ins_weight)\n" - strings: " File \"create_programs.py\", line 108, in build_and_save\n loss, labels = self._loss_function(*outputs)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "axis" - type: INT - i: -1 - } - } - ops { - inputs { - parameter: "Labels" - arguments: "label_ctr" - } - inputs { - parameter: "Loss@GRAD" - arguments: "log_loss_0.tmp_0@GRAD" - } - inputs { - parameter: "Predicted" - arguments: "ctr.tmp_0" - } - outputs { - parameter: "Predicted@GRAD" - arguments: "ctr.tmp_0@GRAD" - } - type: "log_loss_grad" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 11422, in log_loss\n attrs={\'epsilon\': epsilon})\n" - strings: " File \"join.py\", line 95, in loss_function\n loss = fluid.layers.log_loss(input=ctr_output, label=label)\n" - strings: " File \"create_programs.py\", line 108, in build_and_save\n loss, labels = self._loss_function(*outputs)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "epsilon" - type: FLOAT - f: 9.99999974738e-05 - } - } - ops { - inputs { - parameter: "Out" - arguments: "ctr.tmp_0" - } - inputs { - parameter: "Out@GRAD" - arguments: "ctr.tmp_0@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "clip_0.tmp_0@GRAD" - } - type: "sigmoid_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/layer_function_generator.py\", line 247, in func\n helper.append_op(type=op_type, inputs={\"X\": x}, outputs={\"Out\": output})\n" - strings: " File \"join.py\", line 52, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "clip_0.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_7.tmp_1" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_7.tmp_1@GRAD" - } - type: "clip_grad" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10681, in clip\n outputs={\"Out\": out})\n" - strings: " File \"join.py\", line 52, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "max" - type: FLOAT - f: 15.0 - } - attrs { - name: "min" - type: FLOAT - f: -15.0 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_7.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_7.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_7.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_7.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_7.b_0" - strings: "fc_7.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_7.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_6.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_7.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_6.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_7.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_7.w_0" - strings: "fc_7.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_6.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_6.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_6.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_6.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_6.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_6.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_6.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_6.b_0" - strings: "fc_6.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_6.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_5.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_6.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_5.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_6.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_6.w_0" - strings: "fc_6.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_5.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_5.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_5.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_5.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_5.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_5.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_5.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_5.b_0" - strings: "fc_5.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_5.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_4.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_5.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_4.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_5.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_5.w_0" - strings: "fc_5.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_4.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_4.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_4.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_4.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_4.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_4.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_4.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_4.b_0" - strings: "fc_4.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_4.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_3.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_4.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_3.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_4.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_4.w_0" - strings: "fc_4.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_3.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_3.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_3.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_3.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_3.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_3.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_3.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_3.b_0" - strings: "fc_3.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_3.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_2.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_3.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_2.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_3.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_3.w_0" - strings: "fc_3.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_2.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_2.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_2.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_2.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_2.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_2.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_2.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_2.b_0" - strings: "fc_2.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_2.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_1.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_2.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_1.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_2.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_2.w_0" - strings: "fc_2.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_1.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_1.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_1.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_1.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_1.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_1.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_1.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_1.b_0" - strings: "fc_1.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_1.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_0.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_1.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_0.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_1.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_1.w_0" - strings: "fc_1.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_0.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_0.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_0.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_0.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_0.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_0.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_0.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_0.b_0" - strings: "fc_0.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_0.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "_generated_var_2" - } - inputs { - parameter: "Y" - arguments: "fc_0.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "_generated_var_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_0.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_0.w_0" - strings: "fc_0.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "BatchSize" - arguments: "bn6048.batch_size" - } - inputs { - parameter: "BatchSquareSum" - arguments: "bn6048.batch_square_sum" - } - inputs { - parameter: "BatchSum" - arguments: "bn6048.batch_sum" - } - inputs { - parameter: "Means" - arguments: "_generated_var_0" - } - inputs { - parameter: "Scales" - arguments: "_generated_var_1" - } - inputs { - parameter: "X" - arguments: "cvm_input" - } - inputs { - parameter: "Y@GRAD" - arguments: "_generated_var_2@GRAD" - } - outputs { - parameter: "BatchSize@GRAD" - arguments: "bn6048.batch_size@GRAD" - } - outputs { - parameter: "BatchSquareSum@GRAD" - arguments: "bn6048.batch_square_sum@GRAD" - } - outputs { - parameter: "BatchSum@GRAD" - arguments: "bn6048.batch_sum@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "cvm_input@GRAD" - } - type: "data_norm_grad" - attrs { - name: "data_layout" - type: STRING - s: "NCHW" - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "bn6048.batch_square_sum" - strings: "bn6048.batch_square_sum@GRAD" - strings: "bn6048.batch_sum" - strings: "bn6048.batch_sum@GRAD" - strings: "bn6048.batch_size" - strings: "bn6048.batch_size@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 3474, in data_norm\n attrs={\"epsilon\": epsilon})\n" - strings: " File \"join.py\", line 31, in inference\n param_attr={\"batch_size\":1e4, \"batch_sum_default\":0.0, \"batch_square\":1e4})\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "epsilon" - type: FLOAT - f: 9.99999974738e-05 - } - } -} -version { - version: 0 -} diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/model.yaml b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/model.yaml deleted file mode 100644 index d31480c4..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/model.yaml +++ /dev/null @@ -1,109 +0,0 @@ -aa_Attention: Do Not Modify This File Manually, Unless You Really Know It -input_accessor: -- class: AbacusSparseJoinAccessor - input: - - name: cvm_input - slot_dim: 11 - slots: [6048, 6002, 6145, 6202, 6201, 6121, 6738, 6119, 6146, 6120, 6147, 6122, - 6123, 6118, 6142, 6143, 6008, 6148, 6151, 6127, 6144, 6094, 6083, 6952, 6739, - 6150, 6109, 6003, 6099, 6149, 6129, 6203, 6153, 6152, 6128, 6106, 6251, 7082, - 7515, 6951, 6949, 7080, 6066, 7507, 6186, 6007, 7514, 6125, 7506, 10001, 6006, - 7023, 6085, 10000, 6098, 6250, 6110, 6124, 6090, 6082, 6067, 6101, 6004, 6191, - 7075, 6948, 6157, 6126, 6188, 7077, 6070, 6111, 6087, 6103, 6107, 6194, 6156, - 6005, 6247, 6814, 6158, 7122, 6058, 6189, 7058, 6059, 6115, 7079, 7081, 6833, - 7024, 6108, 13342, 13345, 13412, 13343, 13350, 13346, 13409, 6009, 6011, 6012, - 6013, 6014, 6015, 6019, 6023, 6024, 6027, 6029, 6031, 6050, 6060, 6068, 6069, - 6089, 6095, 6105, 6112, 6130, 6131, 6132, 6134, 6161, 6162, 6163, 6166, 6182, - 6183, 6185, 6190, 6212, 6213, 6231, 6233, 6234, 6236, 6238, 6239, 6240, 6241, - 6242, 6243, 6244, 6245, 6354, 7002, 7005, 7008, 7010, 7012, 7013, 7015, 7016, - 7017, 7018, 7019, 7020, 7045, 7046, 7048, 7049, 7052, 7054, 7056, 7064, 7066, - 7076, 7078, 7083, 7084, 7085, 7086, 7087, 7088, 7089, 7090, 7099, 7100, 7101, - 7102, 7103, 7104, 7105, 7109, 7124, 7126, 7136, 7142, 7143, 7144, 7145, 7146, - 7147, 7148, 7150, 7151, 7152, 7153, 7154, 7155, 7156, 7157, 7047, 7050, 6253, - 6254, 6255, 6256, 6257, 6259, 6260, 6261, 7170, 7185, 7186, 6751, 6755, 6757, - 6759, 6760, 6763, 6764, 6765, 6766, 6767, 6768, 6769, 6770, 7502, 7503, 7504, - 7505, 7510, 7511, 7512, 7513, 6806, 6807, 6808, 6809, 6810, 6811, 6812, 6813, - 6815, 6816, 6817, 6819, 6823, 6828, 6831, 6840, 6845, 6875, 6879, 6881, 6888, - 6889, 6947, 6950, 6956, 6957, 6959, 10006, 10008, 10009, 10010, 10011, 10016, - 10017, 10018, 10019, 10020, 10021, 10022, 10023, 10024, 10029, 10030, 10031, - 10032, 10033, 10034, 10035, 10036, 10037, 10038, 10039, 10040, 10041, 10042, - 10044, 10045, 10046, 10051, 10052, 10053, 10054, 10055, 10056, 10057, 10060, - 10066, 10069, 6820, 6821, 6822, 13333, 13334, 13335, 13336, 13337, 13338, 13339, - 13340, 13341, 13351, 13352, 13353, 13359, 13361, 13362, 13363, 13366, 13367, - 13368, 13369, 13370, 13371, 13375, 13376, 5700, 5702, 13400, 13401, 13402, 13403, - 13404, 13406, 13407, 13408, 13410, 13417, 13418, 13419, 13420, 13422, 13425, - 13427, 13428, 13429, 13430, 13431, 13433, 13434, 13436, 13437, 13326, 13330, - 13331, 5717, 13442, 13451, 13452, 13455, 13456, 13457, 13458, 13459, 13460, - 13461, 13462, 13463, 13464, 13465, 13466, 13467, 13468, 1104, 1106, 1107, 1108, - 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1119, 1120, 1121, 1122, - 1123, 1124, 1125, 1126, 1127, 1128, 1129, 13812, 13813, 6740, 1490, 1491] - need_gradient: false - table_id: 0 -- async_pull: true - class: DenseInputAccessor - input: - - name: fc_0.w_0 - shape: [4488, 511] - - name: fc_0.b_0 - shape: [511] - - name: fc_1.w_0 - shape: [511, 255] - - name: fc_1.b_0 - shape: [255] - - name: fc_2.w_0 - shape: [255, 255] - - name: fc_2.b_0 - shape: [255] - - name: fc_3.w_0 - shape: [255, 127] - - name: fc_3.b_0 - shape: [127] - - name: fc_4.w_0 - shape: [127, 127] - - name: fc_4.b_0 - shape: [127] - - name: fc_5.w_0 - shape: [127, 127] - - name: fc_5.b_0 - shape: [127] - - name: fc_6.w_0 - shape: [127, 127] - - name: fc_6.b_0 - shape: [127] - - name: fc_7.w_0 - shape: [127, 1] - - name: fc_7.b_0 - shape: [1] - need_gradient: true - table_id: 1 -- async_pull: true - class: DenseInputAccessor - input: - - name: bn6048.batch_size - shape: [4488] - - name: bn6048.batch_sum - shape: [4488] - - name: bn6048.batch_square_sum - shape: [4488] - need_gradient: true - table_id: 2 -- {adjw_ratio: 20, adjw_threshold: 1000, class: WeightsAdjustAccessor, input: ins_weight, - slot_id: 6002} -- class: LabelInputAccessor - input: - - label_name: label_ctr - output_name: ctr.tmp_0 - shape: [-1, 1] -inputs: -- name: cvm_input - shape: [-1, 4488] -labels: -- name: label_ctr - shape: [-1, 1] -loss: loss_ctr -monitor: -- {class: AucMonitor, compute_interval: 600, name: epoch_auc, target: ctr.tmp_0, target_idx: 0} -- {class: AucMonitor, compute_interval: 86400, name: day_auc, target: ctr.tmp_0, target_idx: 0} -outputs: -- name: ctr.tmp_0 - shape: [-1, 1] diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/startup_program deleted file mode 100644 index e10ee0452058a41b87af3da2d4e5a3a3ccffcb68..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 29929 zcmeI5&u`;I6vvxQyUkA9tqF%MWrd7aNNH7^r0Fk~4MK}l2?-Vs+z^_b*ptN6v4elK zDF_0$y#ZIagTxWI^A8{)ao`L`t{@Ja!t*$394A}6Vw=sH*Gm(RXWpCf`1|~^<9Xz_ z-!r*6sWZE51u;d~bsBr$k3fqGZ@^FAMF4e(g^5t3a-e+I-Jv4yW$8>Fa$1!AM>O#|g;^ z$0cDSbgVMLv9MlQSRdBwWu;y)&Y|1oxZ6QdSaCaEF%wpNTDVgYr7Pgh6|xBK^fKbe zhOKT-XjS=iCBI6tZ}?vRY|5;!Hkj|a2D7NcEZuG?t7OS;Q^z(rJ!X!yp!~W-mSKy= zOw;w4b|fv_{w5>Khla^zrP#Dvyx1{#$FQwNlUa>oo9VjAi|y0r&pvrg+bWty4K@ZJ zx7GckYj_-X(2iJxyMsD)(>L^D-C-?$Vmn7_`&8MIO~c}H`CzxaB^#DZ+YWDYOQ&|5 z$neKJ&vC0qC(LQMYJ+?YuUZ}H%Wq1ZNVM;p@cPqPlp|@h#C>5@{ z9t>l{VY zQy{hJQ8Y^5|%8ABfSM=)t7RHmtVq1?QnEzQZWEG^j9`JwIN2kKCeV zMY>bj4Ua`#6VnCGM#?nLIA$IQfLxWsBdCt{wh-(V_znFsS`=i&@v4^JqIDB`&MgXC zT{u7=4P@nA+lU(NpZkVY=MJ|t9-paIAslMtn5%ber^U=yhi7NOw6$tUEx|B@l<{g= zEy$?hfn^W}0Ke`Py7)?Y@N;Gc3_jVG87B)MU`wVUpG=!rxmhTlY==l4vn#y5OF=DyZi1 z22q|A3U7f7xSHMO(xuHzQCT62{Xihit<5PCA->CL>&VnCOS=5#oU%j~#H@tk1%;4# zmvde0x=gaJr%RdVtlH^XBK>&gTJ7bnCH?`C!>73G#Zg0z$S=DpAd{l_Ya@Z zvtSrNF(QT69?365rV){*5L4s&UD+3U(AV!E@Ki;dr_mS`ZV~6{AQZ&{&>=J3;Q9q6 z8?H`MF4-$X$?k$^k79%XvIL@y7;zr+plClFcASQ2BSt(J&x>Oc4?~C%h!GD7Bf2q+ z5O+KiL?pxrj7aIYv4t2hyw8Joe)_k}w@Wj2=Vt^Xx-pCpkDC&RHev*db~;2GF#<6H zpGLqV+!=Wq5yuGeqk#kw2{8gAQaU0MVgzEu*aPp!3nRpD(-Md_Vg!nIIz$^W0x<$H zVs|Ty_CaDbs@a`hCPorsnWzM^ zjU<7xoetSXl0cF`l9(b%%>N~gJ&`z%B*eOg2|^N*1csz^gd`*hB#GG~iAkCrC5h}f zNr-iF6Ua7_1j=?gWE)8WNdie?iX`##SFgMnU-vai64}@@iBhnXd4iCHB!M9*9U%!x z0!d=FNMe#^M@Zt{jZQDC{7v%Dgc&66mpWs8rEZue)ocq^dAGdrbP~T@!Nt2Dbn)); z!3$!wNQ&ts66O$@#yYTHOh5U;PCu`R&OAmfNO2&R^N-|;2=he1y4;;XYJgZOg-V!4 z`)i@?Z<(YtVU}`KG7b6p0iD1tM>b*!e@Se0N|OlPHWO zQCKBc>b9eCsFN;M|A%r`L$4V$8*@B>Z0oYPco-H Y64ixUwvcEGRX2sId;X\n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 1 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_7.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - longs: 1 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_6.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_6.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_5.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_5.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_4.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_4.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_3.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.0125244855881 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_3.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.0125244855881 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 255 - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_2.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.0125244855881 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 255 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_2.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.0125244855881 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 255 - longs: 255 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_1.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.00884747877717 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 255 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_1.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.00884747877717 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 511 - longs: 255 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_0.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.00298540713266 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 511 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_0.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.00298540713266 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 4488 - longs: 511 - } - } - ops { - outputs { - parameter: "Out" - arguments: "bn6048.batch_square_sum" - } - type: "fill_constant" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 189, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 3456, in data_norm\n dtype=input.dtype)\n" - strings: " File \"join.py\", line 31, in inference\n param_attr={\"batch_size\":1e4, \"batch_sum_default\":0.0, \"batch_square\":1e4})\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "force_cpu" - type: BOOLEAN - b: false - } - attrs { - name: "value" - type: FLOAT - f: 10000.0 - } - attrs { - name: "shape" - type: LONGS - longs: 4488 - } - attrs { - name: "dtype" - type: INT - i: 5 - } - } - ops { - outputs { - parameter: "Out" - arguments: "bn6048.batch_sum" - } - type: "fill_constant" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 189, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 3448, in data_norm\n dtype=input.dtype)\n" - strings: " File \"join.py\", line 31, in inference\n param_attr={\"batch_size\":1e4, \"batch_sum_default\":0.0, \"batch_square\":1e4})\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "force_cpu" - type: BOOLEAN - b: false - } - attrs { - name: "value" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 4488 - } - attrs { - name: "dtype" - type: INT - i: 5 - } - } - ops { - outputs { - parameter: "Out" - arguments: "bn6048.batch_size" - } - type: "fill_constant" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 189, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 3440, in data_norm\n dtype=input.dtype)\n" - strings: " File \"join.py\", line 31, in inference\n param_attr={\"batch_size\":1e4, \"batch_sum_default\":0.0, \"batch_square\":1e4})\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "force_cpu" - type: BOOLEAN - b: false - } - attrs { - name: "value" - type: FLOAT - f: 10000.0 - } - attrs { - name: "shape" - type: LONGS - longs: 4488 - } - attrs { - name: "dtype" - type: INT - i: 5 - } - } -} -version { - version: 0 -} diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program deleted file mode 100644 index 6985108a684fee618d154dc5be249186aed2ed9c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 34269 zcmeHQ&5s*N6(472GIr%-9A{U_8U)^6K|G^b+wJ)z!XSdQ0to_A4g?|8^w?!jGwtrC zyFKv`E#bB+aptfH2?;LY#F6vfxgfOny>a9(ScO;h%I>n;?G4h*teJVa#BSGA)vJ2- ztM}{osvi5~ch~Yu#aE+$x$*Gouz1;l8Tn5Ncsi%XPK_wTL=!6!ugdGe_Zt z?1b_k8?@|RJjo!YcaFQ7jg>}K0_U`9TFUcMxS+LqPa z0-*|QO$=eKDC)uPHeX|G9{bezgpvfj^nkKM4gu)!3CkC8{BY&|r5arEq@DwTh)8e_6KK zlyh(!=76z0&cQ2D_t#-)<8WTjQAC_9ROGCY#gZsUbCw{e$vbMs=V)?TKg|iT%3CP~ z&QN%v7-|J=&yy182S<8)#z^anFawIP3M`0iHD^PSUJwT(1GDskSmWG+*!#U1y+<7| zr*&W%mLlkYKDYA~9bsX2CM=-fxCrKUQ(yPW$ZS~ zJuPXd`4(IKA-BD#lm`AXyZ)qYyFC4yS$dP@ewzA|X1UO@W83oluy4G{ZpbgLed-V5 z;BU_ zp~Huc@AZysuU8N3PRHf-;Pl<`r})VC>aKGL55>o3ZNDBlF^2-}W4p(r(cg64fzzpX zL%YwP`QdRbI5oB^uH*5Fxqr~usyJT74g&6VEI(is_-Ds)7`2X{*|!0df+gmy_1ri_;|>;?@zCVZ@aV~)Z*_Rr9=Ngb zuu%hD82upEn=C9^{>7-}cF4g*8jW$J31l`c%a$rk1j>v5X>HQ*%_zeeK@g|9N2islWgyo~Chv-4gI2tYZYr zJ8bz!V8IEaUKx+&<^I4;gT)){_R#WRvf93Db)0?#E$57T>`vDY+uZ5~+f5752F$_Y z^&c-8Q2aDqe5&6wW%~o6QYL_2I4b@A3qYk^2_@aOTn@&h76BTW0u6pMT3u5(v^xuj z9M6dz+Xd(eTXG|+X-{g8Vav4L?XI|7FSxb%*Twa3xv{;N5>{X$5Pm2 zme;w<`>>@xbD$SsjsQH~I<@2m1~}BWg|)LJ9u89v(_K#)M4#o!Aj-1cE*yIPdj8Op zw(rbsL?30T9XfV2X^}pEU=N+>!NUq{ffiWJVJzV20&LM~<8AjqhsglmnAz>vIk96f z*TKF~mg}S(RZ&dHw3T$fCHC~L3sxHU)(ZG3^miB*(|Sw z+#Lv;(TS{>N3s0dbyg5a-X4sA{5A0LUxbf~W19W(3&F?TZ%EitxROAQG}e%`!$ZK3 zzny>grXg_;>7TM36)T?v!lc6z3zM&edJBzuAEl74nOKYX5>>Mn)jM%wvh;p!{COe zLU2s-$&P}2Db8;c7nzOH&Bhr;9qUY1!CGQ1;sG;_$*QA1#AJ!d5|bq+dx1<=A&-tt zQ%qJJ{r}rtkn4*i%Ke|{UPr_vJN|O{eQFJjy7oX9UEyQHi zF<&|-n{n&Pg4%RhOm=j;R|+(V$r6(#CQD2f-cbJk-UOvHSwuRG$>Jq}8t|#>G07)O zK3Vd~l27&meX87xWm@F|_VzTn0)NMK^OXpN^_hrg54fs@QP4dZ-PnLYLK3TCrU)79@I+J~wPqOeJhMJ|jm{xHjD|dNCkE4O5gEX@0zHxfw#286t%}G% z+A^S01A1~*(;v=ESUQ!|^d?(|VUvjugYOvPo8#fPkBtus?2br!IDreLB7#Md^}W2N zrsyn^cKwIoWpUu;)A15MkNn!WzUz2qt2^)@A)xPBJ?`<)j(t6kb#rH5LWU?L(PA=9 zmLF8((}1@i$*}le6otgT0+5~p91Y*!5|E^xj3ElX6H3X%-+EH&#XKHWx|U!1FE7!) AGynhq diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program.pbtxt b/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program.pbtxt deleted file mode 100644 index f0a296c8..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/model/join/test_program.pbtxt +++ /dev/null @@ -1,2305 +0,0 @@ -blocks { - idx: 0 - parent_idx: -1 - vars { - name: "fc_7.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_7.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "bn6048.batch_square_sum" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 4488 - } - } - } - persistable: true - } - vars { - name: "fc_0.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_7.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 1 - } - } - } - persistable: true - } - vars { - name: "fc_1.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_7.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 1 - } - } - } - persistable: true - } - vars { - name: "fc_6.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "_generated_var_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 4488 - } - } - } - } - vars { - name: "fc_4.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "clip_0.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_3.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_2.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "cvm_input" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 4488 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "bn6048.batch_sum" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 4488 - } - } - } - persistable: true - } - vars { - name: "fc_2.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 511 - } - } - } - persistable: true - } - vars { - name: "fc_1.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - } - } - } - persistable: true - } - vars { - name: "fc_5.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "bn6048.batch_size" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 4488 - } - } - } - persistable: true - } - vars { - name: "fc_2.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - dims: 255 - } - } - } - persistable: true - } - vars { - name: "ctr.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_1.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - } - } - } - persistable: true - } - vars { - name: "fc_4.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "_generated_var_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 4488 - } - } - } - } - vars { - name: "_generated_var_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 4488 - } - lod_level: 0 - } - } - } - vars { - name: "fc_3.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_3.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_6.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_3.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_4.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_3.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_6.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_1.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 511 - dims: 255 - } - } - } - persistable: true - } - vars { - name: "fc_1.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_5.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_4.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_6.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_6.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 4488 - dims: 511 - } - } - } - persistable: true - } - vars { - name: "fc_4.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - ops { - inputs { - parameter: "BatchSize" - arguments: "bn6048.batch_size" - } - inputs { - parameter: "BatchSquareSum" - arguments: "bn6048.batch_square_sum" - } - inputs { - parameter: "BatchSum" - arguments: "bn6048.batch_sum" - } - inputs { - parameter: "X" - arguments: "cvm_input" - } - outputs { - parameter: "Means" - arguments: "_generated_var_0" - } - outputs { - parameter: "Scales" - arguments: "_generated_var_1" - } - outputs { - parameter: "Y" - arguments: "_generated_var_2" - } - type: "data_norm" - attrs { - name: "data_layout" - type: STRING - s: "NCHW" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 3474, in data_norm\n attrs={\"epsilon\": epsilon})\n" - strings: " File \"join.py\", line 31, in inference\n param_attr={\"batch_size\":1e4, \"batch_sum_default\":0.0, \"batch_square\":1e4})\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "epsilon" - type: FLOAT - f: 9.99999974738e-05 - } - } - ops { - inputs { - parameter: "X" - arguments: "_generated_var_2" - } - inputs { - parameter: "Y" - arguments: "fc_0.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_0.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_1.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_1.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_2.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_2.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_3.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_3.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_4.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_4.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_5.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_5.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_6.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_6.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_6.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_6.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_6.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_6.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_6.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_6.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_7.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_7.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_7.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_7.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_7.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"join.py\", line 51, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_7.tmp_1" - } - outputs { - parameter: "Out" - arguments: "clip_0.tmp_0" - } - type: "clip" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10681, in clip\n outputs={\"Out\": out})\n" - strings: " File \"join.py\", line 52, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "max" - type: FLOAT - f: 15.0 - } - attrs { - name: "min" - type: FLOAT - f: -15.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "clip_0.tmp_0" - } - outputs { - parameter: "Out" - arguments: "ctr.tmp_0" - } - type: "sigmoid" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/layer_function_generator.py\", line 247, in func\n helper.append_op(type=op_type, inputs={\"X\": x}, outputs={\"Out\": output})\n" - strings: " File \"join.py\", line 52, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } -} -version { - version: 0 -} diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/inference_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/inference_program deleted file mode 100644 index c3d142f21b4713c1e64d69c44a4f7c8f65b2d7ed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 27921 zcmeHQ&yV9s6;8H)CAHntnaqmq4U4cWL8n_;Cr)>EW>y5m99BX?%N{r^Ldfe*Dru`7 z+p?XW4x`b|kv(u+X;&Y6&@*icQWoB<$Zv*P zwYgU9#T8MjLsITtjzGLO4yy;C)@!6Pul?WNOi9eRHA+0-?QHYL5D>UcHbLM=(3W^t zEfw)~A>uwXat}n5$qv)0$feYIhwHp)t;H%sP1fV_n)(VgTT5kbaG5o(+Ns7=+XdB1 zWSg~)Yxbsi!tB}&xJF0`u0j9thm8Glx}aZn$>xC4e)W}7dDcs(LYIZlc>3s=<#l+) z49iRRLwn_Z$m%id2Yi;OWyZ}>%RRC+pkePET1TLD*ngc=|1n!fnb@EV`@jZ$8;mye zm-SG36f9LTZkbDm3z`%fN>*252OW{!o^5$B;2AP3Z4UOyNvcnxF@ycQR5LWMPQtuc zx1w3Py!fb2cg-!sBGPPTvU{IEt@K8^V}WL=9NXSEY}&Ho6fN5nf<;^o8IKD z*{AT_b(*$y28HZnTf5y1tdK&5-i0}&!Spu+du;Wa1K%9c`>ubXc@wpv*p@>T{Z2=3 zD3+rbo=-jM^bOY|3jCR2=m*`4`=&n(v>^@aDldWGz2`dg=!o3=G*@LUH>Ty?R);G& z)sdk9C>q2xeQ?-w?ZbBz>mjK2hxMtZx9+yMZqe*RogT#*i0hYBx;`>32fEuG!t-b- ztS!(QunEI~=M#YX?@z6u=TkGJ)4D;dNA!~ue%G_HW%mseo*TQrxF45nxQ;#P{@5Q= zkod!=OKi8W9ZWkehl{4ZHh+scNPBk`X{g_KU?v{h^t)Ul6I~A`fd)o(c|_D(d;suP z_gs%wUO6r{)jhH!#tEB8)H2zC2g^VHs!VRMaTX0NvPX(PsiM2UCX<^u_88m?IH_%n z#&&w_Exhw<q)y}Eyft5v&bc&WUud7ePvF|7WwFg>! z{Aj(g?^7o<&TO}LQ9sgVKdT=z=xQj(#|UCzx1Z$SoS^AAv9_F?B5)0Qd;qBP2BW^C z=nSqN&GZ_3eHidD0jIb5Fy~$^!xb*Za;(rYZR_XM?{a9@VkO#pu0JyEA4GNP&^6tb z1}4BA;;yc>lwMC7)PUwrXVg?%56M`y) zi~~UH3%l<)d{|dW8RG^R>XqWR3WZ1CQ!8W#%O;i0?tIDBw9cOi*=Oh!1D`lNv1ld* zPkL>i+H?f=d*1>rV1ic(*)*>#gq{SJECj7bWeG4@4$s;F=C`wfoQMCG(H#8*F-9QY znPmp^PTYe-W>Mf9!JKA*tLcXfbk>0FNigi(;b11f5d0#;?1k2)8G?@spdn_0`F7M< zHP)s^q<)u8(gPbDQtI%DT4LzTq5l%xxKw152Gv_+@5+Fw%{0K58kr%udIfO*_9m&q zf0vUms;6&w$k-{7dqpuS7t~u=OlQQ!zd*i8d2_Bsl7$FTp9$S8%0=stPGvY#2 z%x4r!xiX|>>sB(P9kQh&8Pc*Gj|^#Jt7>WQ>v@E<0(|1|ln80Dh74)*_aWA~W+p>g z+Qjq;55JS{n+$1R%4NaaLRtZm@sJjaWJvpblAIZfWcei(WJsIDL>bb)sM(MeVDDK% zT7hhl&)DXBD`FunZoF#AkXDAYGD|D7v{@TP@(F1L_{8BU5z=A}8Pdv-cJ*C$84jq- z(z4}Bxzh!=UA|0NS^<*rkQR$%NGn6y1>2Cb08(aYAO^S;FT5iE53&eNG;mP|m&;n~Xa;<)Su8`fn?$EHI zfH`;H(mQNZ-N?M^vQI}1g>mS1RS>FzE+0-O&9C})kdMpcCZpt>Lx$>%pnJ-UUfU6U zz7iz-M?%8&frNj|hFxyk_<|+^-C*nhS*7b3L+Vi94Ba?qaa-{8vPyh>&oj8WUOk+6 zv_20p0~TopA|C*RTs>?s7!f5iSb^^nSp-dbocjF5T%P*HjJ$jbJJoNZ-;#N5 zNo|Xh4y!i^ewZ>1?~zSLh|0ef3hLKLg>$}|gAnMn`A>kzxVRFS{ZoS3^#QXrY~oG% zH>W!CPja*2$aR=BCLj(@7Lj{@%oTX&4u8;FfZSQx;Mw#V@wWH<(+L2|QH@n_wEM;(;_Z{FgpN;lJqvOkSX~4c2*lNxK~< vpykK>EIYpB_v;*7cQL?;1`X%&=LcwUJP@q8Go@3&l diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/inference_program.pbtxt b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/inference_program.pbtxt deleted file mode 100644 index 9c57a27b..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/inference_program.pbtxt +++ /dev/null @@ -1,1799 +0,0 @@ -blocks { - idx: 0 - parent_idx: -1 - vars { - name: "fc_3.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 511 - } - } - } - persistable: true - } - vars { - name: "fc_1.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - } - } - } - persistable: true - } - vars { - name: "save_infer_model/scale_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "cvm_input" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 3672 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_3.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_1.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_1.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_4.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_0.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 3672 - dims: 511 - } - } - } - persistable: true - } - vars { - name: "fc_4.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "feed" - type { - type: FEED_MINIBATCH - } - persistable: true - } - vars { - name: "fc_4.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_1.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_4.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fetch" - type { - type: FETCH_LIST - } - persistable: true - } - vars { - name: "fc_5.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 1 - } - } - } - persistable: true - } - vars { - name: "fc_5.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "ctr.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_3.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "clip_0.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_3.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_4.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 1 - } - } - } - persistable: true - } - vars { - name: "fc_1.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 511 - dims: 255 - } - } - } - persistable: true - } - vars { - name: "fc_3.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - ops { - inputs { - parameter: "X" - arguments: "feed" - } - outputs { - parameter: "Out" - arguments: "cvm_input" - } - type: "feed" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/io.py\", line 905, in prepend_feed_ops\n attrs={\'col\': i})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/io.py\", line 1090, in save_inference_model\n prepend_feed_ops(main_program, feeded_var_names)\n" - strings: " File \"create_programs.py\", line 133, in build_and_save\n program_only=True)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "col" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "cvm_input" - } - inputs { - parameter: "Y" - arguments: "fc_0.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_0.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_1.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_1.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_2.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_2.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_3.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_3.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_4.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_4.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_2" - } - type: "relu" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_5.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_5.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_1" - } - type: "elementwise_add" - attrs { - name: "axis" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "y_data_format" - type: STRING - s: "" - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_1" - } - outputs { - parameter: "Out" - arguments: "clip_0.tmp_0" - } - type: "clip" - attrs { - name: "min" - type: FLOAT - f: -15.0 - } - attrs { - name: "max" - type: FLOAT - f: 15.0 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10681, in clip\n outputs={\"Out\": out})\n" - strings: " File \"update.py\", line 49, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - } - ops { - inputs { - parameter: "X" - arguments: "clip_0.tmp_0" - } - outputs { - parameter: "Out" - arguments: "ctr.tmp_0" - } - type: "sigmoid" - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/layer_function_generator.py\", line 247, in func\n helper.append_op(type=op_type, inputs={\"X\": x}, outputs={\"Out\": output})\n" - strings: " File \"update.py\", line 49, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "is_test" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "ctr.tmp_0" - } - outputs { - parameter: "Out" - arguments: "save_infer_model/scale_0" - } - type: "scale" - attrs { - name: "scale" - type: FLOAT - f: 1.0 - } - attrs { - name: "bias" - type: FLOAT - f: 0.0 - } - attrs { - name: "bias_after_scale" - type: BOOLEAN - b: true - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10410, in scale\n \'bias_after_scale\': bias_after_scale\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/io.py\", line 1045, in save_inference_model\n var, 1., name=\"save_infer_model/scale_{}\".format(i))\n" - strings: " File \"create_programs.py\", line 133, in build_and_save\n program_only=True)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_role_var" - type: STRINGS - } - } - ops { - inputs { - parameter: "X" - arguments: "save_infer_model/scale_0" - } - outputs { - parameter: "Out" - arguments: "fetch" - } - type: "fetch" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/io.py\", line 922, in append_fetch_ops\n attrs={\'col\': i})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/io.py\", line 1091, in save_inference_model\n append_fetch_ops(main_program, fetch_var_names)\n" - strings: " File \"create_programs.py\", line 133, in build_and_save\n program_only=True)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "col" - type: INT - i: 0 - } - } -} diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program deleted file mode 100644 index bfa0c8961f2f60e85ad2bcb23bb37403146cb0ef..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 57844 zcmeHQO>7(25f-gK^4XFo+KCl9X@~_{QL0Rllw>-XXKukH6z zp3Okl_u%VwF^y)pJMC;?Xxh# zQv5`J^}L)6wtCVCa5CG$CBgGDC(g^ga#frcY^sU9J_EfjC(cWGE#wH&GIe|prsWCT z9)1G1#Yq`$_*+@S$JynUUSSJn&vDeYYoot0=yOios2)Bb+6=wr zIojaOg?=QVDeE*#KS$f)evUpPJTBogYwSYXGyn+khEAU*pb=cGE5d1-+%rqjvkJbk zBEA#|QwpYk&-WTR&h+*HqM&_jVn=vh&;)n9tE_1D_Em9`M;Ivjfbed-9ynTbaX8C( zpGY4__fPN@KBVv+)yS}f>jswioTvx4ZSb4`Uk3yUH`N^46atvM&4vP4_=c#G@2C2w$13^Nm*qeOka|#HRzC+Bw;y z!`;<0t_gPc9Y8eKKDk9a(b#pydB6t2(6Ai!j&AH9xan5}4G|LLWY8N@Apxuyk=4C_3yY8?<{%CBY@($TW&?QO z!D^_noVYD{1Sik(kUJz-uCr(2$Zautk=8vlKj6*#gVGRpP;b0G-k@oBrv*ErlY0@-$p8Wr>~d^;NrP>gu6p z7iX1AZ1G6723@t?Gu4jKbpXF|%4N2^*R$KYy4T-os=a{=AzN|ojVW$^oHSox^RiWr ze#%950X|wJ?@4px4zmj|~0D z=vn&*nzg^#*E$_j-|Qb>_5X4Yde)|C?7~6*XR~&F(=l8f4BEG~eckbXvu6&B&gPyC z1Gv+(Z`b)WG(2@q5GC(_xxzT{L4@hHGf1@k`xqiK|`<9jU$Ev%8x4lfb45r>0e} zfeSd`hkC0~t5??XZ|7~}oi%=?fA0BY+ty(O-syEhLuu5}iFOBu*-?Ww8zuYlD&9y>}LIWEq$gw5X=TjJ;x z0-rEEah8#Qr$%`x{6K(vA5diBSOq80RDhzc<^qgxXjwQ)983;90dw~Ygg#k7Id=5< zvwxSL2JUA}r?G_qBw=ya(3~jz-(A&?3}^M_3Oof>ctqJQN2DLg=ZrmihN8&=TYTmd`=s|@LT+`zGuRtOt-|N9i~G7i-Eus0NB*1C>A*}Hp87z z#9;p_#9_3N#LZ@T#@5Y&0__-%>go=LX-llY!Mr{2uzUoF{4!4vIWZ9V!4rYVotMG^ zXMj2)V89?54g!pPeEP$SqvAD;xn%q(qu?kO1_Ksw7W zGE8al5K-EvbI*Y&Z5$Pe(tcAnOGW{{Fr}5q7Q~F0Hxr_?nBY+-N=uZMC@oRi@lsj| zK4Ex@QChShN=uaXF=gk_F0I^Trd`_XM4JiStjOA>l^_|Ww0MXpEm2yY%J^^zj9Cy# zyR?-5hbf1TGym^lDXm1dAZEljN$#>?7(h{X5N=uY>yp&dgPe5r=dLsc(v`b67w6ser^d7b-@t7!WjM5^E zM&6~xLqutb(h{X5O54^aSEc1)cWEWE1u-LPq=jNesU=2~mMASzTB5Y$rL+=!!tfN^ zrIoUNv`b67w2wy?Fe|H$nSzr*KRa~+7ovVA@GDUuTJZ(0@AUd$O)gh0^WQ6*pix@s2Am{ zTZ(2tFIOtYf><4ja*Ffm)Er~K+cP@V$U)wH)zaOKO4qPjR~tKAp0leRwfJ9a8x<}i z+ftxW1;%n##W_czeHYj?oHilL*?(2xp9~8>eX6`yV9Q*RGlC4IBEm(WWnG?^m*{cm z$^DlAW^w@LpZ-mJYLc7ICc@_4zyi6-o~7>VmTqfqFDz-?lrp~DNT5Yj!L8S?y2pLJ z1@h7S{~Vt{d74Agn*c~h?{08Vl1F@yg3p9gQUy9JQ#_#=7k|yOi)`)|7o2xWf$%(g zmKENx^^VbY^$sYTd#dZD6*hAV>S>e`MIXu_2#;kTlF`i{2z(|7eEv8N@QJ96HyYc` zrigAl7YRvN-d)xEj$!sJ(E62sas046A?-w%^Abu=it+;l@al&6JP{V5VvWmoNpq~%O5`5s`^2PWeu)f_y^au|XTwTD2)X$Agqk4-MqWH0> zLs4Cg{ce?4kbnXTn(0(UrGx_yz>K_22S0M#T^mX;7i{pvgAnXax`$&l+43b`f+8$W#lcyoQ{nh3eLdkByb zR8A4zlEUOrPyB8=r{kEpn~g7CYzJlbyl}^lisAto?_Yw7SW8+Jumo}0Dj`c2vMh!R zI^92~^h=iz1)`-5%L{vk31!227SzkITqW|eU!8-`vxk~F(7jqQP?iA8O11j~Q7P&? zn{^H}C=qr3D5t=O4tE*&W>)DjC_M=jUzW%6u9^6l2~`y~_a^=zqr6zTz`a3*dPyC= z#1#~+$|=7-&y-M)84ZjadcNonC8Ok6p3S`_##+71xR-E*TB>q-qs|C6ywsIbBCl?Fl2~Dj{t6>D5G(FjhR2&M zEGr%n3I0#ZirMuR#}2o&tjM=QXjw5{$?#;Zoa1#2W6O%46j)+EltEo#KL(9qMd^wk z;Uz1-mM_SSR6o>w)aqQcn>s}rH&NqvbWAv zT7c^GJ2lAfPg(YYwVol%UL)>oRAa^MZ|1!oZ$|r8s2ubJVc8g#%~Y9Y*7q~KsT{Ne z+aFX8dVUO=gn1dSd`2N!8l^+@9S@@OkS>6*oB^$1DWTQ-jdF=C z9;v+Yc)Mqcs^g-V^<}nz$Hu0msr7p^dX(!|K64`EX3h4d~@OcD|~4Vj~o|g zT2oVSVnYG_-4A+*z=7bMV;K<-Vkhz{x=&Dlr&KSlpr# zgHLaX!JLx9(h&s_>H?OIN^xnRn991TSXsL0QG}8~-^9|1rIU6Z zZ8iCs6%#2tb0SMeM2@rcj3T#Dh?YiaEM3&Z9f_q2O9`#S(ut)TCKM_>MXj!gTn_D} z+q|Us1S*&9Z@jO(wiwP9W7H?x-cIrOBX$gP%`M7So*_* z(do%OQ^xx+p$ID4R-wIgDB6~i@gBz6AmhDJbGv;tRJXTT0}35mNR>uuEM1t7#L|VO zgjQnd#L|hS(_Z?qQ`7AILy%HCEtc-#Tb!jwAT7$$@hf8K#L|hS+g2r7WdK$Xj6;pr zD5&;rT@60cF3zOy;-qA}hoLUWcyFYzbhJ!m>B4-hEM4>{Ldl?SV(G-vX)m2nc+ye$ zg!j_JI2*9^dYq*9ceX-{LGi0%=i}j$aW= zCzehuoml!_P4zW3CpQ%-Rf{L4*OS81!%!En^m+f`hh2CCfv9{Kq4`$&iZ2F1X|Ql@EK7Kyau&Pcv{>0Ey5|0n>_AOHXW diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program.pbtxt b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program.pbtxt deleted file mode 100644 index 092c484a..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/main_program.pbtxt +++ /dev/null @@ -1,3791 +0,0 @@ -blocks { - idx: 0 - parent_idx: -1 - vars { - name: "fc_0.tmp_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - } - vars { - name: "fc_1.w_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 511 - dims: 255 - } - } - } - } - vars { - name: "fc_1.tmp_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - } - vars { - name: "fc_1.tmp_1@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - } - vars { - name: "fc_2.b_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - lod_level: 0 - } - } - } - vars { - name: "fc_2.tmp_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - } - vars { - name: "fc_3.b_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - lod_level: 0 - } - } - } - vars { - name: "fc_2.tmp_1@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - } - vars { - name: "fc_3.tmp_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - } - vars { - name: "cvm_input@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 3672 - } - } - } - } - vars { - name: "fc_3.tmp_1@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - } - vars { - name: "fc_4.w_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 127 - } - } - } - } - vars { - name: "fc_3.tmp_2@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - } - } - } - vars { - name: "fc_0.tmp_2@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - } - } - } - vars { - name: "fc_4.b_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - lod_level: 0 - } - } - } - vars { - name: "fc_4.tmp_1@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - } - vars { - name: "fc_3.w_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 127 - } - } - } - } - vars { - name: "fc_5.w_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 1 - } - } - } - } - vars { - name: "fc_4.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_0.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 511 - } - } - } - persistable: true - } - vars { - name: "fc_1.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - } - } - } - persistable: true - } - vars { - name: "fc_2.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "cvm_input" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 3672 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_3.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.w_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - dims: 127 - } - } - } - } - vars { - name: "fc_1.tmp_2@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - } - } - } - vars { - name: "fc_3.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "loss_ctr@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 1 - } - } - } - } - vars { - name: "fc_0.tmp_1@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - } - vars { - name: "fc_4.tmp_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - } - vars { - name: "fc_4.tmp_2@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - } - } - } - vars { - name: "fc_3.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "clip_0.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_3.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_4.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 1 - } - } - } - persistable: true - } - vars { - name: "log_loss_0.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.b_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 511 - } - lod_level: 0 - } - } - } - vars { - name: "fc_1.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.w_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 3672 - dims: 511 - } - } - } - } - vars { - name: "fc_0.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.tmp_2@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - } - } - } - vars { - name: "fc_2.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_4.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 3672 - dims: 511 - } - } - } - persistable: true - } - vars { - name: "fc_4.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_4.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_1.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 511 - dims: 255 - } - } - } - persistable: true - } - vars { - name: "fc_1.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 1 - } - } - } - persistable: true - } - vars { - name: "fc_5.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_1.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "ctr.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "ins_weight" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.tmp_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - } - vars { - name: "fc_3.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "label_ctr" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "elementwise_mul_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "elementwise_mul_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - } - vars { - name: "loss_ctr" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 1 - } - } - } - persistable: false - } - vars { - name: "fc_5.tmp_1@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - } - } - } - vars { - name: "log_loss_0.tmp_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - } - vars { - name: "ctr.tmp_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - } - } - } - vars { - name: "clip_0.tmp_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - } - vars { - name: "fc_1.b_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - } - lod_level: 0 - } - } - } - vars { - name: "fc_5.b_0@GRAD" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 1 - } - lod_level: 0 - } - } - } - ops { - inputs { - parameter: "X" - arguments: "cvm_input" - } - inputs { - parameter: "Y" - arguments: "fc_0.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_0.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_1.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_1.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_2.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_2.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_3.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_3.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_4.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_4.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_5.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_5.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_1" - } - outputs { - parameter: "Out" - arguments: "clip_0.tmp_0" - } - type: "clip" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10681, in clip\n outputs={\"Out\": out})\n" - strings: " File \"update.py\", line 49, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "max" - type: FLOAT - f: 15.0 - } - attrs { - name: "min" - type: FLOAT - f: -15.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "clip_0.tmp_0" - } - outputs { - parameter: "Out" - arguments: "ctr.tmp_0" - } - type: "sigmoid" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/layer_function_generator.py\", line 247, in func\n helper.append_op(type=op_type, inputs={\"X\": x}, outputs={\"Out\": output})\n" - strings: " File \"update.py\", line 49, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "Labels" - arguments: "label_ctr" - } - inputs { - parameter: "Predicted" - arguments: "ctr.tmp_0" - } - outputs { - parameter: "Loss" - arguments: "log_loss_0.tmp_0" - } - type: "log_loss" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 11422, in log_loss\n attrs={\'epsilon\': epsilon})\n" - strings: " File \"update.py\", line 90, in loss_function\n loss = fluid.layers.log_loss(input=ctr_output, label=label)\n" - strings: " File \"create_programs.py\", line 108, in build_and_save\n loss, labels = self._loss_function(*outputs)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "epsilon" - type: FLOAT - f: 9.99999974738e-05 - } - } - ops { - inputs { - parameter: "X" - arguments: "log_loss_0.tmp_0" - } - inputs { - parameter: "Y" - arguments: "ins_weight" - } - outputs { - parameter: "Out" - arguments: "elementwise_mul_0" - } - type: "elementwise_mul" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10367, in _elementwise_op\n \'use_mkldnn\': use_mkldnn})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10428, in elementwise_mul\n return _elementwise_op(LayerHelper(\'elementwise_mul\', **locals()))\n" - strings: " File \"update.py\", line 91, in loss_function\n loss = fluid.layers.elementwise_mul(loss, ins_weight)\n" - strings: " File \"create_programs.py\", line 108, in build_and_save\n loss, labels = self._loss_function(*outputs)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "axis" - type: INT - i: -1 - } - } - ops { - inputs { - parameter: "X" - arguments: "elementwise_mul_0" - } - outputs { - parameter: "Out" - arguments: "loss_ctr" - } - type: "mean" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10756, in mean\n type=\"mean\", inputs={\"X\": x}, attrs={}, outputs={\"Out\": out})\n" - strings: " File \"update.py\", line 92, in loss_function\n loss = fluid.layers.mean(loss, name=\'loss_ctr\')\n" - strings: " File \"create_programs.py\", line 108, in build_and_save\n loss, labels = self._loss_function(*outputs)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 256 - } - } - ops { - outputs { - parameter: "Out" - arguments: "loss_ctr@GRAD" - } - type: "fill_constant" - attrs { - name: "op_role" - type: INT - i: 257 - } - attrs { - name: "value" - type: FLOAT - f: 1.0 - } - attrs { - name: "force_cpu" - type: INT - i: 0 - } - attrs { - name: "shape" - type: LONGS - longs: 1 - } - attrs { - name: "dtype" - type: INT - i: 5 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "loss_ctr@GRAD" - } - inputs { - parameter: "X" - arguments: "elementwise_mul_0" - } - outputs { - parameter: "X@GRAD" - arguments: "elementwise_mul_0@GRAD" - } - type: "mean_grad" - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "elementwise_mul_0@GRAD" - } - inputs { - parameter: "X" - arguments: "log_loss_0.tmp_0" - } - inputs { - parameter: "Y" - arguments: "ins_weight" - } - outputs { - parameter: "X@GRAD" - arguments: "log_loss_0.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - } - type: "elementwise_mul_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10367, in _elementwise_op\n \'use_mkldnn\': use_mkldnn})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10428, in elementwise_mul\n return _elementwise_op(LayerHelper(\'elementwise_mul\', **locals()))\n" - strings: " File \"update.py\", line 91, in loss_function\n loss = fluid.layers.elementwise_mul(loss, ins_weight)\n" - strings: " File \"create_programs.py\", line 108, in build_and_save\n loss, labels = self._loss_function(*outputs)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "axis" - type: INT - i: -1 - } - } - ops { - inputs { - parameter: "Labels" - arguments: "label_ctr" - } - inputs { - parameter: "Loss@GRAD" - arguments: "log_loss_0.tmp_0@GRAD" - } - inputs { - parameter: "Predicted" - arguments: "ctr.tmp_0" - } - outputs { - parameter: "Predicted@GRAD" - arguments: "ctr.tmp_0@GRAD" - } - type: "log_loss_grad" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 11422, in log_loss\n attrs={\'epsilon\': epsilon})\n" - strings: " File \"update.py\", line 90, in loss_function\n loss = fluid.layers.log_loss(input=ctr_output, label=label)\n" - strings: " File \"create_programs.py\", line 108, in build_and_save\n loss, labels = self._loss_function(*outputs)\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "epsilon" - type: FLOAT - f: 9.99999974738e-05 - } - } - ops { - inputs { - parameter: "Out" - arguments: "ctr.tmp_0" - } - inputs { - parameter: "Out@GRAD" - arguments: "ctr.tmp_0@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "clip_0.tmp_0@GRAD" - } - type: "sigmoid_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/layer_function_generator.py\", line 247, in func\n helper.append_op(type=op_type, inputs={\"X\": x}, outputs={\"Out\": output})\n" - strings: " File \"update.py\", line 49, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "clip_0.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_5.tmp_1" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_5.tmp_1@GRAD" - } - type: "clip_grad" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10681, in clip\n outputs={\"Out\": out})\n" - strings: " File \"update.py\", line 49, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "max" - type: FLOAT - f: 15.0 - } - attrs { - name: "min" - type: FLOAT - f: -15.0 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_5.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_5.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_5.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_5.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_5.b_0" - strings: "fc_5.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_5.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_4.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_5.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_4.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_5.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_5.w_0" - strings: "fc_5.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_4.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_4.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_4.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_4.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_4.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_4.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_4.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_4.b_0" - strings: "fc_4.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_4.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_3.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_4.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_3.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_4.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_4.w_0" - strings: "fc_4.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_3.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_3.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_3.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_3.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_3.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_3.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_3.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_3.b_0" - strings: "fc_3.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_3.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_2.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_3.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_2.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_3.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_3.w_0" - strings: "fc_3.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_2.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_2.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_2.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_2.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_2.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_2.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_2.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_2.b_0" - strings: "fc_2.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_2.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_1.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_2.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_1.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_2.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_2.w_0" - strings: "fc_2.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_1.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_1.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_1.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_1.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_1.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_1.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_1.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_1.b_0" - strings: "fc_1.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_1.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "fc_0.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_1.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_0.tmp_2@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_1.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_1.w_0" - strings: "fc_1.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "Out" - arguments: "fc_0.tmp_2" - } - inputs { - parameter: "Out@GRAD" - arguments: "fc_0.tmp_2@GRAD" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_0.tmp_1@GRAD" - } - type: "relu_grad" - attrs { - name: "is_test" - type: BOOLEAN - b: false - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_0.tmp_1@GRAD" - } - inputs { - parameter: "Y" - arguments: "fc_0.b_0" - } - outputs { - parameter: "X@GRAD" - arguments: "fc_0.tmp_0@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_0.b_0@GRAD" - } - type: "elementwise_add_grad" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_0.b_0" - strings: "fc_0.b_0@GRAD" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "Out@GRAD" - arguments: "fc_0.tmp_0@GRAD" - } - inputs { - parameter: "X" - arguments: "cvm_input" - } - inputs { - parameter: "Y" - arguments: "fc_0.w_0" - } - outputs { - parameter: "X@GRAD" - arguments: "cvm_input@GRAD" - } - outputs { - parameter: "Y@GRAD" - arguments: "fc_0.w_0@GRAD" - } - type: "mul_grad" - attrs { - name: "op_role_var" - type: STRINGS - strings: "fc_0.w_0" - strings: "fc_0.w_0@GRAD" - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 1 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } -} -version { - version: 0 -} diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/model.yaml b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/model.yaml deleted file mode 100644 index 3bcc23d3..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/model.yaml +++ /dev/null @@ -1,90 +0,0 @@ -aa_Attention: Do Not Modify This File Manually, Unless You Really Know It -input_accessor: -- class: AbacusSparseUpdateAccessor - input: - - name: cvm_input - slot_dim: 9 - slots: [6048, 6002, 6145, 6202, 6201, 6121, 6738, 6119, 6146, 6120, 6147, 6122, - 6123, 6118, 6142, 6143, 6008, 6148, 6151, 6127, 6144, 6094, 6083, 6952, 6739, - 6150, 6109, 6003, 6099, 6149, 6129, 6203, 6153, 6152, 6128, 6106, 6251, 7082, - 7515, 6951, 6949, 7080, 6066, 7507, 6186, 6007, 7514, 6125, 7506, 10001, 6006, - 7023, 6085, 10000, 6098, 6250, 6110, 6124, 6090, 6082, 6067, 6101, 6004, 6191, - 7075, 6948, 6157, 6126, 6188, 7077, 6070, 6111, 6087, 6103, 6107, 6194, 6156, - 6005, 6247, 6814, 6158, 7122, 6058, 6189, 7058, 6059, 6115, 7079, 7081, 6833, - 7024, 6108, 13342, 13345, 13412, 13343, 13350, 13346, 13409, 6009, 6011, 6012, - 6013, 6014, 6015, 6019, 6023, 6024, 6027, 6029, 6031, 6050, 6060, 6068, 6069, - 6089, 6095, 6105, 6112, 6130, 6131, 6132, 6134, 6161, 6162, 6163, 6166, 6182, - 6183, 6185, 6190, 6212, 6213, 6231, 6233, 6234, 6236, 6238, 6239, 6240, 6241, - 6242, 6243, 6244, 6245, 6354, 7002, 7005, 7008, 7010, 7012, 7013, 7015, 7016, - 7017, 7018, 7019, 7020, 7045, 7046, 7048, 7049, 7052, 7054, 7056, 7064, 7066, - 7076, 7078, 7083, 7084, 7085, 7086, 7087, 7088, 7089, 7090, 7099, 7100, 7101, - 7102, 7103, 7104, 7105, 7109, 7124, 7126, 7136, 7142, 7143, 7144, 7145, 7146, - 7147, 7148, 7150, 7151, 7152, 7153, 7154, 7155, 7156, 7157, 7047, 7050, 6253, - 6254, 6255, 6256, 6257, 6259, 6260, 6261, 7170, 7185, 7186, 6751, 6755, 6757, - 6759, 6760, 6763, 6764, 6765, 6766, 6767, 6768, 6769, 6770, 7502, 7503, 7504, - 7505, 7510, 7511, 7512, 7513, 6806, 6807, 6808, 6809, 6810, 6811, 6812, 6813, - 6815, 6816, 6817, 6819, 6823, 6828, 6831, 6840, 6845, 6875, 6879, 6881, 6888, - 6889, 6947, 6950, 6956, 6957, 6959, 10006, 10008, 10009, 10010, 10011, 10016, - 10017, 10018, 10019, 10020, 10021, 10022, 10023, 10024, 10029, 10030, 10031, - 10032, 10033, 10034, 10035, 10036, 10037, 10038, 10039, 10040, 10041, 10042, - 10044, 10045, 10046, 10051, 10052, 10053, 10054, 10055, 10056, 10057, 10060, - 10066, 10069, 6820, 6821, 6822, 13333, 13334, 13335, 13336, 13337, 13338, 13339, - 13340, 13341, 13351, 13352, 13353, 13359, 13361, 13362, 13363, 13366, 13367, - 13368, 13369, 13370, 13371, 13375, 13376, 5700, 5702, 13400, 13401, 13402, 13403, - 13404, 13406, 13407, 13408, 13410, 13417, 13418, 13419, 13420, 13422, 13425, - 13427, 13428, 13429, 13430, 13431, 13433, 13434, 13436, 13437, 13326, 13330, - 13331, 5717, 13442, 13451, 13452, 13455, 13456, 13457, 13458, 13459, 13460, - 13461, 13462, 13463, 13464, 13465, 13466, 13467, 13468, 1104, 1106, 1107, 1108, - 1109, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1119, 1120, 1121, 1122, - 1123, 1124, 1125, 1126, 1127, 1128, 1129, 13812, 13813, 6740, 1490, 1491] - need_gradient: true - table_id: 0 -- async_pull: true - class: DenseInputAccessor - input: - - name: fc_0.w_0 - shape: [3672, 511] - - name: fc_0.b_0 - shape: [511] - - name: fc_1.w_0 - shape: [511, 255] - - name: fc_1.b_0 - shape: [255] - - name: fc_2.w_0 - shape: [255, 127] - - name: fc_2.b_0 - shape: [127] - - name: fc_3.w_0 - shape: [127, 127] - - name: fc_3.b_0 - shape: [127] - - name: fc_4.w_0 - shape: [127, 127] - - name: fc_4.b_0 - shape: [127] - - name: fc_5.w_0 - shape: [127, 1] - - name: fc_5.b_0 - shape: [1] - need_gradient: true - table_id: 5 -- {adjw_ratio: 20, adjw_threshold: 1000, class: WeightsAdjustAccessor, input: ins_weight, - slot_id: 6002} -- class: LabelInputAccessor - input: - - label_name: label_ctr - output_name: ctr.tmp_0 - shape: [-1, 1] -inputs: -- name: cvm_input - shape: [-1, 3672] -labels: -- name: label_ctr - shape: [-1, 1] -loss: loss_ctr -monitor: -- {class: AucMonitor, compute_interval: 600, name: epoch_auc, target: ctr.tmp_0, target_idx: 0} -- {class: AucMonitor, compute_interval: 86400, name: day_auc, target: ctr.tmp_0, target_idx: 0} -outputs: -- name: ctr.tmp_0 - shape: [-1, 1] diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/startup_program deleted file mode 100644 index d826974e2c46010b9ad18c09893a96774c97694c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 19253 zcmeI4O>g5w7{`-M(}qdAwaTTG9xO{pX;qxJmafEx!yYOju?q(dNC*v+c#@3jj4hAT zrd+ZIz5pO`;R7HcaRiQh0VJ+_=QD5)kH<;txM4BUHtI%CE=~MoW}as}{{QpyOUcjQ z=Sw+bGJECsc1fw*p(l2m)^v*kji^=L7+)vT)WKvj zQ7kT@kt*RPa=aiC7f??XFe$)z*F82Dw2LR@cu^kKw_7{gr10@D+_|38z3mzo_KCqLy)^Gvwhw5ok?z^fy{N5Yf6 zK9jww&m70)RsZVU>vv(_lT}x=VPo{RVV_o&2swPAbIy8P&DwO`p>V3*fDQPi7o6Mv zm33sgLUObI=%jvR3TaY5;69fQ^?YK&i-lpJTIZK6=qbC$!-}Q1K)Yw2Z$0+k*T=bg-n0H(@b4<#RJ;st`C3OO!4GwHXiBOT$u3TVYI_-8hSKd z1(iz?_GzMyc&ed-KbKZPBKi6>dFJb7+ZIeE*KSL1h)RHo?lYga6s)Bq(+RKOy^#Je zjE+NFe}-95G-@zgJ%8MNMXD-gqMYVQyf3;Py_)zY{}t(c3r5POA1>R?4ak@jgQ`VePbfe}9X?-3f)5e7_1U8o)k6lQ+NsJF8$ zy)|{dj|ZMJbomFm$EfRyx>7a>=fwfB-qa#5K?zjH^SSZhFkiK{$i~zQjNER{G6)$e zP6y|%Bc-waNzN*hqF$H~E?R`FDb5}J)lE`Rp=0FVk?n6DLqBT+&Ln)y!7FN|m%33N z<*i@J^QHujoxzm8CTL3k?x4w|;KoDIl;WQs*?);F!<3d_t7!>u##%a+@W^{w+UjZ2 zhT<*yIGwps%7f0BtD>JUsM_&w)D>6ltsB)IgKp2mghrSQx{WaLGV(#)el-NS4BbYU z_f^@0g~ z+f@eLMwmd|UJczwm_V37m{=}MJa0aI*=({y4ob*k^8B7Ub0!?W(ObKBEVdCDw z#3I#xcypW6BtIHk-_1QVp&ty%Alyh3DBPg~9 zj|paQCBzAIrPXjH#0kWSdxsN?WP3Xy+lA32pW=yH^aytbPK%Q7G zPZ&=>dq4dUc9JIwDW1^(Hzk8BAy1$yt%fThPasd+JDyl1+X>+{{Zlw B\n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 1 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_5.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - longs: 1 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_4.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_4.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_3.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_3.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.017747130245 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_2.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.0125244855881 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_2.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.0125244855881 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 255 - longs: 127 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_1.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.00884747877717 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 255 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_1.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.00884747877717 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 511 - longs: 255 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_0.b_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 128, in append_bias_op\n attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.00330049172044 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 511 - } - } - ops { - outputs { - parameter: "Out" - arguments: "fc_0.w_0" - } - type: "gaussian_random" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1851, in _prepend_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/initializer.py\", line 356, in __call__\n stop_gradient=True)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1736, in create_parameter\n initializer(param, self)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper_base.py\", line 328, in create_parameter\n **attr._to_kwargs(with_initializer=True))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 337, in fc\n attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "dtype" - type: INT - i: 5 - } - attrs { - name: "seed" - type: INT - i: 0 - } - attrs { - name: "std" - type: FLOAT - f: 0.00330049172044 - } - attrs { - name: "mean" - type: FLOAT - f: 0.0 - } - attrs { - name: "shape" - type: LONGS - longs: 3672 - longs: 511 - } - } -} -version { - version: 0 -} diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program deleted file mode 100644 index bc6263eadd520f558313e807611ff4a0775bbcf8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24920 zcmeHPO>Y}T7>=8yv8PG1O)E8+9#$(soG4v?C2grNh#o2-p~4Y_(6U*NomKWryX(ZJ zs@iLR00%_5LofUW5JwK25J%43_yy&inYDN9jh$2^ZrZ%L#5+4X^S<-(>^wW~JmimG zbNLzNMgK2XERuZ7GPd*sqq4G?Unu6u#r(YTSb4mfE0zd>dlfx)1Gu{icdroz?)IO4 zkyBpGDl@Ay#WkL@&U02*u0zhNWO+E}W1jIXUR_1sfdVfxl{D*w##&AK zhBX#}#=1!6Sl^8FO!aU_lb(^ys*TB+HEW`$V#sV~hD_R1+mn0~wY>{H6^BRXlgHpV z`{gkxsvcv1*3pT5FWP^bkI1zBnVUCw&s2F6T!x+@B+rMj@`BwJ&(%ksWBn$ctBgDc zW0KDRxv$2}H{rtDy#5+5X9?;rkOfwMjt`D?vS>LrOxdGbS9E1P)0NCtc}`a*Pr7u9 zOw1_SrIu|M?fU|`Omd&Eq_dYSk=z$644Xu2T_Lld!km@>sW|I)9Br9g@&hCE9ct{G zp)#*sBbU2|*KrNYcZ{a(Mlk1c+I4cJ<%bqET0y;L_?;L4Lb>?a49nijW?v!;NvUkc zYFEi3TzaNUBg+q{a$zG^)_z#vsg~(DQEXa!%KW__bE>Y;n@xw7gWk=-uXxw@%8tDQiR@=hzg>>(m_i0?&ur7E|4hs2*v)b)1OpEIa8D0< z+NSE*9#yNi?^HHb+f&UTpkCAP1ERv88OLGN*gG)8cBHpyT+(<7JoUi$==wUj_xq`u z>X!BMALAxT;&7Fr;2=>^Q!)S6bHxE|xZ! zUz_U2hVT&F`S$ePiOQZQ`BLB7=FYS%-s0NuDAMk#)kk30lYw2GplQZbDq!n2f8O?5 z6x_5ZFUI!l*ft&e8yYrvuj^8Z{=g4i)A=~fQ-Y>xRCE{u5sX9HsOlAUL;Q}uw7*(s zJ^gimOj;oY3;N9)Nv2dQVi4_gY^Q0MFpeT~zi&`t9=udrzS2b5DspPvS^{4{WXj62o30H6dG zcWg5nhX03aX4j6^?yB$>G~g8##tf0B0Q{p!xqU}Q${>X4V8!;n8N<5~5G8p{4Es@H z*_cn8sre1|T5mbBZ9pTa1NR1a@n9T_o=V*czhDKId%I(;lZ21NduM~FN#5c%wtK;-s&@`5u(9ll@?i)2Uy82S11 z%^O9<`#hh@L-kc$uL{-AAwARL98)*V3V%LFJPsGtH&xg68aJ!kOrFy7tX+llvC|=Ij8> z--AhfYLc5#S-@ti_xsadv4>U5=xBvhE diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program.pbtxt b/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program.pbtxt deleted file mode 100644 index 1aaba4ef..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/model/update/test_program.pbtxt +++ /dev/null @@ -1,1642 +0,0 @@ -blocks { - idx: 0 - parent_idx: -1 - vars { - name: "fc_4.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_0.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_3.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 511 - } - } - } - persistable: true - } - vars { - name: "fc_1.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - } - } - } - persistable: true - } - vars { - name: "fc_2.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "cvm_input" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 3672 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_3.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_1.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_4.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_4.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_0.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_1.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_4.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 1 - } - } - } - persistable: true - } - vars { - name: "fc_5.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "ctr.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_0.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 511 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.tmp_1" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_4.tmp_2" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_5.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 1 - } - } - } - persistable: true - } - vars { - name: "fc_1.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 511 - dims: 255 - } - } - } - persistable: true - } - vars { - name: "fc_3.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_0.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 3672 - dims: 511 - } - } - } - persistable: true - } - vars { - name: "fc_1.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 255 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 255 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "clip_0.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 1 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_3.w_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - dims: 127 - } - } - } - persistable: true - } - vars { - name: "fc_3.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.tmp_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: -1 - dims: 127 - } - lod_level: 0 - } - } - persistable: false - } - vars { - name: "fc_2.b_0" - type { - type: LOD_TENSOR - lod_tensor { - tensor { - data_type: FP32 - dims: 127 - } - } - } - persistable: true - } - ops { - inputs { - parameter: "X" - arguments: "cvm_input" - } - inputs { - parameter: "Y" - arguments: "fc_0.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_0.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_0.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_0.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_1.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_1.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_1.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_1.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_2.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_2.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_2.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_2.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_3.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_3.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_3.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_3.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_4.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_4.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_1" - } - outputs { - parameter: "Out" - arguments: "fc_4.tmp_2" - } - type: "relu" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 159, in append_activation\n attrs=act)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 360, in fc\n return helper.append_activation(pre_activation)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_4.tmp_2" - } - inputs { - parameter: "Y" - arguments: "fc_5.w_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_0" - } - type: "mul" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "x_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "force_fp32_output" - type: BOOLEAN - b: false - } - attrs { - name: "y_num_col_dims" - type: INT - i: 1 - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 345, in fc\n \"y_num_col_dims\": 1})\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "scale_out" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_x" - type: FLOAT - f: 1.0 - } - attrs { - name: "scale_y" - type: FLOATS - floats: 1.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_0" - } - inputs { - parameter: "Y" - arguments: "fc_5.b_0" - } - outputs { - parameter: "Out" - arguments: "fc_5.tmp_1" - } - type: "elementwise_add" - attrs { - name: "y_data_format" - type: STRING - s: "" - } - attrs { - name: "x_data_format" - type: STRING - s: "" - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 135, in append_bias_op\n attrs={\'axis\': dim_start})\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 358, in fc\n pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)\n" - strings: " File \"update.py\", line 47, in inference\n initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])))\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "axis" - type: INT - i: 1 - } - } - ops { - inputs { - parameter: "X" - arguments: "fc_5.tmp_1" - } - outputs { - parameter: "Out" - arguments: "clip_0.tmp_0" - } - type: "clip" - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/nn.py\", line 10681, in clip\n outputs={\"Out\": out})\n" - strings: " File \"update.py\", line 49, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - attrs { - name: "max" - type: FLOAT - f: 15.0 - } - attrs { - name: "min" - type: FLOAT - f: -15.0 - } - } - ops { - inputs { - parameter: "X" - arguments: "clip_0.tmp_0" - } - outputs { - parameter: "Out" - arguments: "ctr.tmp_0" - } - type: "sigmoid" - attrs { - name: "is_test" - type: INT - i: 1 - } - attrs { - name: "use_cudnn" - type: BOOLEAN - b: false - } - attrs { - name: "use_mkldnn" - type: BOOLEAN - b: false - } - attrs { - name: "op_role_var" - type: STRINGS - } - attrs { - name: "op_callstack" - type: STRINGS - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/framework.py\", line 1780, in append_op\n attrs=kwargs.get(\"attrs\", None))\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layer_helper.py\", line 43, in append_op\n return self.main_program.current_block().append_op(*args, **kwargs)\n" - strings: " File \"/home/xiexionghang/paddle/py-paddle/python/lib/python2.7/site-packages/paddle/fluid/layers/layer_function_generator.py\", line 247, in func\n helper.append_op(type=op_type, inputs={\"X\": x}, outputs={\"Out\": output})\n" - strings: " File \"update.py\", line 49, in inference\n ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name=\"ctr\")\n" - strings: " File \"create_programs.py\", line 100, in build_and_save\n inference_info = self._inference()\n" - strings: " File \"create_programs.py\", line 200, in main\n builder.build_and_save()\n" - strings: " File \"create_programs.py\", line 203, in \n main(sys.argv)\n" - } - attrs { - name: "op_namescope" - type: STRING - s: "/" - } - attrs { - name: "op_role" - type: INT - i: 0 - } - } -} -version { - version: 0 -} diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh b/paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh deleted file mode 100755 index d31f61e9..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/start_feed_trainer.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash -BIN_FILE=feed_trainer -work_dir=`pwd` - -function usage() { - echo -e "\033[41mUSAGE: sh scripts/start_feed_trainer.sh [run_mode]\033[0m" - echo "run_mode=mpi, run job in mpi cluster" - echo "run_mode=mpi_tmp, run 1 node with mpi in /tmp" - echo "run_mode=local, run 1 node in local" - echo "Example: sh scripts/start_feed_trainer.sh local" - exit 0 -} -if [ $# -lt 1 ];then - run_mode="mpi" -else - run_mode="$1" -fi - -export PATH=/usr/local/openmpi/bin:$PATH -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openmpi/lib/ -if [ "${run_mode}" = "mpi" ];then - mpirun mv package/* . - export HADOOP_HOME="./hadoop-client/hadoop" - export PATH=$HADOOP_HOME/bin/:./bin:$PATH - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./so - mpirun sed -i 's/LocalRuntimeEnvironment/MPIRuntimeEnvironment/g' conf/*.yaml - export HADOOP_HOME="./hadoop-client/hadoop" - export PATH=$HADOOP_HOME/bin/:/bin:$PATH - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./so - - GLOG_logtostderr=0 mpirun -npernode 2 -timestamp-output -tag-output --prefix $work_dir ./bin/feed_trainer --log_dir=log -elif [ "${run_mode}" = "mpi_tmp" ];then - mv package/* . - mkdir temp - export HADOOP_HOME="$work_dir/hadoop-client/hadoop" - export PATH=$HADOOP_HOME/bin/:/bin:$PATH - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${work_dir}/so - sed -i 's/LocalRuntimeEnvironment/MPIRuntimeEnvironment/g' conf/*.yaml - mpirun -npernode 2 -timestamp-output -tag-output --prefix $work_dir --mca orte_tmpdir_base ${work_dir}/temp scripts/start_feed_trainer.sh random_log -elif [ "${run_mode}" = "local" ];then - sed -i 's/MPIRuntimeEnvironment/LocalRuntimeEnvironment/g' conf/*.yaml - export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${work_dir}/so - mkdir log - ./bin/feed_trainer --log_dir=log -elif [ "${run_mode}" = "random_log" ];then - log_dir="log/log.${RANDOM}" - ./bin/feed_trainer --log_dir=log -else - usage -fi diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/submit_mpi.sh b/paddle/fluid/train/custom_trainer/feed/scripts/submit_mpi.sh deleted file mode 100755 index df7c6a86..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/submit_mpi.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -export PATH=/bin/:$PATH -set -x - -source conf/env.conf - -echo "# This file is automatically generated. Don't change it." > conf/qsub_f.conf -echo "SERVER=$MPI_SERVER" >> conf/qsub_f.conf -echo "QUEUE=$MPI_QUEUE" >> conf/qsub_f.conf -echo "PRIORITY=$MPI_PRIORITY" >> conf/qsub_f.conf - -export HADOOP_HOME=$HADOOP_HOME - -sh scripts/compake_runable_package.sh - -$HPC_HOME/bin/qsub_f \ - -N $MPI_JOB_NAME \ - --conf conf/qsub_f.conf \ - --hdfs $HADOOP_FS \ - --ugi $HADOOP_UGI \ - --hout $HDFS_ROOT \ - --files package \ - -l nodes=$MPI_NODE_NUM,walltime=$MPI_WALL_TIME,pmem-hard=$MPI_NODE_MEM,pcpu-soft=180,pnetin-soft=1000,pnetout-soft=1000 \ - scripts/start_feed_trainer.sh - -if [ $? -ne 0 ]; then - echo -e "qsub_f failed, please check the config or get help from abacus RD\n" - exit -1 -fi - -exit 0 diff --git a/paddle/fluid/train/custom_trainer/feed/scripts/update.py b/paddle/fluid/train/custom_trainer/feed/scripts/update.py deleted file mode 100644 index 4be67e8c..00000000 --- a/paddle/fluid/train/custom_trainer/feed/scripts/update.py +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python -#-*- coding:utf-8 -*- - -""" -This is an example of network building -""" - -from __future__ import print_function, division -import paddle -from paddle import fluid - -def sparse_cvm_dim(sparse_info): - return sparse_info['slot_dim'] * len(sparse_info['slots']) - -def inference(): - """Build inference network(without loss and optimizer) - - Returns: - list: inputs - and - list: outputs - """ - sparse_cvm = { "name": "cvm_input", "slot_dim" : 9, "slots": [6048,6002,6145,6202,6201,6121,6738,6119,6146,6120,6147,6122,6123,6118,6142,6143,6008,6148,6151,6127,6144,6094,6083,6952,6739,6150,6109,6003,6099,6149,6129,6203,6153,6152,6128,6106,6251,7082,7515,6951,6949,7080,6066,7507,6186,6007,7514,6125,7506,10001,6006,7023,6085,10000,6098,6250,6110,6124,6090,6082,6067,6101,6004,6191,7075,6948,6157,6126,6188,7077,6070,6111,6087,6103,6107,6194,6156,6005,6247,6814,6158,7122,6058,6189,7058,6059,6115,7079,7081,6833,7024,6108,13342,13345,13412,13343,13350,13346,13409,6009,6011,6012,6013,6014,6015,6019,6023,6024,6027,6029,6031,6050,6060,6068,6069,6089,6095,6105,6112,6130,6131,6132,6134,6161,6162,6163,6166,6182,6183,6185,6190,6212,6213,6231,6233,6234,6236,6238,6239,6240,6241,6242,6243,6244,6245,6354,7002,7005,7008,7010,7012,7013,7015,7016,7017,7018,7019,7020,7045,7046,7048,7049,7052,7054,7056,7064,7066,7076,7078,7083,7084,7085,7086,7087,7088,7089,7090,7099,7100,7101,7102,7103,7104,7105,7109,7124,7126,7136,7142,7143,7144,7145,7146,7147,7148,7150,7151,7152,7153,7154,7155,7156,7157,7047,7050,6253,6254,6255,6256,6257,6259,6260,6261,7170,7185,7186,6751,6755,6757,6759,6760,6763,6764,6765,6766,6767,6768,6769,6770,7502,7503,7504,7505,7510,7511,7512,7513,6806,6807,6808,6809,6810,6811,6812,6813,6815,6816,6817,6819,6823,6828,6831,6840,6845,6875,6879,6881,6888,6889,6947,6950,6956,6957,6959,10006,10008,10009,10010,10011,10016,10017,10018,10019,10020,10021,10022,10023,10024,10029,10030,10031,10032,10033,10034,10035,10036,10037,10038,10039,10040,10041,10042,10044,10045,10046,10051,10052,10053,10054,10055,10056,10057,10060,10066,10069,6820,6821,6822,13333,13334,13335,13336,13337,13338,13339,13340,13341,13351,13352,13353,13359,13361,13362,13363,13366,13367,13368,13369,13370,13371,13375,13376,5700,5702,13400,13401,13402,13403,13404,13406,13407,13408,13410,13417,13418,13419,13420,13422,13425,13427,13428,13429,13430,13431,13433,13434,13436,13437,13326,13330,13331,5717,13442,13451,13452,13455,13456,13457,13458,13459,13460,13461,13462,13463,13464,13465,13466,13467,13468,1104,1106,1107,1108,1109,1110,1111,1112,1113,1114,1115,1116,1117,1119,1120,1121,1122,1123,1124,1125,1126,1127,1128,1129,13812,13813,6740,1490,1491]} - # TODO: build network here - cvm_input = fluid.layers.data(name='cvm_input', shape=[sparse_cvm_dim(sparse_cvm)], dtype='float32', stop_gradient=False) - - net = cvm_input - lr_x = 1.0 - init_range = 0.2 - fc_layers_size = [511, 255, 127, 127, 127, 1] - fc_layers_act = ["relu"] * (len(fc_layers_size) - 1) + [None] - scales_tmp = [net.shape[1]] + fc_layers_size - scales = [] - for i in range(len(scales_tmp)): - scales.append(init_range / (scales_tmp[i] ** 0.5)) - for i in range(len(fc_layers_size)): - net = fluid.layers.fc( - input = net, - size = fc_layers_size[i], - name = 'fc_' + str(i), - act = fc_layers_act[i], - param_attr = \ - fluid.ParamAttr(learning_rate=lr_x, \ - initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i])), - bias_attr = \ - fluid.ParamAttr(learning_rate=lr_x, \ - initializer=fluid.initializer.NormalInitializer(loc=0.0, scale=1.0 * scales[i]))) - - ctr_output = fluid.layers.sigmoid(fluid.layers.clip(net, min=-15.0, max=15.0), name="ctr") - - accessors = [ - { "class": "AbacusSparseUpdateAccessor", "input": "sparses", "table_id": 0, "need_gradient": True}, - { "class": "DenseInputAccessor", "input": "vars", "table_id": 3, "need_gradient": True, "async_pull": True}, - { "class": "WeightsAdjustAccessor", "input": "ins_weight", - "slot_id": 6002, "adjw_ratio": 20, "adjw_threshold": 1000}, - { "class": "LabelInputAccessor", "input": "labels"} - ] - monitors = [ - { "name": "epoch_auc", "class": "AucMonitor", "target": ctr_output, "compute_interval": 600 }, - { "name": "day_auc", "class": "AucMonitor", "target": ctr_output, "compute_interval": 86400 } - ] - - return { - 'accessors': accessors, - 'monitors': monitors, - 'sparses': [sparse_cvm], - 'inputs': [cvm_input], - 'outputs': [ctr_output] - } - -def loss_function(ctr_output): - """ - Args: - *outputs: the second result of inference() - - Returns: - Variable: loss - and - list: labels - """ - # TODO: calc loss here - ins_weight = fluid.layers.data( - name="ins_weight", - shape=[-1, 1], - dtype="float32", - lod_level=0, - append_batch_size=False, - stop_gradient=True) - - label = fluid.layers.data(name='label_ctr', shape=ctr_output.shape, dtype='float32') - loss = fluid.layers.log_loss(input=ctr_output, label=label) - loss = fluid.layers.elementwise_mul(loss, ins_weight) - loss = fluid.layers.mean(loss, name='loss_ctr') - - return loss, [label] diff --git a/paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.cc b/paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.cc deleted file mode 100644 index e4a0876f..00000000 --- a/paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.cc +++ /dev/null @@ -1,248 +0,0 @@ -#include "paddle/fluid/framework/archive.h" -#include "paddle/fluid/train/custom_trainer/feed/trainer_context.h" -#include "paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.h" -#include - -namespace paddle { -namespace custom_trainer { -namespace feed { - - -int Shuffler::initialize(YAML::Node config, - std::shared_ptr context_ptr) { - _trainer_context = context_ptr.get(); - _shuffle_key_func = shuffle_key_factory(config["shuffle_key_func"].as("RANDOM")); - return 0; -} - -class LocalShuffler : public Shuffler { -public: - LocalShuffler() {} - virtual ~LocalShuffler() {} - virtual int shuffle(::paddle::framework::Channel& data_channel) { - std::vector data_items(data_channel->Size()); - data_channel->ReadAll(data_items); - std::shuffle(data_items.begin(), data_items.end(), local_random_engine()); - data_channel->Open(); - data_channel->Clear(); - data_channel->WriteMove(data_items.size(), &data_items[0]); - data_channel->Close(); - return 0; - } -}; -REGIST_CLASS(Shuffler, LocalShuffler); - -class GlobalShuffler : public Shuffler { -public: - GlobalShuffler() {} - virtual ~GlobalShuffler() {} - virtual int initialize(YAML::Node config, - std::shared_ptr context_ptr) { - Shuffler::initialize(config, context_ptr); - _max_concurrent_num = config["max_concurrent_num"].as(6); // 最大并发发送数 - _max_package_size = config["max_package_size"].as(256); // 最大包个数,一次发送package个数据 - _shuffle_data_msg_type = config["shuffle_data_msg_type"].as(3); // c2c msg type - _finish_msg_type = config["finish_msg_type"].as(4); // c2c msg type - - reset_channel(); - auto binded = std::bind(&GlobalShuffler::get_client2client_msg, this, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3); - _trainer_context->pslib->ps_client()->registe_client2client_msg_handler(_shuffle_data_msg_type, - binded); - _trainer_context->pslib->ps_client()->registe_client2client_msg_handler(_finish_msg_type, - binded); - return 0; - } - - // 所有worker必须都调用shuffle,并且shuffler同时只能有一个shuffle任务 - virtual int shuffle(::paddle::framework::Channel& data_channel) { - uint32_t send_count = 0; - uint32_t package_size = _max_package_size; - uint32_t concurrent_num = _max_concurrent_num; - ::paddle::framework::Channel input_channel = ::paddle::framework::MakeChannel(data_channel); - data_channel.swap(input_channel); - set_channel(data_channel); - - _item_send_count = 0; - _item_receive_count = 0; - auto* environment = _trainer_context->environment.get(); - auto worker_num = environment->node_num(EnvironmentRole::WORKER); - std::vector>> waits(concurrent_num); - std::vector send_buffer(package_size); - std::vector> send_buffer_worker(worker_num); - - int status = 0;// >0: finish; =0: running; <0: fail - while (status == 0) { - // update status - // 如果在训练期,则限速shuffle - // 如果在wait状态,全速shuffle - if (_trainer_context->is_status(TrainerStatus::Training)) { - concurrent_num = 1; - package_size = _max_concurrent_num / 2; - } else { - package_size = _max_package_size; - concurrent_num = _max_concurrent_num; - } - for (uint32_t current_wait_idx = 0; status == 0 && current_wait_idx < concurrent_num; ++current_wait_idx) { - auto read_size = input_channel->Read(package_size, send_buffer.data()); - if (read_size == 0) { - status = 1; - break; - } - _item_send_count += read_size; - for (int i = 0; i < worker_num; ++i) { - send_buffer_worker[i].clear(); - } - for (int i = 0; i < read_size; ++i) { - auto worker_idx = _shuffle_key_func(send_buffer[i].id) % worker_num; - send_buffer_worker[worker_idx].push_back(std::move(send_buffer[i])); - } - for (auto& wait_s : waits[current_wait_idx]) { - if (wait_s.get() != 0) { - LOG(WARNING) << "fail to send shuffle data"; - status = -1; - break; - } - } - if (status != 0) { - break; - } - waits[current_wait_idx].clear(); - for (int i = 0; i < worker_num; ++i) { - if (!send_buffer_worker[i].empty()) { - waits[current_wait_idx].push_back(send_shuffle_data(i, send_buffer_worker[i])); - } - } - } - } - for (auto& waits_s : waits) { - for (auto& wait_s : waits_s) { - if (wait_s.get() != 0) { - LOG(WARNING) << "fail to send shuffle data"; - status = -1; - } - } - } - VLOG(2) << "start send finish, worker_num: " << worker_num; - waits[0].clear(); - for (int i = 0; i < worker_num; ++i) { - waits[0].push_back(send_finish(i)); - } - VLOG(2) << "wait all finish"; - for (int i = 0; i < worker_num; ++i) { - if (waits[0][i].get() != 0) { - LOG(WARNING) << "fail to send finish " << i; - status = -1; - } - } - VLOG(2) << "finish shuffler_send_channel, total_send:" << _item_send_count; - return status < 0 ? status : 0; - } - -private: - /* - 1. 部分c2c send_shuffle_data先到, 此时channel未设置, 等待wait_channel - 2. shuffle中调用set_channel, 先reset_wait_num, 再解锁channel - 3. 当接收到所有worker的finish请求后,先reset_channel, 再同时返回 - */ - bool wait_channel() { - VLOG(5) << "wait_channel"; - std::lock_guard lock(_channel_mutex); - return _out_channel != nullptr; - } - void reset_channel() { - VLOG(5) << "reset_channel"; - _channel_mutex.lock(); - if (_out_channel != nullptr) { - _out_channel->Close(); - } - _out_channel = nullptr; - } - void reset_wait_num() { - _wait_num_mutex.lock(); - _wait_num = _trainer_context->environment->node_num(EnvironmentRole::WORKER); - VLOG(5) << "reset_wait_num: " << _wait_num; - } - void set_channel(paddle::framework::Channel& channel) { - VLOG(5) << "set_channel"; - // 在节点开始写入channel之前,重置wait_num - CHECK(_out_channel == nullptr); - _out_channel = channel; - reset_wait_num(); - _channel_mutex.unlock(); - } - - int32_t finish_write_channel() { - int wait_num = --_wait_num; - VLOG(5) << "finish_write_channel, wait_num: " << wait_num; - // 同步所有worker,在所有写入完成后,c2c_msg返回前,重置channel - if (wait_num == 0) { - reset_channel(); - VLOG(2) << "finish shuffle_receive_channel, receive_count: " << _item_receive_count; - _wait_num_mutex.unlock(); - } else { - std::lock_guard lock(_wait_num_mutex); - } - return 0; - } - int32_t write_to_channel(std::vector&& items) { - size_t items_size = items.size(); - _item_receive_count += items_size; - return _out_channel->Write(std::move(items)) == items_size ? 0 : -1; - } - - int32_t get_client2client_msg(int msg_type, int from_client, const std::string& msg) { - // wait channel - if (!wait_channel()) { - LOG(FATAL) << "out_channel is null"; - return -1; - } - VLOG(5) << "get c2c msg, type: " << msg_type << ", from_client: " << from_client << ", msg_size: " << msg.size(); - if (msg_type == _shuffle_data_msg_type) { - paddle::framework::BinaryArchive ar; - ar.SetReadBuffer(const_cast(msg.data()), msg.size(), [](char*){}); - std::vector items; - ar >> items; - return write_to_channel(std::move(items)); - } else if (msg_type == _finish_msg_type) { - return finish_write_channel(); - } - LOG(FATAL) << "no such msg type: " << msg_type; - return -1; - } - - std::future send_shuffle_data(int to_client_id, std::vector& items) { - // server端也开启了client, worker节点为偶数编号 - to_client_id = 2 * to_client_id; - VLOG(5) << "send_shuffle_data, to_client_id: " << to_client_id << ", items_size: " << items.size(); - paddle::framework::BinaryArchive ar; - ar << items; - return _trainer_context->pslib->ps_client()->send_client2client_msg(_shuffle_data_msg_type, to_client_id, - std::string(ar.Buffer(), ar.Length())); - } - - std::future send_finish(int to_client_id) { - // server端也开启了client, worker节点为偶数编号 - to_client_id = 2 * to_client_id; - VLOG(5) << "send_finish, to_client_id: " << to_client_id; - static const std::string empty_str; - return _trainer_context->pslib->ps_client()->send_client2client_msg(_finish_msg_type, to_client_id, empty_str); - } - - uint32_t _max_package_size = 0; - uint32_t _max_concurrent_num = 0; - int _shuffle_data_msg_type = 3; - int _finish_msg_type = 4; - - bthread::Mutex _channel_mutex; - paddle::framework::Channel _out_channel = nullptr; - - bthread::Mutex _wait_num_mutex; - std::atomic _wait_num; - std::atomic _item_send_count; - std::atomic _item_receive_count; -}; -REGIST_CLASS(Shuffler, GlobalShuffler); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.h b/paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.h deleted file mode 100644 index 86394235..00000000 --- a/paddle/fluid/train/custom_trainer/feed/shuffler/shuffler.h +++ /dev/null @@ -1,69 +0,0 @@ -#pragma once -#include "paddle/fluid/train/custom_trainer/feed/dataset/data_reader.h" - -namespace paddle { -namespace custom_trainer { -namespace feed { - -class TrainerContext; - -inline double current_realtime() { - struct timespec tp; - clock_gettime(CLOCK_REALTIME, &tp); - return tp.tv_sec + tp.tv_nsec * 1e-9; -} - -inline std::default_random_engine& local_random_engine() { - struct engine_wrapper_t { - std::default_random_engine engine; - engine_wrapper_t() { - static std::atomic x(0); - std::seed_seq sseq = {x++, x++, x++, (unsigned long)(current_realtime() * 1000)}; - engine.seed(sseq); - } - }; - thread_local engine_wrapper_t r; - return r.engine; -} - -inline uint64_t shuffle_key_random(const std::string& /*key*/) { - return local_random_engine()(); -} - -inline uint64_t shuffle_key_hash(const std::string& key) { - static std::hash hasher; - return hasher(key); -} - -inline uint64_t shuffle_key_numeric(const std::string& key) { - return strtoull(key.c_str(), NULL, 10); -} - -typedef uint64_t (*ShuffleKeyFunc)(const std::string& key); -inline ShuffleKeyFunc shuffle_key_factory(const std::string& name) { - if (name == "NUMERIC") { - return &shuffle_key_numeric; - } else if (name == "HASH") { - return &shuffle_key_hash; - } - return &shuffle_key_random; -} - - -class Shuffler { -public: - Shuffler() {} - virtual ~Shuffler() {} - virtual int initialize(YAML::Node config, - std::shared_ptr context_ptr); - virtual int shuffle(::paddle::framework::Channel& data_channel) = 0; -protected: - ShuffleKeyFunc _shuffle_key_func; - TrainerContext* _trainer_context; -}; - -REGIST_REGISTERER(Shuffler); - -} // namespace feed -} // namespace custom_trainer -} // namespace paddle diff --git a/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp b/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp deleted file mode 100644 index 1087f567..00000000 --- a/paddle/fluid/train/custom_trainer/feed/temp/feed_trainer.cpp +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/init.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace train { - -void ReadBinaryFile(const std::string& filename, std::string* contents) { - std::ifstream fin(filename, std::ios::in | std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); - fin.seekg(0, std::ios::end); - contents->clear(); - contents->resize(fin.tellg()); - fin.seekg(0, std::ios::beg); - fin.read(&(contents->at(0)), contents->size()); - fin.close(); -} - -std::unique_ptr Load( - paddle::framework::Executor* executor, const std::string& model_filename) { - VLOG(3) << "loading model from " << model_filename; - std::string program_desc_str; - ReadBinaryFile(model_filename, &program_desc_str); - - std::unique_ptr main_program( - new paddle::framework::ProgramDesc(program_desc_str)); - return main_program; -} - -} // namespace train -} // namespace paddle - -int main() { - paddle::framework::InitDevices(false); - - const auto cpu_place = paddle::platform::CPUPlace(); - - paddle::framework::Executor executor(cpu_place); - paddle::framework::Scope scope; - auto startup_program = paddle::train::Load(&executor, "startup_program"); - auto train_program = paddle::train::Load(&executor, "main_program"); - - std::string loss_name = ""; - for (auto op_desc : train_program->Block(0).AllOps()) { - if (op_desc->Type() == "mean") { - loss_name = op_desc->Output("Out")[0]; - break; - } - } - - PADDLE_ENFORCE_NE(loss_name, "", "loss not found"); - - // init all parameters - executor.Run(*startup_program, &scope, 0); - - // prepare data - auto x_var = scope.Var("x"); - auto x_tensor = x_var->GetMutable(); - x_tensor->Resize({2, 13}); - - auto x_data = x_tensor->mutable_data(cpu_place); - for (int i = 0; i < 2 * 13; ++i) { - x_data[i] = static_cast(i); - } - - auto y_var = scope.Var("y"); - auto y_tensor = y_var->GetMutable(); - y_tensor->Resize({2, 1}); - auto y_data = y_tensor->mutable_data(cpu_place); - for (int i = 0; i < 2 * 1; ++i) { - y_data[i] = static_cast(i); - } - - auto loss_var = scope.Var(loss_name); - - paddle::platform::ProfilerState pf_state; - pf_state = paddle::platform::ProfilerState::kCPU; - paddle::platform::EnableProfiler(pf_state); - clock_t t1 = clock(); - - for (int i = 0; i < 10; ++i) { - executor.Run(*train_program, &scope, 0, false, true); - std::cout << "step: " << i << " loss: " - << loss_var->Get().data()[0] - << std::endl; - } - - clock_t t2 = clock(); - paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal, - "run_paddle_op_profiler"); - std::cout << "run_time = " << t2 - t1 << std::endl; - return 0; -} diff --git a/paddle/fluid/train/custom_trainer/feed/tool/format_newcate_hotnews.awk b/paddle/fluid/train/custom_trainer/feed/tool/format_newcate_hotnews.awk deleted file mode 100755 index 7820d405..00000000 --- a/paddle/fluid/train/custom_trainer/feed/tool/format_newcate_hotnews.awk +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/awk -f -{ - if ($1 !~ /^([0-9a-zA-Z])+$/ || $2 !~ /^([0-9])+$/ || $3 !~ /^([0-9])+$/) { - next; - } - show = $2; - clk = $3; - if (clk > show) { - clk = show; - } - for (i = 0; i < clk; i++) { - $2 = "1"; - $3 = "1"; - print $0; - } - for (i = 0; i < show - clk; i++) { - $2 = "1"; - $3 = "0"; - print $0; - } -} diff --git a/paddle/fluid/train/custom_trainer/feed/tool/gdbinit b/paddle/fluid/train/custom_trainer/feed/tool/gdbinit deleted file mode 100644 index 1979250b..00000000 --- a/paddle/fluid/train/custom_trainer/feed/tool/gdbinit +++ /dev/null @@ -1,697 +0,0 @@ -# -# STL GDB evaluators/views/utilities - 1.03 -# -# The new GDB commands: -# are entirely non instrumental -# do not depend on any "inline"(s) - e.g. size(), [], etc -# are extremely tolerant to debugger settings -# -# This file should be "included" in .gdbinit as following: -# source stl-views.gdb or just paste it into your .gdbinit file -# -# The following STL containers are currently supported: -# -# std::vector -- via pvector command -# std::list -- via plist or plist_member command -# std::map -- via pmap or pmap_member command -# std::multimap -- via pmap or pmap_member command -# std::set -- via pset command -# std::multiset -- via pset command -# std::deque -- via pdequeue command -# std::stack -- via pstack command -# std::queue -- via pqueue command -# std::priority_queue -- via ppqueue command -# std::bitset -- via pbitset command -# std::string -- via pstring command -# std::widestring -- via pwstring command -# -# The end of this file contains (optional) C++ beautifiers -# Make sure your debugger supports $argc -# -# Simple GDB Macros writen by Dan Marinescu (H-PhD) - License GPL -# Inspired by intial work of Tom Malnar, -# Tony Novac (PhD) / Cornell / Stanford, -# Gilad Mishne (PhD) and Many Many Others. -# Contact: dan_c_marinescu@yahoo.com (Subject: STL) -# -# Modified to work with g++ 4.3 by Anders Elton -# Also added _member functions, that instead of printing the entire class in map, prints a member. - - - -# -# std::vector<> -# - -define pvector - if $argc == 0 - help pvector - else - set $size = $arg0._M_impl._M_finish - $arg0._M_impl._M_start - set $capacity = $arg0._M_impl._M_end_of_storage - $arg0._M_impl._M_start - set $size_max = $size - 1 - end - if $argc == 1 - set $i = 0 - while $i < $size - printf "elem[%u]: ", $i - p *($arg0._M_impl._M_start + $i) - set $i++ - end - end - if $argc == 2 - set $idx = $arg1 - if $idx < 0 || $idx > $size_max - printf "idx1, idx2 are not in acceptable range: [0..%u].\n", $size_max - else - printf "elem[%u]: ", $idx - p *($arg0._M_impl._M_start + $idx) - end - end - if $argc == 3 - set $start_idx = $arg1 - set $stop_idx = $arg2 - if $start_idx > $stop_idx - set $tmp_idx = $start_idx - set $start_idx = $stop_idx - set $stop_idx = $tmp_idx - end - if $start_idx < 0 || $stop_idx < 0 || $start_idx > $size_max || $stop_idx > $size_max - printf "idx1, idx2 are not in acceptable range: [0..%u].\n", $size_max - else - set $i = $start_idx - while $i <= $stop_idx - printf "elem[%u]: ", $i - p *($arg0._M_impl._M_start + $i) - set $i++ - end - end - end - if $argc > 0 - printf "Vector size = %u\n", $size - printf "Vector capacity = %u\n", $capacity - printf "Element " - whatis $arg0._M_impl._M_start - end -end - -document pvector - Prints std::vector information. - Syntax: pvector - Note: idx, idx1 and idx2 must be in acceptable range [0...size()-1]. - Examples: - pvector v - Prints vector content, size, capacity and T typedef - pvector v 0 - Prints element[idx] from vector - pvector v 1 2 - Prints elements in range [idx1..idx2] from vector -end - -# -# std::list<> -# - -define plist - if $argc == 0 - help plist - else - set $head = &$arg0._M_impl._M_node - set $current = $arg0._M_impl._M_node._M_next - set $size = 0 - while $current != $head - if $argc == 2 - printf "elem[%u]: ", $size - p *($arg1*)($current + 1) - end - if $argc == 3 - if $size == $arg2 - printf "elem[%u]: ", $size - p *($arg1*)($current + 1) - end - end - set $current = $current._M_next - set $size++ - end - printf "List size = %u \n", $size - if $argc == 1 - printf "List " - whatis $arg0 - printf "Use plist to see the elements in the list.\n" - end - end -end - -document plist - Prints std::list information. - Syntax: plist : Prints list size, if T defined all elements or just element at idx - Examples: - plist l - prints list size and definition - plist l int - prints all elements and list size - plist l int 2 - prints the third element in the list (if exists) and list size -end - -define plist_member - if $argc == 0 - help plist_member - else - set $head = &$arg0._M_impl._M_node - set $current = $arg0._M_impl._M_node._M_next - set $size = 0 - while $current != $head - if $argc == 3 - printf "elem[%u]: ", $size - p (*($arg1*)($current + 1)).$arg2 - end - if $argc == 4 - if $size == $arg3 - printf "elem[%u]: ", $size - p (*($arg1*)($current + 1)).$arg2 - end - end - set $current = $current._M_next - set $size++ - end - printf "List size = %u \n", $size - if $argc == 1 - printf "List " - whatis $arg0 - printf "Use plist_member to see the elements in the list.\n" - end - end -end - -document plist_member - Prints std::list information. - Syntax: plist : Prints list size, if T defined all elements or just element at idx - Examples: - plist_member l int member - prints all elements and list size - plist_member l int member 2 - prints the third element in the list (if exists) and list size -end - - -# -# std::map and std::multimap -# - -define pmap - if $argc == 0 - help pmap - else - set $tree = $arg0 - set $i = 0 - set $node = $tree._M_t._M_impl._M_header._M_left - set $end = $tree._M_t._M_impl._M_header - set $tree_size = $tree._M_t._M_impl._M_node_count - if $argc == 1 - printf "Map " - whatis $tree - printf "Use pmap to see the elements in the map.\n" - end - if $argc == 3 - while $i < $tree_size - set $value = (void *)($node + 1) - printf "elem[%u].left: ", $i - p *($arg1*)$value - set $value = $value + sizeof($arg1) - printf "elem[%u].right: ", $i - p *($arg2*)$value - if $node._M_right != 0 - set $node = $node._M_right - while $node._M_left != 0 - set $node = $node._M_left - end - else - set $tmp_node = $node._M_parent - while $node == $tmp_node._M_right - set $node = $tmp_node - set $tmp_node = $tmp_node._M_parent - end - if $node._M_right != $tmp_node - set $node = $tmp_node - end - end - set $i++ - end - end - if $argc == 4 - set $idx = $arg3 - set $ElementsFound = 0 - while $i < $tree_size - set $value = (void *)($node + 1) - if *($arg1*)$value == $idx - printf "elem[%u].left: ", $i - p *($arg1*)$value - set $value = $value + sizeof($arg1) - printf "elem[%u].right: ", $i - p *($arg2*)$value - set $ElementsFound++ - end - if $node._M_right != 0 - set $node = $node._M_right - while $node._M_left != 0 - set $node = $node._M_left - end - else - set $tmp_node = $node._M_parent - while $node == $tmp_node._M_right - set $node = $tmp_node - set $tmp_node = $tmp_node._M_parent - end - if $node._M_right != $tmp_node - set $node = $tmp_node - end - end - set $i++ - end - printf "Number of elements found = %u\n", $ElementsFound - end - if $argc == 5 - set $idx1 = $arg3 - set $idx2 = $arg4 - set $ElementsFound = 0 - while $i < $tree_size - set $value = (void *)($node + 1) - set $valueLeft = *($arg1*)$value - set $valueRight = *($arg2*)($value + sizeof($arg1)) - if $valueLeft == $idx1 && $valueRight == $idx2 - printf "elem[%u].left: ", $i - p $valueLeft - printf "elem[%u].right: ", $i - p $valueRight - set $ElementsFound++ - end - if $node._M_right != 0 - set $node = $node._M_right - while $node._M_left != 0 - set $node = $node._M_left - end - else - set $tmp_node = $node._M_parent - while $node == $tmp_node._M_right - set $node = $tmp_node - set $tmp_node = $tmp_node._M_parent - end - if $node._M_right != $tmp_node - set $node = $tmp_node - end - end - set $i++ - end - printf "Number of elements found = %u\n", $ElementsFound - end - printf "Map size = %u\n", $tree_size - end -end - -document pmap - Prints std::map or std::multimap information. Works for std::multimap as well. - Syntax: pmap : Prints map size, if T defined all elements or just element(s) with val(s) - Examples: - pmap m - prints map size and definition - pmap m int int - prints all elements and map size - pmap m int int 20 - prints the element(s) with left-value = 20 (if any) and map size - pmap m int int 20 200 - prints the element(s) with left-value = 20 and right-value = 200 (if any) and map size -end - - -define pmap_member - if $argc == 0 - help pmap_member - else - set $tree = $arg0 - set $i = 0 - set $node = $tree._M_t._M_impl._M_header._M_left - set $end = $tree._M_t._M_impl._M_header - set $tree_size = $tree._M_t._M_impl._M_node_count - if $argc == 1 - printf "Map " - whatis $tree - printf "Use pmap to see the elements in the map.\n" - end - if $argc == 5 - while $i < $tree_size - set $value = (void *)($node + 1) - printf "elem[%u].left: ", $i - p (*($arg1*)$value).$arg2 - set $value = $value + sizeof($arg1) - printf "elem[%u].right: ", $i - p (*($arg3*)$value).$arg4 - if $node._M_right != 0 - set $node = $node._M_right - while $node._M_left != 0 - set $node = $node._M_left - end - else - set $tmp_node = $node._M_parent - while $node == $tmp_node._M_right - set $node = $tmp_node - set $tmp_node = $tmp_node._M_parent - end - if $node._M_right != $tmp_node - set $node = $tmp_node - end - end - set $i++ - end - end - if $argc == 6 - set $idx = $arg5 - set $ElementsFound = 0 - while $i < $tree_size - set $value = (void *)($node + 1) - if *($arg1*)$value == $idx - printf "elem[%u].left: ", $i - p (*($arg1*)$value).$arg2 - set $value = $value + sizeof($arg1) - printf "elem[%u].right: ", $i - p (*($arg3*)$value).$arg4 - set $ElementsFound++ - end - if $node._M_right != 0 - set $node = $node._M_right - while $node._M_left != 0 - set $node = $node._M_left - end - else - set $tmp_node = $node._M_parent - while $node == $tmp_node._M_right - set $node = $tmp_node - set $tmp_node = $tmp_node._M_parent - end - if $node._M_right != $tmp_node - set $node = $tmp_node - end - end - set $i++ - end - printf "Number of elements found = %u\n", $ElementsFound - end - printf "Map size = %u\n", $tree_size - end -end - -document pmap_member - Prints std::map or std::multimap information. Works for std::multimap as well. - Syntax: pmap : Prints map size, if T defined all elements or just element(s) with val(s) - Examples: - pmap_member m class1 member1 class2 member2 - prints class1.member1 : class2.member2 - pmap_member m class1 member1 class2 member2 lvalue - prints class1.member1 : class2.member2 where class1 == lvalue -end - - -# -# std::set and std::multiset -# - -define pset - if $argc == 0 - help pset - else - set $tree = $arg0 - set $i = 0 - set $node = $tree._M_t._M_impl._M_header._M_left - set $end = $tree._M_t._M_impl._M_header - set $tree_size = $tree._M_t._M_impl._M_node_count - if $argc == 1 - printf "Set " - whatis $tree - printf "Use pset to see the elements in the set.\n" - end - if $argc == 2 - while $i < $tree_size - set $value = (void *)($node + 1) - printf "elem[%u]: ", $i - p *($arg1*)$value - if $node._M_right != 0 - set $node = $node._M_right - while $node._M_left != 0 - set $node = $node._M_left - end - else - set $tmp_node = $node._M_parent - while $node == $tmp_node._M_right - set $node = $tmp_node - set $tmp_node = $tmp_node._M_parent - end - if $node._M_right != $tmp_node - set $node = $tmp_node - end - end - set $i++ - end - end - if $argc == 3 - set $idx = $arg2 - set $ElementsFound = 0 - while $i < $tree_size - set $value = (void *)($node + 1) - if *($arg1*)$value == $idx - printf "elem[%u]: ", $i - p *($arg1*)$value - set $ElementsFound++ - end - if $node._M_right != 0 - set $node = $node._M_right - while $node._M_left != 0 - set $node = $node._M_left - end - else - set $tmp_node = $node._M_parent - while $node == $tmp_node._M_right - set $node = $tmp_node - set $tmp_node = $tmp_node._M_parent - end - if $node._M_right != $tmp_node - set $node = $tmp_node - end - end - set $i++ - end - printf "Number of elements found = %u\n", $ElementsFound - end - printf "Set size = %u\n", $tree_size - end -end - -document pset - Prints std::set or std::multiset information. Works for std::multiset as well. - Syntax: pset : Prints set size, if T defined all elements or just element(s) having val - Examples: - pset s - prints set size and definition - pset s int - prints all elements and the size of s - pset s int 20 - prints the element(s) with value = 20 (if any) and the size of s -end - - - -# -# std::dequeue -# - -define pdequeue - if $argc == 0 - help pdequeue - else - set $size = 0 - set $start_cur = $arg0._M_impl._M_start._M_cur - set $start_last = $arg0._M_impl._M_start._M_last - set $start_stop = $start_last - while $start_cur != $start_stop - p *$start_cur - set $start_cur++ - set $size++ - end - set $finish_first = $arg0._M_impl._M_finish._M_first - set $finish_cur = $arg0._M_impl._M_finish._M_cur - set $finish_last = $arg0._M_impl._M_finish._M_last - if $finish_cur < $finish_last - set $finish_stop = $finish_cur - else - set $finish_stop = $finish_last - end - while $finish_first != $finish_stop - p *$finish_first - set $finish_first++ - set $size++ - end - printf "Dequeue size = %u\n", $size - end -end - -document pdequeue - Prints std::dequeue information. - Syntax: pdequeue : Prints dequeue size, if T defined all elements - Deque elements are listed "left to right" (left-most stands for front and right-most stands for back) - Example: - pdequeue d - prints all elements and size of d -end - - - -# -# std::stack -# - -define pstack - if $argc == 0 - help pstack - else - set $start_cur = $arg0.c._M_impl._M_start._M_cur - set $finish_cur = $arg0.c._M_impl._M_finish._M_cur - set $size = $finish_cur - $start_cur - set $i = $size - 1 - while $i >= 0 - p *($start_cur + $i) - set $i-- - end - printf "Stack size = %u\n", $size - end -end - -document pstack - Prints std::stack information. - Syntax: pstack : Prints all elements and size of the stack - Stack elements are listed "top to buttom" (top-most element is the first to come on pop) - Example: - pstack s - prints all elements and the size of s -end - - - -# -# std::queue -# - -define pqueue - if $argc == 0 - help pqueue - else - set $start_cur = $arg0.c._M_impl._M_start._M_cur - set $finish_cur = $arg0.c._M_impl._M_finish._M_cur - set $size = $finish_cur - $start_cur - set $i = 0 - while $i < $size - p *($start_cur + $i) - set $i++ - end - printf "Queue size = %u\n", $size - end -end - -document pqueue - Prints std::queue information. - Syntax: pqueue : Prints all elements and the size of the queue - Queue elements are listed "top to bottom" (top-most element is the first to come on pop) - Example: - pqueue q - prints all elements and the size of q -end - - - -# -# std::priority_queue -# - -define ppqueue - if $argc == 0 - help ppqueue - else - set $size = $arg0.c._M_impl._M_finish - $arg0.c._M_impl._M_start - set $capacity = $arg0.c._M_impl._M_end_of_storage - $arg0.c._M_impl._M_start - set $i = $size - 1 - while $i >= 0 - p *($arg0.c._M_impl._M_start + $i) - set $i-- - end - printf "Priority queue size = %u\n", $size - printf "Priority queue capacity = %u\n", $capacity - end -end - -document ppqueue - Prints std::priority_queue information. - Syntax: ppqueue : Prints all elements, size and capacity of the priority_queue - Priority_queue elements are listed "top to buttom" (top-most element is the first to come on pop) - Example: - ppqueue pq - prints all elements, size and capacity of pq -end - - - -# -# std::bitset -# - -define pbitset - if $argc == 0 - help pbitset - else - p /t $arg0._M_w - end -end - -document pbitset - Prints std::bitset information. - Syntax: pbitset : Prints all bits in bitset - Example: - pbitset b - prints all bits in b -end - - - -# -# std::string -# - -define pstring - if $argc == 0 - help pstring - else - printf "String \t\t\t= \"%s\"\n", $arg0._M_data() - printf "String size/length \t= %u\n", $arg0._M_rep()._M_length - printf "String capacity \t= %u\n", $arg0._M_rep()._M_capacity - printf "String ref-count \t= %d\n", $arg0._M_rep()._M_refcount - end -end - -document pstring - Prints std::string information. - Syntax: pstring - Example: - pstring s - Prints content, size/length, capacity and ref-count of string s -end - -# -# std::wstring -# - -define pwstring - if $argc == 0 - help pwstring - else - call printf("WString \t\t= \"%ls\"\n", $arg0._M_data()) - printf "WString size/length \t= %u\n", $arg0._M_rep()._M_length - printf "WString capacity \t= %u\n", $arg0._M_rep()._M_capacity - printf "WString ref-count \t= %d\n", $arg0._M_rep()._M_refcount - end -end - -document pwstring - Prints std::wstring information. - Syntax: pwstring - Example: - pwstring s - Prints content, size/length, capacity and ref-count of wstring s -end - -# -# C++ related beautifiers (optional) -# - -set print pretty on -set print object on -set print static-members on -set print vtbl on -set print demangle on -set demangle-style gnu-v3 -set print sevenbit-strings off - -set follow-fork-mode child -set detach-on-fork off diff --git a/paddle/fluid/train/custom_trainer/feed/tool/ins_weight.py b/paddle/fluid/train/custom_trainer/feed/tool/ins_weight.py deleted file mode 100755 index 8b4d87c3..00000000 --- a/paddle/fluid/train/custom_trainer/feed/tool/ins_weight.py +++ /dev/null @@ -1,122 +0,0 @@ -#!/usr/bin/python -import sys -import re -import math - -del_text_slot = True -g_ratio = 1 -w_ratio = 0.01 -slots_str = "6048 6145 6202 6201 6121 6119 6146 6120 6147 6122 6123 6118 6142 6143 6008 6148 6151 6127 6144 6150 6109 6003 6096 6149 6129 6203 6153 6152 6128 6106 6251 7082 7515 7080 6066 7507 6186 6007 7514 6054 6125 7506 10001 6006 6080 7023 6085 10000 6250 6110 6124 6090 6082 6067 7516 6101 6004 6191 6188 6070 6194 6247 6814 7512 10007 6058 6189 6059 7517 10005 7510 7024 7502 7503 6183 7511 6060 6806 7504 6185 6810 6248 10004 6815 6182 10068 6069 6073 6196 6816 7513 6071 6809 6072 6817 6190 7505 6813 6192 6807 6808 6195 6826 6184 6197 6068 6812 7107 6811 6823 6824 6819 6818 6821 6822 6820 6094 6083 6952 6099 6951 6949 6098 7075 6948 6157 6126 7077 6111 6087 6103 6107 6156 6005 6158 7122 6155 7058 6115 7079 7081 6833 6108 6840 6837 7147 7129 6097 6231 6957 7145 6956 7143 6130 7149 7142 6212 6827 7144 6089 6161 7055 6233 6105 7057 6237 6828 6850 6163 7124 6354 6162 7146 6830 7123 6160 6235 7056 6081 6841 6132 6954 6131 6236 6831 6845 6832 6953 6839 6950 7125 7054 6138 6166 6076 6851 6353 7076 7148 6858 6842 6860 7126 6829 6835 7078 6866 6869 6871 7052 6134 6855 6947 6862 6215 6852 7128 6092 6112 6213 6232 6863 6113 6165 6214 6216 6873 6865 6870 6077 6234 6861 6164 6217 7127 6218 6962 7053 7051 6961 6002 6738 6739 10105 7064 6751 6770 7100 6014 6765 6755 10021 10022 6010 10056 6011 6756 10055 6768 10024 6023 10003 6769 10002 6767 6759 10018 6024 6064 6012 6050 10042 6168 6253 10010 10020 6015 6018 10033 10041 10039 10031 10016 6764 7083 7152 7066 6171 7150 7085 6255 10044 10008 7102 6167 6240 6238 6095 10017 10046 6019 6031 6763 6256 6169 6254 10034 7108 7186 6257 10019 6757 10040 6025 7019 7086 10029 10011 7104 6261 6013 6766 10106 7105 7153 7089 6057 7134 7151 7045 7005 7008 7101 6035 7137 10023 6036 6172 7099 7087 6239 7185 6170 10006 6243 6350 7103 7090 7157 6259 7171 6875 7084 7154 6242 6260 7155 7017 7048 7156 6959 7047 10053 7135 6244 7136 10030 7063 6760 7016 7065 7179 6881 7018 6876 10081 10052 10054 10038 6886 10069 7004 10051 7007 7109 10057 6029 6888 10009 6889 7021 10047 6245 6878 10067 6879 6884 7180 7182 10071 7002 6880 6890 6887 10061 6027 6877 6892 10060 6893 7050 10036 7049 10012 10025 7012 7183 10058 7181 10086 6891 6258 6894 6883 7046 6037 7106 10043 10048 10045 10087 6885 10013 10028 7187 10037 10035 10050 6895 7011 7170 7172 10026 10063 10095 10082 10084 6960 10092 10075 6038 7010 7015 10015 10027 10064 7184 10014 10059 7013 7020 10072 10066 10080 6896 10083 10090 6039 10049 7164 7165 10091 10099 6963 7166 10079 10103 7006 7009 7169 6034 7028 7029 7030 7034 7035 7036 7040 7041 7042 10032 6009 6241 7003 7014 7088 13326 13330 13331 13352 13353 6198" -slot_whitelist = slots_str.split(" ") - -def calc_ins_weight(params, label): - """calc ins weight""" - global g_ratio - global w_ratio - slots = [] - s_clk_num = 0 - s_show_num = 0 - active = 0 - attclk_num = 0 - attshow_num = 0 - attclk_avg = 0 - for items in params: - if len(items) != 2: - continue - slot_name = items[0] - slot_val = items[1] - if slot_name not in slots: - slots.append(slot_name) - if slot_name == "session_click_num": - s_clk_num = int(slot_val) - if slot_name == "session_show_num": - s_show_num = int(slot_val) - if slot_name == "activity": - active = float(slot_val) / 10000.0 - w = 1 - # for inactive user - if active >= 0 and active < 0.4 and s_show_num >=0 and s_show_num < 20: - w = math.log(w_ratio * (420 - (active * 50 + 1) * (s_show_num + 1)) + math.e) - if label == "0": - w = 1 + (w - 1) * g_ratio - return w - -def filter_whitelist_slot(tmp_line): - terms = tmp_line.split() - line = "%s %s %s" % (terms[0], terms[1], terms[2]) - for item in terms[3:]: - feasign = item.split(':') - if len(feasign) == 2 and \ - feasign[1] in slot_whitelist: - line = "%s %s" %(line, item) - return line - -def get_sample_type(line): - # vertical_type = 20 - # if line.find("13038012583501790:6738") > 0: - # return 30 - # vertical_type = 0/5/1/2/9/11/13/16/29/-1 - if (line.find("7408512894065610:6738") > 0) or \ - (line.find("8815887816424655:6738") > 0) or \ - (line.find("7689987878537419:6738") > 0) or \ - (line.find("7971462863009228:6738") > 0) or \ - (line.find("9941787754311891:6738") > 0) or \ - (line.find("10504737723255509:6738") > 0) or \ - (line.find("11067687692199127:6738") > 0) or \ - (line.find("11912112645614554:6738") > 0) or \ - (line.find("15571287443748071:6738") > 0) or \ - (line.find("7127025017546227:6738") > 0): - return 20 - return -1 - -def main(): - """ins adjust""" - global del_text_slot - for l in sys.stdin: - l = l.rstrip("\n") - items = l.split(" ") - if len(items) < 3: - continue - label = items[2] - lines = l.split("\t") - line = lines[0] - # streaming ins include all ins, sample_type only handle NEWS ins - sample_type = -1 - if 'NEWS' in l: - sample_type = get_sample_type(line) - #line = filter_whitelist_slot(tmp_line) - if len(lines) >= 4: - if 'VIDEO' in lines[3]: - continue - params = lines[2] - params = params.split(" ") - m = [tuple(i.split(":")) for i in params] - if m is None or len(m) == 0: - if sample_type > 0: - print "%s $%s *1" % (line, sample_type) - else: - print "%s *1" % line - sys.stdout.flush() - continue - weight = calc_ins_weight(m, label) - if sample_type > 0: - print "%s $%s *%s" % (line, sample_type, weight) - else: - print "%s *%s" % (line, weight) - sys.stdout.flush() - else: - if sample_type > 0: - print "%s $%s *1" % (line, sample_type) - else: - print "%s *1" % line - sys.stdout.flush() - -if __name__ == "__main__": - if len(sys.argv) > 1: - if sys.argv[1] == "0": - del_text_slot = False - if len(sys.argv) > 2: - g_ratio = float(sys.argv[2]) - if len(sys.argv) > 3: - w_ratio = float(sys.argv[3]) - main() diff --git a/paddle/fluid/train/custom_trainer/feed/tool/xbox_compressor_mf.py b/paddle/fluid/train/custom_trainer/feed/tool/xbox_compressor_mf.py deleted file mode 100755 index b306ddfe..00000000 --- a/paddle/fluid/train/custom_trainer/feed/tool/xbox_compressor_mf.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/python -""" -xbox model compressor -""" - -import sys -import math -import time -import re - -#WISE -#SHOW_COMPRESS_RATIO : 8192 -#CLICK_COMPRESS_RATIO : 8192 -#LR_COMPRESS_RATIO : 1048576 -#MIO_COMPRESS_RATIO:8192 - -#PC -#MIO_COMPRESS_RATIO : 1024 -#SHOW_COMPRESS_RATIO : 128 -#CLICK_COMPRESS_RATIO : 1024 -#LR_COMPRESS_RATIO : 8192 - -#STAMP_COL = 2 -SHOW_COL = 3 -CLICK_COL = 4 -LR_W_COL = 5 -LR_G2SUM_COL = 6 -FM_COL = 9 - -#DAY_SPAN = 300 - -#show clk lr = float -SHOW_RATIO = 1 -#SHOW_RATIO = 1024 -CLK_RATIO = 8 -#CLK_RATIO = 1024 -LR_RATIO = 1024 -MF_RATIO = 1024 - -base_update_threshold=0.965 -base_xbox_clk_cof=1 -base_xbox_nonclk_cof=0.2 - -def as_num(x): - y='{:.5f}'.format(x) - return(y) - -def compress_show(xx): - """ - compress show - """ - preci = SHOW_RATIO - - x = float(xx) - return str(int(math.floor(x * preci + 0.5))) - - -def compress_clk(xx): - """ - compress clk - """ - preci = CLK_RATIO - - x = float(xx) - clk = int(math.floor(x * preci + 0.5)) - if clk == 0: - return "" - return str(clk) - - -def compress_lr(xx): - """ - compress lr - """ - preci = LR_RATIO - - x = float(xx) - lr = int(math.floor(x * preci + 0.5)) - if lr == 0: - return "" - return str(lr) - -def compress_mf(xx): - """ - compress mf - """ - preci = MF_RATIO - - x = float(xx) - return int(math.floor(x * preci + 0.5)) - - -def show_clk_score(show, clk): - """ - calculate show_clk score - """ - return (show - clk) * 0.2 + clk - - -for l in sys.stdin: - cols = re.split(r'\s+', l.strip()) - key = cols[0].strip() - - #day = int(cols[STAMP_COL].strip()) - #cur_day = int(time.time()/3600/24) - #if (day + DAY_SPAN) <= cur_day: - # continue - - # cvm features - show = cols[SHOW_COL] - click = cols[CLICK_COL] - pred = "" - - f_show = float(show) - f_clk = float(click) - """ - if f_show != 0: - show_log = math.log(f_show) - else: - show_log = 0 - - if f_clk != 0: - click_log = math.log(f_clk) - show_log - else: - click_log = 0 - """ - show_log = f_show - click_log = f_clk - #print f_show, f_clk - #if show_clk_score(f_show, f_clk) < base_update_threshold: - # continue - - #show = compress_show(show) - show = compress_show(show_log) - #clk = compress_clk(click) - clk = compress_clk(click_log) - - # personal lr weight - lr_w = cols[LR_W_COL].strip() - lr_wei = compress_lr(lr_w) - - # fm weight - fm_wei = [] - fm_sum = 0 - if len(cols) > 7: - #fm_dim = int(cols[FM_COL].strip()) - #if fm_dim != 0: - for v in xrange(FM_COL, len(cols), 1): - mf_v = compress_mf(cols[v]) - #print mf_v - fm_wei.append(str(mf_v)) - fm_sum += (mf_v * mf_v) - - sys.stdout.write("%s\t%s\t%s\t%s" % (key, show, clk, pred)) - sys.stdout.write("\t") - sys.stdout.write("%s" % lr_wei) - if len(fm_wei) > 0 and fm_sum > 0: - sys.stdout.write("\t%s" % "\t".join(fm_wei)) - else: - sys.stdout.write("\t[\t]") - sys.stdout.write("\n") - diff --git a/paddle/fluid/train/custom_trainer/feed/tool/xbox_decompressor_mf.awk b/paddle/fluid/train/custom_trainer/feed/tool/xbox_decompressor_mf.awk deleted file mode 100755 index 61b2f831..00000000 --- a/paddle/fluid/train/custom_trainer/feed/tool/xbox_decompressor_mf.awk +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/awk -f -{ - OFS="\t"; - SHOW_RATIO = 1; - CLK_RATIO = 8; - LR_RATIO = 1024; - MF_RATIO = 1024; -} - -function decompress_show(x) { - x = x * 1.0 / SHOW_RATIO; - return x; -} - -function decompress_clk(x) { - if (x == "") { - x = 0; - } - x = x * 1.0 / CLK_RATIO; - return x; -} - -function decompress_lr(x) { - return x * 1.0 / LR_RATIO; -} - -function decompress_mf(x) { - return x * 1.0 / MF_RATIO; -} - -function show_clk_sore(show, clk, nonclk_coeff, clk_coeff) { - return (show - clk) * nonclk_coeff + clk * clk_coeff; -} - -#key, show, clk, pred, lr_w, mf_w or [\t] -{ - l=split($0, a, "\t"); - - show = decompress_show(a[2]); - click = decompress_clk(a[3]); - lr = decompress_lr(a[5]); - printf("%s\t0\t0\t%s\t%s\t%s\t0\t", a[1], show, click, lr); - if (l == 7) { - printf("0\n"); - } else { - printf("%d", l-5) - for(i = 6; i <= l; i++) { - printf("\t%s", decompress_mf(a[i])); - } - printf("\t0\n"); - } -} diff --git a/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_converter b/paddle/fluid/train/custom_trainer/feed/tool/xbox_pb_converter deleted file mode 100755 index 04d925a88cd58ee3719a34b3e51fcacd3c8757da..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6983561 zcmcG13t&{m_5Kad)uKsJG@{m3qXvBKZW2fk>V_n60|8e85*6HpWD^2;tjPv~&uB=^ zx<;eYmimm(qP1=66RFhz^0HbZzW8Y?sgH7(R7rgx^~L`?Gjn&(-s}qgu~#p1zwgeS zIdkUBnKN^5l4}c!rVU9-(#&Is_H&Jp`&GA~`g5Pa^p9K1)@=AUT{}%10X$rA zbcgP_W6~Z~w3*t(T}>N`bQoO^QN$3pyin;05@}{?N=PZy7d_nSQl{sgHcDVK)wC3W z={b_sq3%0QTOfEd-8xl3t!uhjlYEZEqvy1AcJXgC6}kZD`6y69*L`OkE^_ppF(W2@Y^H#knVQdCi06cy^7aA$HRU|Frh3tY z%yVlh&aSDhYid5bIeWs{6Ee?ftUt#kkVMyAi6C(5Mem~|2{5;u3?+=80+FJ!Z{9S3C{Cy_TpYH(ua|Li1STcw;&B6rDGY=H*mJ%oQE?9CmmHPeHLjGa5vIP zIG@1vETlO|JCQcyd>!Ybz!xB$igP*6=W(8a^KqP&IK4Pu!b!(yr0F<)IPV1>gOp6> zDxAN-HI2hR;rf?I$zBSPK7n%u&P#C8@r+7eM%n{>4bEJg1;9(xy$WDDeg%9i(q^2e z;rbpTaXg9hS2%A}`H8!U{=spq;{J$qCC;~SHsL%M=XjiMoPWl-26^v5`X16RaOUGW z9jSR727EKlCvnp86w<%od=+OG&IRC#kp36xe57>z9_f6feueKvT8=YK+(myOoeBIJ z&N7_O0$1Xki|ezHdT>s}8Nm5#aO-hCg6nFecOkt8>6JJua9)A)cQ|L@d<*yffO9FX z>1adxHqKv4sQnh#+tu~kNdJX%A-D-RU%^So^++E_`Yp~DT(84vQ};Hg^f27Z!g&nv zS4dw(8pKKd_I{jntOouoPV<+5iC##sRJ4@hsoc^S@IfN#Y40g|;~94@yXKdZIryIIsd4U&1LwtAjq_}rm*Sj&(>w}sk!%8_#W?3Ge3?p5 zMp~z?&E!h+0;wJ6Scc8Z^VG#WlVE9$x?ZW$VMt5V_4!C=;as9HB}d_$gp=kfejEdE zInFjlE@s#5?3ykQS6H|3KLehI^Jc|oTJBrn10XKI`4rAma8Ac*9-DCC#(4(L=Wx=o z73Y08&mn+==HRvB3jR%{l>7!~KEto$dLhmZ#odYYZgm|(Itu4)3crFhLtQ_k(t4!3 z)peIjDOrW{Rfdnm^$9pn#d$o=k8x(>{5ekZ_%km4igT$VT2;DSrTbNCChw|?A@CRf zhwGt884`Xo0Mhjq9V;y{nNvPnp!=M`}~o7vbJDDj!ilXV=#OF9QB4(rHNhDFcq*(m%kR zNKQn0EzS;{bexIvcAO4{b)?xihv8feJ`d++oc9BNhx7xSbUXlz|1}@ZhjH-;&ODrV z&`lhFlZh4pw+`30Al-oUD5RrsF2H#quz~Y-Twj9p6r>qA=~#iY6z5vtM{%Bx^Ff?n zgF7APbGTlGla8O@bg65?H{rSg>4iw|Vm#f8cWg{XOvrA~vviyWlJlnD{msZ&-pt=O zT=K>SH$5KljpL7^J{hZ}S1%B}*_USzi7kixB^^gu>ZNf_hX>`H3@hxOF3T~;2#r53 zr|FZgQ1x~#UMQX)CjQm5k1XZW7@}hwoE9(V2uX|hp7Sde|IeW^f5}F-U;xDtQgXVB zrJUxNrZK?FxnJdfT*;ZbRLW`UXOAkU^P<@H9)Gmdr^Awe6?oE*V@_=T*DUoGsCw6` z{9PBz{BFhXQswtr%AY@m>OD@|H$~<*=T-h@OZoSp zdR_oW~&m`6TnOtBY4G_Vz2)uA5K})!TVNtloyh z9(a4nhtqL943+cglGeIaIc|4sId3RAyKPd=U#k4ysQjgu#I~!*Qod=or(yv9R6A00 zuPqcM6sdA(ZuqJ8pZ$^j55>Qw{DXqpc<7yuKUn(F9A67W`D3*m7CU)M@X*g>DW}=L z3t%r?4>X6<@li@F@07H57`m49+&e2)KhHpZUe2#n{vqf@&QDZ)wkl^pjd#;8tv)O^ z|7m797I~&CeuTyT`xNg*`DD)(%AS>(Xf28_QTA-=;c>;Uw2a497JDs9tbNff@|phjeoK1+#qo`h=QwS(vO}}plg7$+U1gE;LQH5}&jD3V z3hL$K2x=6^d8(WpmhoN+p2lO#fLr8q$?V#TDt~iLtiOF%`E!*`GsjmB0$9?2yG73r zf+zcNS4#b)s|WWezDKp!5jkmc+^#R(z*Lp2IEn`41&e z$%#_VuJacNrbY3KEcr*P@p!$;Z&&&4D*waaNe_!H@()92@&4MS@*j^y5b57BQu>)G zioe>@zXhtnt1R-Iq4LjB<@64hkoK!Ppf)-XbU5rZ`G17@;g;dUw&-6GEbKJ@9mHt62nRyqm`doyU8uE3KPU$CYPUL zayovEa!CJcEq2>z>Bl=|c5V16vfh+qrM;Q*p9h}epRJbo=MwPLU-r3De$x(HM1D9M zOTX+>{A!Cl$H8v7{L7U*OHP&Y%soF>Fy{FBmqmWs&!giOnyW2U`85^SxmCR-mVT^-f25-c9l_=NNYWa4N6ufW_`6j3r88sY z*{XQE>PNTAe*^T*^$hijW3Q6mZ?Vr&miQCP0de@wknL@^HSd`5-dBoGvBZI=qT?uk&(EcPJgT(m;K`pX zw%C7_%HOH{vswO^5R%IoR^{A`{4~#Y|2khVzTxu5`=We|JB$B0f1K3Y3X7dw0-od? zD3WrT^@db_hozj6<7IxovO_cfM8#{$4`eGjZ@1_f>Jmo)`N?kcl{_#%aZE=$czX|1 z{CleWu1lqyXr?$$JX4lqx7b4ocxsnh&0nTpdPU?PqxGowymnQNZqc(TXTK_^t1#Bi zk2*{BuO7?K08j1OYLVwE)Wh3F^DZ5Q7X3_DercP9zgcG2)~R~?RD9m8_+2Ao%ipQ^ z49j?bQ1S06IoDX^ywp`Q4WCQ_hw3x-EQ* z;E&UGSmKrkE&6#=_3zN1P&v_~L|vX{k>8BRR;zkTE&YBN1Sa`=)c%!epE=-ZTy$FG z{GH-kRGgs1(f+B*DYe9_Hm5A#hngG-)_;{gPG|6c6-S#_5}iZ?WumB_Ti| z{ghhPi7xP@=N?P{o@KG;HYM44NV%Nmc&Sq53|RE=lBK`8R5|Zi^oi+G9Pg=eN-gEs zE%{ff{3l!dYNO(59_DghsQBiTSpE*hFSFz~%P*cF^>dbz2Wk<=HpMTt*mDQQHLn-u zCyq%IWjSt3{)LwO4JvUWi0`@72D zS0eqPTiy7RrN7L0rpsav7peS5z#z$fMqBoyIxOSP^h-gN-)8CG4T?7`b~q$QwyQ;jMg_^w%4xmXJKCE z`EODDxs#;c)~NSfX1ivBr*(llb*jiYU*&HBPj=O#-Y=MX`>o2~b=P!}^AeSR1RMnQ zi(C0^x8hH*$Zw8|b5;H>OTRZMzRzMO?jJ+S2c4oLq=EANdUYDi4?<S8s~)kJAjxe7e>mzw9;nO({f;9A~SmV=Vp|;g&c~x5QoM z_}#1IcU$&pPyM-UZ;P^X)s5Pz7X8ERh-06ne6!x$MLEZ5?Us08D|phIrs5ORKA*9a z5AnpYSIN_@{H-ZZIX1XR&mTUUEAp9kH5Y*a*RyF?Jr=*}Qh`o8%Ax-CSjPJZx8zGK ze*PEWsovfbqi%{B)Jy?3Xvt5yWRJUa4Q}SLL)=;(=rGNS-nH_(isB zwWuB6E8{^bM_1*z6kmw^B#%dxld5>XC}*s;+A@CgRl8I+@!d0BH=uAT-#l(s7f)Jv z({J=y{GmC&v{>?+<@{dB(_`tEO3QffQuY2H@{_#{{7m|jO|Tm}Ub58tl%%yY^JPD# zSjvA0yclcn{Ba&HQ-6k7K?vA0(Q&wW7^vWmA_lPCpGosL1v!+998e~!(w@J zUbxrd$H-bxZz*m_SJncJ=*?Y5zBY zr}(qi5_df;__11vr5|5Lzi~ZWCYz;|BIW#xs(;gF$#xA`+Eoc&^vhH^F3f&^&0^;z zs{B=|9Ni+%u-UTw7KvuIg>K^!rJwT`!@3`M9&G{M%Id zJ1pgIw3K6d*Y8Ar#6_1$#%%9v7W-MM%BOFX?dqB*Ut=JU{+s{i7W58fh|L#BJ^NK$ zWZLt+;Ay<9R(dn-c7%ucBekACVgTZR8Hi)nh*{>LePCd#4tI_14Q!L%v< zbCrLMk`quI&x(4-YOh)R-qVN2)@xV!Ck>C~%{b}}i<}td;wZ4p*N>_PK0v)>hilHu2VxMNe9DSiIr%&0lTjh6yCq0*1+SQn#AOB@(uQ|Vj&_R5BnSO79 zDnHd?{~CS{&>vXb;47=3AO3S_3V0iq*VLCU^)^&cVr8rAssj_3z$^+H)b&>nEO#xrCSC!)@xg)n4eKnO4 zAoE$pkgO=$>-9A>)YW?d%Hq@@n&_Re%saj2=5e)%)+)0;LM6MbG73F81NpOPZy<~QAeU_-Qz%#SF24$2tmw782d_J$Q zx!mU$4WL|soJD1g)#cvm`o_ZY;(*gxzPPNx8)zu24uB{q$a57eLvh7{n#RI7x95P% zkR{Zk1AS$+h2?X-1;trjRI6%j@WEc{mrJ4YouWL5SJPMkxkvGNRdhskGU}jb+7DA>GeGH`X~ zcrL1@R-sMV-Z_iV0qCg;UwMPC)>jws#L#v16+SOC=A&X~9;_C!puR?*QhsA5%+&96 zdW*fV)Y|I0vYG;vR=1o~bMQUVxVV0KZCTxla#`%m;>L;P^|k)82A|pL;>J8@!5rK$ zWmV=z#!_8ZWtt2cVKKboGP1r5WT3Kz^_G{_)LgQ}r1;H|3_E~bZm}ymKRQ|EP(s9QZbEq z!e3KXPK>htxyu~hI#;F_!!Tw*{-ExkNKV;b-&jx_$gD(K2z&C@E`py5_`CtI&J1s1 zeWUjRjFMW|tQ6lIHELLp{M1(kVpKw1k^YQ~Sf#zW7tMA$$P${%7F92Ex^UfC?JXxS z>kX{%`@Ge4mGxxBC?wLT1GzUfQY&ga)F%gR`4gHO{Y?SkOvs0!snJneU1yG38kF=QPVy~~(56LkiFrn5r zlvVkpcn7KRtm@iDO+FgXfe91{)JUs7$ft9f>MDEhm2D@0wXr7ATAOLw{3)&%3O;Awc02br_WGYb)R zDtCL(LtA+Sh=mzjjnMikL`ZCaT3nHHBHYp-KP2X?NaO?kI5XI!#3m$&c7PxZ9fl!eNyOu^P*!^#%NJBq)STM6sS`XCcxeCZ;4{9byyA!QQWP}+fI#tV(i8RwKDlE?QV%0?f zs2pB$K5<}7s5LEhi<}UJJ<4%WR(^R?b%QT5B5R`q(p#|t_6)~YUe+k$Y7xu-pfCv< z_(A!zn%LYJ2|s8$iU^2CP%MI#Z>?PF;;O0~T`$+c-g$M)De|A=Yiz2O%WbtHCgTXy zS0GBniU;wFhV?RR1LIZPXu{eedaJrh3~Gc|$kre{zgFw3t*u|?(;8Pa(n>@_tmLah z^W^jFY25HvSI~Vw!V5BqNG6sG(UdPx;j5{xrIO42*!ocQ2v4CcYpnBQi4?#NMLhy& zvxp{EL}g`7Q{!T-GSJXeN3I214K?*}a%GF^8-(D2`WmeXod}@=4Gras8#G!%V{woD zlS)-O1_s;h9BPh=@l()rsZlgsS@bsv4g& zv#7o*9~)d)XVb<@&SETosawQYom)6t<;(Ur)CcMj6*(tPgCKaKyrCLVpL~T z78OpMR+Kxv*t?7f#36$o%`C>^u)5rLf!O0|6weLHloMONhw61stngKqHPr;57c>Ji zd_wVZ=GBR@g6%Xc#AQu!E1KXmx3s*oy;2!ovytSNRQt~HFFFTfI5)STz?qwy9SRQ&YUW2F;*dbs1o+?t;FDt^u9Akc0Y3X zV3;*EzGXhKbra8uvRLb=hyNE{2lFr@CAjv#RT7O(T~Mh&Um`W%$IS3DmZ>|2h+^|!Jx~Sz?6ZFC`=ESj%U#Z>L5FkhU%P1mg@B{g2B`+ z!zAQuaAp@5TZd}}isAhSca2n(trT?dS19OU)I8Yj9nD6r&E~gyQaoS)y zMVE6a*$>`m&`eMrsK$QK6}}1~9*=*b^c1?%~f>pW-M-VJ{X*)pJz>nZE zT#;GEd>uoBsWGxN`H`J_h8T1fg$*__IRi#q)r23#Ib>CtZo%t^Ur6NLT-Ei?oC_#y zDyzMqDS+)uIs9D{^6LF7Fx$krC(L%RqnwzBFcX_kajo$Xr^$?rxW+d~m$0O%rX?Yk zTvSg6Na2?prOEQ#xyLDq00D5BR7!^Ya@riyhPmL6shG9)tA#C zUYR$nu#AJj{p6f54bd=7Y{~^W5;Vh^$(%8~qBFCA-mm1rrtACT!ig%Hq7BpxXR(^rBYmmvPQ$kj^%jGRDD8VY zrQ(2VFIT3BOwHAbWxRjMfvm(-SA_#xMfz8myTk(|lYjJXFmkT%JeG zE76dSC!@=yLXkMKCunAzVQu!4b=So?5q_0aY;MCe3fU56cV$QqjEBTpyb&`FKH*8| zqlDuul<^hQyDLkzY4EKCCT|Y?1h*qj*EPZF4(3EQMdJ3hWh^C#dJ-sFE~J=ub7ip3 zvQ8Co3c*?@W-mvGD55y0^am9U^?us<@M0H`oLYkUDRY*up~}Zr%zV=(UcEUq%q(L& zQQ*w zO}Tu6kBQwPt^6UgwQE8&zOR)#e%Qwo^R)CgWb=q)%*8nNLw%L_=!MLEuzp>c5&gz7 z;lYhR#QoN6%WjlMOoT;ci+nW)VKJ_Z*dZR-u8p;ch+3nrY~sASrFGc15Y=P1Tc%5^ z@qwL+pALObAS1dbkSjg~j4HIzOP`b`nyR8=Vjs@jvKCu#gCpueJrZXt|KJ_(kFr7F z%E+y%LWE84dRzG#Ap zm3Hn5%>^iZphOdOTK`Ou)(uF9V$pnHJXiYh3C& zu{r-}3Y85pyDyJ65v`8!N4J;>7B6bCVzG(p%!p1P_#EzkGMXk@N3cbZcvkd%Gr*6o zsKE;;i-cy4)}??e)BNh$+t}py<865(w)?Two@ffNz79z+Tl3I8QQZ?7v2_!>^!U-N zRYq*qZnF%2Q?N*_38(D@19tWVHW@pe?B!g-2P_-k*7qI>o}+juQB~aHt=;Q%;EQ^M zlKAP1|B0K;`^nH33hqT8&4MqJF!bOK!sdgu^)IMIuh1TOQ(GOUca?+yOf zl^#^6|6^;|HE6;-&@ejGE{*T+Ey|7SR(`;qNM><0VlL){_=bzALsw=MMTF+J7hd|h zsnL(`%)FVFmz=cx{oWUM8CmK>q3Da{1}}boqzQ}ora(1*@Ss|*PRzO^rs}k=O%8Tw zC3L9gmO-(Hlj1x25iI=FXobTz|iMb68Wh<=P|AP$Z$JxkoM0Zxjmo79xs4dUH8M^p;yR zufY2GR{Xdz+k9VhfOlt+G4Z`UgKdlheLIjDdHc=UBZrYQJKwjcscJs`=r}$b5O3w_ z2N&cID_VD5zE_ZFC}RE0()`#q)A4=pz7wr!B7^KuYd_GMD%ybfZuQ6O5Ba5v943*M z>)cgE$5M17S$)ZR07n(kmdhFY)ucJK#CxL=X_+%~K7Lk1Zi|tlaJiO>>37fvqX}M} z;X8%QJnVF1Czw7&ALNkZIi^7`fd-H5L))~?h#SuwXhxzPMjn&SEn8IMYn-sOfPT6& z)?a00$9`%-0Vw+zGV*DqGXt}Tm=zZ-!8anYg+w}zhNSrpAYLFG%{b_jSEu^2*Nlwf z1Zi%l%r%30D{4$GsVwDH;u&66q%mG*=+ZUe?O9~PKKK`Dhv+0SqHi0c zy&&IFSqvtQDgU7OFC$Uxmtbfe_>_SFlp zX5%NEVF=)h>F4cdm6b29uJc{6XbH^^)t+i+jtaGDQc<7h#&?{D*e9--7aW#u_#X4K z`m&6rDg?sozruj8c5Yc!V==zz6wwgIY$1DRisb`--YEXhwj646tS`zB)S^SJ&$7dZ z)(=|D4GvTe^x8dQY>|+ZO)zoBM$8MB@7uVF_c28R*AuTAOJl{*H9L5!`7uD`6UgW{ z?{QpZ{C7-=yf)jKnc7rcgYO@m6Xjc-Li%|?@qsvar}(nD9Og3VfFXUvJGnCAtO?2M{4d&+fuGTd`FR=X z{o}oti?%S6KBuM^+*Rtc*Eru<9Pq=>L9T4@7)&q_20v-}6e`hkV%-`2fiL=bFZ>`= zqP>$tw3hfk!Z67Du{iig`Ef2w1<_86xUfWFv!KN?5W1 z+B@M`i2Zggfrl8eUDHL_rhE9;N4PvnfsEFLRt;R5&%y z>pI7EPL{kbDFL6U<&~7=PA&90&&i-)KRHAjigTEjB(9RQ;kX(KJX~;urwR$LIF>99fn!RPRW*ufq{X+FgO_*}XlIdT`NQtDxl9l#umO?4j zY?ek?K12V~bKD+CQu995K=(%oj+bXXW42F@Gnt>Bp%Eza%eGNXbY&v>oTjk+g|U?X z))e}iL7MnIzn;@q;5QSG(B8uDd8I>^7roy!wi>@ecewT*yZ;CFyu3e@-5;X;8ozs% z%BLeR?l%1D;$hluJWt%~e_V>+tUE;8tIF$Be|G?nXs6Kw>KI7C@3Zjc?;Boyj{N<` z`1QOobj#^X;N zM2=L(7e-k43dYZ1d^+Q2GTy;>{A%^ckW4xd7=HJSq+s%wOf0sJ(EsQt+#tiZ8j9*BEIyxC|{*HUPwVLr| zW@e;aj9>GQOJey^J@1M>*Zz!T4I1zmM^Cj5ip6 zIpYTy-@y2NjBjGR%@MW#<%~~Zd^6)y8NY(@cE(@9_;ki!$#@6juV#EUVS_y-wpF#aLN z4>0~=#_wbNZyB#SqjvrX<86$8l<_Hy{~hB~8UGmL?Tr6DGyX-! zcQSq(<5x5O6~=cl{#C}WVf=Q+cQgKV#;;}k8;tK^{F{v5%J~0dd@tkQVf+rpzsvYO z#{Zr12IK$1_yNYh$M}7W?_<2?irW7_8E<3!hm22Q{6~yWW&AG2+Zo@__;kkqoAD0D zf6Vx7#(%Hf z+Zq2I<2xC@pYf|1KO`x7ql@vw8NY_{KVf_~<3}-mE#r@5d=KMCGkz=Mk7j%?cn9OpW_&i|&t<%u@eao8jCV5L!}v_bmoR=J<4YND{>?nn(PGBuu>5|; zpU3!S#-Gpl7RFCwd^_W(Grp7YI^$O}eg@;a7=Hod*D&70_-@7*Gkz`O=Q6&B@fR_E zE92)gzL)VOjNifdOBmnB_)8gYFn$5!2N+L(8_+!VG5#{fYnf5|FJ-)q@nwupVf-S- zr!u~r@pi`h7@yAgD#kk)zl8DGj93k z_#VdJ#`vv_zn$^DjQ=I$cQBszJI%v$zO+xGINzLr_o%TE=f=d=KNzzr{#nU@PM{vHZP^-^%zMjDLyoeT;vV z@do3!Gk$>auQPrht*U@#}S{zrSqF- zB(6y&^GIj>5R*jeU_5<+U>@0wA7+wB-HfNX);x5^Q{Cp_VfUweG{>x_S#@gBxM!}t=$ z|C#ZnjDL>tiy8kH#`_un0^^$*-^2J8#&2MJJL5MpzLW8r7{8kFn;GB5_$`cI!}zU? z?`HfxeE93vh_+G~MGJXf+-(q|p<99ILVEj(T4>10H#_wbN2aMNp zqW1qU#@iTgFg}IxVaBI2emCRojNilfbjJUO@eam+#`tW;f5CV+bteh1_2jPGOo$&5D`e+uIV7=J3`_c1<= z@!F)Q{h!8o8{SajL%@agYj96&u07t#=99miSat)Co|r| z_$iDpVf@b-U&{Dg#xG|4RL1)mpU3!S#^*D>h4BT9Z)f~W#&Oit%p7U(I-(@z*fk z!}#kMU&8nx<4YNTBjXn{KE!xG<2xAN%=lXw-@^D+jBjWBuNdFS_|=SG&G>s5-^KX* z7{7+`4>G=+@eeb8E#rU7_#VcuVfxmN0E`g*^g~CVV(yx4?%Jrme29L*OBV(NHaH7x=3NVA|RW zrwIHhVcOCPYXa{koJ@G&I}oS6PdJ5epTO@B{wd*JfwvPLO}Izkmk1w4xLe>2gpVfN zCGa}Jv?Uhq6!>w%#}IB2_))^g67~yxKjBour2^kU_&CBIfo~%`matpk8wnpz*dg%s zgij!B7x)Uow8a)q5x9XcZLNhhftL_IiSWR;)c!?;?S%UTzJ&0}gnI>^L--WJJp#`l zd@A8?fu|BqBitqMB*Lc=?iAQXn6~P|Edq}xdkT;7m$uw-cU2xJTfZ2p1FX7I*_;+QJNX3A~Q*Ji?s@VY|Rr5MD?) zMc@X)ml4(kUP9PQc;LUH{|T28?i2VD!exYe1)f8A5#b(zXAmwY+%51_!n8FS?h<$s zVISd6fn9_v3AYG5o^Tamzrd#urU@uqD)3ms)r36)k0!i?uv_562`?q=5O@gT8p3vg zzp4kWC7dGgr-bVWYXa{kTu*plpXh(We!_hMzeD(P!o31-C)_}|N8pzTHxlj^cmrYD ziVb%OypC`a;ZA`cC%lYsi@=W(UQXCA@co3F36~0d2jLZjJp$iG_zJ>qfo~*yC1Hoa z*Au>quwCFQ2wzP&Mc@X)w8b0N1YSb;TEYWgivB0uLby-hO9)>_xL4pggs&&uBk&Bu zt%SP;o=SKn;VywE5pE;gDX@$1F9^2?Jf83kg#7}aLb#o9sla0i2MK!w9!>a0!ft^N zCmbT|5O@gTn+V$l{;CeRgK&z#pAx>AuqN=yV&!gmsO2z))^y9nC_zJl=Ggi{1=AbbyDP2eSj?^L->BeJp#`l`~cx@fu|CFkZ_m4lL$XVxKm&k;fD#g2t1zfZwdPaK85fa!leR_ zCHx3skHDh|KT6mw@Zp4iN7y0o5WfkAw$47yVDT zn{c1N?-2eI;a-8a6Mmdb{lG}Yp*ue=48C@m9(-SK{cylDw{S~O%c??6-?H8=k}X=VTQi==6Q6g|+C~oz zzpnClO>gT74B6i+mNvAUi5x9cs#hPcY4h~pM}d_7G(6K|(}Tk|)nM_n9S=W;3q6#g zw{36waP#mf$eXlnV`XJ!r#w67>8(>g*{BRixRW%=C>! zKw%rBSwC)a=v(sBY~sP-PGbn7Y041XW4w*a{(c=~)<~*qr?H8chv?p^g4ts{LCo_J zg^{Xukkb0};OnF-qlh$E82qLv7!Zmp(c88KzX+U6@;OMHY(2P@1aubx=tZGGng^nm zY%J;sq@}z-av7JwJ_>`Iih`dCJ#8`0#+0XbEJ(9c={CKiD9sk!T@?Hm$@mJKHl=67 zS&N43G|oYUjLRa(Jyf*me=P4GKDEeuHKwRU@}8*Vy;`JD{7&Np==F84ep&p%LwCMtCndvDYoO`st?sGGnm)8B zEfwk|rR!hrrhnJ(J6~V_)kuBZ20hq|>OWd{1ldqWR{sXFye|DDSGhB;Ok;bNH5=B;QY{d z7NzBEY;ri?)k9aO7&GA2(V}HhJ7!i41BVOPV$2hjeXy=jN}Qu4Zn#a;a-S3;_amg0 zBL4|DjW_-yMNWZqv)VoktRr;~B~gWsfV{hm&++uSnX<%P>!l9t;G_=jh!ogP1-7aJ zUqFUVv^O+T50217(=o`V+193@6y|;^JJ${all)Tf7_jh3s1L4%^fIsxPJ2I@GkKTP z!rVp{57fAH6ibpsD^peT5i=$vLa`6o+`$8_PF^ z^E93tK#dp~q2X6RU4`X4#hqbk#sj#a2j78jDhhs?8{A|Z4%y)8%lFYxwG}1p!rlC& zZwt%+QG{N>h;kDhWY8d9Cl})(vn(OAEF~wtxCoP!A6C^YCLIU)GYE^y zKSt$epNArv@?ifvjZP9SxUv5;>8LRG$SkSI5_nJVxH{!|=xo1{j(lxB*S=f?y~Dx$ z7cK7We?EHOChkMzYai)<&m7MkSE{FX3xAS)OA3lA3VuQrKZ6p)?qPq6%wE|UZp3Zf ze5kV1ctSmPlz0pk?=)KIv7x{izu`QQM&gTb2Cx}ysPPRe!uZyMU(3-=CWQf-t%vH< zi$bTlE+R|%NZ&G@oCS7sKTa*o`FGRldgxM2+5G^2Z^|Jhr;>W?q+~os9vxnMW3=^$ z3qh~ls?#jE$9Q)S=8ZphMw(*mMw9{VJ*C?IXS%;eR03<1-TGSuMomZ1)8C4aWD~l? zRE6=VeEy&6`77!9H_(3zXq0?U`{tm~AESNaW&LLRzW&70zQbeN_b%PHv~SB?vwbHH zZr@87EzpnIzB}o8**+Q1I6tOl%5X;Xr3`0a`orr{t{z;Tf2{py zfuB+LVPw}(&tbe$_hr)dkYs5|C zsD2tOyNnC4G)73IhnA8NeO!FJCgLU2e5P~rNu|JNJldzXAsXKlcnKriI1&X!d-r7w zcC`Mjq9DZHY25uWTEC6#H!}Jjl{tPZa_lj#q#SpkwEp{1e=X|Ab!BJ&4dzI~!^-{- zsK0aZ^pB&z>~&Iqmt$i=>2LR?rv5T;(|A2B^_L6{NA>snIS16=ZgR>@e}6+2QT^S7 zyG(xz|2;^5OJojHf9Fw-i2lw-{R#EgE9N6T_^p_Y#8l*bS4>7?j)E`2j5MM!__mH& z%BF`rMqy}Vey9{nmi#nFZtzV#I4mtU>5T#e)V9L%?~LSKXiDqVj&hp29M{aD8z<0> zz5k*%_5^;S=io;2O8Npo4;`JRhbHO4A$l7u4^vcNxKWGIufu>H!s7d|l7zH64^Mi;u_vq!H=|l0Xu?G%2NjwO#-qg##)`wn|=EzU# zr`hZ9-H4$>6HQhRU1S)0(Rpaw&=Mm*=d&iSkRbRD$*@YyP)qeYemab_O!(|)3k3^Yz za5Ht4-KZ5)B{YbTkaTD_W(vik9;H>g5tLeG%ngH>c~DoYg*L_X3i3K3iYtJX0 z+NDluUxHL)m#SHT5l?f@r3;0^>}a68QA#(Uv#q4Njxqmzni@)$h;^4(Q2)Z2o63|559%fjqxUx9h#gLI*Nkzkd9$8EIH&(UE^*Vw#F7H4ZdX@ z`5{#RtD>$8ZqP$oZfF!nK?&s@8_hd6_%}WHwz2aAnY}|~7fa`Kak+zhKIUySoE*Pe zf|q}4bkOW_xA2c5`+rq?+enq=|Di&cP?kr;{7UUHRC^GG$|XWNl&=TBNM5x8Q%&-! z9_KdOhNj^y=W9*JQ-$yYv_9BM^`_EJ#{y16j;Vty#y$iol| z7C=G6(ps{$rct@AlhaNSady)h=eC02S!w-U*7)XsA>Usy&&89kQ5ai%0e`Cmyf-?Y zNPjQ|r83gam@Q)UK{}EV6>K_vBbLy&!kAX}G$DHWXx(|zJm)<{=DEu_Wu>|LyK5fl zi&nrhaL;HHJ-^FXj|`C&aF3Xx_ZW{6IjH^<^|%5kks-$;QjVP@$5biDD^E%}?r1gT zI65N7Zrn4zeV;6FmyyHe$X0Tsft1BpcIM8@otqo{oZJkC((4#Sv{uwZb8u!DpHOve zA1ANG)(J#EE3E}D954`)+h}7wH~IO02R||HBR|*HLt4OK4%8yJ>pufF7{JQ+=QObN z*3DS`HrkN4DK>BJ=87hoOzJH5;ULzdB z0!wM^Hz9b^$GuZDuHX3bpD42RgMEeLFusat>2pFFjV~LGFDz`Z3MW&OOUJ_C8)Cku z8hXX#&6beaRD*ur{1ns@xRiD)i!kQ()(y5ol&J^j!FkT#hfup{+-|+~{eAklw~ZC3 zy)d{C3NlA`bZk>Fi8zp^F|jHLP2GnHyZ?8c`kBXy!HcaOES!fnU0&2N`4y-G3*-Kx z;C^FRADR?uLMLx2#3VEBGb}KTWNIhYtVy2+Mi#;m(A*zdXe;VigjRoo;1S9Y_dhT` z6whA`tJ&Xl{LGGFKQ9VWVGCgZ1u5YRsSi3vY5hSX1iu1**Z*#e`a5$k%AHpje1*0x z-k1lA{0!#kZke*`dTh}qucV)_$Bgo8Aav#P*csIdoO_Llw6hU9v7PA9*()f4Kn zq2cLNAnvAMRvbmTqWw|1I14q~jB&J?h+qes%st6%AJYBM@XxPP&*-7%6k`O-5;!ck zb(UvHVXy%1X86l0-zeluH7>)h9ZDL7=~3hwmK!V}fgfhM>_)aFmkqfv8I5T{uH5H_ zLx94Lh3R@|YI;ldrA^26FVf3<(f;(p{VxX3>fE1)p*?0Ul^7a6kCJi16pC{CmSxWf zO@5!6I?8w*J87Xj+j)7TV9TSLp6l>q^|8~qX{e@w?thFLk%UI1gpZ_7ZGF)#hhgqz zSc6XasO5OgXn&Vt;N+Dv;K0DW22S5H{4FGO;PUn8Wv{w4hLWv?#;npwe^b82b`d&w zHcAQ)MKQEvU|)9vJ>HRZdN*yq?=n{5t!M7jNpyLH2X}-!J`*>Mz5hUA0zPrVVVbc^ z!QgwwByOEzqX7+oY=$CT+1490#VCBgO2%b6J& zjdmyneRtS2tWjX9tyo+JSEhARfiPRYLA!3WuY&>lq@Zad$&cwn5B)?<@EEjvjK%MX z{<)c666vA22oY8n1^C;2!=6o8QOMATZPx7n{dU;iBu$uC!nGiJ zg4avYgCmW{z9Yl9I^DSa7;4#FVht_g9agG*+PS(IS*Vv3rd5BEwGOWP~ir zgF}96eLsn$!QVuBUdgx~LZi)pw_GBa!yK{HqtsB>s@^tM8*C1j51RHO4vl!mQKyuVurzo*N02#8XmL9{5e z1dX9a`BA7G0i-{+o=a_+@K%A**yG4WK1%q;AfLC z)t>$cZessaGZx^75iszw&WDwCOnDa%8P$)B(1ftYt~VlrK1p^E8a{j~StC@GZoH4E z43?cCEZdC8X<&pGpMf&TG3_ya3DKzuzei9Z^iqttwMSU~P}&q3a_L569O)o5^io(y zWhZD_O}V43~Pbd&-wyk{nx_J<$M>Y#lj~=4^6{ho{sY(y2db{ zkC0}1YG|4rRBC9x9XsctX=%ZW?4kK-!RcubGd*|_l4a?}y3vS8i#?$NPcY9DoafO) z1uz|&)ADSg0xYBE*+Rud!Mvj2ydqKxwyU^4VLyE|2(gimmlkrxciq25EP(mOZ`>&c z;M}xQ1e`BFOrh9LP)nt01Mn4vWEl;tI zegigE6gvBK+<=@8#Lt=$L_yeWG1lJ+J9E>Zz)RYzwv(Y?JAe57Fh#_NsbaW5+7uGh zmisIXNNnDq-}mRPC5MhZ#rGb9fT5H^v4GoS>>NWbY4~c<#c;cKrL|MO&|@+xM2xc< zxCpq5`0HMzW{bDUXbHl2yu&NR04>Tn?e6pNrsN80`)4#8iI;QIqBkNm@BbHtn~rBs zrcye_+yp6}$2tKaB;E@gntqLVvCcyE)_*S!&G(oQ`pWHrn!Ov6lH2Y;ukPJ2B)N@dxV;;O1y0Wm z71%o_YwP!&}L%#06>C ztf--{b1LN5Ie5i;Raz<5phdLp0o#9ZvHVu&*>~^|?4jhrh&_C1EFA-TcqR#>=~mEq z2W}J|7qN$A<182lkt>geJsgiNH74Q53&oFsKpV9FBI`?T3sU=oJ3Ec?9po)_o1SA! zZkvM#!aDxJ2Nx+i)FM7+ac(hum!a0sJkS0ftdMNfZ!2}!0h*o4v;7>|cn{Dc;ggWj zcoLsk2}D1PVLS{(;^2MnOkfiD^`it8u0rjF6o=g3I8sYOW#LW8kq;}q91nGde*t9i zKUgn{{aoy|nDz%_w3y&@&l*@A{Rr9`7@d#OVSzKr=Jbxnq|HMkrYEdUYQAxZyZy=} z88zy?5jmb$?V8FtHYO`OTFFJbIpmHhfc~3U9ID@5Ro-UW=(Rp zpNE_xm-Az?%!$G>uW1k)L@{P5o_l~g43gqG@z8$E_n*Nuw~^^Nw+oA-4>Tecmq``T z_bjkAVRg#j#D@g|`mg|2H`v(v?_WhrZ~tn<^SS#cn|(Do|O+Mds|4wlVzkLr0mfLT1a zF}xNTWDG#|Dz+wsy@}0Cu@woU!?t5wds|3$27Ak`}nbX>46KyA2gvZB%CL{NVA1y1yy;QnK zQ=&05a*viSST8NdgueR)Z0TsmxsiMAqSGS&uIUW2kLK6+?>C1>+tTXL&jHe>_$n{DK0ZmUkHsENba4#-lp1cPf@2pXD2=*@_Yd4* zGz6u1G15nYI*1i0z^>@SSt9JOLgL&;Jv8PulDQ|4E*2r1jU!Z5F;lduC0U>N(rNfJ z6d4X4LXelViG|m*^i#yp<_mX5w25yxp$dB0-I4XpU8K%E#!YZL5p~vM4kU8RD5#UB zVHwPkI(HiDdP$voj7M>^|C5;ZOPjQO4~2Mj_|0f&QAa)Ay?m|~{8U1F2~445DJ(G$ z>(iYG^wYN#P&lk@DUj>aO1vvoG2hqNy^Jb9`i2B8P2WiR-$SwYJlZlv(X%clr@0gF z=tl^{7!ILvzrX+B_zKi~odU&Tbrhogy$#8&m%=M0KRsk+Pjb6QTn^w9%%otWoz_oyk-QRLA#1^b z^~TWFeIu8ThDCJz#-;^(^zvcX>q)))`+@_leIr}<4NG2Whx04ME~Dt~@`{lf-k#o0 zdUp1h?*?+c8&!TpU>~49e9`-SA-3~i!0(7jMEvqy<4v>={seE@qoS_dKraKfwQWmY zITf}=1+RsTYOHMc-pwPE+m_JX=!1nY=d|S32civKEiQZX{TmPsrs%<9`JJhAJ4Vv6 z@MtN)4Xu4iukHSFhjT-4UNj_eu!3`2i_$!ZIyW>P)-q{Q^2&Q4igR1|0VGn255-W7 zh)+_aCbv$HN-;Sig?QT(E5?cRMhS~x5=4J(xBlf0-HGq9nJ^Eq@`W%%lrZ+>m82I4 z^OuM)^UciX6{nCSDaozdp-t8CTf`;lgCs%EbA<>=z$KRCIJ0orl#nDzNs^kpa=MhH zNJ-L}+$MIkVtdD)+&VTYO>#sU2y=9-Fw{FoC+MBq#HN%~xDaC)i7~uoQhH#Xb6fvH zH1Ao6f$>soBP05jMq17WQb&4n>kiyf>Uc$5D)Yc@I@3mSTPI3r+a4~X%3=#_>m&|i z>iWk}vUC{mFFTDLdXg609L%$a=|*VgSyX!lG7E9GR!+f<&hULG2LT4e+^OFA(5Kv+ehx<++}uAW`DxXTVKMCpq(YIym?CqzY3+kN zh|13^7!@eh%eU&Gd1CDs9CqV=?5&_h7oDGkk(+$;T4WR>4QACiQmAi5hxiO7_C7MYjEMced4@K@F zRfEI`*=dyEvoy-NQRRfO?lkfgnXPh?f{c6A<4#DDFTZR&51LXzRrn@sYrzR@54Yk% z#!ulp@zDf(MU7b*+fyU=chb%Y8Uq1(*^dWxhNc&Bh~{7+8aiLk*%BB|ZySbuwlVD9 z8ZA;S`Dp*CqaKl9q%qk;vajK3fgEC6qvGt)%QS-~@MgI)xa z_$Gb&DC2J2FUo1s0_V=kIX5tgCQwWp*yEuI6rZY!^*=@VScK7vzr@`CD+<qg z2s`dYbUAI5*mGOIqn#UJ-1G2R#=Mk98;m)pRK`DV(Hj|q8e z%OouFPL^oBF*189_8LZ0_KvAT!sp=P|1tM1;B`&u-zQCv2Cb7KL{-J5v=ng(sWzn5 zV@_#D3>kGPb?el)Oh%PH0@4 zF(u#c_pZJ7+2@k#H~;yb=ksXJy1whZt#`fax@lVqp5ef03A)UrnoqaUzJ-2NW@a?XAJKESzPfy+c;8F*GjN zIcIEDq|H}xERM^-L;`G1TVG$G8SF)VcCa_l?89JiS4(XW$HRi{p*~DxkkjXjH{q~t zyRYIHsXxZ8-wwic{m#tw<85q*hoXGy-VB8o3=TXUY=g&88(bA1*cw~~4h2`CeI~#7 zyv+}o$4GB!ZFnny^{RuO122pqZXiUIFT68KZZ(3N<2X+eRDtHg)o_d<@{$Ygz)r|k zmWAD4)B?_OrfE=yPk?jv06=hO+`&@;i7QGpw{REhyvj#lS9^QTz~_Sx1%K& z(}LC@OO!0H0}cmk9D0Cl4-Od7*=0+>IIs|BFnz7TEl7y;V9Ha?@|&Baq08*FIZW%& zX=QfWl}vk1r;WGMW-@J=PJg ztu4qA8bkCpBlvAi~ zxPcn%A*x>Knf@~l+1F?KjNrDK$hNEqaN4GE+UDYfx$fs9Q7E`Av^ZoAT<1u>rW@lN z#BP@3pZ^b|F&&nT-~nQiYhs)<>7eB$Q>Ktt#xi%EzExGTHnDkWUh5kTCHri z)*4SOv*g_auqk)jY91b~I4We8yvX>ZAEAzBNfX1fb>1G-soS|>Q}P!kcQvwz{~m-B zx=Lz|FitDebd@U;K~i@Pq^X&}PIVKvb$+(aLEEbXJ#b8t$}`?#?qpnZ3F2l z0JY#Y821&n{_w#5G>A_b;mv^6iMmRCCO{}l|9jAHTX~rks%gtn^CVl0ai=e^XkBet z2qRL9`X!sRf-NBf1Bi)X$q251#p?8hX(B%|rB|-7!IVKUP%6Aa?G57=ABMuqa9nel zK$-;I!*R@IU7dn4U?0O2Hqy1mZKPfmJ^_NTNvAQJ29Wit9VO#?CxQEpQUkd7TmkW^ z2vLZcp&sREfl(yXRys@gL;VeIHYUzyYFlZAPGqn$5uCR_5tC|FRZPZz@k`xCZG6;K zR0%XJgz3aGbxwL>j@pLPzJelVKsF;#UOn+|2S8+*8qtghyx8-%C%eNz1^Zyg^p7|= zU3M*OhuM@1hRE89T6TXV?0#i;$gTaz?tHK^MRv6|c?sEF#}vx$h(sgxssgWOq{?nE z!6nOXtPReV-C>MsPR>wYP$~X9vYYN@JG6Y7TJbc1r^@c$^u!!hI#bKek+RzQFQ+P` zHj4_DBDG@nI9p2W&u$M3LohgoZbdrOdO{^}HI=jAg-Ot?oTaAW5QUd$;a5A_ZkKAm z(8#pM0}?GpG`=s0(wPE&VhDp{r$q?RmGS7j=h zqXL%;!-%799r}M6@Ej&oAW1d!YW$Gjt`4F1!_@ncnlgkcrI^bLKAv9T6whfu=Ih;d!SKs zH=CHOrgFq!-X!KxXzxGHLUbP^jjLB?`Fzc5frNIV7l2A5xUwd)vM-DUHk2NMYHiWW z01AbCU1m0%{m4r3=$i+)^;SIrnC5haEcoRcsxfzcQ*b$DmdniWXymFdXjRa%I+&(_ zEj$iQ@U$=*Eua8I3kTs0z{Q4C^er^STFpnHNp*!-5x8*TOhwDc1uIf#^K07bBUz~V zT45Se^B&zB<`XhCOJ{QuxhGc9npjD##H)YtgVGpTr}hR;HY2;jY#ZR!046f2+KrlkdlNc61?TfBMYL5KwAF2cw{l|A?lHXpS5ILgEA6_&+7iCzf^je@J{A);7%W z!s^zm1`#vCL=_VWqs?q!uDao29Sc|WwQBsBML_BzJF8(HJ3^MO)_eqUUMNNZ1&M(= z-YHngf}Ku`*s5Vptb%#ZIWbbEetO8pguP4E>%3+$mY0EUPKTr5dY&>{FLe znpRYjjA*AU;@HYl5i%Qe0)}9|%1Rrs(@x|-$+{ac8l1}BWHzykI**{uA>)R^63Ft; zT9j~?f4U(XyJ1zwS~d?C5_4+pi_@S2Cr~@ArWJ&DqIB6P!!T5p#Qw%5$mYr7rWQeS zk)#fN4BePkbT-Oh+>{?1NRgN|N$T1iM@Sst`w7vyp%10Q>TskB&Ah(UX{MftWa+XJ zUIf{p6JcquiE)gI|Goiy*iG1~H;7?WiH8-f(wOr#WW;9c*fbrRp<`t_R;FVi9V^u_ zzLJb)3K_E+TJWPsz~`jbb!OWgns5PP32CKgfi zrAvsI;?ojMY=i^E;9PSC=_gs9*VydC1a|K zi;hM#7hxyKN3p>QGRbPtKTRy4c1I3+`9k`I&|1Pr!D7f1F=B{Vp~Q6g;{+1AWZsi= zMSGa3j^R?8)ga}qMJ=pmHW7$=NaqM#XJnc|n&}B7YX-}x%fzxJ#2|@hATa}zgn`E< zDs;UfL^MhIzD)O+vopVyp1iYWR#to{)Gn7TP64~*JVsTMurUyQP{5QN36m_!>#~8jrhs7peb3e<&D*pW} zAaei?OQmtAo^?)T9V5XVNjw^f8Q8gk74lLlNTQDw(sfQS+)OHT3tg{?%dC6z2Aev_ zRFBB!eMC80FS_B3$Tx!y^}?qF5gmmbJ*3_MqKxPgH~w3RFLvYSN&GH1ewxJV-1vzS z=V1==SAL3K4MRAek!=`toq<~2O{9!AJ-^sTOaO*j)>JPqwaaq2zQ$YbE9qt^#}?It zb~!zE2kn9dPzlxU6eOzZ2B)Bm-NWsIC$ixEPC?SB&U6aOK>m%6$=Nt{ol}s^QKOuK zGOE917uG9j^%7e$ry{Fd>8$LD zmBxh1)U*m?LPZAkEkx7m*X(f~eV4=uNX$?{&>tzB@N^K;l#9d+6=H1RTT!7z1xeh5 z#0(uoy6W@yx+rz=6dd>`>5n75s$|22eSN<0b?OUnwhdfBzQdj120YskG9xy79;bLs z|Jg(Ie%$_QHLBDU4-eC^102Z(7@L#94JmA7g7EM#_12v<9GDkF zAmi$)(E7YVt^Q)-B+$wU?Rg|uMLzT43_8kHS5};iXJh9MQ;X9oNBmuuPylbR9#aLN zd-WiVE~FUbJR&sCe-o}Q&cpfuc1lct7?HmAtsMtJ)V8irIWUQMA3Q}@T|@vod&TL{e)XR~E5IzSmm9K1dABiDVIH?x+SNJ& z*_z>Q=tj>J*$9^-J$NFv1y5@mcoG?JHN2zl!Kh$j>yd<*r9iu~Kpg`{xci^~g*J}Q z(8eoM`)LCc(wLVBF9rVLyVHZr+YdjQj%TnS~|AiXvMJ`-daC-_!@5!TtR&NjDU=49dIz*lVcXhZlxt~nIYAjyytqpGSz^;mH z@>S%)*u=8vh+!AKE!a-FxrO9Ee?W3#THk7cscKK@TPMlFBs&(4fXflFiKYJ;PE#`B z)RvOCLpVJqR=*#dyxIX-l20BErNZ~$x102U`|*=PIlm1lH8WBEf7wd0Ze?tGD<42Q z|E*R&!ib69{4st~XyG!{f2);wx|K+JE4ZHCPb=JesO~N#M`g_}95c!{8dGrtxn+v+ zl4zH4DI_)l+pXa$#aLZ}9h+*K} zFd1YyprVC|)Kmng<*YdxyG3f)oh;ACpqMIpj9(5E_-9Py`ml~0QS}pCb1pKgPoz>b zh6Fd?oSncLkR!rndT@Wk7gg`G+4m+`|h{ zZTVjuOAFu-EJ?>7ETvK!%8F02q1>Gg<@vv+K`9%V2?{W=8QWHxPxQsr`aDP!;W(Wx zlR5Zg141qSF4oJJ=l;=Q`ZT;Rj)4zH=&~Ia;ZRWR#8HXZMq?rlmb&a~0fB{x z!50lyBMi})CvCw<5aKjeW>rfw01 za=2AhM`ZC|0{^PTI>kZ{A7z`bj*?CDc$ zBojCsA;OgE-LfX zt#?bl8NN*|Pc3@3n(QsQ!6{m&s#A-WsS!3#R?!PH;TxS%XT!ZqovjN)6V$(SP-8hy z!wWnezE{m&WEWqeM(Q%S?)ba8xL7czCL~* z0NaFl?U&;)eHBt^#eHhd9cai}%6z~ZX~Wyk#s-95f&w^$tzST}ntEtHPa?r~_^w-s z2`ovL3@m(_^hz|LK6eOekSZ_VhAQGnU%=cA$W61?Hbfs$O19TukYRTAhvafYqkJ=s%N zP6xAiX^!dmiJmm17dZRyd~@H~_eb%nMllG1=m2znM`D5IvQdyj-qvBhak+upJ|j;v zsX>6!wUXn`ysvzSt6#Ke~A0=hO#UC&ghiD~-d6ZFDF#Yml$nS)jt z56^#EJHTnN2B(?#RP)4K4!gLzegS30DiMN-iqE!;=-KV{_}*2&s3`gK&l& z+VRf6WdI59&hx;R_Mku!3(R~2V>DAhbkV*@L2n<9A8qMdU^uiUYCljUM0K=E<{kx^ zjE4tRRT4*exO!9^)m|;Tt_#p&!^K!kH-dEtKBa?;5qwey>k(Y8gUb+nR0mfe_!k{) zMDRf!1cUwcI@lgqNb z^fgzJsX=v#e!%E#S2qI9m7`ScYwYusf0D>_=eRaEl!fhg=87Lt=E!Z5S!R1QrFe60 zaeY#QiwUZcH1Ocb##fV#-bu!5Q;hfUYz7}{c@w@^m$>DQlA5*2 z1eLi2fw+)^p%B+q>Zs_?Gh(zjeNHdgKGXahXZ}BC+Wl}`97a@Cj>-;1Be0cKCCB?X zS6Mh45jVS0-1fjixKv1_$`>7c1XLa`fS`|*Cal9ZG`e#c2P9nz^FDXzmLKrkb$LoJ=NB8@{E^P4P4GJaKub6z^)v|ISOoF zxibBSBO>!en0m|qw=tN}Wc#m`5}Y1QMza2Z(iOOG5Nh{%3dci{aKRc>+4_*eGad{J04?MwiKH499l_8zqAzVe3)w3 z9Wd2IlzL1=UA?C!EQZ%a1e&Lfa)3@TX82`~QHLQtzcjI4E#G3G> z@XzF%g$4ciMcE=rI62?r>_oyVnU;YI?Jh0-v>l8&^rV?zQO(io3|#p%f*a9qR;V8> z=#4dXg^4(K8F&ma(xQ$>UqNqcNr0*>RWDD$R6hCYz?U~u?QrdbN8wx4CPZcYUIryu zCJk)7MVElaEK_UfYqHr^q{ewaOQ~#-42|KHi=@>-XStffuecb9^xDG&erZv4KNEJt zlXObEBaXL}Lvptn?ABCzDuA$jj&EF4p4;p7J*kAfRVCN%gON8pLLG7g^v`HpodXFp zW5w5}3c;mT|F4jrg^N&}_q7E#vq!WA6{c+X>Be7Qf4G~cVTfDnLEUx&HR|+-%z4pR z&8s;ui=p#!?0G?)q9L*`ZT0F(u@UJ8Yhgc%CL9X8p^A7?vs;ZHM+6p&*`eojE7Ik# zHV?@{zP6h+M<*izV$}SAT9ga(5AY~2f;$bknB;f4k zu>LIRHAJ%}f}5CAVcwK~Ai}5z`zM*o{Dwd|>w19s71d63 z9ix{^lwavuqoM?j03YkR5@Alif*=$Bj9|S!rMWX3RCCyfi8RTNELPVLb2-MT3;aq7 z0NtU01CNazZ1n<^p;^Owd=8($Cb;IQJ4dKNR+BUAPlXCM=Y#qh;9l;dB}`7om)clBVXR5jHiaFfTz(NSdMwo$_m-bTm-%P;;OYExMn%d+4Z5 z%?lB`Y*Cb%--Eb})STPHa!J$-bEqj{UNSY1bS`fS;|^#XhQ%{aw^OfxhmxEMg8^$B z?qMBcySc9>=Zb#n`@LJ@=&b$R?>!ypdRp|WN1 z9%rPk?(U`AW5|ZSfvr2EJZ4PYh|Lu|+cC90WHps?apn#VmX^gZ7{cT-36|%oR+1|B5+BPvv91erXx|7Z;~rW`)5`GhM{4YO)Q{} zgp25)cMP+g`|{4goG)(V*Ovk6vh^kxrM5uK@BT%QAo2&t#M zC3DnSc1etbKJ~V?_GSY~QMKNZ1J!76$>mN-GC^JFDOsp|cEKYFvc@S$?x`YO&}^V+ z)H^z+u{=)~3_RUHzN^zk>Y$!dQ9ojOAiR)c#P{oZC@Rt{&_y+vJv10-YD}&2V*G;* zFuYM+WTR^p<(vsCQ^#Vqc0N?Yu%$Izl$|<==|!%3cRsTkByNl7RG^hX&sjv>iaubg zt?0QVhPTr(t`OU&I|d#rkY1+FW}?+FLy*uXZ|)#q8-bBTfZJ5b{f5QW&cd^#e)Bz< zW9nn&)$7^&9eYm;scjHkv==dXbL@pT{az?Zru0ZgoBDoQs@JA&N#v?C5*C!65U=pW zFnSd32RG`S!!iASdcPGtFr`RV)rgDpy+bFSom<`h5!fk`vsrXtE{I0;ljqrr&cgT*)tB6!Z-a+#Dtk4$i0cWAs@R2)^ z;A@q4EK%NnCmJ$+4!jftbaZd!v~pD;ojy=JcH*)i_91aDY*N zYY2GvgYRMDwG8>?0oAtwozBzhKWc9+8`V|E2aumef~_-Oz1M{_@g3%|$n$e`RP^5t zTSar3wam@xR7>2fOPOW4SxQw)So?Sn&{ws6yju{CPlT?;fGV^|e)Q&Qs7BWDeRzd} zC~Z|zK(U-tTyLF*Y5bkSMN;@B6vh}Y6WFPp=xMQ#S@IV6huv-{!OOoS+fHr-xjuC> zRa#%4k_}la2%j?A!FH_abpTfd-e3PIIh^;bR4i0CL%A>~L=GThbNM4?>8+aX~dQGK9)>NRYXH!>Y5^z9> zlYlK!2$LKOP$0TUr#`Z{H`DEvn8#sNYI^&)Rv(1|^i?(SFP3gO7t~$gA&MYd8zdhx z#S6q{(YifFNvfXV*tq&(t*>WI0e0&*q5+~`^cm5lOL!!)9ud}2ToqKi+hMCHk5#gr zXak#2--2ARu&K7}GxnB9uFv2+_G(eZTbe_4b7noy!mhKe zZL8UYd>-9ci$~_Z2|XUN{11Rtm<4vi0>*v;?p%H?3CD!qDly5$avg`0S#lHOK^^bF z6jdYyt!~T?+>*-{dEd{R^g;{j^D&VXKFo3BXcMj1sf|K%c^zIL<1 zN#%znzZQklakGiZ>Q!+FkoPw8SoPOE8!VLvdT zn#NTHOB4|C73wco3&2aEzc8cuP_2=^9-ptoSBm8sn!tK>5%uC^bmL^)>eZ>DJW046 zt8uGWhcQn-$kp`;X{ugNLD+KSI2sDol*}14ock{ikzpJ@e=V4Cu_M^NCLi2VuY(#9 zfqn+8MO}&_vR(%EPRBnOo%`6RF})GD2uuje@b3q(vQVPS5b~acj54Uus4RL|O#MmL zf&IaMLW5x{iXcz!E6a8S7ekH{alpt1>~dfHjtz3_CWbz+LJbi?V5l}r{%0pF zif=7W!jUZvn`7h1HNBYH711ONJ5-Kv)5b8a4Qhw_8U?!+Qg6(XQwe+Ik2da$g$Rrq zdVT7<5X8JtxQ>U`%#wXr_?lEiI@BGavtX8fXX7SBlMsoi1vaX}s!KE?F?9t~P5(MS zRlo8!ZlZOe=jxaSi`_Vqr6zg*M53cGw_nY#{}QzpM^5~FQu2W zXwf!8L0}{6aDSVTEjps0)A)xC?g5Y_HdR9sIhtZM+Enlw)Wrn^f>!zHBzVLOpKI0j zLhUD|WMwx-!10Hr@?%Jj-;^Z<^HV_WzQjLP?FCi0LnnYSn6aV%FqCcRA zDBAy;;(Y^Ae6xY(J8 zR88m>;MjE^4}@tFz_V{RyFB|!f+;*ZPUD+Le5Z9M@cowj5PW?^>YrHqa`1f(;rY&Eimrh|ajTh4N*=oL;*(To`)3orMNputytu0!BA9i4PvXX*pqDawFTTO$Z zj+36<4UU^p{^cvbf0NE^v9eC&W{S6}x&}{fVpM|Riy;DaeRj}uDq zgf}^(8`howOpR50DViSpTB80V^i@T~+bI3ID=AhpL6eSK?Vdt%p$euX@OnTuXm4&NZ$3+MbhVl%!MbrWe{ zF>Msm5+nOE_B_wX-Wui8wg7}}9oQ(m-oP~%`HA@|@&P;+qW^pnxCnn4%OWh#WfA_? z2H(L^sCR05v%zjCu;>QpytcV?Y{=o;?3s;e(2 z=;$<{2+KoE;jb{Jo}UFVa;dyb5|`>9op-2s!IFd`nnHC9iG%%Taooknk7$9nx?M7d zB@G863#jifn4>alLFyEbJ72&@k&6ArtWpKe#5NW5Y?huM~pIxDxW>BA5qOC zV>Pc6qPpc0B<_%?rn6+Ss7`~MkR~cQlfw1O|5R23FYZTHJJz$JsGbnl=;Bmy?X6Qh z;wmTscaIdFYH=NChpnPd#v^MFH%qBOZq`4T)hkS4t(fYz!|+O&^-n~#xb~2YOg-xg z^tt}T^$&4fQhHYBj2#u%Us*C)T=#Nl|8MuKUtiRZw059py@$F9QO!R-Ra95$6pyHG zhgnoLl(20C=h|Vb=tO4CaECO1t~&Z+N0&r8B!9}apC(p=nI<)B`h58utj>i?U4_^<0R zgfW=)w8F%H4(q_9e4AAap*G|&Pp_VF6Zs{!ilr~L7TG++n`_s!fyHO2N05WVD12lV zC)zMZW@9qm*AD1F0gvVGjG_dKC&+o5VqCB%k#wL9O`7a`iGvu1(jN67=(nqx!8a4T zHw74%d$}Ql8@gsWF5}v3d^`Z8R<}B%ICodJSF*0ZlKO_~)R+ghdStzK&)W+(Q|apy zrS+~5PyUglpE@v-&FvI+V%{&u`g+u6{s4&GoEVl3H`Hx8@r& zJvB#x^d2w?Kp_cSI@Gtw?w_tjDc%O!>r_`?00rjJqz>SNe@ObR2vZ^*Dgvk?GuecI zaBYxS65IVk0;7xf)(Pk?*^KI=LGH<-%vkJ`hQ(*+r(v;~4I4HVTj_;RbJDT6O@I-L zGl)fm(F7Ls2}KKdUX8ko_gE3a$$9)=(~<*#I(8zjl-ujb|C<`v_n~J=Vihg33%-Pc+Rpe`#o&QRu4NU9LR^4uAOea(K==n8c~N{aiNw@Kx&!QT`VgpN z4KxjM6oilJ?>~PmfmRhSs32H)!bR7jdG$+a5hGQwhqs&l;0{)2Gq(`A@%`=clbA9h zxqKfQPxQ>6Z}X=jz<3YB7DKJujGw2YR#A+(&rubeHa9ZmvE=gRcH6wprgQ#Wht69G zW2Ohj(}il4O@>u;6>~%0Hg9Cg@yX>Mfe=#Z zEN*^y5KSv8?nJ4z`AwAJe>M#IN5$}EaUc&}w>qLYPY55zZv%w!`OuQapN9F=jnfD| zuY3J4{W1g6#}7uc51!N4=Tx~us^Ie^{)U%0f5Xv2t7O%2Anb{AQ5!z*Ub6Cu)Tfww z{!&g-+Tw>Ip^gbRqnPI-6z?2~v}-9SeuRRbwJ4{~wAbdQwU~F%i)<0^F85Y)KT z7H2WFYYAIiB`r>5!k)=3Ud|SMZi}nn<*h{gx#315(yrP{DfbxQLDfQA0EM6t^5(tv@Ai>`O-o;n7u!k4Vo6W2LHqs z-Z&POFST1B&(<;FSII3rcm&c;vWXwSuY&xEAC>qMs92n!VozyfHx(u{Oj`XLapTY? zF%ac2%>sdFJwjN6?oiLsNwH+#p%yJqXUm-sJ&DERl_gV{(8dHKScS(Wt-SF-1jtvR z;ab>l{Fa;pj;U+eySbfP36B69OY*}e{w&I`&~zCjbI$YRn=FU5MlmOp&wn%@0}MynD3g0nFkw$5c;s{W7^L~g6e#O}I40Dr z4hAmbFPxy+_(_QBGHLQcV79r1O)dhyK)B=*f_~=L0;b{+B%H~FWlX@yV!o}TcK~KW zPDT_A>5K2uCcnt?nTwc$#t6-Db=_^ue~~n1zld706JbyBH1^%mNEnya*uOxQ5uIW; z#$RbnQg9c$jE%W!?lzWp>kC3zDY8MxV}!j4T&Lhk*`Jy43KNnjyGU- zPa-7h^#Pp-tGojeEYMAVQizteo{ePCi6`hv-ekh3OfY0qGLDBc{jVU=j-wr^isNU5 zV3rI3-&7O=&Zz9mfim3eN+~Pn5R^e5g34aUJTtF)O+wk1VxK}4|H@VNKy;N$#S;z{ z`3Wj=H5J7~`e;o>csDg+F&mBMAQNw4SS9-~dv{F}6G4MiG#hlRp6=!;A$RLTcg$cAXQJO)azxXn;lI9BG8A%D>z z501D4MaJH;Lf9iR>R=tKWIwk1B1G%RXfP8RnPA9s0wSZs&@g1wuG^8SW`mBEEPoa< zItMl@qm1TDmrv5P^EpUUM&uv(an^GQ86AM$uXZ^jA)|yZGx&S7^H?e7| z-uY#E6|-bCbXeK?`~}c-G~pIjc_?F*)@M-V&sup_d#aqsD!4x`;zXlG(bN|0Lp9Uu>|W0{tdi%6de7f8Ri>P`z5(MW0*913 zX367FQZ*`5&c&;cb2j=|TAjI&wmLCA$0`}ZcHRTsX>|^7M9xm>bMg+P?y|Pz6DXMSn-rbdQK9ye}ZX~Ht&NiODU`8*nosRCL@<< z5GZQ8`3uo8u*!m~Oaq>o#+G&RK03J+`-;2Z3Y1<`60kYyExQ=5 zwaF7V+_>Wk*2FSd=)?CX8U+&8ow7mC2&hr~B20BUu{wf(HWYcg!kFF}=clLaEdOC1 zRMiWolPSq1Y|2iA?P3<7b*c!5=S=HK@*TWaR|Q_$@}Dzn2Gmcpnn4*w;ih_$tSkd=wL+@Lz4` z*B~F|@I_t`u!-Bmn3Znm%Zb0z8hnY8#wwP7E(sbtv5;S>f1btpkp#OqCekl)BK=Zr@a3w=%f5_<&=9=f<2MnzKL@qrog>7}&Qz#X#{u6?=|D)wKq6@Hiq zu|UJUP?OgDrFZ(*!>r>50L?580Ajh8OZb2I1N3H;SeA`=XUSKIw>76C*IDG>sCFgY zR&XN(_(8(Lq{Gj<>TkUCvu)gf!2Ng}WfP%K7EZzq#T6gtp0GUGnX6mjl5wPGQsC*S zn>)Th#w+l}C-l9Z(AH5#1V+z4{V6n8{;l%>K$vSPEF;_k>nAakuyT=WP;9pH^2xzNy}Rr|qnznx}EDi$5i3inBzw`S!F zA!~^;kJQQbS~jvLuBNRLEbl-2$G!m zV#@%mEF!-S6T3o`582TBSZjh8Jp;dX+`_U^ZKF5zriXJ^{Gv8_{;1{y5UrYLlRF zY*(i}!m>IHPK_N7P7M^X@pKoCdf3HsIkMW}Z%-iv=hG`I)MgzoCL)yXn-Xsm2OexI z6$BfYtq#}#thqg(0qY+{sWs=Y`k~QNNX)>I73x(_X@1FOg=%6^F7hS820jlZib8zM zRd)zE$g5>u^?EA0eaeEnY2N)sGw%Y?a@c{$Zcy+ep`3icV*pP$p#Nq%V0xkM< z(}2mwixZ5VkbyTOUY}%qfD1P=qAiC(yenj&YnjvQg`=PRi-JbPgkoQ)`zJWhRDI!F z#tnDm#h}}ZW_rz1*K=bEyheL$NGpk9vMgZ-QQvuI;9Z>6RaM_%w>H?VN9ODW=bMoY zs+~;ZWrUhYi?8B9p~`-*LF_PCflRZpmTJ$TP=;!L4qLuUg@fgszsJ56!WuzXql$`Q zK#)JvK0aOo>Jj33$`sBRWn@JyfH}&m}rLkJD^TTLX%H zil3i1o+tbz3}Le{@5B5YAFlt;qzfd+^*dB4!-SWIfP7@??Bk78epzS~@|u^pCyK z(rr#lPd;V0^pQ?k0bE|jNnUj=CS;_mwHF1@9QpQQ&#R#>T^7w&kE&Rfj zQNBXo75U3aWIzxMz^xiGfd`7^u88i~OuCkQLC5A@k#dv-FeBGfhL9TxnPu?lZA2fH zhUnk;&n5yeP_H0Kv{nPAKHc(_28@R5)ff(1O#MAm6WphL40$LWzZ{g_$ug$D_c2Xr zVBxdWBGdmF6VTWiH0FBgf!d|Wbv^ax?K1BYN9=Zw3##z1-Q0)m+-I3950m)+Xea%a zNxU*-`WM(qi);)&%&c$H+w3aLVUM1W0#bK;z-fX{qAKfStAfV=; z-DtLM)UgrHc!mo%=t@;=@lcj9^A;`>{}xp8GR}rAjt?(bI^!3}K()!N)`JqL*7S%t zePj-gf|f)cMP3~?SyC+6jV$hbrL@4o?sBFk&pNc_Dg#TsU(4Pb{x({^VkoS`a++NI z>FbZ}vBX(z(Y+6SINYljZf2h7&;iQX%Q*nCi<{wXy2L~9Pi!dm_#}PTJ|8h|z?@xl zUk<8x4nnXIoM{D>+j4>1?+rQtTm}Jn9~=a}2os|uah=?D4j`Of^)6Ob00!&|#u65y zUpVvngL6klI2hqM@}ix6v1gNjpGY5)Azj!))}Y>2tW+lICGgU;0XCjR8aFYQA1YuC zQe2s%K9u!ML~2TNav*&|dYK-HFbqajj?f@+pfvA{SouJg&Gx8uPGQSFAoRXYfTluQ zF=%aNJn`liBwFb5o~;eVhOF?F<;4JFeq-TmaK6rD@1!%jutS9IYtUO zICAX*7=Tz2v9KwoZYY5mHBm7&1QAjBPIO58LmxU3y#OlDNT)htA`3(2i8&SfVgi2w z+N-`Bi$(QdH&(s*NpB1;h`{Yhj0%}?`I2opxEwt$4VS+n$;IV?32C^nm%ET_%eQFB zf`~-`9B3}zBKMDltW(k{E%QP@B^|m;sptx21W{o0;4QV+d5|8$O6gBh5_;w>`iNW9 zU8U37Bj>#U>wj24R{WH@Tf*x?W_Ad3c0FKwbjKadeIj&^1<*L!8p3x9u;zjz8$aRa zCr58mEQQ_MwaWdxosbsjBV_DHadZ#qRGxmc2VJz(esl~dOywGS!DlcyD66|N7X;EO z2C)C{P~uol_Mhs0(&uDI_u7l;iG1pj-J^*C*GKtVthS(x^beRwyL+FCBeEX z>Lg@X$ka?{y5t@M3cSbyqA;#+CB2CGSSA&vdN{dE0xL&T>ZNV24J&n*gnw33__(jD zcOf(hYd{YOC{<_&0ETLM3^rdLnt+1Sg;wem0Z&U5>6DkZP#D_*CrC{UPaF!`d0iSZ zhW{a7a0NeftrmOWXh-Q5bqo{9P}^34v#?cVI+Z8k5_eE{P`}B!gzmfAq^+V?#v{>4 zLT3s*cB@yP;p1&p(KOx-z*jUy?!XpfV!{vh^c&rloQ#*6aCPmMJx~>Fst@wwz(>9I-0$(ygC)SiKxpCg8 zHc2L542BSB!9QmL-Jzl4EkZ@O=RBb6b9P6n&IvMYLt;$?k96irepU-s9k?2WLr85B z5W(sTfM{eP1L~wg>a|^aDS@b6B}u{#z zjQ>LHonGEE8)j)7xZ})#; z?+ZIgr{uWBZf2UiEx0VSHkQrjrj1i_dG7}JC02xV??|#Y(ijIZ$0ikMn?&CTAU)&V zZ>8NG8t+bai@J6vdAvJO@YPlf=N&*V7?SPq!!z2sR$k_{J?c9>#-!mZJ>H(Z0&j;0 za9b>{{y{R&>Y8wWHKmT7ajj#rMbYjyxj+YS_rzYLUC*%sp8X+cSdTU-FaDcKM4u^w z%VtMQkD$6{9f!_!AabDXbgyiG3=f8Y)waLYgcU}wuVq^vr#-!z4P3Wlbzbbx>R|oF zYjx1+I48dHSUZ(wOWU1IFU&4VS+|DuU1HCC_>RsplwtU@mok6j_CMrgvSkQNJ> zI&Hk*V0U%>V^_2dlwT@oOSb7p(xDC4NZMKg*T$@j4KxX5dUfAyU=qwpOm%W9Vs8mJ zc2|&#|HkE?X4Fn#YQhp$sn=PPR4ow5G?Ut@{Sz5#*5N3fBmh+;NlDC~C}tAup1xIz zV|j&W;eAORx0k?mvi=nL`7(Ff0ju_FDJWL0O1&?`5Gb{+nBPO59zs+3%)7QUXaYto++rfhVG8ENm9d55ZDj*#z z3eM`kJM-~i-8%4KyrYxATt*7X|2aFE3~jr1`aw}TE>k!o5(V3vn8eb*(sNG1M^2=r ztw|(ky7~w&O*&oUOrG|FrO%G;r-VJT9Vuc5iERi_0~o}m+eA2*VO#DFKRtX);&+lg zPvQ%w=yDB=%b6~bYBr>m(5ZgMCGVO7A$dRupC;2f^}{C*uCfuUW+mGEIk=Z+fJ+*| zB@fo*S&c8plbT@-LGTPp(}^*9NA+v_TJj^UVhZ&(mhf}Zdm=i}Hd*+i0|cwZ9+lT4 z<|GC)8{>TFJHpK1tE6#rryb{Jb_Q-}Db}kL~3}M0_%yHf#@~iM=YSRUFHyqA{eb80Mk}-2@I2%sySetHd4S zGlM;92E!N+sB0nfSk5}`*SZ0J41QVCnr=a|(J{A z_pd`KeD%=!-UB-8){UqGRPWo$N!us6K(1uZ>6qc~h5H;W#&scCcwnY$}dc_T5>rSJtR^oOL;t29X;N!1;_g0l=| zTBUQ<8yuK8$G{3Qnu)10Da0jv+Y=~EgCo1a_OJ@UDGqf>Fy@I9+b;}{9*Py0`ynaz zHgzN+{;$!q^tAhYYP)wyfqvU17kQrpUwuj&@p!wVU*38K*iDIYj?N|9k5RH%qI<}2 z;X_iN-dTH;KW{%gZSIFU4sIBSutdAnl}?Hl*xr(}R$$L6=N|&LVUz_AuY`SYEFpg3 zL|~w`P!1;{-hh(!yW^7%nd|)4VB66pc*Y&a7mzWd8~skmcu5_~&XAfry2GU*T~j~y zNZQfVGg76!*UwA)MXV8JlJ?;VDQ?$FPTy`hlHB90Lr{M?g9ooIq}Ggr|H6Jv7BdXq zZ#hVK|9``4&Sq@MeON#)Cd7c#qc#T}-z4F3X7D(HYj0asK2(YSC2HdIF;3dKvd~sZ zn(@DC41>?of}{b!ALn1;``YJU6OXtL6Nh+;x%cGk8&o{Y9a-&JN;#~JeW;tm-3%UEbKGLs(XJh0jue8-1PRJ;LUj*IdKf< ztViVw#<-jBdrj$&uHiG}(!ew|_5fQOr6|Nwc__%rTy_Y`rcm6YM$z=!YxmoydpAvB zD&1Q(Uz6#c_eKic=hk}Y{&s|i?vuSa?;$6JZVbX1=$;_@11)vvt~vz7WiFeCvNqk$ zNoVX?fKLwH-aSJw9R8Eul;}{+9l~B^POHRt(xGM{F6YrldCsGkC5FGMp`4>z0Nsft0HZ_3V3QVfmdn+Xbd#N zKSh2v4(K=g#;yz8g`=%lZNh-uN88sJ!YP9$bl!Gf#W7MJUpunv-{#iu%v?W?Q#sCi zj2C4nykIcQbsKb$*P(H^c@pNt}Mt}kTB6M-SanuW>m6urtc#gHeJv9XGU9G^^*^&7+1Kz=k5;D?OkkD*gX z+m`@?;IX9ieHa*B`Bh-_Z2Zo8Eiih<+Q8^(e+!H*dmu16bVp!xX*DjGaSg65nv1{r zfYlb=6Cqr>0dStbliQnovYxFQ78DBOj>XbAAIYjAeLWRI<D=1#4%>k1*P1El*yVi+udC2J%@(P@kt+ zt5d;R$qZt%SyK8w`gzuBZkhhKVLy28@^BnMc`1q`DDna+&Ro#?{TWt-nY*b$L*)rn zf7dy5zb*e!EB1wGZ2?5f*gj5(;-@V-34a@LjyGgBkuo&`pV5XNt~vk$?;-1lfX&&1 zols?N9~3;V!EKxei1|5)+$q)~0-ItjC1iaG80%bxA>)FBM;TKNE-?L@p4``mq!iw()`KB0ePcb%0|5gLJL<%KAQ=p|-T4eM1 zItKni3-PxA+=mt-d;1-5+W6qw4n1wNTT&{@L}29K>pj5gEo%Gn$Z>a4w~@Kic{ z;_na0TTRt;?m+(eudoHDAqB??XM+!vUC8j?jAWbh{+lFBZpb&e8ptD9ks8QA9m1w9 z|6GJoYX(XpHd%fu^L0Z7Xia?pZ7}>H7PCr5BVVnIfz+cuNB}EVpHadBgImCxHAWEZEvMvpI3gZ7q1`SW-|> zS{vS4TmC(+FMQAT=v?;%R_sz6{0_nIP|#UfOdqq(*cW{1f6Ti5J!Jm}V^%)&@_#dC zb?)!f-+?jf@eGA`Y|MhMl_GyznmijosY25FF{r%|>cH?E3lGMjKgzJ{0Q`0HnaT=X zyXsV3_Sm){*X?z&z80?Lgz@n(z{EgaReV@@5DI98XvcZ!u2-D0GR||z@8At98S*Ov ziHavz4B)MNz~NnP6zcPJoe7;tlsGIrUm|v+&{9D<{yQj02XbQU#-nkO1eNLf@WZO* zorrQWFa?8=)CL+Mt1>HOd_BczRc$}X@`DQt?5NMJ z4x3eS_bO=hjj!=Dd9Nb~*7(QW1TPVN?5sUNfdR)iO$hK#aE-*L&fG^z#L!}lhk6(C zG&7v573vr0pEwJ>0#vZ`#yF*si3gT5BQ2^jA6AYhwHL!9Uo17sVXfeXBLQDcD<013 zfO9fat@7najo}#M_^%u@xD$;Vp2w`v$91>v&f47k$L{?M?3BqdTn(K`Z3GDD@i4Hd zL&XMhl- zz*oiLjW8Zh0dY6xfjB&Ok5qVpUMgGEMaXlh{1rPpYfMIJ*TWE}ZkbRKwGB&U4(&sBbsf*&54^u-JyLv&ws36Jb;3#$op^zfI=g7U;!4O-~^R7 z^69pTIq^x7f09)BBRMJOG*CZpC?jq1*~)gU2H|D0YKk*FY4Cu9S(-~+x_*a_BS5dx zq8%=PATSCm%I&9NDCdZ_UUkbLIDK)Kz6pwga-!HK<0RI0(Pb!P`afJrx+jvcjbQBS zh_(e+vg6BpJ*I!wZDiuO;L3S8ZRB5l13JW5|5-@nCvH|^z+Uy`AYaHVT6R4rR3&4P zqi#TTIhyZ2+yg?7;fL+Vg*brTtl2elBEQ@eV+e)T&zJ=0^N5z*%% zx9KZVwC8t7b+u!~4p5|hLhB9CQI)(-4PTZQY1;7075d0iK2t4gq2-gPJ zR7KX{Qxd8h>w+o|noPX`e`t*W7|W2&M**C=$C0ai$B@JAvJeazp9{)C1iFb$Y?B+= z$iTt)X89b}aiZ!FM@^(&zR}2&E{M_+$X<9H{KqE#c#&_)8JfkYwK)5_(Wn9E8epx; zZL1-BvhB(*2I0LeK*vKroNS@&sDUBVzbl9W3U#Pj=c_0K@H*8eHq72%M|{CMc0{vjPua&{Sx(tcyQNX7O677_=2wcTgV*U+{=wBUdwmE2uGV z2A1ORDc3z)R15q9WBP4!_JA`foIQ{q<~@WwoITLVP5e=(z@+gr#qJ0FoEb6ireHsz z+J4@vTwZoK0~C**3-#j$vsH3_H<*$|>Tu~3BSFb&Oz0URLh2JHER?puH@-w1 zLkgJ)*ia~5p(vRL!=#Raoff4A>*cCRhq^%IZ(lVznJncT|5WL>c_&mRo~Jk$=%_Du zNxXiMcv_+Z(-YOT26fH6fu1UZS>;(&K`-%qzZQlDegD_62Eho&Xc3j~gLm#Q4>an2{-m3lrICcMn8!TmGZ?o-Hu{(Zrl zI!G?L+zI82Q4e-#{bDd!{J-%9H=yU~`a9qY-k+iHj`@OGM#4dGcW+THS-I5TB|IKN zO_7@Bb}x9>qgJ6OIMS%@=4N>kdpWhR{Fii5Nu!nq1i0+ARCgC*SaYs#K(oy9 zZGHGSuY7Z>xd1TKcntJLbR*QdE!v5{NW%<}p>dKLxj1C{`{u~XK*b?Chg0^uUeF(^ zl7SYh(nF2~TIyERiyyq5QbSm3An~-M5x7WqBsfuz#^neA?EE*FpzEt9M@w-z2DO?4 zoPO8iV_G|W4pTC5ruCE@#M>l}GLS+8vjm#{w&hbh!^#c;x9s^>NzZb4y!e0$?nfbn z9@Hzw>vx}t@q#9~X9#La*0Zd0ZOz`t%Ef1gmNT7?({7{x5I;;U*$HUyW?j)e$Wlwu z9su)$HO{>R%s_jMAm8oqXfQTxHCf{C(DA>brF3Ed-*qPv8e^)62PeE<-vkQ34h0Tt znHku9$huDl`MwUEyY}nAP|lL^4-F&3p~Cd%0;5Em>H%<&i1>RS-xp)oHj>dx-oqpL)VbKS`9aYZUDo4xvwHeUJmGu2U`90D43;Akd{ce zLyL1_0Jc+m8OKqaJbo`{P-IfCy^PHF0xbXOfSiDq3w!?JC(x~J!haC&d?N^(9eA$@ zh1Jo6pq!Yf6+MiCNZgr;!;z?YqW%VMdLatGxjj-eH$2RK(dJ>Szho%mEUPp>29VFo zQ3v4`z4K}4+nhydJ}rk@^RtI0ILD!GbqV+joJJ7&Ige0>og?iS2S2UU+GG{&$^2Js z<+6&37;TX#cLrZoF*=6`aM!sNVZN15e+>UP5Yxq%j}~^QSJ(($N84XhFOv110#8T* z%<{zjyO;KbyU=o3n1X51f~gnx53Xhq&N==W{`yBj{k_>N?_Yv2czh7}rIsvU12P1= zTq$-Y_39oJ;is}}j>J|^(Ht3!qsZ#wzo4pnH2~}OZ_XunEVdV&$NVP06TOkqze^x=7m9y}6E5X3cvUi3;z19 z$18FYKqXT`#^!XADuy6r9i3-!`OQ?)Ct=0;YaA>X2}b);mKWy(k`Wr z_qQuSzXE;HInBgLk2hTy#GF3v9hUjrhrM0e6AJuo7pO}NB*mPvj!mN{B3@ij$uRHO z&9bCHo<811Fh1My@$PFS;T)Md*$#ep%*PuAjr-sDcxqRt{tozf&t@pR13sRrfqrPt z*yCEr@Za)q!lt87(GKW7h0$DIXT)%^np;)s4->>YJBvLH;=*Y-unPSIVXVW;Pamdc ze351By18RQ)lE!e@43#e<{^|Q<$l$H&a%+>bv7kR=lv@ULun$0ds z<-)30UzqJ-Qi1lcD9TBQ{wDkuf|gaUpzE{ZERBaZ&YSk$S`okz&^C_PBK6$(uj!t_ z`nW&@T=W}-rR>r7qZSrX;D;h2kEyam1T9%E`ZUJZ%OG{Qf$MK{95M=L&5`2o%?Vu9P|K$#dnT;kA{wXvO`M;$U7 z&_70W8DrH={Gn5S9WN+35zGK28_#7T+e`wpHk8dxh(*kSmT?+E$zLoSjkYk=l%JZ| z$+V|_^%0GiP7AG7#H5j>YMS_y3qh|%fwt&M#7K!LAz(dD$^|U$^SFs5B};{lC;-mb z9UB+y4qt|)k63B}t4>dTxQQ7uu?T8R$iXiR<%C@P=HoXXzXkX$!0#yhj>2ytehcwi zjNjt6=nCKjX8NG5mY{`I@CzVB&@VJt!Ww+^DgyF#aZLg%y0T-RPgO14_efTgN(^UE zlYoA8l~B4Ch#bq`gDgJ?Jh=`m$>w;jpEokCX+%WN2&ny~sG^QZhcXsW+M;KpFo6Qr z#&Hr0+|D;X5PvW5q$IfTLqegwDXhDe0DlW|NlE_c+Ezv z3wE=BRkRXe8HlvQxavc)i;v18+t+nB3t=$8u#jzEIkUf}YUx*V(R1|&S*dE@gSX&< z*iQX5>OO3lxX}3FU7ZPgg$)A?xlI^UAv4|o#U5e|x2u2$zIZ;wOAbN2?~5cI)bz`@ zt|aT=1v}OoqoFZ%GkRht-tVEs%k@!u3&A8IY7D(6KC*+)&xedfl}N|R8pEtL~^i&86n#0qN_ z>P1%&Y)un?3eaN%sL&Q&ju=TbB_utT1p*coD`Z)MpU%6qBe`*FAhvv>^@M8LKc(oS z(Z!O25ysNNEYQBlE+%=|?84DHPsN(b94z$9!kz{-v!-$b05E}o+L|4={ry1U=HVZBSyNCkVFJDisHMxxwQIoJzGsT>anS{ncFEC9u+U@u6 zc#uK;5&XfrwM`D0*!3JGnGx?@k)z&AZ~s=L|191+>3XmO4&L$dko6xS{ZE)AAZ>#p z(tJMK0jodN?MN0rShCLyUn+4X5i*KFRxvnRJ%JhyeT-pyRv!&GYW_}myKwCMVfb4r z;Zx`DhVYE}zro*`^9SG|P&#MYw^4Mo*T(n&Xgez`$iv=MjVeS}#R7R~=XG%DTjF~$ zF&3z4**Pg4MOpw09T)2mKnN;)DHXv+4pw02?{w6rO!jIpEQ2~A5RN(Xnit=8b~x`s zZ-Kq!ek7d3>)r(Wke<*f-eSw$NaDBA2h5?KOgCQ zO=^*N9ad8vN<2enMy=w-LTgbKM+&QQ~or`QJs|Ku4%mcVWEH-X9m<>|XY+_Ay zDmyL=0il@sXIzv8xaTMFs^TNy$ZD8(CBu|Cw54ltCw)<+fJtiL6Pd|wzkvd`fn$>! z;HVl(G(hP<=otC}w}9Ie7rxdQWvhW2P68bga0D)mpgC}fY=R0Za6rUCI}RUThi$@f zRzX*IT*=KQND(7&Z zq_6$kRZ{dI-}QKZ5Squkh6zCNlg#NGCH+rG#|{_}k{z-c2w;ayepsy%+1Me|$xZxG z2f#?cQn4U5$&l~e1Fv+tl2N`<1j%mg9Qd&0@MVXphQY-4S50I+-JsxN@~-P(aNP{A z2U084B(}-`KQe5zsb7A>3Vb}UFHCY)s2Bv})cl-GsVV8W8_Pl!SD{cgCQ)=n_zFM{ zTIb3>goAxVE|RhO8>o)mV3>HEIM^7vv>m1cTF6y&zX3RtjowMdt9>0ta7j&MiElm( z&Jqr^v?}<9Z21y|Hjajs0W99Y;!1O{-JA-V1rDx;wy00Fwbm~>9+f~iKHdzYAXdEY z>7gP42483ZIbw!+R-qOfa~SxLvz0yEK*WsHs=q)3xfffrz7kBg*S`xK+l()(@!_58 z{pvu#^?nHLLMW_f-q)zjQr`j%BmKSuH*a=A!x$i@gRmZMztYXX4BLJb1t(Lc6M1{u z>RSLgt|yzj&X(T82^WuuV+hZdsmv%ljx@SW6Z5;|IXzv7i3PHpiD{0s!n&pTc&BL; ztoB*BAp`n*R54(4jq+YCPn0PGTw+z4KYE=3alMI;&LvQ_fni%>-dFH?xa5ayOiG9n zP$OyOZ-P3k0R}rba)>jfGm4dd^$Uu`Bg6|Z*%bf_ucCwN7`}b80Wj;$K-l z?hB7{*Eo<;ncs|OPkd@N<{hrKz!#15wgtaM2HM4@-UtLRyyQvb0cZvtY-~(_s~dH0 z5ICztvhoHALNd-`g6N^WK%}P_pa5&1-wG*24}&W>E{Qm=lFN{t3Z&gI*yM&sC43LT zm^9*H(aL=pfbwnlp0*g)?qdY>ko<{Fqm3yqbb2w9)`3zY6_%k#vw`Q%_@V%B63XE* zoKyG3KSNu%H0Q34VNuzxfpjbiNAjTe`t!)_6R z0IE980`xvl2c<2*`9wIgi5!4b9 z#EUi@uu$t1U)ySA8AoTlE733qgx=Qyo4OB;gBu$V@7WIaPCwUs@JY3j+M?E@48{Ok z$3PR7nj(`mO=USD>^PIWab@{)4%CUE9IBEw?185T4yYBuhWbi#;v7IJ;w%iHpci`g zBvYy?kKzG4!ZAy8V7d~N+nbhEk-i!8hv4rG3m}VLvIC`Uj*vs%ayzly{N3=kbbbJT z&x23&bG42V=wmZL$69f|^t433?y-AWVBzCH1bEvCM~3%yBBE6o&7kNogBZp{f4XjB zdAacALkH6J0?oy$NXl^!4{2b_$4+KdaXQeGG*D=qHV}oGAnqfD0t=_&*MYjp)(H3Q zJeM{tV>Aj`9Emjp&bQ>bz1yQUg;mAl!#FTU5wYK6=pHvQXU<5S0?Xak&Jtqp<%l@k z+r3_bd)q*tUbysB@O$x^$#fpgFVJyU0^F`x{qr9HRYyhz>tRoH4 zT#YW@T*R<=BNl%6Fks>{uqq$Kr|C0jpxwdVwzpS7)3K2JJ@LU(;P*3)iDOdpNh|DB zT0L#0^kMMYiAUPiTua3EYAVO{&4-ypu&;DXq9?;)(EI)@;hPWB#=}cRAGCvW*ro*0 z32S4+@y$P(a44sWB8D^zf0Yxe`FucVY+3Hy{gaqlqhQG;H$jF9DCL z$l6Xw0ztu!3K|qNYSctP69r8KG))q6GX#ku5*L&)$cN&B+F=n$z)qAq*Ty*Rj^i@w zjE;`uf+&iRumy0Lh#Sg4P-djs7QvOc=6~N)b#Hek!3F&O=i>qT-dawbI(4=>wcM!v z3C{U)-)erGfhb%E6u49PRtbjCbc?pd$hkO~`Y~1?)lMSj(=ifINLXorCy)jP!V3Vj zv;FSADB%F$h^1!-c*-KjaG(ai$r%pOqesQ~G$(ZMibS&lZncJs85mwhA1sYPB$kLe zy~vQD!J?s}(=HTf#xH0r{#81Dw`i7wx)Og2fU*Q?l~16)*Ik?#IeDPrq8ALpK=~~3 z__17%Au4Y|+u(0q`Me`dWeC4i4*wMuHD15Iw<82}K=&5A?MPCux8-}HUSCh$gTBTN zn%gVk{TSX=*e5~dqz7-AJ9u$y>((_RzDFHjdIo5&4#oB|jNtjgW*1}9if-7Z&?-Yt4Bhqt^$_@TWpF6aqZUvXswqMU!qdbTx9iV1rQcE|`Age+GRJStgIi1HjxlyAlHWF5IWJ+~QUm z6#D~40P{)g6B3^e0}nZax)Dv1v zPme~w!!_^KA)XnlU-k_5sG@Xf4dt!sPmhOxwHoE zE8aq9`UF1EW6Ib3emDXxQ<~{lA~hkD+yzlx1g+rlFi?xtb_0oLX0nXxfY$GVIo3|u z9>}@+zkUB4`w6}M4dgj8-v+h`I}xS3_5nS-b^0i1%iz=8OTjz&ID z6@4c14aSyPS7fy0K?47#b((z64C;Mw15|k_Mk#V2zu6;{qpy>rt_Q|g*Tb{S=Q&e^ z{YdQ0!!X=l+$skk>5r^Jda9P3=A~*_=wMZjU(rH zt;(;pzt>{_J#CTBV*;5s^tnhpOs*uE*~|Ts8l^QgsTE4O@HE*q_5upOpufxV5HoaD zcWV;ia#md(3zEJUswH@H?g^%kCoe@G`$7rm0tKl=s#W(Hm1JC+UPhT##B6n$sc>mQ zi*1tNP9hMoZ2%uYqGT+v*@Xx8g9&Nj(a7)9Xba##kDpRRGy#7WT2d*eK9&szj>R=f!K&+0Zc`~%t%|De5sKZ)?i z#KQkX)6&C#k?0c<>OJG5mm)$jQQzp%uEl7Pt_7%wU?_(BJhXrpJ;%60H~*Xa&U>X> zCX?g7_*ih(z8a4#184EomI_0@rL#S{{cSJkCZ;E-0m4%Pjm!*NPco#vi8}i?Bb4!R zQr#J}7QEMCP-E7Wycy`8&9^`joPyRiS|mu5?(VQqR(DRyjMHU)v(C9*?_P01A+g`C z1Xo0s#Sjj7p7O8pKT(@ktaIt(o%pm^mR-Ei8M_ecjxe7=y|qi-5`}I_>n%Q86WJSn^b$5{udlh^aws4<|@Z0Vz1@PkFP%y`i>h zjEZm<^Sq^1ME=pO!Z{|YsJ$nTV!u^{V{f$2(h)>6QKLCr21AZT z@A!~o)T(yq9BrE~L|HIWOaO2ViXr7U>r&j9?qQ66Im91!2(#*8;}HW?Lvz?&(sh^`Y2K%=`^kFhuNJvooYt9md zs6CAnbQ!Is&e4+!PazWBJ=9VfgEzKBF;MHFgUBi?b(6v>8i~K_Vw(Hk(6Qy;j8%RU zPiGXVm!#3uKKgMQO^tKE|6QDg1wZ)*Sn$ObA*d*IOdU2d?RxO7tCFH4_Ujdq%;pZ) z4aD%wrWh|@57&2_1qtC9AaM?n_yGd`zaks4s~{hV(_pOFVt4)PH(v!&kD@aoG^HH& z9=f|_Mpw*ZTAC(Po|F=2;-gJMl&JQYWq^P}CqDrSZ3Nr@))d++jV_%#L8A>%T@>NN zaQoDB-C;eIlg@S5r98#ZWzRh&^_hfa%pe+%g|{FYqq;Y<#0n10_CDyb~dN!7p=xG)h`}Nhj~r@ z>cE%UUpJ{+NsDbIsa-4Y?gdXgt|w_4eV!;p$2|0__Z%=}@8w4!d%^bwq=r9Lhm$?{ zhwT;o350)?hOY-EO~cnD+oYWe-7@zQ$M)=D@arDhSM=;7d|gpJ_MU*$Ht7x>D$+Ko zQn&n@G5T*}M`exgr453!*ckPpo+jl)os`Sr$b(8T>gNx-f;VD!?D>c-6%6t0TnDGZwr}7`@e;&P%7ggQ0o1+gnF)j`yIXytNsE3beAvX%-kN%iN=tlPS*_#5so-n z&qRb*v}>CY-~KGNe`HMi`|gGCOZ&^-WCwklQ`U`Mbs3VV@wWPxri&Xm0N2EjeSI`Uqky%+4-ZAswmt50BH0S!m_>n$f` zW;TXJz827JMg12mCQX`Dum9l(`NM9>LcTNjxPgZxga*rC%g6nF$QxtX-!XH)Li z=ZBwI`C}WY63zek<=Fh2V)8BLbalZJo<|vSe)w5jAZsYb{*SSoemNK8&YrrBRR^AL zHKiPm7ydzSgg!WL`hrGK%|KNa`TqB&NWE^K@V|e_z#kJ2zkPo!rzD-U7vQG-H6(Z_ zCGk}JRJQr;<}#}etUkg zFVEb`tHiO9^M^8fVrDBw2(~!oC617D5sp+eq*+0nOw8|BA^M;jVvqI?b^Z#%+o6K1 z+2{qlZ~XGi^fG5+W(Ed33wI68N?QuTv?>289u28kXW1WN7}Wtl4qpMV-$d+O6%w`aTi`h#kP0Yr)n7QM>8~V)>DU73EdMtj2{w#jo{1W*6 ze;WRPc=$hnsv7>7cJOg8?NNs$*r!%X*_9}&E?iT{JO{kW<+)I06IZV9MGdBjkxuFd3$ z0AzoN3%AIa!>OnAZKjs3&L0nkYf|IE4Tj$#ij>2h0QLHxc{xAFPG4bLYX6`ywzrK6 zU6L-_BUKk_E_>VRqRg5El|nURe*{Lw#nS3|h+3JgEa%?Lxxk^!c{r1%z07$cvmU<& zix`andQug*?Vs7Ped-*0n_xi{(97$kNz;Dai?QvWvX||@eQ$^UcWl2)Z2Kv3?LQBZ z-kbiP+p+x)-49BOru#Q6j_rQlFKgaNld;dS^nc|~9s1v){iN9TyT-Nu0yym#bbt5l zU(m7r0^ict+cCgAY1B}l_JvpqocyaWpgarfm4*HXLN1I*=C{A#X&W%VW$#i)Ndub= zhrbQIAzkd7Nz;>B22}!-$BlqA^QAjuBcgNx!`q%h|ha9zrH;X(9wD>ouhx_zS6Hfs53Czy)DY<$d`7^P8 z6S>8hbbgBuX_p|4#_$pls2%+~vnRpRlLDnG<7-rMhCO&LZ{-}CsWQ--ld^!X&d^NU zG=KjJ#Ta|-#z5cToP?x64)tSarg>C4ZQhJ%+yl(l1LedwTxrp^e#V)r3c@VaR{{5Hnmp9 z28w8fIvz60q}9)oJ6hD0$a3xHu5c>)p~=(PczuYG`1$dgO)&`d{_3un@jAtf*9%xV zf}`|TZI@?m@M~XiOPL*jA=1WY-j7Gz zb{G!gzw5iX1cdd^`{8iBS7TfG1(}m6M^h<*GiY4qf{V(Wg~r${i8D47uL9cH@}-2w z0jU6Si|kKH-%)rh3kBeggnDP+X~uG~MR<_?9lOwv(BK?o@U*;Zp}~d7 zREyl4Pt|v%o#fcexmV|u**~gVOrg-=hwebm_3|pbu%EgF6lN1Ii^u`lGUw=X4+UQX zH*tqc^Z$*jz$JEl0X0Y@ohh|)E1c)~mVhZCmN&l#f|uD}vxB{!`>`#|ujLtiP!^h) zSuQjYa!^Q0(M#rYiTug|x$&yVVy zGW#R7otz=mSizhHZqAP?XsTlu_wC3D$g7czao``e_)k(9!(9Msj69@0HJ+rnA-CM4 z#s>1KM~!^g2~uOZ&{n9?bs?zn4Bv&W_9_7WybrZV4468F4wFyv03fE|T2s)aeqdv? zt=FKD;J&UkRKE{;C$fYjM>i~cmHMwplz&Z=C!sm<1 z!u-y_yWPMepX9cK=c_%33c#a7u7Oz)h56$%fSH5$`e?CzP;63t_+-|#SB1Y5o~e&~ z!I!rw!S*WjC;Wwe3?FaGP2;cE)+#Y}zMId9()^EXYXsgSk4DS=*_0a!Xr`Z+p&Z~J zhZLjnXixo!a1LvNqA$+Hk@msQp|4}dsMnto18{RXhAwz&Le(+nkN007CKdY^N^b=} zeX@upU~PJ}z*0;?L4O-8%yFfa|3+XXm04DHtJT<=WaY28wwsl|VftH<RozY^5Y_#id1J%+x#`y&H{y@36LJ_comRrJGUtyBIYwn4a0Pau1LK2;kv(u% zIbD+r*a_p7h5HJdiVsvumH;t+&sN+&QTVi!zC{)zxL8p|6pf%Irv)-KL5j=P*|xzp2z- zZDm8;-&doB!0(qOejeyn>Lddn9L*h{S{Aw(L6c8mGs?0fC|8!aRb{d$g$Y7)=*Up= zX=OHnjmL-=r|Hixg?c1JZ1oCMFX$E7=u%$)f}MU(z3Rpn@qT8zcYpLfTn8vI9HRZV zEhM_Kztg@vo?9`N6~(#R^LN>A)8M5AMlU{{LwrI^vCi9tDdUAsUH+caHkmL42zrrk1F|0q7)*vAp$MxOIRwF?}hS_Pc^?P zp^yEhN1jsq`=6zcW&anQ`ro5Z(lO`|0!G`^;-eB0ayROzj^YLBpw$VQR-^F2SCUBx z7NwQf<)hf;M08om7C;Dgn>r6YuWQgGEZPuRM9-=#n!s17@$~(XczVn@r{w?2uc5l>w_4%niz!z zzD27#V;zqi2a_D1KWWI}a#5s#43Wf+xA5tkBk=Srq>z7&wSW;Z^0P&Zh)j&M@tF6? zqwG5*6EmCgQu}LaxSY1~3F1fXSPbTFvzPnfbWS*xG@`y%Z{uT7F#=qU@v$HrnvPXh zl~%4oso#bA=BV#j5(9IsRK{@Y=@t8lxmJ8YuMBgpyMRoYvn2BYA~kO%af7uL_41Sf zl_j8BloIDDIaw56oxj8BCsqWcY8SCrEjD<`#r?NJSXciDvPEN=g{ZmiGOalKlj8g@1J6X;D6&7)jr;)5Ng9dV9pxXWUTp*Oz(b-~{;S@C02uYh zD;4>1EU?9X10nPx(m5R9ET9DEL5GY!ml`eVL|FJ3`2;;dO4?h|U$7Ortt_z>Kfs>> z4gB`X+=in3t#e>CTz8DU*UpM`+<;`C;8S??EUjEg=OTYaN0P!`(@*pOO26AdjT>*+ z9}KY!dMQ6S%W>D^t+BLM^9kDt@;Cr3!chQR_1IHOdRt43viQkuUqfE#p@B-xVT zfXrM4|5&i{+s8-#E_Rjtr}cfTJ+z#IV)R(2$_pmb`5kJ)U3N`ir#j54Y{W--)A270 z*CYc}yde~=iPQoJThtnMs@6`{YR%f6S|fwiyOM_clc3uU!uGT{3mJKd+UD4Pu5Rnc zrzeevLH&j=1q@#E2L{Hl-pXrjK2#6ln~PW}1XrYCwt$&U!(%STQGUT{96B)dSDM#^ z17#e$!f-kju)-8zXuMpk80{Dy8r0va{93*EC_U5S1SAG$sD!{iPI+tH`k5E<3z(G~ z>DB^OE7TX?G2AS&yb4;-%HCw